chunkr-ai 0.1.0a5__py3-none-any.whl → 0.1.0a7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. chunkr_ai/__init__.py +2 -0
  2. chunkr_ai/_client.py +31 -3
  3. chunkr_ai/_constants.py +5 -5
  4. chunkr_ai/_exceptions.py +4 -0
  5. chunkr_ai/_models.py +1 -1
  6. chunkr_ai/_types.py +35 -1
  7. chunkr_ai/_utils/__init__.py +1 -0
  8. chunkr_ai/_utils/_typing.py +5 -0
  9. chunkr_ai/_version.py +1 -1
  10. chunkr_ai/resources/__init__.py +14 -0
  11. chunkr_ai/resources/files.py +3 -3
  12. chunkr_ai/resources/tasks/__init__.py +14 -0
  13. chunkr_ai/resources/tasks/extract.py +409 -0
  14. chunkr_ai/resources/tasks/parse.py +102 -346
  15. chunkr_ai/resources/tasks/tasks.py +62 -14
  16. chunkr_ai/resources/webhooks.py +193 -0
  17. chunkr_ai/types/__init__.py +27 -1
  18. chunkr_ai/types/bounding_box.py +19 -0
  19. chunkr_ai/types/cell.py +39 -0
  20. chunkr_ai/types/cell_style.py +28 -0
  21. chunkr_ai/types/chunk.py +40 -0
  22. chunkr_ai/types/chunk_processing.py +40 -0
  23. chunkr_ai/types/chunk_processing_param.py +42 -0
  24. chunkr_ai/types/extract_configuration.py +24 -0
  25. chunkr_ai/types/extract_output_response.py +19 -0
  26. chunkr_ai/types/file_create_params.py +2 -1
  27. chunkr_ai/types/file_info.py +21 -0
  28. chunkr_ai/types/generation_config.py +29 -0
  29. chunkr_ai/types/generation_config_param.py +29 -0
  30. chunkr_ai/types/llm_processing.py +36 -0
  31. chunkr_ai/types/llm_processing_param.py +36 -0
  32. chunkr_ai/types/ocr_result.py +28 -0
  33. chunkr_ai/types/page.py +27 -0
  34. chunkr_ai/types/parse_configuration.py +64 -0
  35. chunkr_ai/types/parse_configuration_param.py +65 -0
  36. chunkr_ai/types/parse_output_response.py +29 -0
  37. chunkr_ai/types/segment.py +109 -0
  38. chunkr_ai/types/segment_processing.py +228 -0
  39. chunkr_ai/types/segment_processing_param.py +229 -0
  40. chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
  41. chunkr_ai/types/task_list_params.py +7 -1
  42. chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
  43. chunkr_ai/types/task_response.py +68 -0
  44. chunkr_ai/types/tasks/__init__.py +7 -1
  45. chunkr_ai/types/tasks/extract_create_params.py +47 -0
  46. chunkr_ai/types/tasks/extract_create_response.py +214 -0
  47. chunkr_ai/types/tasks/extract_get_params.py +21 -0
  48. chunkr_ai/types/tasks/extract_get_response.py +214 -0
  49. chunkr_ai/types/tasks/parse_create_params.py +25 -805
  50. chunkr_ai/types/tasks/parse_create_response.py +55 -0
  51. chunkr_ai/types/tasks/parse_get_params.py +21 -0
  52. chunkr_ai/types/tasks/parse_get_response.py +55 -0
  53. chunkr_ai/types/unwrap_webhook_event.py +11 -0
  54. chunkr_ai/types/version_info.py +31 -0
  55. chunkr_ai/types/webhook_url_response.py +9 -0
  56. {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/METADATA +14 -13
  57. chunkr_ai-0.1.0a7.dist-info/RECORD +86 -0
  58. chunkr_ai/types/task.py +0 -1225
  59. chunkr_ai/types/tasks/parse_update_params.py +0 -857
  60. chunkr_ai-0.1.0a5.dist-info/RECORD +0 -52
  61. {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/WHEEL +0 -0
  62. {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,214 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import Optional
4
+ from datetime import datetime
5
+ from typing_extensions import Literal
6
+
7
+ from ..._models import BaseModel
8
+ from ..file_info import FileInfo
9
+ from ..version_info import VersionInfo
10
+ from ..extract_configuration import ExtractConfiguration
11
+ from ..extract_output_response import ExtractOutputResponse
12
+
13
+ __all__ = ["ExtractCreateResponse"]
14
+
15
+
16
+ class ExtractCreateResponse(BaseModel):
17
+ configuration: ExtractConfiguration
18
+
19
+ created_at: datetime
20
+ """The date and time when the task was created and queued."""
21
+
22
+ file_info: FileInfo
23
+ """Information about the input file."""
24
+
25
+ message: str
26
+ """A message describing the task's status or any errors that occurred."""
27
+
28
+ status: Literal["Starting", "Processing", "Succeeded", "Failed", "Cancelled"]
29
+ """The status of the task."""
30
+
31
+ task_id: str
32
+ """The unique identifier for the task."""
33
+
34
+ task_type: Literal["Parse", "Extract"]
35
+
36
+ version_info: VersionInfo
37
+ """Version information for the task."""
38
+
39
+ expires_at: Optional[datetime] = None
40
+ """The date and time when the task will expire."""
41
+
42
+ finished_at: Optional[datetime] = None
43
+ """The date and time when the task was finished."""
44
+
45
+ input_file_url: Optional[str] = None
46
+ """The presigned URL of the input file. Deprecated use `file_info.url` instead."""
47
+
48
+ output: Optional[ExtractOutputResponse] = None
49
+ """The processed results of a document extraction task.
50
+
51
+ Shapes:
52
+
53
+ - `results`: JSON matching the user-provided schema.
54
+ - `citations`: mirror of `results`; only leaf positions (primitive or
55
+ array-of-primitives) contain a `Vec<Citation>` supporting that field.
56
+ - `metrics`: mirror of `results`; only leaf positions contain a `Metrics` object
57
+ for that field.
58
+
59
+ Detailed shape:
60
+
61
+ - Shared structure: `results`, `citations`, and `metrics` have the same
62
+ object/array shape as the user schema. Non-leaf nodes (objects, arrays of
63
+ objects) are mirrored; only leaves carry values.
64
+ - Leaf definition:
65
+ - A leaf is either a JSON primitive (string, number, bool, or null) or an
66
+ array of primitives (including empty).
67
+ - Arrays of objects are not leaves; recurse into their elements (`items`
68
+ mirror index-by-index).
69
+ - Null handling:
70
+ - If a leaf in `results` is null, the corresponding position in `citations`
71
+ and `metrics` remains null.
72
+ - Arrays:
73
+ - Array of objects: `citations`/`metrics` are arrays whose elements mirror
74
+ each object and carry values at their own leaves.
75
+ - Array of primitives: treated as a single leaf. `citations[path]` is a list
76
+ of `Citation` supporting the array as a whole. `metrics[path]` is a
77
+ `Metrics` object for the array as a whole.
78
+ - Citations leaves:
79
+ - Type: JSON array of `Citation` objects.
80
+ - Each `Citation` has: `citation_id: string`, `citation_type: Segment|Word`,
81
+ `bbox: BoundingBox[]`, `content: string`, `segment_id?: string`,
82
+ `segment_type: SegmentType`, `ss_range?: string[]`.
83
+ - Segment citation: represents a full parsed segment; `segment_id` set,
84
+ `bbox` has one entry (segment box), `content` is the segment text. If the
85
+ segment is from a spreadsheet, `ss_range` contains the table range
86
+ (single-element array) or the underlying cell refs if available.
87
+ - Word citation: represents selected OCR words within a segment;
88
+ `segment_id` is null, `bbox` has one entry per word, `content` is the
89
+ whitespace-joined text of those words; `segment_type` is `Text`. If OCR
90
+ words came from spreadsheet cells, `ss_range` lists those cell refs.
91
+ - Metrics leaves:
92
+ - Type: `Metrics` object with `confidence: "High" | "Low"`, indicating whether
93
+ citations sufficiently support the item.
94
+
95
+ Example:
96
+
97
+ results
98
+
99
+ ```json
100
+ {
101
+ "invoice_id": "INV-001",
102
+ "seller": { "name": "Acme" },
103
+ "line_items": [{ "sku": "A1", "qty": 2 }],
104
+ "tags": ["urgent", "paid"],
105
+ "notes": null
106
+ }
107
+ ```
108
+
109
+ citations
110
+
111
+ ```json
112
+ {
113
+ "invoice_id": [
114
+ {
115
+ "citation_id": "abc1234",
116
+ "citation_type": "Segment",
117
+ "bbox": [{ "left": 10, "top": 20, "width": 100, "height": 18 }],
118
+ "content": "Invoice INV-001",
119
+ "segment_id": "seg_001",
120
+ "segment_type": "Text",
121
+ "ss_range": ["A1:C10"]
122
+ },
123
+ {
124
+ "citation_id": "pqr2345",
125
+ "citation_type": "Word",
126
+ "bbox": [
127
+ { "left": 12, "top": 24, "width": 36, "height": 18 },
128
+ { "left": 52, "top": 24, "width": 48, "height": 18 }
129
+ ],
130
+ "content": "INV-001",
131
+ "segment_id": null,
132
+ "segment_type": "Text",
133
+ "ss_range": ["B3", "C3"]
134
+ }
135
+ ],
136
+ "seller": {
137
+ "name": [
138
+ {
139
+ "citation_id": "def5678",
140
+ "citation_type": "Word",
141
+ "bbox": [
142
+ { "left": 45, "top": 80, "width": 30, "height": 12 },
143
+ { "left": 80, "top": 80, "width": 40, "height": 12 }
144
+ ],
145
+ "content": "Acme",
146
+ "segment_id": null,
147
+ "segment_type": "Text"
148
+ }
149
+ ]
150
+ },
151
+ "line_items": [
152
+ {
153
+ "sku": [
154
+ {
155
+ "citation_id": "ghi9012",
156
+ "citation_type": "Segment",
157
+ "bbox": [{ "left": 12, "top": 140, "width": 60, "height": 16 }],
158
+ "content": "A1",
159
+ "segment_id": "seg_010",
160
+ "segment_type": "Text",
161
+ "ss_range": ["D5:E12"]
162
+ }
163
+ ],
164
+ "qty": [
165
+ {
166
+ "citation_id": "jkl3456",
167
+ "citation_type": "Word",
168
+ "bbox": [{ "left": 85, "top": 140, "width": 12, "height": 16 }],
169
+ "content": "2",
170
+ "segment_id": null,
171
+ "segment_type": "Text",
172
+ "ss_range": ["E12"]
173
+ }
174
+ ]
175
+ }
176
+ ],
177
+ "tags": [
178
+ {
179
+ "citation_id": "mno7890",
180
+ "citation_type": "Segment",
181
+ "bbox": [{ "left": 12, "top": 200, "width": 100, "height": 16 }],
182
+ "content": "urgent paid",
183
+ "segment_id": "seg_020",
184
+ "segment_type": "Text",
185
+ "ss_range": ["A20:C25"]
186
+ }
187
+ ],
188
+ "notes": null
189
+ }
190
+ ```
191
+
192
+ metrics
193
+
194
+ ```json
195
+ {
196
+ "invoice_id": { "confidence": "High" },
197
+ "seller": { "name": { "confidence": "Low" } },
198
+ "line_items": [
199
+ { "sku": { "confidence": "High" }, "qty": { "confidence": "High" } }
200
+ ],
201
+ "tags": { "confidence": "Low" },
202
+ "notes": null
203
+ }
204
+ ```
205
+ """
206
+
207
+ source_task_id: Optional[str] = None
208
+ """The ID of the source `parse` task that was used for extraction"""
209
+
210
+ started_at: Optional[datetime] = None
211
+ """The date and time when the task was started."""
212
+
213
+ task_url: Optional[str] = None
214
+ """The presigned URL of the task."""
@@ -0,0 +1,21 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing_extensions import TypedDict
6
+
7
+ __all__ = ["ExtractGetParams"]
8
+
9
+
10
+ class ExtractGetParams(TypedDict, total=False):
11
+ base64_urls: bool
12
+ """Whether to return base64 encoded URLs.
13
+
14
+ If false, the URLs will be returned as presigned URLs.
15
+ """
16
+
17
+ include_chunks: bool
18
+ """Whether to include chunks in the output response"""
19
+
20
+ wait_for_completion: bool
21
+ """Whether to wait for the task to complete"""
@@ -0,0 +1,214 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import Optional
4
+ from datetime import datetime
5
+ from typing_extensions import Literal
6
+
7
+ from ..._models import BaseModel
8
+ from ..file_info import FileInfo
9
+ from ..version_info import VersionInfo
10
+ from ..extract_configuration import ExtractConfiguration
11
+ from ..extract_output_response import ExtractOutputResponse
12
+
13
+ __all__ = ["ExtractGetResponse"]
14
+
15
+
16
+ class ExtractGetResponse(BaseModel):
17
+ configuration: ExtractConfiguration
18
+
19
+ created_at: datetime
20
+ """The date and time when the task was created and queued."""
21
+
22
+ file_info: FileInfo
23
+ """Information about the input file."""
24
+
25
+ message: str
26
+ """A message describing the task's status or any errors that occurred."""
27
+
28
+ status: Literal["Starting", "Processing", "Succeeded", "Failed", "Cancelled"]
29
+ """The status of the task."""
30
+
31
+ task_id: str
32
+ """The unique identifier for the task."""
33
+
34
+ task_type: Literal["Parse", "Extract"]
35
+
36
+ version_info: VersionInfo
37
+ """Version information for the task."""
38
+
39
+ expires_at: Optional[datetime] = None
40
+ """The date and time when the task will expire."""
41
+
42
+ finished_at: Optional[datetime] = None
43
+ """The date and time when the task was finished."""
44
+
45
+ input_file_url: Optional[str] = None
46
+ """The presigned URL of the input file. Deprecated use `file_info.url` instead."""
47
+
48
+ output: Optional[ExtractOutputResponse] = None
49
+ """The processed results of a document extraction task.
50
+
51
+ Shapes:
52
+
53
+ - `results`: JSON matching the user-provided schema.
54
+ - `citations`: mirror of `results`; only leaf positions (primitive or
55
+ array-of-primitives) contain a `Vec<Citation>` supporting that field.
56
+ - `metrics`: mirror of `results`; only leaf positions contain a `Metrics` object
57
+ for that field.
58
+
59
+ Detailed shape:
60
+
61
+ - Shared structure: `results`, `citations`, and `metrics` have the same
62
+ object/array shape as the user schema. Non-leaf nodes (objects, arrays of
63
+ objects) are mirrored; only leaves carry values.
64
+ - Leaf definition:
65
+ - A leaf is either a JSON primitive (string, number, bool, or null) or an
66
+ array of primitives (including empty).
67
+ - Arrays of objects are not leaves; recurse into their elements (`items`
68
+ mirror index-by-index).
69
+ - Null handling:
70
+ - If a leaf in `results` is null, the corresponding position in `citations`
71
+ and `metrics` remains null.
72
+ - Arrays:
73
+ - Array of objects: `citations`/`metrics` are arrays whose elements mirror
74
+ each object and carry values at their own leaves.
75
+ - Array of primitives: treated as a single leaf. `citations[path]` is a list
76
+ of `Citation` supporting the array as a whole. `metrics[path]` is a
77
+ `Metrics` object for the array as a whole.
78
+ - Citations leaves:
79
+ - Type: JSON array of `Citation` objects.
80
+ - Each `Citation` has: `citation_id: string`, `citation_type: Segment|Word`,
81
+ `bbox: BoundingBox[]`, `content: string`, `segment_id?: string`,
82
+ `segment_type: SegmentType`, `ss_range?: string[]`.
83
+ - Segment citation: represents a full parsed segment; `segment_id` set,
84
+ `bbox` has one entry (segment box), `content` is the segment text. If the
85
+ segment is from a spreadsheet, `ss_range` contains the table range
86
+ (single-element array) or the underlying cell refs if available.
87
+ - Word citation: represents selected OCR words within a segment;
88
+ `segment_id` is null, `bbox` has one entry per word, `content` is the
89
+ whitespace-joined text of those words; `segment_type` is `Text`. If OCR
90
+ words came from spreadsheet cells, `ss_range` lists those cell refs.
91
+ - Metrics leaves:
92
+ - Type: `Metrics` object with `confidence: "High" | "Low"`, indicating whether
93
+ citations sufficiently support the item.
94
+
95
+ Example:
96
+
97
+ results
98
+
99
+ ```json
100
+ {
101
+ "invoice_id": "INV-001",
102
+ "seller": { "name": "Acme" },
103
+ "line_items": [{ "sku": "A1", "qty": 2 }],
104
+ "tags": ["urgent", "paid"],
105
+ "notes": null
106
+ }
107
+ ```
108
+
109
+ citations
110
+
111
+ ```json
112
+ {
113
+ "invoice_id": [
114
+ {
115
+ "citation_id": "abc1234",
116
+ "citation_type": "Segment",
117
+ "bbox": [{ "left": 10, "top": 20, "width": 100, "height": 18 }],
118
+ "content": "Invoice INV-001",
119
+ "segment_id": "seg_001",
120
+ "segment_type": "Text",
121
+ "ss_range": ["A1:C10"]
122
+ },
123
+ {
124
+ "citation_id": "pqr2345",
125
+ "citation_type": "Word",
126
+ "bbox": [
127
+ { "left": 12, "top": 24, "width": 36, "height": 18 },
128
+ { "left": 52, "top": 24, "width": 48, "height": 18 }
129
+ ],
130
+ "content": "INV-001",
131
+ "segment_id": null,
132
+ "segment_type": "Text",
133
+ "ss_range": ["B3", "C3"]
134
+ }
135
+ ],
136
+ "seller": {
137
+ "name": [
138
+ {
139
+ "citation_id": "def5678",
140
+ "citation_type": "Word",
141
+ "bbox": [
142
+ { "left": 45, "top": 80, "width": 30, "height": 12 },
143
+ { "left": 80, "top": 80, "width": 40, "height": 12 }
144
+ ],
145
+ "content": "Acme",
146
+ "segment_id": null,
147
+ "segment_type": "Text"
148
+ }
149
+ ]
150
+ },
151
+ "line_items": [
152
+ {
153
+ "sku": [
154
+ {
155
+ "citation_id": "ghi9012",
156
+ "citation_type": "Segment",
157
+ "bbox": [{ "left": 12, "top": 140, "width": 60, "height": 16 }],
158
+ "content": "A1",
159
+ "segment_id": "seg_010",
160
+ "segment_type": "Text",
161
+ "ss_range": ["D5:E12"]
162
+ }
163
+ ],
164
+ "qty": [
165
+ {
166
+ "citation_id": "jkl3456",
167
+ "citation_type": "Word",
168
+ "bbox": [{ "left": 85, "top": 140, "width": 12, "height": 16 }],
169
+ "content": "2",
170
+ "segment_id": null,
171
+ "segment_type": "Text",
172
+ "ss_range": ["E12"]
173
+ }
174
+ ]
175
+ }
176
+ ],
177
+ "tags": [
178
+ {
179
+ "citation_id": "mno7890",
180
+ "citation_type": "Segment",
181
+ "bbox": [{ "left": 12, "top": 200, "width": 100, "height": 16 }],
182
+ "content": "urgent paid",
183
+ "segment_id": "seg_020",
184
+ "segment_type": "Text",
185
+ "ss_range": ["A20:C25"]
186
+ }
187
+ ],
188
+ "notes": null
189
+ }
190
+ ```
191
+
192
+ metrics
193
+
194
+ ```json
195
+ {
196
+ "invoice_id": { "confidence": "High" },
197
+ "seller": { "name": { "confidence": "Low" } },
198
+ "line_items": [
199
+ { "sku": { "confidence": "High" }, "qty": { "confidence": "High" } }
200
+ ],
201
+ "tags": { "confidence": "Low" },
202
+ "notes": null
203
+ }
204
+ ```
205
+ """
206
+
207
+ source_task_id: Optional[str] = None
208
+ """The ID of the source `parse` task that was used for extraction"""
209
+
210
+ started_at: Optional[datetime] = None
211
+ """The date and time when the task was started."""
212
+
213
+ task_url: Optional[str] = None
214
+ """The presigned URL of the task."""