chunkr-ai 0.1.0a7__py3-none-any.whl → 0.1.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/_base_client.py +3 -3
- chunkr_ai/_compat.py +48 -48
- chunkr_ai/_models.py +40 -40
- chunkr_ai/_utils/__init__.py +8 -2
- chunkr_ai/_utils/_compat.py +45 -0
- chunkr_ai/_utils/_datetime_parse.py +136 -0
- chunkr_ai/_utils/_transform.py +11 -1
- chunkr_ai/_utils/_typing.py +1 -1
- chunkr_ai/_utils/_utils.py +0 -1
- chunkr_ai/_version.py +1 -1
- chunkr_ai/resources/tasks/extract.py +4 -20
- chunkr_ai/resources/tasks/parse.py +4 -20
- chunkr_ai/resources/tasks/tasks.py +4 -20
- chunkr_ai/types/extract_output_response.py +45 -2
- chunkr_ai/types/task_get_params.py +0 -3
- chunkr_ai/types/tasks/extract_create_response.py +0 -147
- chunkr_ai/types/tasks/extract_get_params.py +0 -3
- chunkr_ai/types/tasks/extract_get_response.py +0 -147
- chunkr_ai/types/tasks/parse_get_params.py +0 -3
- {chunkr_ai-0.1.0a7.dist-info → chunkr_ai-0.1.0a8.dist-info}/METADATA +1 -1
- {chunkr_ai-0.1.0a7.dist-info → chunkr_ai-0.1.0a8.dist-info}/RECORD +23 -21
- {chunkr_ai-0.1.0a7.dist-info → chunkr_ai-0.1.0a8.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a7.dist-info → chunkr_ai-0.1.0a8.dist-info}/licenses/LICENSE +0 -0
chunkr_ai/_utils/_utils.py
CHANGED
@@ -22,7 +22,6 @@ from typing_extensions import TypeGuard
|
|
22
22
|
import sniffio
|
23
23
|
|
24
24
|
from .._types import NotGiven, FileTypes, NotGivenOr, HeadersLike
|
25
|
-
from .._compat import parse_date as parse_date, parse_datetime as parse_datetime
|
26
25
|
|
27
26
|
_T = TypeVar("_T")
|
28
27
|
_TupleT = TypeVar("_TupleT", bound=Tuple[object, ...])
|
chunkr_ai/_version.py
CHANGED
@@ -131,7 +131,6 @@ class ExtractResource(SyncAPIResource):
|
|
131
131
|
*,
|
132
132
|
base64_urls: bool | NotGiven = NOT_GIVEN,
|
133
133
|
include_chunks: bool | NotGiven = NOT_GIVEN,
|
134
|
-
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
135
134
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
136
135
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
137
136
|
extra_headers: Headers | None = None,
|
@@ -140,14 +139,10 @@ class ExtractResource(SyncAPIResource):
|
|
140
139
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
141
140
|
) -> ExtractGetResponse:
|
142
141
|
"""
|
143
|
-
Retrieves the current state of an extract task
|
144
|
-
completion.
|
142
|
+
Retrieves the current state of an extract task.
|
145
143
|
|
146
144
|
Returns task details such as processing status, configuration, output (when
|
147
|
-
available), file metadata, and timestamps.
|
148
|
-
provided, the server will hold the request briefly. If the task does not reach a
|
149
|
-
terminal state during that window, the response will indicate a retry with
|
150
|
-
appropriate headers.
|
145
|
+
available), file metadata, and timestamps.
|
151
146
|
|
152
147
|
Typical uses:
|
153
148
|
|
@@ -161,8 +156,6 @@ class ExtractResource(SyncAPIResource):
|
|
161
156
|
|
162
157
|
include_chunks: Whether to include chunks in the output response
|
163
158
|
|
164
|
-
wait_for_completion: Whether to wait for the task to complete
|
165
|
-
|
166
159
|
extra_headers: Send extra headers
|
167
160
|
|
168
161
|
extra_query: Add additional query parameters to the request
|
@@ -184,7 +177,6 @@ class ExtractResource(SyncAPIResource):
|
|
184
177
|
{
|
185
178
|
"base64_urls": base64_urls,
|
186
179
|
"include_chunks": include_chunks,
|
187
|
-
"wait_for_completion": wait_for_completion,
|
188
180
|
},
|
189
181
|
extract_get_params.ExtractGetParams,
|
190
182
|
),
|
@@ -299,7 +291,6 @@ class AsyncExtractResource(AsyncAPIResource):
|
|
299
291
|
*,
|
300
292
|
base64_urls: bool | NotGiven = NOT_GIVEN,
|
301
293
|
include_chunks: bool | NotGiven = NOT_GIVEN,
|
302
|
-
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
303
294
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
304
295
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
305
296
|
extra_headers: Headers | None = None,
|
@@ -308,14 +299,10 @@ class AsyncExtractResource(AsyncAPIResource):
|
|
308
299
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
309
300
|
) -> ExtractGetResponse:
|
310
301
|
"""
|
311
|
-
Retrieves the current state of an extract task
|
312
|
-
completion.
|
302
|
+
Retrieves the current state of an extract task.
|
313
303
|
|
314
304
|
Returns task details such as processing status, configuration, output (when
|
315
|
-
available), file metadata, and timestamps.
|
316
|
-
provided, the server will hold the request briefly. If the task does not reach a
|
317
|
-
terminal state during that window, the response will indicate a retry with
|
318
|
-
appropriate headers.
|
305
|
+
available), file metadata, and timestamps.
|
319
306
|
|
320
307
|
Typical uses:
|
321
308
|
|
@@ -329,8 +316,6 @@ class AsyncExtractResource(AsyncAPIResource):
|
|
329
316
|
|
330
317
|
include_chunks: Whether to include chunks in the output response
|
331
318
|
|
332
|
-
wait_for_completion: Whether to wait for the task to complete
|
333
|
-
|
334
319
|
extra_headers: Send extra headers
|
335
320
|
|
336
321
|
extra_query: Add additional query parameters to the request
|
@@ -352,7 +337,6 @@ class AsyncExtractResource(AsyncAPIResource):
|
|
352
337
|
{
|
353
338
|
"base64_urls": base64_urls,
|
354
339
|
"include_chunks": include_chunks,
|
355
|
-
"wait_for_completion": wait_for_completion,
|
356
340
|
},
|
357
341
|
extract_get_params.ExtractGetParams,
|
358
342
|
),
|
@@ -174,7 +174,6 @@ class ParseResource(SyncAPIResource):
|
|
174
174
|
*,
|
175
175
|
base64_urls: bool | NotGiven = NOT_GIVEN,
|
176
176
|
include_chunks: bool | NotGiven = NOT_GIVEN,
|
177
|
-
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
178
177
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
179
178
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
180
179
|
extra_headers: Headers | None = None,
|
@@ -183,14 +182,10 @@ class ParseResource(SyncAPIResource):
|
|
183
182
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
184
183
|
) -> ParseGetResponse:
|
185
184
|
"""
|
186
|
-
Retrieves the current state of a parse task
|
187
|
-
completion.
|
185
|
+
Retrieves the current state of a parse task.
|
188
186
|
|
189
187
|
Returns task details such as processing status, configuration, output (when
|
190
|
-
available), file metadata, and timestamps.
|
191
|
-
provided, the server will hold the request briefly. If the task does not reach a
|
192
|
-
terminal state during that window, the response will indicate a retry with
|
193
|
-
appropriate headers.
|
188
|
+
available), file metadata, and timestamps.
|
194
189
|
|
195
190
|
Typical uses:
|
196
191
|
|
@@ -204,8 +199,6 @@ class ParseResource(SyncAPIResource):
|
|
204
199
|
|
205
200
|
include_chunks: Whether to include chunks in the output response
|
206
201
|
|
207
|
-
wait_for_completion: Whether to wait for the task to complete
|
208
|
-
|
209
202
|
extra_headers: Send extra headers
|
210
203
|
|
211
204
|
extra_query: Add additional query parameters to the request
|
@@ -227,7 +220,6 @@ class ParseResource(SyncAPIResource):
|
|
227
220
|
{
|
228
221
|
"base64_urls": base64_urls,
|
229
222
|
"include_chunks": include_chunks,
|
230
|
-
"wait_for_completion": wait_for_completion,
|
231
223
|
},
|
232
224
|
parse_get_params.ParseGetParams,
|
233
225
|
),
|
@@ -382,7 +374,6 @@ class AsyncParseResource(AsyncAPIResource):
|
|
382
374
|
*,
|
383
375
|
base64_urls: bool | NotGiven = NOT_GIVEN,
|
384
376
|
include_chunks: bool | NotGiven = NOT_GIVEN,
|
385
|
-
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
386
377
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
387
378
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
388
379
|
extra_headers: Headers | None = None,
|
@@ -391,14 +382,10 @@ class AsyncParseResource(AsyncAPIResource):
|
|
391
382
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
392
383
|
) -> ParseGetResponse:
|
393
384
|
"""
|
394
|
-
Retrieves the current state of a parse task
|
395
|
-
completion.
|
385
|
+
Retrieves the current state of a parse task.
|
396
386
|
|
397
387
|
Returns task details such as processing status, configuration, output (when
|
398
|
-
available), file metadata, and timestamps.
|
399
|
-
provided, the server will hold the request briefly. If the task does not reach a
|
400
|
-
terminal state during that window, the response will indicate a retry with
|
401
|
-
appropriate headers.
|
388
|
+
available), file metadata, and timestamps.
|
402
389
|
|
403
390
|
Typical uses:
|
404
391
|
|
@@ -412,8 +399,6 @@ class AsyncParseResource(AsyncAPIResource):
|
|
412
399
|
|
413
400
|
include_chunks: Whether to include chunks in the output response
|
414
401
|
|
415
|
-
wait_for_completion: Whether to wait for the task to complete
|
416
|
-
|
417
402
|
extra_headers: Send extra headers
|
418
403
|
|
419
404
|
extra_query: Add additional query parameters to the request
|
@@ -435,7 +420,6 @@ class AsyncParseResource(AsyncAPIResource):
|
|
435
420
|
{
|
436
421
|
"base64_urls": base64_urls,
|
437
422
|
"include_chunks": include_chunks,
|
438
|
-
"wait_for_completion": wait_for_completion,
|
439
423
|
},
|
440
424
|
parse_get_params.ParseGetParams,
|
441
425
|
),
|
@@ -240,7 +240,6 @@ class TasksResource(SyncAPIResource):
|
|
240
240
|
*,
|
241
241
|
base64_urls: bool | NotGiven = NOT_GIVEN,
|
242
242
|
include_chunks: bool | NotGiven = NOT_GIVEN,
|
243
|
-
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
244
243
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
245
244
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
246
245
|
extra_headers: Headers | None = None,
|
@@ -249,14 +248,10 @@ class TasksResource(SyncAPIResource):
|
|
249
248
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
250
249
|
) -> TaskResponse:
|
251
250
|
"""
|
252
|
-
Retrieves the current state of a task
|
253
|
-
completion.
|
251
|
+
Retrieves the current state of a task.
|
254
252
|
|
255
253
|
Returns task details such as processing status, configuration, output (when
|
256
|
-
available), file metadata, and timestamps.
|
257
|
-
provided, the server will hold the request briefly. If the task does not reach a
|
258
|
-
terminal state during that window, the response will indicate a retry with
|
259
|
-
appropriate headers.
|
254
|
+
available), file metadata, and timestamps.
|
260
255
|
|
261
256
|
Typical uses:
|
262
257
|
|
@@ -270,8 +265,6 @@ class TasksResource(SyncAPIResource):
|
|
270
265
|
|
271
266
|
include_chunks: Whether to include chunks in the output response
|
272
267
|
|
273
|
-
wait_for_completion: Whether to wait for the task to complete
|
274
|
-
|
275
268
|
extra_headers: Send extra headers
|
276
269
|
|
277
270
|
extra_query: Add additional query parameters to the request
|
@@ -293,7 +286,6 @@ class TasksResource(SyncAPIResource):
|
|
293
286
|
{
|
294
287
|
"base64_urls": base64_urls,
|
295
288
|
"include_chunks": include_chunks,
|
296
|
-
"wait_for_completion": wait_for_completion,
|
297
289
|
},
|
298
290
|
task_get_params.TaskGetParams,
|
299
291
|
),
|
@@ -500,7 +492,6 @@ class AsyncTasksResource(AsyncAPIResource):
|
|
500
492
|
*,
|
501
493
|
base64_urls: bool | NotGiven = NOT_GIVEN,
|
502
494
|
include_chunks: bool | NotGiven = NOT_GIVEN,
|
503
|
-
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
504
495
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
505
496
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
506
497
|
extra_headers: Headers | None = None,
|
@@ -509,14 +500,10 @@ class AsyncTasksResource(AsyncAPIResource):
|
|
509
500
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
510
501
|
) -> TaskResponse:
|
511
502
|
"""
|
512
|
-
Retrieves the current state of a task
|
513
|
-
completion.
|
503
|
+
Retrieves the current state of a task.
|
514
504
|
|
515
505
|
Returns task details such as processing status, configuration, output (when
|
516
|
-
available), file metadata, and timestamps.
|
517
|
-
provided, the server will hold the request briefly. If the task does not reach a
|
518
|
-
terminal state during that window, the response will indicate a retry with
|
519
|
-
appropriate headers.
|
506
|
+
available), file metadata, and timestamps.
|
520
507
|
|
521
508
|
Typical uses:
|
522
509
|
|
@@ -530,8 +517,6 @@ class AsyncTasksResource(AsyncAPIResource):
|
|
530
517
|
|
531
518
|
include_chunks: Whether to include chunks in the output response
|
532
519
|
|
533
|
-
wait_for_completion: Whether to wait for the task to complete
|
534
|
-
|
535
520
|
extra_headers: Send extra headers
|
536
521
|
|
537
522
|
extra_query: Add additional query parameters to the request
|
@@ -553,7 +538,6 @@ class AsyncTasksResource(AsyncAPIResource):
|
|
553
538
|
{
|
554
539
|
"base64_urls": base64_urls,
|
555
540
|
"include_chunks": include_chunks,
|
556
|
-
"wait_for_completion": wait_for_completion,
|
557
541
|
},
|
558
542
|
task_get_params.TaskGetParams,
|
559
543
|
),
|
@@ -7,13 +7,56 @@ __all__ = ["ExtractOutputResponse"]
|
|
7
7
|
|
8
8
|
class ExtractOutputResponse(BaseModel):
|
9
9
|
citations: object
|
10
|
-
"""Mirror of `results`; leaves are `Vec<Citation>` for the corresponding field
|
10
|
+
"""Mirror of `results`; leaves are `Vec<Citation>` for the corresponding field
|
11
|
+
|
12
|
+
Example:
|
13
|
+
|
14
|
+
```json
|
15
|
+
{
|
16
|
+
"field_name": [
|
17
|
+
{
|
18
|
+
"citation_id": "abc1234",
|
19
|
+
"citation_type": "Segment",
|
20
|
+
"bboxes": [
|
21
|
+
{
|
22
|
+
"left": 10,
|
23
|
+
"top": 20,
|
24
|
+
"width": 100,
|
25
|
+
"height": 18
|
26
|
+
}
|
27
|
+
],
|
28
|
+
"content": "Example content",
|
29
|
+
"segment_id": "seg_001",
|
30
|
+
"segment_type": "Text",
|
31
|
+
"page_number": 1,
|
32
|
+
"page_height": 297,
|
33
|
+
"page_width": 210,
|
34
|
+
"ss_ranges": ["A1:C10"],
|
35
|
+
"ss_sheet_name": "Sheet1"
|
36
|
+
}
|
37
|
+
]
|
38
|
+
}
|
39
|
+
```
|
40
|
+
"""
|
11
41
|
|
12
42
|
metrics: object
|
13
43
|
"""
|
14
44
|
Mirror of `results`; leaves contain a `Metrics` object for the corresponding
|
15
45
|
field
|
46
|
+
|
47
|
+
Example:
|
48
|
+
|
49
|
+
```json
|
50
|
+
{ "field_name": { "confidence": "High" } }
|
51
|
+
```
|
16
52
|
"""
|
17
53
|
|
18
54
|
results: object
|
19
|
-
"""JSON data that matches the provided schema
|
55
|
+
"""JSON data that matches the provided schema
|
56
|
+
|
57
|
+
Example:
|
58
|
+
|
59
|
+
```json
|
60
|
+
{ "field_name": "value" }
|
61
|
+
```
|
62
|
+
"""
|
@@ -55,153 +55,6 @@ class ExtractCreateResponse(BaseModel):
|
|
55
55
|
array-of-primitives) contain a `Vec<Citation>` supporting that field.
|
56
56
|
- `metrics`: mirror of `results`; only leaf positions contain a `Metrics` object
|
57
57
|
for that field.
|
58
|
-
|
59
|
-
Detailed shape:
|
60
|
-
|
61
|
-
- Shared structure: `results`, `citations`, and `metrics` have the same
|
62
|
-
object/array shape as the user schema. Non-leaf nodes (objects, arrays of
|
63
|
-
objects) are mirrored; only leaves carry values.
|
64
|
-
- Leaf definition:
|
65
|
-
- A leaf is either a JSON primitive (string, number, bool, or null) or an
|
66
|
-
array of primitives (including empty).
|
67
|
-
- Arrays of objects are not leaves; recurse into their elements (`items`
|
68
|
-
mirror index-by-index).
|
69
|
-
- Null handling:
|
70
|
-
- If a leaf in `results` is null, the corresponding position in `citations`
|
71
|
-
and `metrics` remains null.
|
72
|
-
- Arrays:
|
73
|
-
- Array of objects: `citations`/`metrics` are arrays whose elements mirror
|
74
|
-
each object and carry values at their own leaves.
|
75
|
-
- Array of primitives: treated as a single leaf. `citations[path]` is a list
|
76
|
-
of `Citation` supporting the array as a whole. `metrics[path]` is a
|
77
|
-
`Metrics` object for the array as a whole.
|
78
|
-
- Citations leaves:
|
79
|
-
- Type: JSON array of `Citation` objects.
|
80
|
-
- Each `Citation` has: `citation_id: string`, `citation_type: Segment|Word`,
|
81
|
-
`bbox: BoundingBox[]`, `content: string`, `segment_id?: string`,
|
82
|
-
`segment_type: SegmentType`, `ss_range?: string[]`.
|
83
|
-
- Segment citation: represents a full parsed segment; `segment_id` set,
|
84
|
-
`bbox` has one entry (segment box), `content` is the segment text. If the
|
85
|
-
segment is from a spreadsheet, `ss_range` contains the table range
|
86
|
-
(single-element array) or the underlying cell refs if available.
|
87
|
-
- Word citation: represents selected OCR words within a segment;
|
88
|
-
`segment_id` is null, `bbox` has one entry per word, `content` is the
|
89
|
-
whitespace-joined text of those words; `segment_type` is `Text`. If OCR
|
90
|
-
words came from spreadsheet cells, `ss_range` lists those cell refs.
|
91
|
-
- Metrics leaves:
|
92
|
-
- Type: `Metrics` object with `confidence: "High" | "Low"`, indicating whether
|
93
|
-
citations sufficiently support the item.
|
94
|
-
|
95
|
-
Example:
|
96
|
-
|
97
|
-
results
|
98
|
-
|
99
|
-
```json
|
100
|
-
{
|
101
|
-
"invoice_id": "INV-001",
|
102
|
-
"seller": { "name": "Acme" },
|
103
|
-
"line_items": [{ "sku": "A1", "qty": 2 }],
|
104
|
-
"tags": ["urgent", "paid"],
|
105
|
-
"notes": null
|
106
|
-
}
|
107
|
-
```
|
108
|
-
|
109
|
-
citations
|
110
|
-
|
111
|
-
```json
|
112
|
-
{
|
113
|
-
"invoice_id": [
|
114
|
-
{
|
115
|
-
"citation_id": "abc1234",
|
116
|
-
"citation_type": "Segment",
|
117
|
-
"bbox": [{ "left": 10, "top": 20, "width": 100, "height": 18 }],
|
118
|
-
"content": "Invoice INV-001",
|
119
|
-
"segment_id": "seg_001",
|
120
|
-
"segment_type": "Text",
|
121
|
-
"ss_range": ["A1:C10"]
|
122
|
-
},
|
123
|
-
{
|
124
|
-
"citation_id": "pqr2345",
|
125
|
-
"citation_type": "Word",
|
126
|
-
"bbox": [
|
127
|
-
{ "left": 12, "top": 24, "width": 36, "height": 18 },
|
128
|
-
{ "left": 52, "top": 24, "width": 48, "height": 18 }
|
129
|
-
],
|
130
|
-
"content": "INV-001",
|
131
|
-
"segment_id": null,
|
132
|
-
"segment_type": "Text",
|
133
|
-
"ss_range": ["B3", "C3"]
|
134
|
-
}
|
135
|
-
],
|
136
|
-
"seller": {
|
137
|
-
"name": [
|
138
|
-
{
|
139
|
-
"citation_id": "def5678",
|
140
|
-
"citation_type": "Word",
|
141
|
-
"bbox": [
|
142
|
-
{ "left": 45, "top": 80, "width": 30, "height": 12 },
|
143
|
-
{ "left": 80, "top": 80, "width": 40, "height": 12 }
|
144
|
-
],
|
145
|
-
"content": "Acme",
|
146
|
-
"segment_id": null,
|
147
|
-
"segment_type": "Text"
|
148
|
-
}
|
149
|
-
]
|
150
|
-
},
|
151
|
-
"line_items": [
|
152
|
-
{
|
153
|
-
"sku": [
|
154
|
-
{
|
155
|
-
"citation_id": "ghi9012",
|
156
|
-
"citation_type": "Segment",
|
157
|
-
"bbox": [{ "left": 12, "top": 140, "width": 60, "height": 16 }],
|
158
|
-
"content": "A1",
|
159
|
-
"segment_id": "seg_010",
|
160
|
-
"segment_type": "Text",
|
161
|
-
"ss_range": ["D5:E12"]
|
162
|
-
}
|
163
|
-
],
|
164
|
-
"qty": [
|
165
|
-
{
|
166
|
-
"citation_id": "jkl3456",
|
167
|
-
"citation_type": "Word",
|
168
|
-
"bbox": [{ "left": 85, "top": 140, "width": 12, "height": 16 }],
|
169
|
-
"content": "2",
|
170
|
-
"segment_id": null,
|
171
|
-
"segment_type": "Text",
|
172
|
-
"ss_range": ["E12"]
|
173
|
-
}
|
174
|
-
]
|
175
|
-
}
|
176
|
-
],
|
177
|
-
"tags": [
|
178
|
-
{
|
179
|
-
"citation_id": "mno7890",
|
180
|
-
"citation_type": "Segment",
|
181
|
-
"bbox": [{ "left": 12, "top": 200, "width": 100, "height": 16 }],
|
182
|
-
"content": "urgent paid",
|
183
|
-
"segment_id": "seg_020",
|
184
|
-
"segment_type": "Text",
|
185
|
-
"ss_range": ["A20:C25"]
|
186
|
-
}
|
187
|
-
],
|
188
|
-
"notes": null
|
189
|
-
}
|
190
|
-
```
|
191
|
-
|
192
|
-
metrics
|
193
|
-
|
194
|
-
```json
|
195
|
-
{
|
196
|
-
"invoice_id": { "confidence": "High" },
|
197
|
-
"seller": { "name": { "confidence": "Low" } },
|
198
|
-
"line_items": [
|
199
|
-
{ "sku": { "confidence": "High" }, "qty": { "confidence": "High" } }
|
200
|
-
],
|
201
|
-
"tags": { "confidence": "Low" },
|
202
|
-
"notes": null
|
203
|
-
}
|
204
|
-
```
|
205
58
|
"""
|
206
59
|
|
207
60
|
source_task_id: Optional[str] = None
|
@@ -55,153 +55,6 @@ class ExtractGetResponse(BaseModel):
|
|
55
55
|
array-of-primitives) contain a `Vec<Citation>` supporting that field.
|
56
56
|
- `metrics`: mirror of `results`; only leaf positions contain a `Metrics` object
|
57
57
|
for that field.
|
58
|
-
|
59
|
-
Detailed shape:
|
60
|
-
|
61
|
-
- Shared structure: `results`, `citations`, and `metrics` have the same
|
62
|
-
object/array shape as the user schema. Non-leaf nodes (objects, arrays of
|
63
|
-
objects) are mirrored; only leaves carry values.
|
64
|
-
- Leaf definition:
|
65
|
-
- A leaf is either a JSON primitive (string, number, bool, or null) or an
|
66
|
-
array of primitives (including empty).
|
67
|
-
- Arrays of objects are not leaves; recurse into their elements (`items`
|
68
|
-
mirror index-by-index).
|
69
|
-
- Null handling:
|
70
|
-
- If a leaf in `results` is null, the corresponding position in `citations`
|
71
|
-
and `metrics` remains null.
|
72
|
-
- Arrays:
|
73
|
-
- Array of objects: `citations`/`metrics` are arrays whose elements mirror
|
74
|
-
each object and carry values at their own leaves.
|
75
|
-
- Array of primitives: treated as a single leaf. `citations[path]` is a list
|
76
|
-
of `Citation` supporting the array as a whole. `metrics[path]` is a
|
77
|
-
`Metrics` object for the array as a whole.
|
78
|
-
- Citations leaves:
|
79
|
-
- Type: JSON array of `Citation` objects.
|
80
|
-
- Each `Citation` has: `citation_id: string`, `citation_type: Segment|Word`,
|
81
|
-
`bbox: BoundingBox[]`, `content: string`, `segment_id?: string`,
|
82
|
-
`segment_type: SegmentType`, `ss_range?: string[]`.
|
83
|
-
- Segment citation: represents a full parsed segment; `segment_id` set,
|
84
|
-
`bbox` has one entry (segment box), `content` is the segment text. If the
|
85
|
-
segment is from a spreadsheet, `ss_range` contains the table range
|
86
|
-
(single-element array) or the underlying cell refs if available.
|
87
|
-
- Word citation: represents selected OCR words within a segment;
|
88
|
-
`segment_id` is null, `bbox` has one entry per word, `content` is the
|
89
|
-
whitespace-joined text of those words; `segment_type` is `Text`. If OCR
|
90
|
-
words came from spreadsheet cells, `ss_range` lists those cell refs.
|
91
|
-
- Metrics leaves:
|
92
|
-
- Type: `Metrics` object with `confidence: "High" | "Low"`, indicating whether
|
93
|
-
citations sufficiently support the item.
|
94
|
-
|
95
|
-
Example:
|
96
|
-
|
97
|
-
results
|
98
|
-
|
99
|
-
```json
|
100
|
-
{
|
101
|
-
"invoice_id": "INV-001",
|
102
|
-
"seller": { "name": "Acme" },
|
103
|
-
"line_items": [{ "sku": "A1", "qty": 2 }],
|
104
|
-
"tags": ["urgent", "paid"],
|
105
|
-
"notes": null
|
106
|
-
}
|
107
|
-
```
|
108
|
-
|
109
|
-
citations
|
110
|
-
|
111
|
-
```json
|
112
|
-
{
|
113
|
-
"invoice_id": [
|
114
|
-
{
|
115
|
-
"citation_id": "abc1234",
|
116
|
-
"citation_type": "Segment",
|
117
|
-
"bbox": [{ "left": 10, "top": 20, "width": 100, "height": 18 }],
|
118
|
-
"content": "Invoice INV-001",
|
119
|
-
"segment_id": "seg_001",
|
120
|
-
"segment_type": "Text",
|
121
|
-
"ss_range": ["A1:C10"]
|
122
|
-
},
|
123
|
-
{
|
124
|
-
"citation_id": "pqr2345",
|
125
|
-
"citation_type": "Word",
|
126
|
-
"bbox": [
|
127
|
-
{ "left": 12, "top": 24, "width": 36, "height": 18 },
|
128
|
-
{ "left": 52, "top": 24, "width": 48, "height": 18 }
|
129
|
-
],
|
130
|
-
"content": "INV-001",
|
131
|
-
"segment_id": null,
|
132
|
-
"segment_type": "Text",
|
133
|
-
"ss_range": ["B3", "C3"]
|
134
|
-
}
|
135
|
-
],
|
136
|
-
"seller": {
|
137
|
-
"name": [
|
138
|
-
{
|
139
|
-
"citation_id": "def5678",
|
140
|
-
"citation_type": "Word",
|
141
|
-
"bbox": [
|
142
|
-
{ "left": 45, "top": 80, "width": 30, "height": 12 },
|
143
|
-
{ "left": 80, "top": 80, "width": 40, "height": 12 }
|
144
|
-
],
|
145
|
-
"content": "Acme",
|
146
|
-
"segment_id": null,
|
147
|
-
"segment_type": "Text"
|
148
|
-
}
|
149
|
-
]
|
150
|
-
},
|
151
|
-
"line_items": [
|
152
|
-
{
|
153
|
-
"sku": [
|
154
|
-
{
|
155
|
-
"citation_id": "ghi9012",
|
156
|
-
"citation_type": "Segment",
|
157
|
-
"bbox": [{ "left": 12, "top": 140, "width": 60, "height": 16 }],
|
158
|
-
"content": "A1",
|
159
|
-
"segment_id": "seg_010",
|
160
|
-
"segment_type": "Text",
|
161
|
-
"ss_range": ["D5:E12"]
|
162
|
-
}
|
163
|
-
],
|
164
|
-
"qty": [
|
165
|
-
{
|
166
|
-
"citation_id": "jkl3456",
|
167
|
-
"citation_type": "Word",
|
168
|
-
"bbox": [{ "left": 85, "top": 140, "width": 12, "height": 16 }],
|
169
|
-
"content": "2",
|
170
|
-
"segment_id": null,
|
171
|
-
"segment_type": "Text",
|
172
|
-
"ss_range": ["E12"]
|
173
|
-
}
|
174
|
-
]
|
175
|
-
}
|
176
|
-
],
|
177
|
-
"tags": [
|
178
|
-
{
|
179
|
-
"citation_id": "mno7890",
|
180
|
-
"citation_type": "Segment",
|
181
|
-
"bbox": [{ "left": 12, "top": 200, "width": 100, "height": 16 }],
|
182
|
-
"content": "urgent paid",
|
183
|
-
"segment_id": "seg_020",
|
184
|
-
"segment_type": "Text",
|
185
|
-
"ss_range": ["A20:C25"]
|
186
|
-
}
|
187
|
-
],
|
188
|
-
"notes": null
|
189
|
-
}
|
190
|
-
```
|
191
|
-
|
192
|
-
metrics
|
193
|
-
|
194
|
-
```json
|
195
|
-
{
|
196
|
-
"invoice_id": { "confidence": "High" },
|
197
|
-
"seller": { "name": { "confidence": "Low" } },
|
198
|
-
"line_items": [
|
199
|
-
{ "sku": { "confidence": "High" }, "qty": { "confidence": "High" } }
|
200
|
-
],
|
201
|
-
"tags": { "confidence": "Low" },
|
202
|
-
"notes": null
|
203
|
-
}
|
204
|
-
```
|
205
58
|
"""
|
206
59
|
|
207
60
|
source_task_id: Optional[str] = None
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.0a8
|
4
4
|
Summary: The official Python library for the chunkr API
|
5
5
|
Project-URL: Homepage, https://github.com/lumina-ai-inc/chunkr-python
|
6
6
|
Project-URL: Repository, https://github.com/lumina-ai-inc/chunkr-python
|