chunkr-ai 0.1.0a7__py3-none-any.whl → 0.1.0a9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/__init__.py +3 -1
- chunkr_ai/_base_client.py +12 -12
- chunkr_ai/_client.py +8 -8
- chunkr_ai/_compat.py +48 -48
- chunkr_ai/_models.py +50 -44
- chunkr_ai/_qs.py +7 -7
- chunkr_ai/_types.py +18 -11
- chunkr_ai/_utils/__init__.py +8 -2
- chunkr_ai/_utils/_compat.py +45 -0
- chunkr_ai/_utils/_datetime_parse.py +136 -0
- chunkr_ai/_utils/_transform.py +13 -3
- chunkr_ai/_utils/_typing.py +1 -1
- chunkr_ai/_utils/_utils.py +4 -5
- chunkr_ai/_version.py +1 -1
- chunkr_ai/resources/files.py +29 -29
- chunkr_ai/resources/health.py +3 -3
- chunkr_ai/resources/tasks/extract.py +21 -37
- chunkr_ai/resources/tasks/parse.py +29 -54
- chunkr_ai/resources/tasks/tasks.py +35 -51
- chunkr_ai/resources/webhooks.py +3 -3
- chunkr_ai/types/__init__.py +0 -2
- chunkr_ai/types/extract_output_response.py +45 -2
- chunkr_ai/types/file_info.py +3 -0
- chunkr_ai/types/ocr_result.py +6 -6
- chunkr_ai/types/parse_configuration.py +0 -4
- chunkr_ai/types/parse_configuration_param.py +0 -4
- chunkr_ai/types/segment.py +8 -5
- chunkr_ai/types/segment_processing.py +92 -2
- chunkr_ai/types/segment_processing_param.py +92 -2
- chunkr_ai/types/task_get_params.py +0 -3
- chunkr_ai/types/tasks/extract_create_response.py +0 -147
- chunkr_ai/types/tasks/extract_get_params.py +0 -3
- chunkr_ai/types/tasks/extract_get_response.py +0 -147
- chunkr_ai/types/tasks/parse_create_params.py +0 -4
- chunkr_ai/types/tasks/parse_get_params.py +0 -3
- chunkr_ai/types/version_info.py +1 -1
- {chunkr_ai-0.1.0a7.dist-info → chunkr_ai-0.1.0a9.dist-info}/METADATA +1 -1
- {chunkr_ai-0.1.0a7.dist-info → chunkr_ai-0.1.0a9.dist-info}/RECORD +40 -40
- chunkr_ai/types/llm_processing.py +0 -36
- chunkr_ai/types/llm_processing_param.py +0 -36
- {chunkr_ai-0.1.0a7.dist-info → chunkr_ai-0.1.0a9.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a7.dist-info → chunkr_ai-0.1.0a9.dist-info}/licenses/LICENSE +0 -0
chunkr_ai/types/file_info.py
CHANGED
chunkr_ai/types/ocr_result.py
CHANGED
@@ -15,14 +15,14 @@ class OcrResult(BaseModel):
|
|
15
15
|
text: str
|
16
16
|
"""The recognized text of the OCR result."""
|
17
17
|
|
18
|
-
cell_ref: Optional[str] = None
|
19
|
-
"""
|
20
|
-
Excel-style cell reference (e.g., "A1" or "A1:B2") when OCR originates from a
|
21
|
-
spreadsheet cell
|
22
|
-
"""
|
23
|
-
|
24
18
|
confidence: Optional[float] = None
|
25
19
|
"""The confidence score of the recognized text."""
|
26
20
|
|
27
21
|
ocr_id: Optional[str] = None
|
28
22
|
"""The unique identifier for the OCR result."""
|
23
|
+
|
24
|
+
ss_cell_ref: Optional[str] = None
|
25
|
+
"""
|
26
|
+
Excel-style cell reference (e.g., "A1" or "A1:B2") when OCR originates from a
|
27
|
+
spreadsheet cell
|
28
|
+
"""
|
@@ -4,7 +4,6 @@ from typing import Optional
|
|
4
4
|
from typing_extensions import Literal
|
5
5
|
|
6
6
|
from .._models import BaseModel
|
7
|
-
from .llm_processing import LlmProcessing
|
8
7
|
from .chunk_processing import ChunkProcessing
|
9
8
|
from .segment_processing import SegmentProcessing
|
10
9
|
|
@@ -23,9 +22,6 @@ class ParseConfiguration(BaseModel):
|
|
23
22
|
LLM refusals etc.)
|
24
23
|
"""
|
25
24
|
|
26
|
-
llm_processing: Optional[LlmProcessing] = None
|
27
|
-
"""Controls the LLM used for the task."""
|
28
|
-
|
29
25
|
ocr_strategy: Optional[Literal["All", "Auto"]] = None
|
30
26
|
"""Controls the Optical Character Recognition (OCR) strategy.
|
31
27
|
|
@@ -5,7 +5,6 @@ from __future__ import annotations
|
|
5
5
|
from typing import Optional
|
6
6
|
from typing_extensions import Literal, TypedDict
|
7
7
|
|
8
|
-
from .llm_processing_param import LlmProcessingParam
|
9
8
|
from .chunk_processing_param import ChunkProcessingParam
|
10
9
|
from .segment_processing_param import SegmentProcessingParam
|
11
10
|
|
@@ -24,9 +23,6 @@ class ParseConfigurationParam(TypedDict, total=False):
|
|
24
23
|
LLM refusals etc.)
|
25
24
|
"""
|
26
25
|
|
27
|
-
llm_processing: LlmProcessingParam
|
28
|
-
"""Controls the LLM used for the task."""
|
29
|
-
|
30
26
|
ocr_strategy: Literal["All", "Auto"]
|
31
27
|
"""Controls the Optical Character Recognition (OCR) strategy.
|
32
28
|
|
chunkr_ai/types/segment.py
CHANGED
@@ -31,20 +31,23 @@ class Segment(BaseModel):
|
|
31
31
|
"Caption",
|
32
32
|
"Footnote",
|
33
33
|
"Formula",
|
34
|
+
"FormRegion",
|
35
|
+
"GraphicalItem",
|
36
|
+
"Legend",
|
37
|
+
"LineNumber",
|
34
38
|
"ListItem",
|
35
39
|
"Page",
|
36
40
|
"PageFooter",
|
37
41
|
"PageHeader",
|
42
|
+
"PageNumber",
|
38
43
|
"Picture",
|
39
|
-
"SectionHeader",
|
40
44
|
"Table",
|
41
45
|
"Text",
|
42
46
|
"Title",
|
47
|
+
"Unknown",
|
48
|
+
"SectionHeader",
|
43
49
|
]
|
44
|
-
"""
|
45
|
-
All the possible types for a segment. Note: Different configurations will
|
46
|
-
produce different types. Please refer to the documentation for more information.
|
47
|
-
"""
|
50
|
+
"""All the possible types for a segment."""
|
48
51
|
|
49
52
|
confidence: Optional[float] = None
|
50
53
|
"""Confidence score of the layout analysis model"""
|
@@ -47,6 +47,24 @@ class SegmentProcessing(BaseModel):
|
|
47
47
|
- `extended_context` uses the full page image as context for LLM generation.
|
48
48
|
"""
|
49
49
|
|
50
|
+
form_region: Optional[GenerationConfig] = FieldInfo(alias="FormRegion", default=None)
|
51
|
+
"""Controls the processing and generation for the segment.
|
52
|
+
|
53
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
54
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
55
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
56
|
+
post-processing.
|
57
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
58
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
59
|
+
- `Auto`: Process content automatically
|
60
|
+
- `LLM`: Use large language models for processing
|
61
|
+
- `Ignore`: Exclude segments from final output
|
62
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
63
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
64
|
+
configuration.
|
65
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
66
|
+
"""
|
67
|
+
|
50
68
|
formula: Optional[GenerationConfig] = FieldInfo(alias="Formula", default=None)
|
51
69
|
"""Controls the processing and generation for the segment.
|
52
70
|
|
@@ -65,6 +83,60 @@ class SegmentProcessing(BaseModel):
|
|
65
83
|
- `extended_context` uses the full page image as context for LLM generation.
|
66
84
|
"""
|
67
85
|
|
86
|
+
graphical_item: Optional[GenerationConfig] = FieldInfo(alias="GraphicalItem", default=None)
|
87
|
+
"""Controls the processing and generation for the segment.
|
88
|
+
|
89
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
90
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
91
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
92
|
+
post-processing.
|
93
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
94
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
95
|
+
- `Auto`: Process content automatically
|
96
|
+
- `LLM`: Use large language models for processing
|
97
|
+
- `Ignore`: Exclude segments from final output
|
98
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
99
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
100
|
+
configuration.
|
101
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
102
|
+
"""
|
103
|
+
|
104
|
+
legend: Optional[GenerationConfig] = FieldInfo(alias="Legend", default=None)
|
105
|
+
"""Controls the processing and generation for the segment.
|
106
|
+
|
107
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
108
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
109
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
110
|
+
post-processing.
|
111
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
112
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
113
|
+
- `Auto`: Process content automatically
|
114
|
+
- `LLM`: Use large language models for processing
|
115
|
+
- `Ignore`: Exclude segments from final output
|
116
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
117
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
118
|
+
configuration.
|
119
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
120
|
+
"""
|
121
|
+
|
122
|
+
line_number: Optional[GenerationConfig] = FieldInfo(alias="LineNumber", default=None)
|
123
|
+
"""Controls the processing and generation for the segment.
|
124
|
+
|
125
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
126
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
127
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
128
|
+
post-processing.
|
129
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
130
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
131
|
+
- `Auto`: Process content automatically
|
132
|
+
- `LLM`: Use large language models for processing
|
133
|
+
- `Ignore`: Exclude segments from final output
|
134
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
135
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
136
|
+
configuration.
|
137
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
138
|
+
"""
|
139
|
+
|
68
140
|
list_item: Optional[GenerationConfig] = FieldInfo(alias="ListItem", default=None)
|
69
141
|
"""Controls the processing and generation for the segment.
|
70
142
|
|
@@ -137,7 +209,7 @@ class SegmentProcessing(BaseModel):
|
|
137
209
|
- `extended_context` uses the full page image as context for LLM generation.
|
138
210
|
"""
|
139
211
|
|
140
|
-
|
212
|
+
page_number: Optional[GenerationConfig] = FieldInfo(alias="PageNumber", default=None)
|
141
213
|
"""Controls the processing and generation for the segment.
|
142
214
|
|
143
215
|
- `crop_image` controls whether to crop the file's images to the segment's
|
@@ -155,7 +227,7 @@ class SegmentProcessing(BaseModel):
|
|
155
227
|
- `extended_context` uses the full page image as context for LLM generation.
|
156
228
|
"""
|
157
229
|
|
158
|
-
|
230
|
+
picture: Optional[GenerationConfig] = FieldInfo(alias="Picture", default=None)
|
159
231
|
"""Controls the processing and generation for the segment.
|
160
232
|
|
161
233
|
- `crop_image` controls whether to crop the file's images to the segment's
|
@@ -226,3 +298,21 @@ class SegmentProcessing(BaseModel):
|
|
226
298
|
configuration.
|
227
299
|
- `extended_context` uses the full page image as context for LLM generation.
|
228
300
|
"""
|
301
|
+
|
302
|
+
unknown: Optional[GenerationConfig] = FieldInfo(alias="Unknown", default=None)
|
303
|
+
"""Controls the processing and generation for the segment.
|
304
|
+
|
305
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
306
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
307
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
308
|
+
post-processing.
|
309
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
310
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
311
|
+
- `Auto`: Process content automatically
|
312
|
+
- `LLM`: Use large language models for processing
|
313
|
+
- `Ignore`: Exclude segments from final output
|
314
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
315
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
316
|
+
configuration.
|
317
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
318
|
+
"""
|
@@ -48,6 +48,24 @@ class SegmentProcessingParam(TypedDict, total=False):
|
|
48
48
|
- `extended_context` uses the full page image as context for LLM generation.
|
49
49
|
"""
|
50
50
|
|
51
|
+
form_region: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="FormRegion")]
|
52
|
+
"""Controls the processing and generation for the segment.
|
53
|
+
|
54
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
55
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
56
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
57
|
+
post-processing.
|
58
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
59
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
60
|
+
- `Auto`: Process content automatically
|
61
|
+
- `LLM`: Use large language models for processing
|
62
|
+
- `Ignore`: Exclude segments from final output
|
63
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
64
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
65
|
+
configuration.
|
66
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
67
|
+
"""
|
68
|
+
|
51
69
|
formula: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Formula")]
|
52
70
|
"""Controls the processing and generation for the segment.
|
53
71
|
|
@@ -66,6 +84,60 @@ class SegmentProcessingParam(TypedDict, total=False):
|
|
66
84
|
- `extended_context` uses the full page image as context for LLM generation.
|
67
85
|
"""
|
68
86
|
|
87
|
+
graphical_item: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="GraphicalItem")]
|
88
|
+
"""Controls the processing and generation for the segment.
|
89
|
+
|
90
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
91
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
92
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
93
|
+
post-processing.
|
94
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
95
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
96
|
+
- `Auto`: Process content automatically
|
97
|
+
- `LLM`: Use large language models for processing
|
98
|
+
- `Ignore`: Exclude segments from final output
|
99
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
100
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
101
|
+
configuration.
|
102
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
103
|
+
"""
|
104
|
+
|
105
|
+
legend: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Legend")]
|
106
|
+
"""Controls the processing and generation for the segment.
|
107
|
+
|
108
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
109
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
110
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
111
|
+
post-processing.
|
112
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
113
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
114
|
+
- `Auto`: Process content automatically
|
115
|
+
- `LLM`: Use large language models for processing
|
116
|
+
- `Ignore`: Exclude segments from final output
|
117
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
118
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
119
|
+
configuration.
|
120
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
121
|
+
"""
|
122
|
+
|
123
|
+
line_number: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="LineNumber")]
|
124
|
+
"""Controls the processing and generation for the segment.
|
125
|
+
|
126
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
127
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
128
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
129
|
+
post-processing.
|
130
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
131
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
132
|
+
- `Auto`: Process content automatically
|
133
|
+
- `LLM`: Use large language models for processing
|
134
|
+
- `Ignore`: Exclude segments from final output
|
135
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
136
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
137
|
+
configuration.
|
138
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
139
|
+
"""
|
140
|
+
|
69
141
|
list_item: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="ListItem")]
|
70
142
|
"""Controls the processing and generation for the segment.
|
71
143
|
|
@@ -138,7 +210,7 @@ class SegmentProcessingParam(TypedDict, total=False):
|
|
138
210
|
- `extended_context` uses the full page image as context for LLM generation.
|
139
211
|
"""
|
140
212
|
|
141
|
-
|
213
|
+
page_number: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="PageNumber")]
|
142
214
|
"""Controls the processing and generation for the segment.
|
143
215
|
|
144
216
|
- `crop_image` controls whether to crop the file's images to the segment's
|
@@ -156,7 +228,7 @@ class SegmentProcessingParam(TypedDict, total=False):
|
|
156
228
|
- `extended_context` uses the full page image as context for LLM generation.
|
157
229
|
"""
|
158
230
|
|
159
|
-
|
231
|
+
picture: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Picture")]
|
160
232
|
"""Controls the processing and generation for the segment.
|
161
233
|
|
162
234
|
- `crop_image` controls whether to crop the file's images to the segment's
|
@@ -227,3 +299,21 @@ class SegmentProcessingParam(TypedDict, total=False):
|
|
227
299
|
configuration.
|
228
300
|
- `extended_context` uses the full page image as context for LLM generation.
|
229
301
|
"""
|
302
|
+
|
303
|
+
unknown: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Unknown")]
|
304
|
+
"""Controls the processing and generation for the segment.
|
305
|
+
|
306
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
307
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
308
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
309
|
+
post-processing.
|
310
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
311
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
312
|
+
- `Auto`: Process content automatically
|
313
|
+
- `LLM`: Use large language models for processing
|
314
|
+
- `Ignore`: Exclude segments from final output
|
315
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
316
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
317
|
+
configuration.
|
318
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
319
|
+
"""
|
@@ -55,153 +55,6 @@ class ExtractCreateResponse(BaseModel):
|
|
55
55
|
array-of-primitives) contain a `Vec<Citation>` supporting that field.
|
56
56
|
- `metrics`: mirror of `results`; only leaf positions contain a `Metrics` object
|
57
57
|
for that field.
|
58
|
-
|
59
|
-
Detailed shape:
|
60
|
-
|
61
|
-
- Shared structure: `results`, `citations`, and `metrics` have the same
|
62
|
-
object/array shape as the user schema. Non-leaf nodes (objects, arrays of
|
63
|
-
objects) are mirrored; only leaves carry values.
|
64
|
-
- Leaf definition:
|
65
|
-
- A leaf is either a JSON primitive (string, number, bool, or null) or an
|
66
|
-
array of primitives (including empty).
|
67
|
-
- Arrays of objects are not leaves; recurse into their elements (`items`
|
68
|
-
mirror index-by-index).
|
69
|
-
- Null handling:
|
70
|
-
- If a leaf in `results` is null, the corresponding position in `citations`
|
71
|
-
and `metrics` remains null.
|
72
|
-
- Arrays:
|
73
|
-
- Array of objects: `citations`/`metrics` are arrays whose elements mirror
|
74
|
-
each object and carry values at their own leaves.
|
75
|
-
- Array of primitives: treated as a single leaf. `citations[path]` is a list
|
76
|
-
of `Citation` supporting the array as a whole. `metrics[path]` is a
|
77
|
-
`Metrics` object for the array as a whole.
|
78
|
-
- Citations leaves:
|
79
|
-
- Type: JSON array of `Citation` objects.
|
80
|
-
- Each `Citation` has: `citation_id: string`, `citation_type: Segment|Word`,
|
81
|
-
`bbox: BoundingBox[]`, `content: string`, `segment_id?: string`,
|
82
|
-
`segment_type: SegmentType`, `ss_range?: string[]`.
|
83
|
-
- Segment citation: represents a full parsed segment; `segment_id` set,
|
84
|
-
`bbox` has one entry (segment box), `content` is the segment text. If the
|
85
|
-
segment is from a spreadsheet, `ss_range` contains the table range
|
86
|
-
(single-element array) or the underlying cell refs if available.
|
87
|
-
- Word citation: represents selected OCR words within a segment;
|
88
|
-
`segment_id` is null, `bbox` has one entry per word, `content` is the
|
89
|
-
whitespace-joined text of those words; `segment_type` is `Text`. If OCR
|
90
|
-
words came from spreadsheet cells, `ss_range` lists those cell refs.
|
91
|
-
- Metrics leaves:
|
92
|
-
- Type: `Metrics` object with `confidence: "High" | "Low"`, indicating whether
|
93
|
-
citations sufficiently support the item.
|
94
|
-
|
95
|
-
Example:
|
96
|
-
|
97
|
-
results
|
98
|
-
|
99
|
-
```json
|
100
|
-
{
|
101
|
-
"invoice_id": "INV-001",
|
102
|
-
"seller": { "name": "Acme" },
|
103
|
-
"line_items": [{ "sku": "A1", "qty": 2 }],
|
104
|
-
"tags": ["urgent", "paid"],
|
105
|
-
"notes": null
|
106
|
-
}
|
107
|
-
```
|
108
|
-
|
109
|
-
citations
|
110
|
-
|
111
|
-
```json
|
112
|
-
{
|
113
|
-
"invoice_id": [
|
114
|
-
{
|
115
|
-
"citation_id": "abc1234",
|
116
|
-
"citation_type": "Segment",
|
117
|
-
"bbox": [{ "left": 10, "top": 20, "width": 100, "height": 18 }],
|
118
|
-
"content": "Invoice INV-001",
|
119
|
-
"segment_id": "seg_001",
|
120
|
-
"segment_type": "Text",
|
121
|
-
"ss_range": ["A1:C10"]
|
122
|
-
},
|
123
|
-
{
|
124
|
-
"citation_id": "pqr2345",
|
125
|
-
"citation_type": "Word",
|
126
|
-
"bbox": [
|
127
|
-
{ "left": 12, "top": 24, "width": 36, "height": 18 },
|
128
|
-
{ "left": 52, "top": 24, "width": 48, "height": 18 }
|
129
|
-
],
|
130
|
-
"content": "INV-001",
|
131
|
-
"segment_id": null,
|
132
|
-
"segment_type": "Text",
|
133
|
-
"ss_range": ["B3", "C3"]
|
134
|
-
}
|
135
|
-
],
|
136
|
-
"seller": {
|
137
|
-
"name": [
|
138
|
-
{
|
139
|
-
"citation_id": "def5678",
|
140
|
-
"citation_type": "Word",
|
141
|
-
"bbox": [
|
142
|
-
{ "left": 45, "top": 80, "width": 30, "height": 12 },
|
143
|
-
{ "left": 80, "top": 80, "width": 40, "height": 12 }
|
144
|
-
],
|
145
|
-
"content": "Acme",
|
146
|
-
"segment_id": null,
|
147
|
-
"segment_type": "Text"
|
148
|
-
}
|
149
|
-
]
|
150
|
-
},
|
151
|
-
"line_items": [
|
152
|
-
{
|
153
|
-
"sku": [
|
154
|
-
{
|
155
|
-
"citation_id": "ghi9012",
|
156
|
-
"citation_type": "Segment",
|
157
|
-
"bbox": [{ "left": 12, "top": 140, "width": 60, "height": 16 }],
|
158
|
-
"content": "A1",
|
159
|
-
"segment_id": "seg_010",
|
160
|
-
"segment_type": "Text",
|
161
|
-
"ss_range": ["D5:E12"]
|
162
|
-
}
|
163
|
-
],
|
164
|
-
"qty": [
|
165
|
-
{
|
166
|
-
"citation_id": "jkl3456",
|
167
|
-
"citation_type": "Word",
|
168
|
-
"bbox": [{ "left": 85, "top": 140, "width": 12, "height": 16 }],
|
169
|
-
"content": "2",
|
170
|
-
"segment_id": null,
|
171
|
-
"segment_type": "Text",
|
172
|
-
"ss_range": ["E12"]
|
173
|
-
}
|
174
|
-
]
|
175
|
-
}
|
176
|
-
],
|
177
|
-
"tags": [
|
178
|
-
{
|
179
|
-
"citation_id": "mno7890",
|
180
|
-
"citation_type": "Segment",
|
181
|
-
"bbox": [{ "left": 12, "top": 200, "width": 100, "height": 16 }],
|
182
|
-
"content": "urgent paid",
|
183
|
-
"segment_id": "seg_020",
|
184
|
-
"segment_type": "Text",
|
185
|
-
"ss_range": ["A20:C25"]
|
186
|
-
}
|
187
|
-
],
|
188
|
-
"notes": null
|
189
|
-
}
|
190
|
-
```
|
191
|
-
|
192
|
-
metrics
|
193
|
-
|
194
|
-
```json
|
195
|
-
{
|
196
|
-
"invoice_id": { "confidence": "High" },
|
197
|
-
"seller": { "name": { "confidence": "Low" } },
|
198
|
-
"line_items": [
|
199
|
-
{ "sku": { "confidence": "High" }, "qty": { "confidence": "High" } }
|
200
|
-
],
|
201
|
-
"tags": { "confidence": "Low" },
|
202
|
-
"notes": null
|
203
|
-
}
|
204
|
-
```
|
205
58
|
"""
|
206
59
|
|
207
60
|
source_task_id: Optional[str] = None
|
@@ -55,153 +55,6 @@ class ExtractGetResponse(BaseModel):
|
|
55
55
|
array-of-primitives) contain a `Vec<Citation>` supporting that field.
|
56
56
|
- `metrics`: mirror of `results`; only leaf positions contain a `Metrics` object
|
57
57
|
for that field.
|
58
|
-
|
59
|
-
Detailed shape:
|
60
|
-
|
61
|
-
- Shared structure: `results`, `citations`, and `metrics` have the same
|
62
|
-
object/array shape as the user schema. Non-leaf nodes (objects, arrays of
|
63
|
-
objects) are mirrored; only leaves carry values.
|
64
|
-
- Leaf definition:
|
65
|
-
- A leaf is either a JSON primitive (string, number, bool, or null) or an
|
66
|
-
array of primitives (including empty).
|
67
|
-
- Arrays of objects are not leaves; recurse into their elements (`items`
|
68
|
-
mirror index-by-index).
|
69
|
-
- Null handling:
|
70
|
-
- If a leaf in `results` is null, the corresponding position in `citations`
|
71
|
-
and `metrics` remains null.
|
72
|
-
- Arrays:
|
73
|
-
- Array of objects: `citations`/`metrics` are arrays whose elements mirror
|
74
|
-
each object and carry values at their own leaves.
|
75
|
-
- Array of primitives: treated as a single leaf. `citations[path]` is a list
|
76
|
-
of `Citation` supporting the array as a whole. `metrics[path]` is a
|
77
|
-
`Metrics` object for the array as a whole.
|
78
|
-
- Citations leaves:
|
79
|
-
- Type: JSON array of `Citation` objects.
|
80
|
-
- Each `Citation` has: `citation_id: string`, `citation_type: Segment|Word`,
|
81
|
-
`bbox: BoundingBox[]`, `content: string`, `segment_id?: string`,
|
82
|
-
`segment_type: SegmentType`, `ss_range?: string[]`.
|
83
|
-
- Segment citation: represents a full parsed segment; `segment_id` set,
|
84
|
-
`bbox` has one entry (segment box), `content` is the segment text. If the
|
85
|
-
segment is from a spreadsheet, `ss_range` contains the table range
|
86
|
-
(single-element array) or the underlying cell refs if available.
|
87
|
-
- Word citation: represents selected OCR words within a segment;
|
88
|
-
`segment_id` is null, `bbox` has one entry per word, `content` is the
|
89
|
-
whitespace-joined text of those words; `segment_type` is `Text`. If OCR
|
90
|
-
words came from spreadsheet cells, `ss_range` lists those cell refs.
|
91
|
-
- Metrics leaves:
|
92
|
-
- Type: `Metrics` object with `confidence: "High" | "Low"`, indicating whether
|
93
|
-
citations sufficiently support the item.
|
94
|
-
|
95
|
-
Example:
|
96
|
-
|
97
|
-
results
|
98
|
-
|
99
|
-
```json
|
100
|
-
{
|
101
|
-
"invoice_id": "INV-001",
|
102
|
-
"seller": { "name": "Acme" },
|
103
|
-
"line_items": [{ "sku": "A1", "qty": 2 }],
|
104
|
-
"tags": ["urgent", "paid"],
|
105
|
-
"notes": null
|
106
|
-
}
|
107
|
-
```
|
108
|
-
|
109
|
-
citations
|
110
|
-
|
111
|
-
```json
|
112
|
-
{
|
113
|
-
"invoice_id": [
|
114
|
-
{
|
115
|
-
"citation_id": "abc1234",
|
116
|
-
"citation_type": "Segment",
|
117
|
-
"bbox": [{ "left": 10, "top": 20, "width": 100, "height": 18 }],
|
118
|
-
"content": "Invoice INV-001",
|
119
|
-
"segment_id": "seg_001",
|
120
|
-
"segment_type": "Text",
|
121
|
-
"ss_range": ["A1:C10"]
|
122
|
-
},
|
123
|
-
{
|
124
|
-
"citation_id": "pqr2345",
|
125
|
-
"citation_type": "Word",
|
126
|
-
"bbox": [
|
127
|
-
{ "left": 12, "top": 24, "width": 36, "height": 18 },
|
128
|
-
{ "left": 52, "top": 24, "width": 48, "height": 18 }
|
129
|
-
],
|
130
|
-
"content": "INV-001",
|
131
|
-
"segment_id": null,
|
132
|
-
"segment_type": "Text",
|
133
|
-
"ss_range": ["B3", "C3"]
|
134
|
-
}
|
135
|
-
],
|
136
|
-
"seller": {
|
137
|
-
"name": [
|
138
|
-
{
|
139
|
-
"citation_id": "def5678",
|
140
|
-
"citation_type": "Word",
|
141
|
-
"bbox": [
|
142
|
-
{ "left": 45, "top": 80, "width": 30, "height": 12 },
|
143
|
-
{ "left": 80, "top": 80, "width": 40, "height": 12 }
|
144
|
-
],
|
145
|
-
"content": "Acme",
|
146
|
-
"segment_id": null,
|
147
|
-
"segment_type": "Text"
|
148
|
-
}
|
149
|
-
]
|
150
|
-
},
|
151
|
-
"line_items": [
|
152
|
-
{
|
153
|
-
"sku": [
|
154
|
-
{
|
155
|
-
"citation_id": "ghi9012",
|
156
|
-
"citation_type": "Segment",
|
157
|
-
"bbox": [{ "left": 12, "top": 140, "width": 60, "height": 16 }],
|
158
|
-
"content": "A1",
|
159
|
-
"segment_id": "seg_010",
|
160
|
-
"segment_type": "Text",
|
161
|
-
"ss_range": ["D5:E12"]
|
162
|
-
}
|
163
|
-
],
|
164
|
-
"qty": [
|
165
|
-
{
|
166
|
-
"citation_id": "jkl3456",
|
167
|
-
"citation_type": "Word",
|
168
|
-
"bbox": [{ "left": 85, "top": 140, "width": 12, "height": 16 }],
|
169
|
-
"content": "2",
|
170
|
-
"segment_id": null,
|
171
|
-
"segment_type": "Text",
|
172
|
-
"ss_range": ["E12"]
|
173
|
-
}
|
174
|
-
]
|
175
|
-
}
|
176
|
-
],
|
177
|
-
"tags": [
|
178
|
-
{
|
179
|
-
"citation_id": "mno7890",
|
180
|
-
"citation_type": "Segment",
|
181
|
-
"bbox": [{ "left": 12, "top": 200, "width": 100, "height": 16 }],
|
182
|
-
"content": "urgent paid",
|
183
|
-
"segment_id": "seg_020",
|
184
|
-
"segment_type": "Text",
|
185
|
-
"ss_range": ["A20:C25"]
|
186
|
-
}
|
187
|
-
],
|
188
|
-
"notes": null
|
189
|
-
}
|
190
|
-
```
|
191
|
-
|
192
|
-
metrics
|
193
|
-
|
194
|
-
```json
|
195
|
-
{
|
196
|
-
"invoice_id": { "confidence": "High" },
|
197
|
-
"seller": { "name": { "confidence": "Low" } },
|
198
|
-
"line_items": [
|
199
|
-
{ "sku": { "confidence": "High" }, "qty": { "confidence": "High" } }
|
200
|
-
],
|
201
|
-
"tags": { "confidence": "Low" },
|
202
|
-
"notes": null
|
203
|
-
}
|
204
|
-
```
|
205
58
|
"""
|
206
59
|
|
207
60
|
source_task_id: Optional[str] = None
|