chunkr-ai 0.1.0a7__py3-none-any.whl → 0.1.0a9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. chunkr_ai/__init__.py +3 -1
  2. chunkr_ai/_base_client.py +12 -12
  3. chunkr_ai/_client.py +8 -8
  4. chunkr_ai/_compat.py +48 -48
  5. chunkr_ai/_models.py +50 -44
  6. chunkr_ai/_qs.py +7 -7
  7. chunkr_ai/_types.py +18 -11
  8. chunkr_ai/_utils/__init__.py +8 -2
  9. chunkr_ai/_utils/_compat.py +45 -0
  10. chunkr_ai/_utils/_datetime_parse.py +136 -0
  11. chunkr_ai/_utils/_transform.py +13 -3
  12. chunkr_ai/_utils/_typing.py +1 -1
  13. chunkr_ai/_utils/_utils.py +4 -5
  14. chunkr_ai/_version.py +1 -1
  15. chunkr_ai/resources/files.py +29 -29
  16. chunkr_ai/resources/health.py +3 -3
  17. chunkr_ai/resources/tasks/extract.py +21 -37
  18. chunkr_ai/resources/tasks/parse.py +29 -54
  19. chunkr_ai/resources/tasks/tasks.py +35 -51
  20. chunkr_ai/resources/webhooks.py +3 -3
  21. chunkr_ai/types/__init__.py +0 -2
  22. chunkr_ai/types/extract_output_response.py +45 -2
  23. chunkr_ai/types/file_info.py +3 -0
  24. chunkr_ai/types/ocr_result.py +6 -6
  25. chunkr_ai/types/parse_configuration.py +0 -4
  26. chunkr_ai/types/parse_configuration_param.py +0 -4
  27. chunkr_ai/types/segment.py +8 -5
  28. chunkr_ai/types/segment_processing.py +92 -2
  29. chunkr_ai/types/segment_processing_param.py +92 -2
  30. chunkr_ai/types/task_get_params.py +0 -3
  31. chunkr_ai/types/tasks/extract_create_response.py +0 -147
  32. chunkr_ai/types/tasks/extract_get_params.py +0 -3
  33. chunkr_ai/types/tasks/extract_get_response.py +0 -147
  34. chunkr_ai/types/tasks/parse_create_params.py +0 -4
  35. chunkr_ai/types/tasks/parse_get_params.py +0 -3
  36. chunkr_ai/types/version_info.py +1 -1
  37. {chunkr_ai-0.1.0a7.dist-info → chunkr_ai-0.1.0a9.dist-info}/METADATA +1 -1
  38. {chunkr_ai-0.1.0a7.dist-info → chunkr_ai-0.1.0a9.dist-info}/RECORD +40 -40
  39. chunkr_ai/types/llm_processing.py +0 -36
  40. chunkr_ai/types/llm_processing_param.py +0 -36
  41. {chunkr_ai-0.1.0a7.dist-info → chunkr_ai-0.1.0a9.dist-info}/WHEEL +0 -0
  42. {chunkr_ai-0.1.0a7.dist-info → chunkr_ai-0.1.0a9.dist-info}/licenses/LICENSE +0 -0
@@ -19,3 +19,6 @@ class FileInfo(BaseModel):
19
19
 
20
20
  page_count: Optional[int] = None
21
21
  """The number of pages in the file."""
22
+
23
+ ss_cell_count: Optional[int] = None
24
+ """The number of cells in the file. Only used for spreadsheets."""
@@ -15,14 +15,14 @@ class OcrResult(BaseModel):
15
15
  text: str
16
16
  """The recognized text of the OCR result."""
17
17
 
18
- cell_ref: Optional[str] = None
19
- """
20
- Excel-style cell reference (e.g., "A1" or "A1:B2") when OCR originates from a
21
- spreadsheet cell
22
- """
23
-
24
18
  confidence: Optional[float] = None
25
19
  """The confidence score of the recognized text."""
26
20
 
27
21
  ocr_id: Optional[str] = None
28
22
  """The unique identifier for the OCR result."""
23
+
24
+ ss_cell_ref: Optional[str] = None
25
+ """
26
+ Excel-style cell reference (e.g., "A1" or "A1:B2") when OCR originates from a
27
+ spreadsheet cell
28
+ """
@@ -4,7 +4,6 @@ from typing import Optional
4
4
  from typing_extensions import Literal
5
5
 
6
6
  from .._models import BaseModel
7
- from .llm_processing import LlmProcessing
8
7
  from .chunk_processing import ChunkProcessing
9
8
  from .segment_processing import SegmentProcessing
10
9
 
@@ -23,9 +22,6 @@ class ParseConfiguration(BaseModel):
23
22
  LLM refusals etc.)
24
23
  """
25
24
 
26
- llm_processing: Optional[LlmProcessing] = None
27
- """Controls the LLM used for the task."""
28
-
29
25
  ocr_strategy: Optional[Literal["All", "Auto"]] = None
30
26
  """Controls the Optical Character Recognition (OCR) strategy.
31
27
 
@@ -5,7 +5,6 @@ from __future__ import annotations
5
5
  from typing import Optional
6
6
  from typing_extensions import Literal, TypedDict
7
7
 
8
- from .llm_processing_param import LlmProcessingParam
9
8
  from .chunk_processing_param import ChunkProcessingParam
10
9
  from .segment_processing_param import SegmentProcessingParam
11
10
 
@@ -24,9 +23,6 @@ class ParseConfigurationParam(TypedDict, total=False):
24
23
  LLM refusals etc.)
25
24
  """
26
25
 
27
- llm_processing: LlmProcessingParam
28
- """Controls the LLM used for the task."""
29
-
30
26
  ocr_strategy: Literal["All", "Auto"]
31
27
  """Controls the Optical Character Recognition (OCR) strategy.
32
28
 
@@ -31,20 +31,23 @@ class Segment(BaseModel):
31
31
  "Caption",
32
32
  "Footnote",
33
33
  "Formula",
34
+ "FormRegion",
35
+ "GraphicalItem",
36
+ "Legend",
37
+ "LineNumber",
34
38
  "ListItem",
35
39
  "Page",
36
40
  "PageFooter",
37
41
  "PageHeader",
42
+ "PageNumber",
38
43
  "Picture",
39
- "SectionHeader",
40
44
  "Table",
41
45
  "Text",
42
46
  "Title",
47
+ "Unknown",
48
+ "SectionHeader",
43
49
  ]
44
- """
45
- All the possible types for a segment. Note: Different configurations will
46
- produce different types. Please refer to the documentation for more information.
47
- """
50
+ """All the possible types for a segment."""
48
51
 
49
52
  confidence: Optional[float] = None
50
53
  """Confidence score of the layout analysis model"""
@@ -47,6 +47,24 @@ class SegmentProcessing(BaseModel):
47
47
  - `extended_context` uses the full page image as context for LLM generation.
48
48
  """
49
49
 
50
+ form_region: Optional[GenerationConfig] = FieldInfo(alias="FormRegion", default=None)
51
+ """Controls the processing and generation for the segment.
52
+
53
+ - `crop_image` controls whether to crop the file's images to the segment's
54
+ bounding box. The cropped image will be stored in the segment's `image` field.
55
+ Use `All` to always crop, or `Auto` to only crop when needed for
56
+ post-processing.
57
+ - `format` specifies the output format: `Html` or `Markdown`
58
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
59
+ - `Auto`: Process content automatically
60
+ - `LLM`: Use large language models for processing
61
+ - `Ignore`: Exclude segments from final output
62
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
63
+ uses chunkr's own VLM models and is not configurable via LLM processing
64
+ configuration.
65
+ - `extended_context` uses the full page image as context for LLM generation.
66
+ """
67
+
50
68
  formula: Optional[GenerationConfig] = FieldInfo(alias="Formula", default=None)
51
69
  """Controls the processing and generation for the segment.
52
70
 
@@ -65,6 +83,60 @@ class SegmentProcessing(BaseModel):
65
83
  - `extended_context` uses the full page image as context for LLM generation.
66
84
  """
67
85
 
86
+ graphical_item: Optional[GenerationConfig] = FieldInfo(alias="GraphicalItem", default=None)
87
+ """Controls the processing and generation for the segment.
88
+
89
+ - `crop_image` controls whether to crop the file's images to the segment's
90
+ bounding box. The cropped image will be stored in the segment's `image` field.
91
+ Use `All` to always crop, or `Auto` to only crop when needed for
92
+ post-processing.
93
+ - `format` specifies the output format: `Html` or `Markdown`
94
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
95
+ - `Auto`: Process content automatically
96
+ - `LLM`: Use large language models for processing
97
+ - `Ignore`: Exclude segments from final output
98
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
99
+ uses chunkr's own VLM models and is not configurable via LLM processing
100
+ configuration.
101
+ - `extended_context` uses the full page image as context for LLM generation.
102
+ """
103
+
104
+ legend: Optional[GenerationConfig] = FieldInfo(alias="Legend", default=None)
105
+ """Controls the processing and generation for the segment.
106
+
107
+ - `crop_image` controls whether to crop the file's images to the segment's
108
+ bounding box. The cropped image will be stored in the segment's `image` field.
109
+ Use `All` to always crop, or `Auto` to only crop when needed for
110
+ post-processing.
111
+ - `format` specifies the output format: `Html` or `Markdown`
112
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
113
+ - `Auto`: Process content automatically
114
+ - `LLM`: Use large language models for processing
115
+ - `Ignore`: Exclude segments from final output
116
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
117
+ uses chunkr's own VLM models and is not configurable via LLM processing
118
+ configuration.
119
+ - `extended_context` uses the full page image as context for LLM generation.
120
+ """
121
+
122
+ line_number: Optional[GenerationConfig] = FieldInfo(alias="LineNumber", default=None)
123
+ """Controls the processing and generation for the segment.
124
+
125
+ - `crop_image` controls whether to crop the file's images to the segment's
126
+ bounding box. The cropped image will be stored in the segment's `image` field.
127
+ Use `All` to always crop, or `Auto` to only crop when needed for
128
+ post-processing.
129
+ - `format` specifies the output format: `Html` or `Markdown`
130
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
131
+ - `Auto`: Process content automatically
132
+ - `LLM`: Use large language models for processing
133
+ - `Ignore`: Exclude segments from final output
134
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
135
+ uses chunkr's own VLM models and is not configurable via LLM processing
136
+ configuration.
137
+ - `extended_context` uses the full page image as context for LLM generation.
138
+ """
139
+
68
140
  list_item: Optional[GenerationConfig] = FieldInfo(alias="ListItem", default=None)
69
141
  """Controls the processing and generation for the segment.
70
142
 
@@ -137,7 +209,7 @@ class SegmentProcessing(BaseModel):
137
209
  - `extended_context` uses the full page image as context for LLM generation.
138
210
  """
139
211
 
140
- picture: Optional[GenerationConfig] = FieldInfo(alias="Picture", default=None)
212
+ page_number: Optional[GenerationConfig] = FieldInfo(alias="PageNumber", default=None)
141
213
  """Controls the processing and generation for the segment.
142
214
 
143
215
  - `crop_image` controls whether to crop the file's images to the segment's
@@ -155,7 +227,7 @@ class SegmentProcessing(BaseModel):
155
227
  - `extended_context` uses the full page image as context for LLM generation.
156
228
  """
157
229
 
158
- section_header: Optional[GenerationConfig] = FieldInfo(alias="SectionHeader", default=None)
230
+ picture: Optional[GenerationConfig] = FieldInfo(alias="Picture", default=None)
159
231
  """Controls the processing and generation for the segment.
160
232
 
161
233
  - `crop_image` controls whether to crop the file's images to the segment's
@@ -226,3 +298,21 @@ class SegmentProcessing(BaseModel):
226
298
  configuration.
227
299
  - `extended_context` uses the full page image as context for LLM generation.
228
300
  """
301
+
302
+ unknown: Optional[GenerationConfig] = FieldInfo(alias="Unknown", default=None)
303
+ """Controls the processing and generation for the segment.
304
+
305
+ - `crop_image` controls whether to crop the file's images to the segment's
306
+ bounding box. The cropped image will be stored in the segment's `image` field.
307
+ Use `All` to always crop, or `Auto` to only crop when needed for
308
+ post-processing.
309
+ - `format` specifies the output format: `Html` or `Markdown`
310
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
311
+ - `Auto`: Process content automatically
312
+ - `LLM`: Use large language models for processing
313
+ - `Ignore`: Exclude segments from final output
314
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
315
+ uses chunkr's own VLM models and is not configurable via LLM processing
316
+ configuration.
317
+ - `extended_context` uses the full page image as context for LLM generation.
318
+ """
@@ -48,6 +48,24 @@ class SegmentProcessingParam(TypedDict, total=False):
48
48
  - `extended_context` uses the full page image as context for LLM generation.
49
49
  """
50
50
 
51
+ form_region: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="FormRegion")]
52
+ """Controls the processing and generation for the segment.
53
+
54
+ - `crop_image` controls whether to crop the file's images to the segment's
55
+ bounding box. The cropped image will be stored in the segment's `image` field.
56
+ Use `All` to always crop, or `Auto` to only crop when needed for
57
+ post-processing.
58
+ - `format` specifies the output format: `Html` or `Markdown`
59
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
60
+ - `Auto`: Process content automatically
61
+ - `LLM`: Use large language models for processing
62
+ - `Ignore`: Exclude segments from final output
63
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
64
+ uses chunkr's own VLM models and is not configurable via LLM processing
65
+ configuration.
66
+ - `extended_context` uses the full page image as context for LLM generation.
67
+ """
68
+
51
69
  formula: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Formula")]
52
70
  """Controls the processing and generation for the segment.
53
71
 
@@ -66,6 +84,60 @@ class SegmentProcessingParam(TypedDict, total=False):
66
84
  - `extended_context` uses the full page image as context for LLM generation.
67
85
  """
68
86
 
87
+ graphical_item: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="GraphicalItem")]
88
+ """Controls the processing and generation for the segment.
89
+
90
+ - `crop_image` controls whether to crop the file's images to the segment's
91
+ bounding box. The cropped image will be stored in the segment's `image` field.
92
+ Use `All` to always crop, or `Auto` to only crop when needed for
93
+ post-processing.
94
+ - `format` specifies the output format: `Html` or `Markdown`
95
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
96
+ - `Auto`: Process content automatically
97
+ - `LLM`: Use large language models for processing
98
+ - `Ignore`: Exclude segments from final output
99
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
100
+ uses chunkr's own VLM models and is not configurable via LLM processing
101
+ configuration.
102
+ - `extended_context` uses the full page image as context for LLM generation.
103
+ """
104
+
105
+ legend: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Legend")]
106
+ """Controls the processing and generation for the segment.
107
+
108
+ - `crop_image` controls whether to crop the file's images to the segment's
109
+ bounding box. The cropped image will be stored in the segment's `image` field.
110
+ Use `All` to always crop, or `Auto` to only crop when needed for
111
+ post-processing.
112
+ - `format` specifies the output format: `Html` or `Markdown`
113
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
114
+ - `Auto`: Process content automatically
115
+ - `LLM`: Use large language models for processing
116
+ - `Ignore`: Exclude segments from final output
117
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
118
+ uses chunkr's own VLM models and is not configurable via LLM processing
119
+ configuration.
120
+ - `extended_context` uses the full page image as context for LLM generation.
121
+ """
122
+
123
+ line_number: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="LineNumber")]
124
+ """Controls the processing and generation for the segment.
125
+
126
+ - `crop_image` controls whether to crop the file's images to the segment's
127
+ bounding box. The cropped image will be stored in the segment's `image` field.
128
+ Use `All` to always crop, or `Auto` to only crop when needed for
129
+ post-processing.
130
+ - `format` specifies the output format: `Html` or `Markdown`
131
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
132
+ - `Auto`: Process content automatically
133
+ - `LLM`: Use large language models for processing
134
+ - `Ignore`: Exclude segments from final output
135
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
136
+ uses chunkr's own VLM models and is not configurable via LLM processing
137
+ configuration.
138
+ - `extended_context` uses the full page image as context for LLM generation.
139
+ """
140
+
69
141
  list_item: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="ListItem")]
70
142
  """Controls the processing and generation for the segment.
71
143
 
@@ -138,7 +210,7 @@ class SegmentProcessingParam(TypedDict, total=False):
138
210
  - `extended_context` uses the full page image as context for LLM generation.
139
211
  """
140
212
 
141
- picture: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Picture")]
213
+ page_number: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="PageNumber")]
142
214
  """Controls the processing and generation for the segment.
143
215
 
144
216
  - `crop_image` controls whether to crop the file's images to the segment's
@@ -156,7 +228,7 @@ class SegmentProcessingParam(TypedDict, total=False):
156
228
  - `extended_context` uses the full page image as context for LLM generation.
157
229
  """
158
230
 
159
- section_header: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="SectionHeader")]
231
+ picture: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Picture")]
160
232
  """Controls the processing and generation for the segment.
161
233
 
162
234
  - `crop_image` controls whether to crop the file's images to the segment's
@@ -227,3 +299,21 @@ class SegmentProcessingParam(TypedDict, total=False):
227
299
  configuration.
228
300
  - `extended_context` uses the full page image as context for LLM generation.
229
301
  """
302
+
303
+ unknown: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Unknown")]
304
+ """Controls the processing and generation for the segment.
305
+
306
+ - `crop_image` controls whether to crop the file's images to the segment's
307
+ bounding box. The cropped image will be stored in the segment's `image` field.
308
+ Use `All` to always crop, or `Auto` to only crop when needed for
309
+ post-processing.
310
+ - `format` specifies the output format: `Html` or `Markdown`
311
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
312
+ - `Auto`: Process content automatically
313
+ - `LLM`: Use large language models for processing
314
+ - `Ignore`: Exclude segments from final output
315
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
316
+ uses chunkr's own VLM models and is not configurable via LLM processing
317
+ configuration.
318
+ - `extended_context` uses the full page image as context for LLM generation.
319
+ """
@@ -16,6 +16,3 @@ class TaskGetParams(TypedDict, total=False):
16
16
 
17
17
  include_chunks: bool
18
18
  """Whether to include chunks in the output response"""
19
-
20
- wait_for_completion: bool
21
- """Whether to wait for the task to complete"""
@@ -55,153 +55,6 @@ class ExtractCreateResponse(BaseModel):
55
55
  array-of-primitives) contain a `Vec<Citation>` supporting that field.
56
56
  - `metrics`: mirror of `results`; only leaf positions contain a `Metrics` object
57
57
  for that field.
58
-
59
- Detailed shape:
60
-
61
- - Shared structure: `results`, `citations`, and `metrics` have the same
62
- object/array shape as the user schema. Non-leaf nodes (objects, arrays of
63
- objects) are mirrored; only leaves carry values.
64
- - Leaf definition:
65
- - A leaf is either a JSON primitive (string, number, bool, or null) or an
66
- array of primitives (including empty).
67
- - Arrays of objects are not leaves; recurse into their elements (`items`
68
- mirror index-by-index).
69
- - Null handling:
70
- - If a leaf in `results` is null, the corresponding position in `citations`
71
- and `metrics` remains null.
72
- - Arrays:
73
- - Array of objects: `citations`/`metrics` are arrays whose elements mirror
74
- each object and carry values at their own leaves.
75
- - Array of primitives: treated as a single leaf. `citations[path]` is a list
76
- of `Citation` supporting the array as a whole. `metrics[path]` is a
77
- `Metrics` object for the array as a whole.
78
- - Citations leaves:
79
- - Type: JSON array of `Citation` objects.
80
- - Each `Citation` has: `citation_id: string`, `citation_type: Segment|Word`,
81
- `bbox: BoundingBox[]`, `content: string`, `segment_id?: string`,
82
- `segment_type: SegmentType`, `ss_range?: string[]`.
83
- - Segment citation: represents a full parsed segment; `segment_id` set,
84
- `bbox` has one entry (segment box), `content` is the segment text. If the
85
- segment is from a spreadsheet, `ss_range` contains the table range
86
- (single-element array) or the underlying cell refs if available.
87
- - Word citation: represents selected OCR words within a segment;
88
- `segment_id` is null, `bbox` has one entry per word, `content` is the
89
- whitespace-joined text of those words; `segment_type` is `Text`. If OCR
90
- words came from spreadsheet cells, `ss_range` lists those cell refs.
91
- - Metrics leaves:
92
- - Type: `Metrics` object with `confidence: "High" | "Low"`, indicating whether
93
- citations sufficiently support the item.
94
-
95
- Example:
96
-
97
- results
98
-
99
- ```json
100
- {
101
- "invoice_id": "INV-001",
102
- "seller": { "name": "Acme" },
103
- "line_items": [{ "sku": "A1", "qty": 2 }],
104
- "tags": ["urgent", "paid"],
105
- "notes": null
106
- }
107
- ```
108
-
109
- citations
110
-
111
- ```json
112
- {
113
- "invoice_id": [
114
- {
115
- "citation_id": "abc1234",
116
- "citation_type": "Segment",
117
- "bbox": [{ "left": 10, "top": 20, "width": 100, "height": 18 }],
118
- "content": "Invoice INV-001",
119
- "segment_id": "seg_001",
120
- "segment_type": "Text",
121
- "ss_range": ["A1:C10"]
122
- },
123
- {
124
- "citation_id": "pqr2345",
125
- "citation_type": "Word",
126
- "bbox": [
127
- { "left": 12, "top": 24, "width": 36, "height": 18 },
128
- { "left": 52, "top": 24, "width": 48, "height": 18 }
129
- ],
130
- "content": "INV-001",
131
- "segment_id": null,
132
- "segment_type": "Text",
133
- "ss_range": ["B3", "C3"]
134
- }
135
- ],
136
- "seller": {
137
- "name": [
138
- {
139
- "citation_id": "def5678",
140
- "citation_type": "Word",
141
- "bbox": [
142
- { "left": 45, "top": 80, "width": 30, "height": 12 },
143
- { "left": 80, "top": 80, "width": 40, "height": 12 }
144
- ],
145
- "content": "Acme",
146
- "segment_id": null,
147
- "segment_type": "Text"
148
- }
149
- ]
150
- },
151
- "line_items": [
152
- {
153
- "sku": [
154
- {
155
- "citation_id": "ghi9012",
156
- "citation_type": "Segment",
157
- "bbox": [{ "left": 12, "top": 140, "width": 60, "height": 16 }],
158
- "content": "A1",
159
- "segment_id": "seg_010",
160
- "segment_type": "Text",
161
- "ss_range": ["D5:E12"]
162
- }
163
- ],
164
- "qty": [
165
- {
166
- "citation_id": "jkl3456",
167
- "citation_type": "Word",
168
- "bbox": [{ "left": 85, "top": 140, "width": 12, "height": 16 }],
169
- "content": "2",
170
- "segment_id": null,
171
- "segment_type": "Text",
172
- "ss_range": ["E12"]
173
- }
174
- ]
175
- }
176
- ],
177
- "tags": [
178
- {
179
- "citation_id": "mno7890",
180
- "citation_type": "Segment",
181
- "bbox": [{ "left": 12, "top": 200, "width": 100, "height": 16 }],
182
- "content": "urgent paid",
183
- "segment_id": "seg_020",
184
- "segment_type": "Text",
185
- "ss_range": ["A20:C25"]
186
- }
187
- ],
188
- "notes": null
189
- }
190
- ```
191
-
192
- metrics
193
-
194
- ```json
195
- {
196
- "invoice_id": { "confidence": "High" },
197
- "seller": { "name": { "confidence": "Low" } },
198
- "line_items": [
199
- { "sku": { "confidence": "High" }, "qty": { "confidence": "High" } }
200
- ],
201
- "tags": { "confidence": "Low" },
202
- "notes": null
203
- }
204
- ```
205
58
  """
206
59
 
207
60
  source_task_id: Optional[str] = None
@@ -16,6 +16,3 @@ class ExtractGetParams(TypedDict, total=False):
16
16
 
17
17
  include_chunks: bool
18
18
  """Whether to include chunks in the output response"""
19
-
20
- wait_for_completion: bool
21
- """Whether to wait for the task to complete"""
@@ -55,153 +55,6 @@ class ExtractGetResponse(BaseModel):
55
55
  array-of-primitives) contain a `Vec<Citation>` supporting that field.
56
56
  - `metrics`: mirror of `results`; only leaf positions contain a `Metrics` object
57
57
  for that field.
58
-
59
- Detailed shape:
60
-
61
- - Shared structure: `results`, `citations`, and `metrics` have the same
62
- object/array shape as the user schema. Non-leaf nodes (objects, arrays of
63
- objects) are mirrored; only leaves carry values.
64
- - Leaf definition:
65
- - A leaf is either a JSON primitive (string, number, bool, or null) or an
66
- array of primitives (including empty).
67
- - Arrays of objects are not leaves; recurse into their elements (`items`
68
- mirror index-by-index).
69
- - Null handling:
70
- - If a leaf in `results` is null, the corresponding position in `citations`
71
- and `metrics` remains null.
72
- - Arrays:
73
- - Array of objects: `citations`/`metrics` are arrays whose elements mirror
74
- each object and carry values at their own leaves.
75
- - Array of primitives: treated as a single leaf. `citations[path]` is a list
76
- of `Citation` supporting the array as a whole. `metrics[path]` is a
77
- `Metrics` object for the array as a whole.
78
- - Citations leaves:
79
- - Type: JSON array of `Citation` objects.
80
- - Each `Citation` has: `citation_id: string`, `citation_type: Segment|Word`,
81
- `bbox: BoundingBox[]`, `content: string`, `segment_id?: string`,
82
- `segment_type: SegmentType`, `ss_range?: string[]`.
83
- - Segment citation: represents a full parsed segment; `segment_id` set,
84
- `bbox` has one entry (segment box), `content` is the segment text. If the
85
- segment is from a spreadsheet, `ss_range` contains the table range
86
- (single-element array) or the underlying cell refs if available.
87
- - Word citation: represents selected OCR words within a segment;
88
- `segment_id` is null, `bbox` has one entry per word, `content` is the
89
- whitespace-joined text of those words; `segment_type` is `Text`. If OCR
90
- words came from spreadsheet cells, `ss_range` lists those cell refs.
91
- - Metrics leaves:
92
- - Type: `Metrics` object with `confidence: "High" | "Low"`, indicating whether
93
- citations sufficiently support the item.
94
-
95
- Example:
96
-
97
- results
98
-
99
- ```json
100
- {
101
- "invoice_id": "INV-001",
102
- "seller": { "name": "Acme" },
103
- "line_items": [{ "sku": "A1", "qty": 2 }],
104
- "tags": ["urgent", "paid"],
105
- "notes": null
106
- }
107
- ```
108
-
109
- citations
110
-
111
- ```json
112
- {
113
- "invoice_id": [
114
- {
115
- "citation_id": "abc1234",
116
- "citation_type": "Segment",
117
- "bbox": [{ "left": 10, "top": 20, "width": 100, "height": 18 }],
118
- "content": "Invoice INV-001",
119
- "segment_id": "seg_001",
120
- "segment_type": "Text",
121
- "ss_range": ["A1:C10"]
122
- },
123
- {
124
- "citation_id": "pqr2345",
125
- "citation_type": "Word",
126
- "bbox": [
127
- { "left": 12, "top": 24, "width": 36, "height": 18 },
128
- { "left": 52, "top": 24, "width": 48, "height": 18 }
129
- ],
130
- "content": "INV-001",
131
- "segment_id": null,
132
- "segment_type": "Text",
133
- "ss_range": ["B3", "C3"]
134
- }
135
- ],
136
- "seller": {
137
- "name": [
138
- {
139
- "citation_id": "def5678",
140
- "citation_type": "Word",
141
- "bbox": [
142
- { "left": 45, "top": 80, "width": 30, "height": 12 },
143
- { "left": 80, "top": 80, "width": 40, "height": 12 }
144
- ],
145
- "content": "Acme",
146
- "segment_id": null,
147
- "segment_type": "Text"
148
- }
149
- ]
150
- },
151
- "line_items": [
152
- {
153
- "sku": [
154
- {
155
- "citation_id": "ghi9012",
156
- "citation_type": "Segment",
157
- "bbox": [{ "left": 12, "top": 140, "width": 60, "height": 16 }],
158
- "content": "A1",
159
- "segment_id": "seg_010",
160
- "segment_type": "Text",
161
- "ss_range": ["D5:E12"]
162
- }
163
- ],
164
- "qty": [
165
- {
166
- "citation_id": "jkl3456",
167
- "citation_type": "Word",
168
- "bbox": [{ "left": 85, "top": 140, "width": 12, "height": 16 }],
169
- "content": "2",
170
- "segment_id": null,
171
- "segment_type": "Text",
172
- "ss_range": ["E12"]
173
- }
174
- ]
175
- }
176
- ],
177
- "tags": [
178
- {
179
- "citation_id": "mno7890",
180
- "citation_type": "Segment",
181
- "bbox": [{ "left": 12, "top": 200, "width": 100, "height": 16 }],
182
- "content": "urgent paid",
183
- "segment_id": "seg_020",
184
- "segment_type": "Text",
185
- "ss_range": ["A20:C25"]
186
- }
187
- ],
188
- "notes": null
189
- }
190
- ```
191
-
192
- metrics
193
-
194
- ```json
195
- {
196
- "invoice_id": { "confidence": "High" },
197
- "seller": { "name": { "confidence": "Low" } },
198
- "line_items": [
199
- { "sku": { "confidence": "High" }, "qty": { "confidence": "High" } }
200
- ],
201
- "tags": { "confidence": "Low" },
202
- "notes": null
203
- }
204
- ```
205
58
  """
206
59
 
207
60
  source_task_id: Optional[str] = None