chunkr-ai 0.1.0a6__py3-none-any.whl → 0.1.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/__init__.py +2 -0
- chunkr_ai/_base_client.py +3 -3
- chunkr_ai/_client.py +31 -3
- chunkr_ai/_compat.py +48 -48
- chunkr_ai/_constants.py +5 -5
- chunkr_ai/_exceptions.py +4 -0
- chunkr_ai/_models.py +41 -41
- chunkr_ai/_types.py +35 -1
- chunkr_ai/_utils/__init__.py +9 -2
- chunkr_ai/_utils/_compat.py +45 -0
- chunkr_ai/_utils/_datetime_parse.py +136 -0
- chunkr_ai/_utils/_transform.py +11 -1
- chunkr_ai/_utils/_typing.py +6 -1
- chunkr_ai/_utils/_utils.py +0 -1
- chunkr_ai/_version.py +1 -1
- chunkr_ai/resources/__init__.py +14 -0
- chunkr_ai/resources/files.py +3 -3
- chunkr_ai/resources/tasks/__init__.py +14 -0
- chunkr_ai/resources/tasks/extract.py +393 -0
- chunkr_ai/resources/tasks/parse.py +110 -286
- chunkr_ai/resources/tasks/tasks.py +64 -32
- chunkr_ai/resources/webhooks.py +193 -0
- chunkr_ai/types/__init__.py +27 -1
- chunkr_ai/types/bounding_box.py +19 -0
- chunkr_ai/types/cell.py +39 -0
- chunkr_ai/types/cell_style.py +28 -0
- chunkr_ai/types/chunk.py +40 -0
- chunkr_ai/types/chunk_processing.py +40 -0
- chunkr_ai/types/chunk_processing_param.py +42 -0
- chunkr_ai/types/extract_configuration.py +24 -0
- chunkr_ai/types/extract_output_response.py +62 -0
- chunkr_ai/types/file_create_params.py +2 -1
- chunkr_ai/types/file_info.py +21 -0
- chunkr_ai/types/generation_config.py +29 -0
- chunkr_ai/types/generation_config_param.py +29 -0
- chunkr_ai/types/llm_processing.py +36 -0
- chunkr_ai/types/llm_processing_param.py +36 -0
- chunkr_ai/types/ocr_result.py +28 -0
- chunkr_ai/types/page.py +27 -0
- chunkr_ai/types/parse_configuration.py +64 -0
- chunkr_ai/types/parse_configuration_param.py +65 -0
- chunkr_ai/types/parse_output_response.py +29 -0
- chunkr_ai/types/segment.py +109 -0
- chunkr_ai/types/segment_processing.py +228 -0
- chunkr_ai/types/segment_processing_param.py +229 -0
- chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
- chunkr_ai/types/task_get_params.py +0 -3
- chunkr_ai/types/task_list_params.py +7 -1
- chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
- chunkr_ai/types/task_response.py +68 -0
- chunkr_ai/types/tasks/__init__.py +7 -1
- chunkr_ai/types/tasks/extract_create_params.py +47 -0
- chunkr_ai/types/tasks/extract_create_response.py +67 -0
- chunkr_ai/types/tasks/extract_get_params.py +18 -0
- chunkr_ai/types/tasks/extract_get_response.py +67 -0
- chunkr_ai/types/tasks/parse_create_params.py +25 -793
- chunkr_ai/types/tasks/parse_create_response.py +55 -0
- chunkr_ai/types/tasks/parse_get_params.py +18 -0
- chunkr_ai/types/tasks/parse_get_response.py +55 -0
- chunkr_ai/types/unwrap_webhook_event.py +11 -0
- chunkr_ai/types/version_info.py +31 -0
- chunkr_ai/types/webhook_url_response.py +9 -0
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/METADATA +14 -13
- chunkr_ai-0.1.0a8.dist-info/RECORD +88 -0
- chunkr_ai/types/task.py +0 -1225
- chunkr_ai/types/tasks/parse_update_params.py +0 -845
- chunkr_ai-0.1.0a6.dist-info/RECORD +0 -52
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/licenses/LICENSE +0 -0
@@ -17,9 +17,13 @@ from ..._response import (
|
|
17
17
|
async_to_raw_response_wrapper,
|
18
18
|
async_to_streamed_response_wrapper,
|
19
19
|
)
|
20
|
-
from ...types.
|
21
|
-
from ...types.tasks import parse_create_params, parse_update_params
|
20
|
+
from ...types.tasks import parse_get_params, parse_create_params
|
22
21
|
from ..._base_client import make_request_options
|
22
|
+
from ...types.llm_processing_param import LlmProcessingParam
|
23
|
+
from ...types.chunk_processing_param import ChunkProcessingParam
|
24
|
+
from ...types.segment_processing_param import SegmentProcessingParam
|
25
|
+
from ...types.tasks.parse_get_response import ParseGetResponse
|
26
|
+
from ...types.tasks.parse_create_response import ParseCreateResponse
|
23
27
|
|
24
28
|
__all__ = ["ParseResource", "AsyncParseResource"]
|
25
29
|
|
@@ -48,15 +52,15 @@ class ParseResource(SyncAPIResource):
|
|
48
52
|
self,
|
49
53
|
*,
|
50
54
|
file: str,
|
51
|
-
chunk_processing:
|
52
|
-
error_handling:
|
55
|
+
chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
|
56
|
+
error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
|
53
57
|
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
54
58
|
file_name: Optional[str] | NotGiven = NOT_GIVEN,
|
55
|
-
llm_processing:
|
56
|
-
ocr_strategy:
|
57
|
-
pipeline:
|
58
|
-
segment_processing: Optional[
|
59
|
-
segmentation_strategy:
|
59
|
+
llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
|
60
|
+
ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
|
61
|
+
pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
|
62
|
+
segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
|
63
|
+
segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
|
60
64
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
61
65
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
62
66
|
extra_headers: Headers | None = None,
|
@@ -64,17 +68,17 @@ class ParseResource(SyncAPIResource):
|
|
64
68
|
extra_body: Body | None = None,
|
65
69
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
66
70
|
idempotency_key: str | None = None,
|
67
|
-
) ->
|
71
|
+
) -> ParseCreateResponse:
|
68
72
|
"""
|
69
73
|
Queues a document for processing and returns a `TaskResponse` with the assigned
|
70
74
|
`task_id`, initial configuration, file metadata, and timestamps. The initial
|
71
75
|
status is `Starting`.
|
72
76
|
|
73
|
-
Creates a task and returns its metadata immediately.
|
77
|
+
Creates a parse task and returns its metadata immediately.
|
74
78
|
|
75
79
|
Args:
|
76
80
|
file:
|
77
|
-
The file to be
|
81
|
+
The file to be parsed. Supported inputs:
|
78
82
|
|
79
83
|
- `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
|
80
84
|
API
|
@@ -93,7 +97,7 @@ class ParseResource(SyncAPIResource):
|
|
93
97
|
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
94
98
|
updated, polled or accessed via web interface.
|
95
99
|
|
96
|
-
file_name: The name of the file to be
|
100
|
+
file_name: The name of the file to be parsed. If not set a name will be generated.
|
97
101
|
|
98
102
|
llm_processing: Controls the LLM used for the task.
|
99
103
|
|
@@ -104,41 +108,26 @@ class ParseResource(SyncAPIResource):
|
|
104
108
|
text. When text layer is present the bounding boxes from the text layer are
|
105
109
|
used.
|
106
110
|
|
107
|
-
|
108
|
-
output will be unified to the Chunkr `output` format.
|
111
|
+
segment_processing: Configuration for how each document segment is processed and formatted.
|
109
112
|
|
110
|
-
|
113
|
+
Each segment has sensible defaults, but you can override specific settings:
|
111
114
|
|
112
|
-
|
115
|
+
- `format`: Output as `Html` or `Markdown`
|
116
|
+
- `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
|
117
|
+
- `crop_image`: Whether to crop images to segment bounds
|
118
|
+
- `extended_context`: Use full page as context for LLM processing
|
119
|
+
- `description`: Generate descriptions for segments
|
113
120
|
|
114
|
-
|
115
|
-
- How the content is produced (rule-based vs. LLM).
|
116
|
-
- The output format (`Html` or `Markdown`).
|
121
|
+
**Defaults per segment type:** Check the documentation for more details.
|
117
122
|
|
118
|
-
|
119
|
-
**descriptions** further refine behaviour.
|
120
|
-
|
121
|
-
**Default strategy per segment**
|
122
|
-
|
123
|
-
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
124
|
-
(Markdown, description off)
|
125
|
-
- `Table` → **LLM** (HTML, description on)
|
126
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
127
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
128
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
129
|
-
|
130
|
-
**Strategy reference**
|
131
|
-
|
132
|
-
- **Auto** – rule-based content generation.
|
133
|
-
- **LLM** – generate content with an LLM.
|
134
|
-
- **Ignore** – exclude the segment entirely.
|
123
|
+
Only specify the fields you want to change - everything else uses the defaults.
|
135
124
|
|
136
125
|
segmentation_strategy:
|
137
126
|
Controls the segmentation strategy:
|
138
127
|
|
139
128
|
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
140
129
|
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
141
|
-
segmentation and better chunking.
|
130
|
+
segmentation and better chunking.
|
142
131
|
- `Page`: Treats each page as a single segment. Faster processing, but without
|
143
132
|
layout element detection and only simple chunking.
|
144
133
|
|
@@ -176,104 +165,39 @@ class ParseResource(SyncAPIResource):
|
|
176
165
|
timeout=timeout,
|
177
166
|
idempotency_key=idempotency_key,
|
178
167
|
),
|
179
|
-
cast_to=
|
168
|
+
cast_to=ParseCreateResponse,
|
180
169
|
)
|
181
170
|
|
182
|
-
def
|
171
|
+
def get(
|
183
172
|
self,
|
184
|
-
task_id: str,
|
173
|
+
task_id: Optional[str],
|
185
174
|
*,
|
186
|
-
|
187
|
-
|
188
|
-
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
189
|
-
high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
|
190
|
-
llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
|
191
|
-
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
192
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
193
|
-
segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
|
194
|
-
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
175
|
+
base64_urls: bool | NotGiven = NOT_GIVEN,
|
176
|
+
include_chunks: bool | NotGiven = NOT_GIVEN,
|
195
177
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
196
178
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
197
179
|
extra_headers: Headers | None = None,
|
198
180
|
extra_query: Query | None = None,
|
199
181
|
extra_body: Body | None = None,
|
200
182
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
The
|
206
|
-
current configuration is used as the base; only provided fields are changed.
|
183
|
+
) -> ParseGetResponse:
|
184
|
+
"""
|
185
|
+
Retrieves the current state of a parse task.
|
207
186
|
|
208
|
-
|
187
|
+
Returns task details such as processing status, configuration, output (when
|
188
|
+
available), file metadata, and timestamps.
|
209
189
|
|
210
|
-
|
211
|
-
- The new configuration must differ from the current configuration.
|
190
|
+
Typical uses:
|
212
191
|
|
213
|
-
|
192
|
+
- Poll a task during processing
|
193
|
+
- Retrieve the final output once processing is complete
|
194
|
+
- Access task metadata and configuration
|
214
195
|
|
215
196
|
Args:
|
216
|
-
|
217
|
-
|
218
|
-
error_handling:
|
219
|
-
Controls how errors are handled during processing:
|
220
|
-
|
221
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
222
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
223
|
-
LLM refusals etc.)
|
224
|
-
|
225
|
-
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
226
|
-
updated, polled or accessed via web interface.
|
227
|
-
|
228
|
-
high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
|
229
|
-
penalty: ~7 seconds per page)
|
230
|
-
|
231
|
-
llm_processing: Controls the LLM used for the task.
|
232
|
-
|
233
|
-
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
234
|
-
|
235
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
236
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
237
|
-
text. When text layer is present the bounding boxes from the text layer are
|
238
|
-
used.
|
239
|
-
|
240
|
-
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
241
|
-
output will be unified to the Chunkr `output` format.
|
197
|
+
base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
|
198
|
+
presigned URLs.
|
242
199
|
|
243
|
-
|
244
|
-
|
245
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
246
|
-
|
247
|
-
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
248
|
-
- How the content is produced (rule-based vs. LLM).
|
249
|
-
- The output format (`Html` or `Markdown`).
|
250
|
-
|
251
|
-
Optional flags such as image **cropping**, **extended context**, and
|
252
|
-
**descriptions** further refine behaviour.
|
253
|
-
|
254
|
-
**Default strategy per segment**
|
255
|
-
|
256
|
-
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
257
|
-
(Markdown, description off)
|
258
|
-
- `Table` → **LLM** (HTML, description on)
|
259
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
260
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
261
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
262
|
-
|
263
|
-
**Strategy reference**
|
264
|
-
|
265
|
-
- **Auto** – rule-based content generation.
|
266
|
-
- **LLM** – generate content with an LLM.
|
267
|
-
- **Ignore** – exclude the segment entirely.
|
268
|
-
|
269
|
-
segmentation_strategy:
|
270
|
-
Controls the segmentation strategy:
|
271
|
-
|
272
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
273
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
274
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
275
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
276
|
-
layout element detection and only simple chunking.
|
200
|
+
include_chunks: Whether to include chunks in the output response
|
277
201
|
|
278
202
|
extra_headers: Send extra headers
|
279
203
|
|
@@ -282,35 +206,25 @@ class ParseResource(SyncAPIResource):
|
|
282
206
|
extra_body: Add additional JSON properties to the request
|
283
207
|
|
284
208
|
timeout: Override the client-level default timeout for this request, in seconds
|
285
|
-
|
286
|
-
idempotency_key: Specify a custom idempotency key for this request
|
287
209
|
"""
|
288
210
|
if not task_id:
|
289
211
|
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
290
|
-
return self.
|
291
|
-
f"/tasks/
|
292
|
-
body=maybe_transform(
|
293
|
-
{
|
294
|
-
"chunk_processing": chunk_processing,
|
295
|
-
"error_handling": error_handling,
|
296
|
-
"expires_in": expires_in,
|
297
|
-
"high_resolution": high_resolution,
|
298
|
-
"llm_processing": llm_processing,
|
299
|
-
"ocr_strategy": ocr_strategy,
|
300
|
-
"pipeline": pipeline,
|
301
|
-
"segment_processing": segment_processing,
|
302
|
-
"segmentation_strategy": segmentation_strategy,
|
303
|
-
},
|
304
|
-
parse_update_params.ParseUpdateParams,
|
305
|
-
),
|
212
|
+
return self._get(
|
213
|
+
f"/tasks/{task_id}/parse",
|
306
214
|
options=make_request_options(
|
307
215
|
extra_headers=extra_headers,
|
308
216
|
extra_query=extra_query,
|
309
217
|
extra_body=extra_body,
|
310
218
|
timeout=timeout,
|
311
|
-
|
219
|
+
query=maybe_transform(
|
220
|
+
{
|
221
|
+
"base64_urls": base64_urls,
|
222
|
+
"include_chunks": include_chunks,
|
223
|
+
},
|
224
|
+
parse_get_params.ParseGetParams,
|
225
|
+
),
|
312
226
|
),
|
313
|
-
cast_to=
|
227
|
+
cast_to=ParseGetResponse,
|
314
228
|
)
|
315
229
|
|
316
230
|
|
@@ -338,15 +252,15 @@ class AsyncParseResource(AsyncAPIResource):
|
|
338
252
|
self,
|
339
253
|
*,
|
340
254
|
file: str,
|
341
|
-
chunk_processing:
|
342
|
-
error_handling:
|
255
|
+
chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
|
256
|
+
error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
|
343
257
|
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
344
258
|
file_name: Optional[str] | NotGiven = NOT_GIVEN,
|
345
|
-
llm_processing:
|
346
|
-
ocr_strategy:
|
347
|
-
pipeline:
|
348
|
-
segment_processing: Optional[
|
349
|
-
segmentation_strategy:
|
259
|
+
llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
|
260
|
+
ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
|
261
|
+
pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
|
262
|
+
segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
|
263
|
+
segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
|
350
264
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
351
265
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
352
266
|
extra_headers: Headers | None = None,
|
@@ -354,17 +268,17 @@ class AsyncParseResource(AsyncAPIResource):
|
|
354
268
|
extra_body: Body | None = None,
|
355
269
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
356
270
|
idempotency_key: str | None = None,
|
357
|
-
) ->
|
271
|
+
) -> ParseCreateResponse:
|
358
272
|
"""
|
359
273
|
Queues a document for processing and returns a `TaskResponse` with the assigned
|
360
274
|
`task_id`, initial configuration, file metadata, and timestamps. The initial
|
361
275
|
status is `Starting`.
|
362
276
|
|
363
|
-
Creates a task and returns its metadata immediately.
|
277
|
+
Creates a parse task and returns its metadata immediately.
|
364
278
|
|
365
279
|
Args:
|
366
280
|
file:
|
367
|
-
The file to be
|
281
|
+
The file to be parsed. Supported inputs:
|
368
282
|
|
369
283
|
- `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
|
370
284
|
API
|
@@ -383,7 +297,7 @@ class AsyncParseResource(AsyncAPIResource):
|
|
383
297
|
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
384
298
|
updated, polled or accessed via web interface.
|
385
299
|
|
386
|
-
file_name: The name of the file to be
|
300
|
+
file_name: The name of the file to be parsed. If not set a name will be generated.
|
387
301
|
|
388
302
|
llm_processing: Controls the LLM used for the task.
|
389
303
|
|
@@ -394,41 +308,26 @@ class AsyncParseResource(AsyncAPIResource):
|
|
394
308
|
text. When text layer is present the bounding boxes from the text layer are
|
395
309
|
used.
|
396
310
|
|
397
|
-
|
398
|
-
output will be unified to the Chunkr `output` format.
|
311
|
+
segment_processing: Configuration for how each document segment is processed and formatted.
|
399
312
|
|
400
|
-
|
313
|
+
Each segment has sensible defaults, but you can override specific settings:
|
401
314
|
|
402
|
-
|
315
|
+
- `format`: Output as `Html` or `Markdown`
|
316
|
+
- `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
|
317
|
+
- `crop_image`: Whether to crop images to segment bounds
|
318
|
+
- `extended_context`: Use full page as context for LLM processing
|
319
|
+
- `description`: Generate descriptions for segments
|
403
320
|
|
404
|
-
|
405
|
-
- How the content is produced (rule-based vs. LLM).
|
406
|
-
- The output format (`Html` or `Markdown`).
|
321
|
+
**Defaults per segment type:** Check the documentation for more details.
|
407
322
|
|
408
|
-
|
409
|
-
**descriptions** further refine behaviour.
|
410
|
-
|
411
|
-
**Default strategy per segment**
|
412
|
-
|
413
|
-
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
414
|
-
(Markdown, description off)
|
415
|
-
- `Table` → **LLM** (HTML, description on)
|
416
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
417
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
418
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
419
|
-
|
420
|
-
**Strategy reference**
|
421
|
-
|
422
|
-
- **Auto** – rule-based content generation.
|
423
|
-
- **LLM** – generate content with an LLM.
|
424
|
-
- **Ignore** – exclude the segment entirely.
|
323
|
+
Only specify the fields you want to change - everything else uses the defaults.
|
425
324
|
|
426
325
|
segmentation_strategy:
|
427
326
|
Controls the segmentation strategy:
|
428
327
|
|
429
328
|
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
430
329
|
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
431
|
-
segmentation and better chunking.
|
330
|
+
segmentation and better chunking.
|
432
331
|
- `Page`: Treats each page as a single segment. Faster processing, but without
|
433
332
|
layout element detection and only simple chunking.
|
434
333
|
|
@@ -466,104 +365,39 @@ class AsyncParseResource(AsyncAPIResource):
|
|
466
365
|
timeout=timeout,
|
467
366
|
idempotency_key=idempotency_key,
|
468
367
|
),
|
469
|
-
cast_to=
|
368
|
+
cast_to=ParseCreateResponse,
|
470
369
|
)
|
471
370
|
|
472
|
-
async def
|
371
|
+
async def get(
|
473
372
|
self,
|
474
|
-
task_id: str,
|
373
|
+
task_id: Optional[str],
|
475
374
|
*,
|
476
|
-
|
477
|
-
|
478
|
-
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
479
|
-
high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
|
480
|
-
llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
|
481
|
-
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
482
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
483
|
-
segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
|
484
|
-
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
375
|
+
base64_urls: bool | NotGiven = NOT_GIVEN,
|
376
|
+
include_chunks: bool | NotGiven = NOT_GIVEN,
|
485
377
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
486
378
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
487
379
|
extra_headers: Headers | None = None,
|
488
380
|
extra_query: Query | None = None,
|
489
381
|
extra_body: Body | None = None,
|
490
382
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
The
|
496
|
-
current configuration is used as the base; only provided fields are changed.
|
383
|
+
) -> ParseGetResponse:
|
384
|
+
"""
|
385
|
+
Retrieves the current state of a parse task.
|
497
386
|
|
498
|
-
|
387
|
+
Returns task details such as processing status, configuration, output (when
|
388
|
+
available), file metadata, and timestamps.
|
499
389
|
|
500
|
-
|
501
|
-
- The new configuration must differ from the current configuration.
|
390
|
+
Typical uses:
|
502
391
|
|
503
|
-
|
392
|
+
- Poll a task during processing
|
393
|
+
- Retrieve the final output once processing is complete
|
394
|
+
- Access task metadata and configuration
|
504
395
|
|
505
396
|
Args:
|
506
|
-
|
507
|
-
|
508
|
-
error_handling:
|
509
|
-
Controls how errors are handled during processing:
|
510
|
-
|
511
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
512
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
513
|
-
LLM refusals etc.)
|
514
|
-
|
515
|
-
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
516
|
-
updated, polled or accessed via web interface.
|
517
|
-
|
518
|
-
high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
|
519
|
-
penalty: ~7 seconds per page)
|
520
|
-
|
521
|
-
llm_processing: Controls the LLM used for the task.
|
522
|
-
|
523
|
-
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
524
|
-
|
525
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
526
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
527
|
-
text. When text layer is present the bounding boxes from the text layer are
|
528
|
-
used.
|
529
|
-
|
530
|
-
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
531
|
-
output will be unified to the Chunkr `output` format.
|
397
|
+
base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
|
398
|
+
presigned URLs.
|
532
399
|
|
533
|
-
|
534
|
-
|
535
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
536
|
-
|
537
|
-
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
538
|
-
- How the content is produced (rule-based vs. LLM).
|
539
|
-
- The output format (`Html` or `Markdown`).
|
540
|
-
|
541
|
-
Optional flags such as image **cropping**, **extended context**, and
|
542
|
-
**descriptions** further refine behaviour.
|
543
|
-
|
544
|
-
**Default strategy per segment**
|
545
|
-
|
546
|
-
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
547
|
-
(Markdown, description off)
|
548
|
-
- `Table` → **LLM** (HTML, description on)
|
549
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
550
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
551
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
552
|
-
|
553
|
-
**Strategy reference**
|
554
|
-
|
555
|
-
- **Auto** – rule-based content generation.
|
556
|
-
- **LLM** – generate content with an LLM.
|
557
|
-
- **Ignore** – exclude the segment entirely.
|
558
|
-
|
559
|
-
segmentation_strategy:
|
560
|
-
Controls the segmentation strategy:
|
561
|
-
|
562
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
563
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
564
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
565
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
566
|
-
layout element detection and only simple chunking.
|
400
|
+
include_chunks: Whether to include chunks in the output response
|
567
401
|
|
568
402
|
extra_headers: Send extra headers
|
569
403
|
|
@@ -572,35 +406,25 @@ class AsyncParseResource(AsyncAPIResource):
|
|
572
406
|
extra_body: Add additional JSON properties to the request
|
573
407
|
|
574
408
|
timeout: Override the client-level default timeout for this request, in seconds
|
575
|
-
|
576
|
-
idempotency_key: Specify a custom idempotency key for this request
|
577
409
|
"""
|
578
410
|
if not task_id:
|
579
411
|
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
580
|
-
return await self.
|
581
|
-
f"/tasks/
|
582
|
-
body=await async_maybe_transform(
|
583
|
-
{
|
584
|
-
"chunk_processing": chunk_processing,
|
585
|
-
"error_handling": error_handling,
|
586
|
-
"expires_in": expires_in,
|
587
|
-
"high_resolution": high_resolution,
|
588
|
-
"llm_processing": llm_processing,
|
589
|
-
"ocr_strategy": ocr_strategy,
|
590
|
-
"pipeline": pipeline,
|
591
|
-
"segment_processing": segment_processing,
|
592
|
-
"segmentation_strategy": segmentation_strategy,
|
593
|
-
},
|
594
|
-
parse_update_params.ParseUpdateParams,
|
595
|
-
),
|
412
|
+
return await self._get(
|
413
|
+
f"/tasks/{task_id}/parse",
|
596
414
|
options=make_request_options(
|
597
415
|
extra_headers=extra_headers,
|
598
416
|
extra_query=extra_query,
|
599
417
|
extra_body=extra_body,
|
600
418
|
timeout=timeout,
|
601
|
-
|
419
|
+
query=await async_maybe_transform(
|
420
|
+
{
|
421
|
+
"base64_urls": base64_urls,
|
422
|
+
"include_chunks": include_chunks,
|
423
|
+
},
|
424
|
+
parse_get_params.ParseGetParams,
|
425
|
+
),
|
602
426
|
),
|
603
|
-
cast_to=
|
427
|
+
cast_to=ParseGetResponse,
|
604
428
|
)
|
605
429
|
|
606
430
|
|
@@ -611,8 +435,8 @@ class ParseResourceWithRawResponse:
|
|
611
435
|
self.create = to_raw_response_wrapper(
|
612
436
|
parse.create,
|
613
437
|
)
|
614
|
-
self.
|
615
|
-
parse.
|
438
|
+
self.get = to_raw_response_wrapper(
|
439
|
+
parse.get,
|
616
440
|
)
|
617
441
|
|
618
442
|
|
@@ -623,8 +447,8 @@ class AsyncParseResourceWithRawResponse:
|
|
623
447
|
self.create = async_to_raw_response_wrapper(
|
624
448
|
parse.create,
|
625
449
|
)
|
626
|
-
self.
|
627
|
-
parse.
|
450
|
+
self.get = async_to_raw_response_wrapper(
|
451
|
+
parse.get,
|
628
452
|
)
|
629
453
|
|
630
454
|
|
@@ -635,8 +459,8 @@ class ParseResourceWithStreamingResponse:
|
|
635
459
|
self.create = to_streamed_response_wrapper(
|
636
460
|
parse.create,
|
637
461
|
)
|
638
|
-
self.
|
639
|
-
parse.
|
462
|
+
self.get = to_streamed_response_wrapper(
|
463
|
+
parse.get,
|
640
464
|
)
|
641
465
|
|
642
466
|
|
@@ -647,6 +471,6 @@ class AsyncParseResourceWithStreamingResponse:
|
|
647
471
|
self.create = async_to_streamed_response_wrapper(
|
648
472
|
parse.create,
|
649
473
|
)
|
650
|
-
self.
|
651
|
-
parse.
|
474
|
+
self.get = async_to_streamed_response_wrapper(
|
475
|
+
parse.get,
|
652
476
|
)
|