chunkr-ai 0.1.0a6__py3-none-any.whl → 0.1.0a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/__init__.py +2 -0
- chunkr_ai/_client.py +31 -3
- chunkr_ai/_constants.py +5 -5
- chunkr_ai/_exceptions.py +4 -0
- chunkr_ai/_models.py +1 -1
- chunkr_ai/_types.py +35 -1
- chunkr_ai/_utils/__init__.py +1 -0
- chunkr_ai/_utils/_typing.py +5 -0
- chunkr_ai/_version.py +1 -1
- chunkr_ai/resources/__init__.py +14 -0
- chunkr_ai/resources/files.py +3 -3
- chunkr_ai/resources/tasks/__init__.py +14 -0
- chunkr_ai/resources/tasks/extract.py +409 -0
- chunkr_ai/resources/tasks/parse.py +124 -284
- chunkr_ai/resources/tasks/tasks.py +62 -14
- chunkr_ai/resources/webhooks.py +193 -0
- chunkr_ai/types/__init__.py +27 -1
- chunkr_ai/types/bounding_box.py +19 -0
- chunkr_ai/types/cell.py +39 -0
- chunkr_ai/types/cell_style.py +28 -0
- chunkr_ai/types/chunk.py +40 -0
- chunkr_ai/types/chunk_processing.py +40 -0
- chunkr_ai/types/chunk_processing_param.py +42 -0
- chunkr_ai/types/extract_configuration.py +24 -0
- chunkr_ai/types/extract_output_response.py +19 -0
- chunkr_ai/types/file_create_params.py +2 -1
- chunkr_ai/types/file_info.py +21 -0
- chunkr_ai/types/generation_config.py +29 -0
- chunkr_ai/types/generation_config_param.py +29 -0
- chunkr_ai/types/llm_processing.py +36 -0
- chunkr_ai/types/llm_processing_param.py +36 -0
- chunkr_ai/types/ocr_result.py +28 -0
- chunkr_ai/types/page.py +27 -0
- chunkr_ai/types/parse_configuration.py +64 -0
- chunkr_ai/types/parse_configuration_param.py +65 -0
- chunkr_ai/types/parse_output_response.py +29 -0
- chunkr_ai/types/segment.py +109 -0
- chunkr_ai/types/segment_processing.py +228 -0
- chunkr_ai/types/segment_processing_param.py +229 -0
- chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
- chunkr_ai/types/task_list_params.py +7 -1
- chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
- chunkr_ai/types/task_response.py +68 -0
- chunkr_ai/types/tasks/__init__.py +7 -1
- chunkr_ai/types/tasks/extract_create_params.py +47 -0
- chunkr_ai/types/tasks/extract_create_response.py +214 -0
- chunkr_ai/types/tasks/extract_get_params.py +21 -0
- chunkr_ai/types/tasks/extract_get_response.py +214 -0
- chunkr_ai/types/tasks/parse_create_params.py +25 -793
- chunkr_ai/types/tasks/parse_create_response.py +55 -0
- chunkr_ai/types/tasks/parse_get_params.py +21 -0
- chunkr_ai/types/tasks/parse_get_response.py +55 -0
- chunkr_ai/types/unwrap_webhook_event.py +11 -0
- chunkr_ai/types/version_info.py +31 -0
- chunkr_ai/types/webhook_url_response.py +9 -0
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a7.dist-info}/METADATA +14 -13
- chunkr_ai-0.1.0a7.dist-info/RECORD +86 -0
- chunkr_ai/types/task.py +0 -1225
- chunkr_ai/types/tasks/parse_update_params.py +0 -845
- chunkr_ai-0.1.0a6.dist-info/RECORD +0 -52
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a7.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a7.dist-info}/licenses/LICENSE +0 -0
@@ -17,9 +17,13 @@ from ..._response import (
|
|
17
17
|
async_to_raw_response_wrapper,
|
18
18
|
async_to_streamed_response_wrapper,
|
19
19
|
)
|
20
|
-
from ...types.
|
21
|
-
from ...types.tasks import parse_create_params, parse_update_params
|
20
|
+
from ...types.tasks import parse_get_params, parse_create_params
|
22
21
|
from ..._base_client import make_request_options
|
22
|
+
from ...types.llm_processing_param import LlmProcessingParam
|
23
|
+
from ...types.chunk_processing_param import ChunkProcessingParam
|
24
|
+
from ...types.segment_processing_param import SegmentProcessingParam
|
25
|
+
from ...types.tasks.parse_get_response import ParseGetResponse
|
26
|
+
from ...types.tasks.parse_create_response import ParseCreateResponse
|
23
27
|
|
24
28
|
__all__ = ["ParseResource", "AsyncParseResource"]
|
25
29
|
|
@@ -48,15 +52,15 @@ class ParseResource(SyncAPIResource):
|
|
48
52
|
self,
|
49
53
|
*,
|
50
54
|
file: str,
|
51
|
-
chunk_processing:
|
52
|
-
error_handling:
|
55
|
+
chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
|
56
|
+
error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
|
53
57
|
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
54
58
|
file_name: Optional[str] | NotGiven = NOT_GIVEN,
|
55
|
-
llm_processing:
|
56
|
-
ocr_strategy:
|
57
|
-
pipeline:
|
58
|
-
segment_processing: Optional[
|
59
|
-
segmentation_strategy:
|
59
|
+
llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
|
60
|
+
ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
|
61
|
+
pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
|
62
|
+
segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
|
63
|
+
segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
|
60
64
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
61
65
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
62
66
|
extra_headers: Headers | None = None,
|
@@ -64,17 +68,17 @@ class ParseResource(SyncAPIResource):
|
|
64
68
|
extra_body: Body | None = None,
|
65
69
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
66
70
|
idempotency_key: str | None = None,
|
67
|
-
) ->
|
71
|
+
) -> ParseCreateResponse:
|
68
72
|
"""
|
69
73
|
Queues a document for processing and returns a `TaskResponse` with the assigned
|
70
74
|
`task_id`, initial configuration, file metadata, and timestamps. The initial
|
71
75
|
status is `Starting`.
|
72
76
|
|
73
|
-
Creates a task and returns its metadata immediately.
|
77
|
+
Creates a parse task and returns its metadata immediately.
|
74
78
|
|
75
79
|
Args:
|
76
80
|
file:
|
77
|
-
The file to be
|
81
|
+
The file to be parsed. Supported inputs:
|
78
82
|
|
79
83
|
- `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
|
80
84
|
API
|
@@ -93,7 +97,7 @@ class ParseResource(SyncAPIResource):
|
|
93
97
|
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
94
98
|
updated, polled or accessed via web interface.
|
95
99
|
|
96
|
-
file_name: The name of the file to be
|
100
|
+
file_name: The name of the file to be parsed. If not set a name will be generated.
|
97
101
|
|
98
102
|
llm_processing: Controls the LLM used for the task.
|
99
103
|
|
@@ -104,41 +108,26 @@ class ParseResource(SyncAPIResource):
|
|
104
108
|
text. When text layer is present the bounding boxes from the text layer are
|
105
109
|
used.
|
106
110
|
|
107
|
-
|
108
|
-
output will be unified to the Chunkr `output` format.
|
111
|
+
segment_processing: Configuration for how each document segment is processed and formatted.
|
109
112
|
|
110
|
-
|
113
|
+
Each segment has sensible defaults, but you can override specific settings:
|
111
114
|
|
112
|
-
|
115
|
+
- `format`: Output as `Html` or `Markdown`
|
116
|
+
- `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
|
117
|
+
- `crop_image`: Whether to crop images to segment bounds
|
118
|
+
- `extended_context`: Use full page as context for LLM processing
|
119
|
+
- `description`: Generate descriptions for segments
|
113
120
|
|
114
|
-
|
115
|
-
- How the content is produced (rule-based vs. LLM).
|
116
|
-
- The output format (`Html` or `Markdown`).
|
121
|
+
**Defaults per segment type:** Check the documentation for more details.
|
117
122
|
|
118
|
-
|
119
|
-
**descriptions** further refine behaviour.
|
120
|
-
|
121
|
-
**Default strategy per segment**
|
122
|
-
|
123
|
-
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
124
|
-
(Markdown, description off)
|
125
|
-
- `Table` → **LLM** (HTML, description on)
|
126
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
127
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
128
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
129
|
-
|
130
|
-
**Strategy reference**
|
131
|
-
|
132
|
-
- **Auto** – rule-based content generation.
|
133
|
-
- **LLM** – generate content with an LLM.
|
134
|
-
- **Ignore** – exclude the segment entirely.
|
123
|
+
Only specify the fields you want to change - everything else uses the defaults.
|
135
124
|
|
136
125
|
segmentation_strategy:
|
137
126
|
Controls the segmentation strategy:
|
138
127
|
|
139
128
|
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
140
129
|
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
141
|
-
segmentation and better chunking.
|
130
|
+
segmentation and better chunking.
|
142
131
|
- `Page`: Treats each page as a single segment. Faster processing, but without
|
143
132
|
layout element detection and only simple chunking.
|
144
133
|
|
@@ -176,104 +165,46 @@ class ParseResource(SyncAPIResource):
|
|
176
165
|
timeout=timeout,
|
177
166
|
idempotency_key=idempotency_key,
|
178
167
|
),
|
179
|
-
cast_to=
|
168
|
+
cast_to=ParseCreateResponse,
|
180
169
|
)
|
181
170
|
|
182
|
-
def
|
171
|
+
def get(
|
183
172
|
self,
|
184
|
-
task_id: str,
|
173
|
+
task_id: Optional[str],
|
185
174
|
*,
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
|
190
|
-
llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
|
191
|
-
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
192
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
193
|
-
segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
|
194
|
-
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
175
|
+
base64_urls: bool | NotGiven = NOT_GIVEN,
|
176
|
+
include_chunks: bool | NotGiven = NOT_GIVEN,
|
177
|
+
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
195
178
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
196
179
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
197
180
|
extra_headers: Headers | None = None,
|
198
181
|
extra_query: Query | None = None,
|
199
182
|
extra_body: Body | None = None,
|
200
183
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
The
|
206
|
-
current configuration is used as the base; only provided fields are changed.
|
184
|
+
) -> ParseGetResponse:
|
185
|
+
"""
|
186
|
+
Retrieves the current state of a parse task and, when requested, can wait for
|
187
|
+
completion.
|
207
188
|
|
208
|
-
|
189
|
+
Returns task details such as processing status, configuration, output (when
|
190
|
+
available), file metadata, and timestamps. If `wait_for_completion=true` is
|
191
|
+
provided, the server will hold the request briefly. If the task does not reach a
|
192
|
+
terminal state during that window, the response will indicate a retry with
|
193
|
+
appropriate headers.
|
209
194
|
|
210
|
-
|
211
|
-
- The new configuration must differ from the current configuration.
|
195
|
+
Typical uses:
|
212
196
|
|
213
|
-
|
197
|
+
- Poll a task during processing
|
198
|
+
- Retrieve the final output once processing is complete
|
199
|
+
- Access task metadata and configuration
|
214
200
|
|
215
201
|
Args:
|
216
|
-
|
217
|
-
|
218
|
-
error_handling:
|
219
|
-
Controls how errors are handled during processing:
|
220
|
-
|
221
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
222
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
223
|
-
LLM refusals etc.)
|
224
|
-
|
225
|
-
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
226
|
-
updated, polled or accessed via web interface.
|
227
|
-
|
228
|
-
high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
|
229
|
-
penalty: ~7 seconds per page)
|
230
|
-
|
231
|
-
llm_processing: Controls the LLM used for the task.
|
232
|
-
|
233
|
-
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
234
|
-
|
235
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
236
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
237
|
-
text. When text layer is present the bounding boxes from the text layer are
|
238
|
-
used.
|
239
|
-
|
240
|
-
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
241
|
-
output will be unified to the Chunkr `output` format.
|
202
|
+
base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
|
203
|
+
presigned URLs.
|
242
204
|
|
243
|
-
|
205
|
+
include_chunks: Whether to include chunks in the output response
|
244
206
|
|
245
|
-
|
246
|
-
|
247
|
-
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
248
|
-
- How the content is produced (rule-based vs. LLM).
|
249
|
-
- The output format (`Html` or `Markdown`).
|
250
|
-
|
251
|
-
Optional flags such as image **cropping**, **extended context**, and
|
252
|
-
**descriptions** further refine behaviour.
|
253
|
-
|
254
|
-
**Default strategy per segment**
|
255
|
-
|
256
|
-
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
257
|
-
(Markdown, description off)
|
258
|
-
- `Table` → **LLM** (HTML, description on)
|
259
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
260
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
261
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
262
|
-
|
263
|
-
**Strategy reference**
|
264
|
-
|
265
|
-
- **Auto** – rule-based content generation.
|
266
|
-
- **LLM** – generate content with an LLM.
|
267
|
-
- **Ignore** – exclude the segment entirely.
|
268
|
-
|
269
|
-
segmentation_strategy:
|
270
|
-
Controls the segmentation strategy:
|
271
|
-
|
272
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
273
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
274
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
275
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
276
|
-
layout element detection and only simple chunking.
|
207
|
+
wait_for_completion: Whether to wait for the task to complete
|
277
208
|
|
278
209
|
extra_headers: Send extra headers
|
279
210
|
|
@@ -282,35 +213,26 @@ class ParseResource(SyncAPIResource):
|
|
282
213
|
extra_body: Add additional JSON properties to the request
|
283
214
|
|
284
215
|
timeout: Override the client-level default timeout for this request, in seconds
|
285
|
-
|
286
|
-
idempotency_key: Specify a custom idempotency key for this request
|
287
216
|
"""
|
288
217
|
if not task_id:
|
289
218
|
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
290
|
-
return self.
|
291
|
-
f"/tasks/
|
292
|
-
body=maybe_transform(
|
293
|
-
{
|
294
|
-
"chunk_processing": chunk_processing,
|
295
|
-
"error_handling": error_handling,
|
296
|
-
"expires_in": expires_in,
|
297
|
-
"high_resolution": high_resolution,
|
298
|
-
"llm_processing": llm_processing,
|
299
|
-
"ocr_strategy": ocr_strategy,
|
300
|
-
"pipeline": pipeline,
|
301
|
-
"segment_processing": segment_processing,
|
302
|
-
"segmentation_strategy": segmentation_strategy,
|
303
|
-
},
|
304
|
-
parse_update_params.ParseUpdateParams,
|
305
|
-
),
|
219
|
+
return self._get(
|
220
|
+
f"/tasks/{task_id}/parse",
|
306
221
|
options=make_request_options(
|
307
222
|
extra_headers=extra_headers,
|
308
223
|
extra_query=extra_query,
|
309
224
|
extra_body=extra_body,
|
310
225
|
timeout=timeout,
|
311
|
-
|
226
|
+
query=maybe_transform(
|
227
|
+
{
|
228
|
+
"base64_urls": base64_urls,
|
229
|
+
"include_chunks": include_chunks,
|
230
|
+
"wait_for_completion": wait_for_completion,
|
231
|
+
},
|
232
|
+
parse_get_params.ParseGetParams,
|
233
|
+
),
|
312
234
|
),
|
313
|
-
cast_to=
|
235
|
+
cast_to=ParseGetResponse,
|
314
236
|
)
|
315
237
|
|
316
238
|
|
@@ -338,15 +260,15 @@ class AsyncParseResource(AsyncAPIResource):
|
|
338
260
|
self,
|
339
261
|
*,
|
340
262
|
file: str,
|
341
|
-
chunk_processing:
|
342
|
-
error_handling:
|
263
|
+
chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
|
264
|
+
error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
|
343
265
|
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
344
266
|
file_name: Optional[str] | NotGiven = NOT_GIVEN,
|
345
|
-
llm_processing:
|
346
|
-
ocr_strategy:
|
347
|
-
pipeline:
|
348
|
-
segment_processing: Optional[
|
349
|
-
segmentation_strategy:
|
267
|
+
llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
|
268
|
+
ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
|
269
|
+
pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
|
270
|
+
segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
|
271
|
+
segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
|
350
272
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
351
273
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
352
274
|
extra_headers: Headers | None = None,
|
@@ -354,17 +276,17 @@ class AsyncParseResource(AsyncAPIResource):
|
|
354
276
|
extra_body: Body | None = None,
|
355
277
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
356
278
|
idempotency_key: str | None = None,
|
357
|
-
) ->
|
279
|
+
) -> ParseCreateResponse:
|
358
280
|
"""
|
359
281
|
Queues a document for processing and returns a `TaskResponse` with the assigned
|
360
282
|
`task_id`, initial configuration, file metadata, and timestamps. The initial
|
361
283
|
status is `Starting`.
|
362
284
|
|
363
|
-
Creates a task and returns its metadata immediately.
|
285
|
+
Creates a parse task and returns its metadata immediately.
|
364
286
|
|
365
287
|
Args:
|
366
288
|
file:
|
367
|
-
The file to be
|
289
|
+
The file to be parsed. Supported inputs:
|
368
290
|
|
369
291
|
- `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
|
370
292
|
API
|
@@ -383,7 +305,7 @@ class AsyncParseResource(AsyncAPIResource):
|
|
383
305
|
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
384
306
|
updated, polled or accessed via web interface.
|
385
307
|
|
386
|
-
file_name: The name of the file to be
|
308
|
+
file_name: The name of the file to be parsed. If not set a name will be generated.
|
387
309
|
|
388
310
|
llm_processing: Controls the LLM used for the task.
|
389
311
|
|
@@ -394,41 +316,26 @@ class AsyncParseResource(AsyncAPIResource):
|
|
394
316
|
text. When text layer is present the bounding boxes from the text layer are
|
395
317
|
used.
|
396
318
|
|
397
|
-
|
398
|
-
output will be unified to the Chunkr `output` format.
|
319
|
+
segment_processing: Configuration for how each document segment is processed and formatted.
|
399
320
|
|
400
|
-
|
321
|
+
Each segment has sensible defaults, but you can override specific settings:
|
401
322
|
|
402
|
-
|
323
|
+
- `format`: Output as `Html` or `Markdown`
|
324
|
+
- `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
|
325
|
+
- `crop_image`: Whether to crop images to segment bounds
|
326
|
+
- `extended_context`: Use full page as context for LLM processing
|
327
|
+
- `description`: Generate descriptions for segments
|
403
328
|
|
404
|
-
|
405
|
-
- How the content is produced (rule-based vs. LLM).
|
406
|
-
- The output format (`Html` or `Markdown`).
|
329
|
+
**Defaults per segment type:** Check the documentation for more details.
|
407
330
|
|
408
|
-
|
409
|
-
**descriptions** further refine behaviour.
|
410
|
-
|
411
|
-
**Default strategy per segment**
|
412
|
-
|
413
|
-
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
414
|
-
(Markdown, description off)
|
415
|
-
- `Table` → **LLM** (HTML, description on)
|
416
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
417
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
418
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
419
|
-
|
420
|
-
**Strategy reference**
|
421
|
-
|
422
|
-
- **Auto** – rule-based content generation.
|
423
|
-
- **LLM** – generate content with an LLM.
|
424
|
-
- **Ignore** – exclude the segment entirely.
|
331
|
+
Only specify the fields you want to change - everything else uses the defaults.
|
425
332
|
|
426
333
|
segmentation_strategy:
|
427
334
|
Controls the segmentation strategy:
|
428
335
|
|
429
336
|
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
430
337
|
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
431
|
-
segmentation and better chunking.
|
338
|
+
segmentation and better chunking.
|
432
339
|
- `Page`: Treats each page as a single segment. Faster processing, but without
|
433
340
|
layout element detection and only simple chunking.
|
434
341
|
|
@@ -466,104 +373,46 @@ class AsyncParseResource(AsyncAPIResource):
|
|
466
373
|
timeout=timeout,
|
467
374
|
idempotency_key=idempotency_key,
|
468
375
|
),
|
469
|
-
cast_to=
|
376
|
+
cast_to=ParseCreateResponse,
|
470
377
|
)
|
471
378
|
|
472
|
-
async def
|
379
|
+
async def get(
|
473
380
|
self,
|
474
|
-
task_id: str,
|
381
|
+
task_id: Optional[str],
|
475
382
|
*,
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
|
480
|
-
llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
|
481
|
-
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
482
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
483
|
-
segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
|
484
|
-
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
383
|
+
base64_urls: bool | NotGiven = NOT_GIVEN,
|
384
|
+
include_chunks: bool | NotGiven = NOT_GIVEN,
|
385
|
+
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
485
386
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
486
387
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
487
388
|
extra_headers: Headers | None = None,
|
488
389
|
extra_query: Query | None = None,
|
489
390
|
extra_body: Body | None = None,
|
490
391
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
The
|
496
|
-
current configuration is used as the base; only provided fields are changed.
|
392
|
+
) -> ParseGetResponse:
|
393
|
+
"""
|
394
|
+
Retrieves the current state of a parse task and, when requested, can wait for
|
395
|
+
completion.
|
497
396
|
|
498
|
-
|
397
|
+
Returns task details such as processing status, configuration, output (when
|
398
|
+
available), file metadata, and timestamps. If `wait_for_completion=true` is
|
399
|
+
provided, the server will hold the request briefly. If the task does not reach a
|
400
|
+
terminal state during that window, the response will indicate a retry with
|
401
|
+
appropriate headers.
|
499
402
|
|
500
|
-
|
501
|
-
- The new configuration must differ from the current configuration.
|
403
|
+
Typical uses:
|
502
404
|
|
503
|
-
|
405
|
+
- Poll a task during processing
|
406
|
+
- Retrieve the final output once processing is complete
|
407
|
+
- Access task metadata and configuration
|
504
408
|
|
505
409
|
Args:
|
506
|
-
|
507
|
-
|
508
|
-
error_handling:
|
509
|
-
Controls how errors are handled during processing:
|
510
|
-
|
511
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
512
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
513
|
-
LLM refusals etc.)
|
514
|
-
|
515
|
-
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
516
|
-
updated, polled or accessed via web interface.
|
517
|
-
|
518
|
-
high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
|
519
|
-
penalty: ~7 seconds per page)
|
520
|
-
|
521
|
-
llm_processing: Controls the LLM used for the task.
|
522
|
-
|
523
|
-
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
524
|
-
|
525
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
526
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
527
|
-
text. When text layer is present the bounding boxes from the text layer are
|
528
|
-
used.
|
529
|
-
|
530
|
-
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
531
|
-
output will be unified to the Chunkr `output` format.
|
410
|
+
base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
|
411
|
+
presigned URLs.
|
532
412
|
|
533
|
-
|
413
|
+
include_chunks: Whether to include chunks in the output response
|
534
414
|
|
535
|
-
|
536
|
-
|
537
|
-
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
538
|
-
- How the content is produced (rule-based vs. LLM).
|
539
|
-
- The output format (`Html` or `Markdown`).
|
540
|
-
|
541
|
-
Optional flags such as image **cropping**, **extended context**, and
|
542
|
-
**descriptions** further refine behaviour.
|
543
|
-
|
544
|
-
**Default strategy per segment**
|
545
|
-
|
546
|
-
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
547
|
-
(Markdown, description off)
|
548
|
-
- `Table` → **LLM** (HTML, description on)
|
549
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
550
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
551
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
552
|
-
|
553
|
-
**Strategy reference**
|
554
|
-
|
555
|
-
- **Auto** – rule-based content generation.
|
556
|
-
- **LLM** – generate content with an LLM.
|
557
|
-
- **Ignore** – exclude the segment entirely.
|
558
|
-
|
559
|
-
segmentation_strategy:
|
560
|
-
Controls the segmentation strategy:
|
561
|
-
|
562
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
563
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
564
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
565
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
566
|
-
layout element detection and only simple chunking.
|
415
|
+
wait_for_completion: Whether to wait for the task to complete
|
567
416
|
|
568
417
|
extra_headers: Send extra headers
|
569
418
|
|
@@ -572,35 +421,26 @@ class AsyncParseResource(AsyncAPIResource):
|
|
572
421
|
extra_body: Add additional JSON properties to the request
|
573
422
|
|
574
423
|
timeout: Override the client-level default timeout for this request, in seconds
|
575
|
-
|
576
|
-
idempotency_key: Specify a custom idempotency key for this request
|
577
424
|
"""
|
578
425
|
if not task_id:
|
579
426
|
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
580
|
-
return await self.
|
581
|
-
f"/tasks/
|
582
|
-
body=await async_maybe_transform(
|
583
|
-
{
|
584
|
-
"chunk_processing": chunk_processing,
|
585
|
-
"error_handling": error_handling,
|
586
|
-
"expires_in": expires_in,
|
587
|
-
"high_resolution": high_resolution,
|
588
|
-
"llm_processing": llm_processing,
|
589
|
-
"ocr_strategy": ocr_strategy,
|
590
|
-
"pipeline": pipeline,
|
591
|
-
"segment_processing": segment_processing,
|
592
|
-
"segmentation_strategy": segmentation_strategy,
|
593
|
-
},
|
594
|
-
parse_update_params.ParseUpdateParams,
|
595
|
-
),
|
427
|
+
return await self._get(
|
428
|
+
f"/tasks/{task_id}/parse",
|
596
429
|
options=make_request_options(
|
597
430
|
extra_headers=extra_headers,
|
598
431
|
extra_query=extra_query,
|
599
432
|
extra_body=extra_body,
|
600
433
|
timeout=timeout,
|
601
|
-
|
434
|
+
query=await async_maybe_transform(
|
435
|
+
{
|
436
|
+
"base64_urls": base64_urls,
|
437
|
+
"include_chunks": include_chunks,
|
438
|
+
"wait_for_completion": wait_for_completion,
|
439
|
+
},
|
440
|
+
parse_get_params.ParseGetParams,
|
441
|
+
),
|
602
442
|
),
|
603
|
-
cast_to=
|
443
|
+
cast_to=ParseGetResponse,
|
604
444
|
)
|
605
445
|
|
606
446
|
|
@@ -611,8 +451,8 @@ class ParseResourceWithRawResponse:
|
|
611
451
|
self.create = to_raw_response_wrapper(
|
612
452
|
parse.create,
|
613
453
|
)
|
614
|
-
self.
|
615
|
-
parse.
|
454
|
+
self.get = to_raw_response_wrapper(
|
455
|
+
parse.get,
|
616
456
|
)
|
617
457
|
|
618
458
|
|
@@ -623,8 +463,8 @@ class AsyncParseResourceWithRawResponse:
|
|
623
463
|
self.create = async_to_raw_response_wrapper(
|
624
464
|
parse.create,
|
625
465
|
)
|
626
|
-
self.
|
627
|
-
parse.
|
466
|
+
self.get = async_to_raw_response_wrapper(
|
467
|
+
parse.get,
|
628
468
|
)
|
629
469
|
|
630
470
|
|
@@ -635,8 +475,8 @@ class ParseResourceWithStreamingResponse:
|
|
635
475
|
self.create = to_streamed_response_wrapper(
|
636
476
|
parse.create,
|
637
477
|
)
|
638
|
-
self.
|
639
|
-
parse.
|
478
|
+
self.get = to_streamed_response_wrapper(
|
479
|
+
parse.get,
|
640
480
|
)
|
641
481
|
|
642
482
|
|
@@ -647,6 +487,6 @@ class AsyncParseResourceWithStreamingResponse:
|
|
647
487
|
self.create = async_to_streamed_response_wrapper(
|
648
488
|
parse.create,
|
649
489
|
)
|
650
|
-
self.
|
651
|
-
parse.
|
490
|
+
self.get = async_to_streamed_response_wrapper(
|
491
|
+
parse.get,
|
652
492
|
)
|