chunkr-ai 0.1.0a5__py3-none-any.whl → 0.1.0a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/__init__.py +2 -0
- chunkr_ai/_client.py +31 -3
- chunkr_ai/_constants.py +5 -5
- chunkr_ai/_exceptions.py +4 -0
- chunkr_ai/_models.py +1 -1
- chunkr_ai/_types.py +35 -1
- chunkr_ai/_utils/__init__.py +1 -0
- chunkr_ai/_utils/_typing.py +5 -0
- chunkr_ai/_version.py +1 -1
- chunkr_ai/resources/__init__.py +14 -0
- chunkr_ai/resources/files.py +3 -3
- chunkr_ai/resources/tasks/__init__.py +14 -0
- chunkr_ai/resources/tasks/extract.py +409 -0
- chunkr_ai/resources/tasks/parse.py +102 -346
- chunkr_ai/resources/tasks/tasks.py +62 -14
- chunkr_ai/resources/webhooks.py +193 -0
- chunkr_ai/types/__init__.py +27 -1
- chunkr_ai/types/bounding_box.py +19 -0
- chunkr_ai/types/cell.py +39 -0
- chunkr_ai/types/cell_style.py +28 -0
- chunkr_ai/types/chunk.py +40 -0
- chunkr_ai/types/chunk_processing.py +40 -0
- chunkr_ai/types/chunk_processing_param.py +42 -0
- chunkr_ai/types/extract_configuration.py +24 -0
- chunkr_ai/types/extract_output_response.py +19 -0
- chunkr_ai/types/file_create_params.py +2 -1
- chunkr_ai/types/file_info.py +21 -0
- chunkr_ai/types/generation_config.py +29 -0
- chunkr_ai/types/generation_config_param.py +29 -0
- chunkr_ai/types/llm_processing.py +36 -0
- chunkr_ai/types/llm_processing_param.py +36 -0
- chunkr_ai/types/ocr_result.py +28 -0
- chunkr_ai/types/page.py +27 -0
- chunkr_ai/types/parse_configuration.py +64 -0
- chunkr_ai/types/parse_configuration_param.py +65 -0
- chunkr_ai/types/parse_output_response.py +29 -0
- chunkr_ai/types/segment.py +109 -0
- chunkr_ai/types/segment_processing.py +228 -0
- chunkr_ai/types/segment_processing_param.py +229 -0
- chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
- chunkr_ai/types/task_list_params.py +7 -1
- chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
- chunkr_ai/types/task_response.py +68 -0
- chunkr_ai/types/tasks/__init__.py +7 -1
- chunkr_ai/types/tasks/extract_create_params.py +47 -0
- chunkr_ai/types/tasks/extract_create_response.py +214 -0
- chunkr_ai/types/tasks/extract_get_params.py +21 -0
- chunkr_ai/types/tasks/extract_get_response.py +214 -0
- chunkr_ai/types/tasks/parse_create_params.py +25 -805
- chunkr_ai/types/tasks/parse_create_response.py +55 -0
- chunkr_ai/types/tasks/parse_get_params.py +21 -0
- chunkr_ai/types/tasks/parse_get_response.py +55 -0
- chunkr_ai/types/unwrap_webhook_event.py +11 -0
- chunkr_ai/types/version_info.py +31 -0
- chunkr_ai/types/webhook_url_response.py +9 -0
- {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/METADATA +14 -13
- chunkr_ai-0.1.0a7.dist-info/RECORD +86 -0
- chunkr_ai/types/task.py +0 -1225
- chunkr_ai/types/tasks/parse_update_params.py +0 -857
- chunkr_ai-0.1.0a5.dist-info/RECORD +0 -52
- {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/licenses/LICENSE +0 -0
@@ -17,9 +17,13 @@ from ..._response import (
|
|
17
17
|
async_to_raw_response_wrapper,
|
18
18
|
async_to_streamed_response_wrapper,
|
19
19
|
)
|
20
|
-
from ...types.
|
21
|
-
from ...types.tasks import parse_create_params, parse_update_params
|
20
|
+
from ...types.tasks import parse_get_params, parse_create_params
|
22
21
|
from ..._base_client import make_request_options
|
22
|
+
from ...types.llm_processing_param import LlmProcessingParam
|
23
|
+
from ...types.chunk_processing_param import ChunkProcessingParam
|
24
|
+
from ...types.segment_processing_param import SegmentProcessingParam
|
25
|
+
from ...types.tasks.parse_get_response import ParseGetResponse
|
26
|
+
from ...types.tasks.parse_create_response import ParseCreateResponse
|
23
27
|
|
24
28
|
__all__ = ["ParseResource", "AsyncParseResource"]
|
25
29
|
|
@@ -48,18 +52,15 @@ class ParseResource(SyncAPIResource):
|
|
48
52
|
self,
|
49
53
|
*,
|
50
54
|
file: str,
|
51
|
-
|
52
|
-
|
53
|
-
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
54
|
-
chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
|
55
|
-
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
55
|
+
chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
|
56
|
+
error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
|
56
57
|
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
57
58
|
file_name: Optional[str] | NotGiven = NOT_GIVEN,
|
58
|
-
llm_processing:
|
59
|
-
ocr_strategy:
|
60
|
-
pipeline:
|
61
|
-
segment_processing: Optional[
|
62
|
-
segmentation_strategy:
|
59
|
+
llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
|
60
|
+
ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
|
61
|
+
pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
|
62
|
+
segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
|
63
|
+
segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
|
63
64
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
64
65
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
65
66
|
extra_headers: Headers | None = None,
|
@@ -67,33 +68,23 @@ class ParseResource(SyncAPIResource):
|
|
67
68
|
extra_body: Body | None = None,
|
68
69
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
69
70
|
idempotency_key: str | None = None,
|
70
|
-
) ->
|
71
|
+
) -> ParseCreateResponse:
|
71
72
|
"""
|
72
73
|
Queues a document for processing and returns a `TaskResponse` with the assigned
|
73
74
|
`task_id`, initial configuration, file metadata, and timestamps. The initial
|
74
75
|
status is `Starting`.
|
75
76
|
|
76
|
-
|
77
|
-
completion. If the task completes within that window, a 200 response with the
|
78
|
-
final `TaskResponse` is returned. Otherwise, the server returns a 408 or 409
|
79
|
-
with retry guidance and a body describing how long to wait before retrying.
|
77
|
+
Creates a parse task and returns its metadata immediately.
|
80
78
|
|
81
79
|
Args:
|
82
80
|
file:
|
83
|
-
The file to be
|
81
|
+
The file to be parsed. Supported inputs:
|
84
82
|
|
85
83
|
- `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
|
86
84
|
API
|
87
85
|
- `http(s)://...`: Remote URL to fetch
|
88
86
|
- `data:*;base64,...` or raw base64 string
|
89
87
|
|
90
|
-
base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
|
91
|
-
|
92
|
-
include_chunks: Whether to include chunks in the output response
|
93
|
-
|
94
|
-
wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
|
95
|
-
408/409 with Retry-After headers
|
96
|
-
|
97
88
|
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
98
89
|
|
99
90
|
error_handling:
|
@@ -106,7 +97,7 @@ class ParseResource(SyncAPIResource):
|
|
106
97
|
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
107
98
|
updated, polled or accessed via web interface.
|
108
99
|
|
109
|
-
file_name: The name of the file to be
|
100
|
+
file_name: The name of the file to be parsed. If not set a name will be generated.
|
110
101
|
|
111
102
|
llm_processing: Controls the LLM used for the task.
|
112
103
|
|
@@ -117,41 +108,26 @@ class ParseResource(SyncAPIResource):
|
|
117
108
|
text. When text layer is present the bounding boxes from the text layer are
|
118
109
|
used.
|
119
110
|
|
120
|
-
|
121
|
-
output will be unified to the Chunkr `output` format.
|
122
|
-
|
123
|
-
segment_processing: Defines how each segment type is handled when generating the final output.
|
124
|
-
|
125
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
126
|
-
|
127
|
-
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
128
|
-
- How the content is produced (rule-based vs. LLM).
|
129
|
-
- The output format (`Html` or `Markdown`).
|
111
|
+
segment_processing: Configuration for how each document segment is processed and formatted.
|
130
112
|
|
131
|
-
|
132
|
-
**descriptions** further refine behaviour.
|
113
|
+
Each segment has sensible defaults, but you can override specific settings:
|
133
114
|
|
134
|
-
|
115
|
+
- `format`: Output as `Html` or `Markdown`
|
116
|
+
- `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
|
117
|
+
- `crop_image`: Whether to crop images to segment bounds
|
118
|
+
- `extended_context`: Use full page as context for LLM processing
|
119
|
+
- `description`: Generate descriptions for segments
|
135
120
|
|
136
|
-
|
137
|
-
(Markdown, description off)
|
138
|
-
- `Table` → **LLM** (HTML, description on)
|
139
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
140
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
141
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
121
|
+
**Defaults per segment type:** Check the documentation for more details.
|
142
122
|
|
143
|
-
|
144
|
-
|
145
|
-
- **Auto** – rule-based content generation.
|
146
|
-
- **LLM** – generate content with an LLM.
|
147
|
-
- **Ignore** – exclude the segment entirely.
|
123
|
+
Only specify the fields you want to change - everything else uses the defaults.
|
148
124
|
|
149
125
|
segmentation_strategy:
|
150
126
|
Controls the segmentation strategy:
|
151
127
|
|
152
128
|
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
153
129
|
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
154
|
-
segmentation and better chunking.
|
130
|
+
segmentation and better chunking.
|
155
131
|
- `Page`: Treats each page as a single segment. Faster processing, but without
|
156
132
|
layout element detection and only simple chunking.
|
157
133
|
|
@@ -188,126 +164,47 @@ class ParseResource(SyncAPIResource):
|
|
188
164
|
extra_body=extra_body,
|
189
165
|
timeout=timeout,
|
190
166
|
idempotency_key=idempotency_key,
|
191
|
-
query=maybe_transform(
|
192
|
-
{
|
193
|
-
"base64_urls": base64_urls,
|
194
|
-
"include_chunks": include_chunks,
|
195
|
-
"wait_for_completion": wait_for_completion,
|
196
|
-
},
|
197
|
-
parse_create_params.ParseCreateParams,
|
198
|
-
),
|
199
167
|
),
|
200
|
-
cast_to=
|
168
|
+
cast_to=ParseCreateResponse,
|
201
169
|
)
|
202
170
|
|
203
|
-
def
|
171
|
+
def get(
|
204
172
|
self,
|
205
|
-
task_id: str,
|
173
|
+
task_id: Optional[str],
|
206
174
|
*,
|
207
175
|
base64_urls: bool | NotGiven = NOT_GIVEN,
|
208
176
|
include_chunks: bool | NotGiven = NOT_GIVEN,
|
209
177
|
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
210
|
-
chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
|
211
|
-
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
212
|
-
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
213
|
-
high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
|
214
|
-
llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
|
215
|
-
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
216
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
217
|
-
segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
|
218
|
-
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
219
178
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
220
179
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
221
180
|
extra_headers: Headers | None = None,
|
222
181
|
extra_query: Query | None = None,
|
223
182
|
extra_body: Body | None = None,
|
224
183
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
The
|
230
|
-
current configuration is used as the base; only provided fields are changed.
|
184
|
+
) -> ParseGetResponse:
|
185
|
+
"""
|
186
|
+
Retrieves the current state of a parse task and, when requested, can wait for
|
187
|
+
completion.
|
231
188
|
|
232
|
-
|
189
|
+
Returns task details such as processing status, configuration, output (when
|
190
|
+
available), file metadata, and timestamps. If `wait_for_completion=true` is
|
191
|
+
provided, the server will hold the request briefly. If the task does not reach a
|
192
|
+
terminal state during that window, the response will indicate a retry with
|
193
|
+
appropriate headers.
|
233
194
|
|
234
|
-
|
235
|
-
- The new configuration must differ from the current configuration.
|
195
|
+
Typical uses:
|
236
196
|
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
guidance and a body describing how long to wait before retrying.
|
197
|
+
- Poll a task during processing
|
198
|
+
- Retrieve the final output once processing is complete
|
199
|
+
- Access task metadata and configuration
|
241
200
|
|
242
201
|
Args:
|
243
|
-
base64_urls: Whether to return base64 encoded URLs. If false,
|
202
|
+
base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
|
203
|
+
presigned URLs.
|
244
204
|
|
245
205
|
include_chunks: Whether to include chunks in the output response
|
246
206
|
|
247
|
-
wait_for_completion:
|
248
|
-
408/409 with Retry-After headers
|
249
|
-
|
250
|
-
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
251
|
-
|
252
|
-
error_handling:
|
253
|
-
Controls how errors are handled during processing:
|
254
|
-
|
255
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
256
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
257
|
-
LLM refusals etc.)
|
258
|
-
|
259
|
-
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
260
|
-
updated, polled or accessed via web interface.
|
261
|
-
|
262
|
-
high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
|
263
|
-
penalty: ~7 seconds per page)
|
264
|
-
|
265
|
-
llm_processing: Controls the LLM used for the task.
|
266
|
-
|
267
|
-
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
268
|
-
|
269
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
270
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
271
|
-
text. When text layer is present the bounding boxes from the text layer are
|
272
|
-
used.
|
273
|
-
|
274
|
-
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
275
|
-
output will be unified to the Chunkr `output` format.
|
276
|
-
|
277
|
-
segment_processing: Defines how each segment type is handled when generating the final output.
|
278
|
-
|
279
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
280
|
-
|
281
|
-
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
282
|
-
- How the content is produced (rule-based vs. LLM).
|
283
|
-
- The output format (`Html` or `Markdown`).
|
284
|
-
|
285
|
-
Optional flags such as image **cropping**, **extended context**, and
|
286
|
-
**descriptions** further refine behaviour.
|
287
|
-
|
288
|
-
**Default strategy per segment**
|
289
|
-
|
290
|
-
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
291
|
-
(Markdown, description off)
|
292
|
-
- `Table` → **LLM** (HTML, description on)
|
293
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
294
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
295
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
296
|
-
|
297
|
-
**Strategy reference**
|
298
|
-
|
299
|
-
- **Auto** – rule-based content generation.
|
300
|
-
- **LLM** – generate content with an LLM.
|
301
|
-
- **Ignore** – exclude the segment entirely.
|
302
|
-
|
303
|
-
segmentation_strategy:
|
304
|
-
Controls the segmentation strategy:
|
305
|
-
|
306
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
307
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
308
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
309
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
310
|
-
layout element detection and only simple chunking.
|
207
|
+
wait_for_completion: Whether to wait for the task to complete
|
311
208
|
|
312
209
|
extra_headers: Send extra headers
|
313
210
|
|
@@ -316,43 +213,26 @@ class ParseResource(SyncAPIResource):
|
|
316
213
|
extra_body: Add additional JSON properties to the request
|
317
214
|
|
318
215
|
timeout: Override the client-level default timeout for this request, in seconds
|
319
|
-
|
320
|
-
idempotency_key: Specify a custom idempotency key for this request
|
321
216
|
"""
|
322
217
|
if not task_id:
|
323
218
|
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
324
|
-
return self.
|
325
|
-
f"/tasks/
|
326
|
-
body=maybe_transform(
|
327
|
-
{
|
328
|
-
"chunk_processing": chunk_processing,
|
329
|
-
"error_handling": error_handling,
|
330
|
-
"expires_in": expires_in,
|
331
|
-
"high_resolution": high_resolution,
|
332
|
-
"llm_processing": llm_processing,
|
333
|
-
"ocr_strategy": ocr_strategy,
|
334
|
-
"pipeline": pipeline,
|
335
|
-
"segment_processing": segment_processing,
|
336
|
-
"segmentation_strategy": segmentation_strategy,
|
337
|
-
},
|
338
|
-
parse_update_params.ParseUpdateParams,
|
339
|
-
),
|
219
|
+
return self._get(
|
220
|
+
f"/tasks/{task_id}/parse",
|
340
221
|
options=make_request_options(
|
341
222
|
extra_headers=extra_headers,
|
342
223
|
extra_query=extra_query,
|
343
224
|
extra_body=extra_body,
|
344
225
|
timeout=timeout,
|
345
|
-
idempotency_key=idempotency_key,
|
346
226
|
query=maybe_transform(
|
347
227
|
{
|
348
228
|
"base64_urls": base64_urls,
|
349
229
|
"include_chunks": include_chunks,
|
350
230
|
"wait_for_completion": wait_for_completion,
|
351
231
|
},
|
352
|
-
|
232
|
+
parse_get_params.ParseGetParams,
|
353
233
|
),
|
354
234
|
),
|
355
|
-
cast_to=
|
235
|
+
cast_to=ParseGetResponse,
|
356
236
|
)
|
357
237
|
|
358
238
|
|
@@ -380,18 +260,15 @@ class AsyncParseResource(AsyncAPIResource):
|
|
380
260
|
self,
|
381
261
|
*,
|
382
262
|
file: str,
|
383
|
-
|
384
|
-
|
385
|
-
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
386
|
-
chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
|
387
|
-
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
263
|
+
chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
|
264
|
+
error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
|
388
265
|
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
389
266
|
file_name: Optional[str] | NotGiven = NOT_GIVEN,
|
390
|
-
llm_processing:
|
391
|
-
ocr_strategy:
|
392
|
-
pipeline:
|
393
|
-
segment_processing: Optional[
|
394
|
-
segmentation_strategy:
|
267
|
+
llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
|
268
|
+
ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
|
269
|
+
pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
|
270
|
+
segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
|
271
|
+
segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
|
395
272
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
396
273
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
397
274
|
extra_headers: Headers | None = None,
|
@@ -399,33 +276,23 @@ class AsyncParseResource(AsyncAPIResource):
|
|
399
276
|
extra_body: Body | None = None,
|
400
277
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
401
278
|
idempotency_key: str | None = None,
|
402
|
-
) ->
|
279
|
+
) -> ParseCreateResponse:
|
403
280
|
"""
|
404
281
|
Queues a document for processing and returns a `TaskResponse` with the assigned
|
405
282
|
`task_id`, initial configuration, file metadata, and timestamps. The initial
|
406
283
|
status is `Starting`.
|
407
284
|
|
408
|
-
|
409
|
-
completion. If the task completes within that window, a 200 response with the
|
410
|
-
final `TaskResponse` is returned. Otherwise, the server returns a 408 or 409
|
411
|
-
with retry guidance and a body describing how long to wait before retrying.
|
285
|
+
Creates a parse task and returns its metadata immediately.
|
412
286
|
|
413
287
|
Args:
|
414
288
|
file:
|
415
|
-
The file to be
|
289
|
+
The file to be parsed. Supported inputs:
|
416
290
|
|
417
291
|
- `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
|
418
292
|
API
|
419
293
|
- `http(s)://...`: Remote URL to fetch
|
420
294
|
- `data:*;base64,...` or raw base64 string
|
421
295
|
|
422
|
-
base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
|
423
|
-
|
424
|
-
include_chunks: Whether to include chunks in the output response
|
425
|
-
|
426
|
-
wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
|
427
|
-
408/409 with Retry-After headers
|
428
|
-
|
429
296
|
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
430
297
|
|
431
298
|
error_handling:
|
@@ -438,7 +305,7 @@ class AsyncParseResource(AsyncAPIResource):
|
|
438
305
|
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
439
306
|
updated, polled or accessed via web interface.
|
440
307
|
|
441
|
-
file_name: The name of the file to be
|
308
|
+
file_name: The name of the file to be parsed. If not set a name will be generated.
|
442
309
|
|
443
310
|
llm_processing: Controls the LLM used for the task.
|
444
311
|
|
@@ -449,41 +316,26 @@ class AsyncParseResource(AsyncAPIResource):
|
|
449
316
|
text. When text layer is present the bounding boxes from the text layer are
|
450
317
|
used.
|
451
318
|
|
452
|
-
|
453
|
-
output will be unified to the Chunkr `output` format.
|
454
|
-
|
455
|
-
segment_processing: Defines how each segment type is handled when generating the final output.
|
456
|
-
|
457
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
458
|
-
|
459
|
-
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
460
|
-
- How the content is produced (rule-based vs. LLM).
|
461
|
-
- The output format (`Html` or `Markdown`).
|
319
|
+
segment_processing: Configuration for how each document segment is processed and formatted.
|
462
320
|
|
463
|
-
|
464
|
-
**descriptions** further refine behaviour.
|
321
|
+
Each segment has sensible defaults, but you can override specific settings:
|
465
322
|
|
466
|
-
|
323
|
+
- `format`: Output as `Html` or `Markdown`
|
324
|
+
- `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
|
325
|
+
- `crop_image`: Whether to crop images to segment bounds
|
326
|
+
- `extended_context`: Use full page as context for LLM processing
|
327
|
+
- `description`: Generate descriptions for segments
|
467
328
|
|
468
|
-
|
469
|
-
(Markdown, description off)
|
470
|
-
- `Table` → **LLM** (HTML, description on)
|
471
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
472
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
473
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
329
|
+
**Defaults per segment type:** Check the documentation for more details.
|
474
330
|
|
475
|
-
|
476
|
-
|
477
|
-
- **Auto** – rule-based content generation.
|
478
|
-
- **LLM** – generate content with an LLM.
|
479
|
-
- **Ignore** – exclude the segment entirely.
|
331
|
+
Only specify the fields you want to change - everything else uses the defaults.
|
480
332
|
|
481
333
|
segmentation_strategy:
|
482
334
|
Controls the segmentation strategy:
|
483
335
|
|
484
336
|
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
485
337
|
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
486
|
-
segmentation and better chunking.
|
338
|
+
segmentation and better chunking.
|
487
339
|
- `Page`: Treats each page as a single segment. Faster processing, but without
|
488
340
|
layout element detection and only simple chunking.
|
489
341
|
|
@@ -520,126 +372,47 @@ class AsyncParseResource(AsyncAPIResource):
|
|
520
372
|
extra_body=extra_body,
|
521
373
|
timeout=timeout,
|
522
374
|
idempotency_key=idempotency_key,
|
523
|
-
query=await async_maybe_transform(
|
524
|
-
{
|
525
|
-
"base64_urls": base64_urls,
|
526
|
-
"include_chunks": include_chunks,
|
527
|
-
"wait_for_completion": wait_for_completion,
|
528
|
-
},
|
529
|
-
parse_create_params.ParseCreateParams,
|
530
|
-
),
|
531
375
|
),
|
532
|
-
cast_to=
|
376
|
+
cast_to=ParseCreateResponse,
|
533
377
|
)
|
534
378
|
|
535
|
-
async def
|
379
|
+
async def get(
|
536
380
|
self,
|
537
|
-
task_id: str,
|
381
|
+
task_id: Optional[str],
|
538
382
|
*,
|
539
383
|
base64_urls: bool | NotGiven = NOT_GIVEN,
|
540
384
|
include_chunks: bool | NotGiven = NOT_GIVEN,
|
541
385
|
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
542
|
-
chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
|
543
|
-
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
544
|
-
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
545
|
-
high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
|
546
|
-
llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
|
547
|
-
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
548
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
549
|
-
segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
|
550
|
-
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
551
386
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
552
387
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
553
388
|
extra_headers: Headers | None = None,
|
554
389
|
extra_query: Query | None = None,
|
555
390
|
extra_body: Body | None = None,
|
556
391
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
The
|
562
|
-
current configuration is used as the base; only provided fields are changed.
|
392
|
+
) -> ParseGetResponse:
|
393
|
+
"""
|
394
|
+
Retrieves the current state of a parse task and, when requested, can wait for
|
395
|
+
completion.
|
563
396
|
|
564
|
-
|
397
|
+
Returns task details such as processing status, configuration, output (when
|
398
|
+
available), file metadata, and timestamps. If `wait_for_completion=true` is
|
399
|
+
provided, the server will hold the request briefly. If the task does not reach a
|
400
|
+
terminal state during that window, the response will indicate a retry with
|
401
|
+
appropriate headers.
|
565
402
|
|
566
|
-
|
567
|
-
- The new configuration must differ from the current configuration.
|
403
|
+
Typical uses:
|
568
404
|
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
guidance and a body describing how long to wait before retrying.
|
405
|
+
- Poll a task during processing
|
406
|
+
- Retrieve the final output once processing is complete
|
407
|
+
- Access task metadata and configuration
|
573
408
|
|
574
409
|
Args:
|
575
|
-
base64_urls: Whether to return base64 encoded URLs. If false,
|
410
|
+
base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
|
411
|
+
presigned URLs.
|
576
412
|
|
577
413
|
include_chunks: Whether to include chunks in the output response
|
578
414
|
|
579
|
-
wait_for_completion:
|
580
|
-
408/409 with Retry-After headers
|
581
|
-
|
582
|
-
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
583
|
-
|
584
|
-
error_handling:
|
585
|
-
Controls how errors are handled during processing:
|
586
|
-
|
587
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
588
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
589
|
-
LLM refusals etc.)
|
590
|
-
|
591
|
-
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
592
|
-
updated, polled or accessed via web interface.
|
593
|
-
|
594
|
-
high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
|
595
|
-
penalty: ~7 seconds per page)
|
596
|
-
|
597
|
-
llm_processing: Controls the LLM used for the task.
|
598
|
-
|
599
|
-
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
600
|
-
|
601
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
602
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
603
|
-
text. When text layer is present the bounding boxes from the text layer are
|
604
|
-
used.
|
605
|
-
|
606
|
-
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
607
|
-
output will be unified to the Chunkr `output` format.
|
608
|
-
|
609
|
-
segment_processing: Defines how each segment type is handled when generating the final output.
|
610
|
-
|
611
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
612
|
-
|
613
|
-
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
614
|
-
- How the content is produced (rule-based vs. LLM).
|
615
|
-
- The output format (`Html` or `Markdown`).
|
616
|
-
|
617
|
-
Optional flags such as image **cropping**, **extended context**, and
|
618
|
-
**descriptions** further refine behaviour.
|
619
|
-
|
620
|
-
**Default strategy per segment**
|
621
|
-
|
622
|
-
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
623
|
-
(Markdown, description off)
|
624
|
-
- `Table` → **LLM** (HTML, description on)
|
625
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
626
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
627
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
628
|
-
|
629
|
-
**Strategy reference**
|
630
|
-
|
631
|
-
- **Auto** – rule-based content generation.
|
632
|
-
- **LLM** – generate content with an LLM.
|
633
|
-
- **Ignore** – exclude the segment entirely.
|
634
|
-
|
635
|
-
segmentation_strategy:
|
636
|
-
Controls the segmentation strategy:
|
637
|
-
|
638
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
639
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
640
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
641
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
642
|
-
layout element detection and only simple chunking.
|
415
|
+
wait_for_completion: Whether to wait for the task to complete
|
643
416
|
|
644
417
|
extra_headers: Send extra headers
|
645
418
|
|
@@ -648,43 +421,26 @@ class AsyncParseResource(AsyncAPIResource):
|
|
648
421
|
extra_body: Add additional JSON properties to the request
|
649
422
|
|
650
423
|
timeout: Override the client-level default timeout for this request, in seconds
|
651
|
-
|
652
|
-
idempotency_key: Specify a custom idempotency key for this request
|
653
424
|
"""
|
654
425
|
if not task_id:
|
655
426
|
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
656
|
-
return await self.
|
657
|
-
f"/tasks/
|
658
|
-
body=await async_maybe_transform(
|
659
|
-
{
|
660
|
-
"chunk_processing": chunk_processing,
|
661
|
-
"error_handling": error_handling,
|
662
|
-
"expires_in": expires_in,
|
663
|
-
"high_resolution": high_resolution,
|
664
|
-
"llm_processing": llm_processing,
|
665
|
-
"ocr_strategy": ocr_strategy,
|
666
|
-
"pipeline": pipeline,
|
667
|
-
"segment_processing": segment_processing,
|
668
|
-
"segmentation_strategy": segmentation_strategy,
|
669
|
-
},
|
670
|
-
parse_update_params.ParseUpdateParams,
|
671
|
-
),
|
427
|
+
return await self._get(
|
428
|
+
f"/tasks/{task_id}/parse",
|
672
429
|
options=make_request_options(
|
673
430
|
extra_headers=extra_headers,
|
674
431
|
extra_query=extra_query,
|
675
432
|
extra_body=extra_body,
|
676
433
|
timeout=timeout,
|
677
|
-
idempotency_key=idempotency_key,
|
678
434
|
query=await async_maybe_transform(
|
679
435
|
{
|
680
436
|
"base64_urls": base64_urls,
|
681
437
|
"include_chunks": include_chunks,
|
682
438
|
"wait_for_completion": wait_for_completion,
|
683
439
|
},
|
684
|
-
|
440
|
+
parse_get_params.ParseGetParams,
|
685
441
|
),
|
686
442
|
),
|
687
|
-
cast_to=
|
443
|
+
cast_to=ParseGetResponse,
|
688
444
|
)
|
689
445
|
|
690
446
|
|
@@ -695,8 +451,8 @@ class ParseResourceWithRawResponse:
|
|
695
451
|
self.create = to_raw_response_wrapper(
|
696
452
|
parse.create,
|
697
453
|
)
|
698
|
-
self.
|
699
|
-
parse.
|
454
|
+
self.get = to_raw_response_wrapper(
|
455
|
+
parse.get,
|
700
456
|
)
|
701
457
|
|
702
458
|
|
@@ -707,8 +463,8 @@ class AsyncParseResourceWithRawResponse:
|
|
707
463
|
self.create = async_to_raw_response_wrapper(
|
708
464
|
parse.create,
|
709
465
|
)
|
710
|
-
self.
|
711
|
-
parse.
|
466
|
+
self.get = async_to_raw_response_wrapper(
|
467
|
+
parse.get,
|
712
468
|
)
|
713
469
|
|
714
470
|
|
@@ -719,8 +475,8 @@ class ParseResourceWithStreamingResponse:
|
|
719
475
|
self.create = to_streamed_response_wrapper(
|
720
476
|
parse.create,
|
721
477
|
)
|
722
|
-
self.
|
723
|
-
parse.
|
478
|
+
self.get = to_streamed_response_wrapper(
|
479
|
+
parse.get,
|
724
480
|
)
|
725
481
|
|
726
482
|
|
@@ -731,6 +487,6 @@ class AsyncParseResourceWithStreamingResponse:
|
|
731
487
|
self.create = async_to_streamed_response_wrapper(
|
732
488
|
parse.create,
|
733
489
|
)
|
734
|
-
self.
|
735
|
-
parse.
|
490
|
+
self.get = async_to_streamed_response_wrapper(
|
491
|
+
parse.get,
|
736
492
|
)
|