chunkr-ai 0.1.0a6__py3-none-any.whl → 0.1.0a7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. chunkr_ai/__init__.py +2 -0
  2. chunkr_ai/_client.py +31 -3
  3. chunkr_ai/_constants.py +5 -5
  4. chunkr_ai/_exceptions.py +4 -0
  5. chunkr_ai/_models.py +1 -1
  6. chunkr_ai/_types.py +35 -1
  7. chunkr_ai/_utils/__init__.py +1 -0
  8. chunkr_ai/_utils/_typing.py +5 -0
  9. chunkr_ai/_version.py +1 -1
  10. chunkr_ai/resources/__init__.py +14 -0
  11. chunkr_ai/resources/files.py +3 -3
  12. chunkr_ai/resources/tasks/__init__.py +14 -0
  13. chunkr_ai/resources/tasks/extract.py +409 -0
  14. chunkr_ai/resources/tasks/parse.py +124 -284
  15. chunkr_ai/resources/tasks/tasks.py +62 -14
  16. chunkr_ai/resources/webhooks.py +193 -0
  17. chunkr_ai/types/__init__.py +27 -1
  18. chunkr_ai/types/bounding_box.py +19 -0
  19. chunkr_ai/types/cell.py +39 -0
  20. chunkr_ai/types/cell_style.py +28 -0
  21. chunkr_ai/types/chunk.py +40 -0
  22. chunkr_ai/types/chunk_processing.py +40 -0
  23. chunkr_ai/types/chunk_processing_param.py +42 -0
  24. chunkr_ai/types/extract_configuration.py +24 -0
  25. chunkr_ai/types/extract_output_response.py +19 -0
  26. chunkr_ai/types/file_create_params.py +2 -1
  27. chunkr_ai/types/file_info.py +21 -0
  28. chunkr_ai/types/generation_config.py +29 -0
  29. chunkr_ai/types/generation_config_param.py +29 -0
  30. chunkr_ai/types/llm_processing.py +36 -0
  31. chunkr_ai/types/llm_processing_param.py +36 -0
  32. chunkr_ai/types/ocr_result.py +28 -0
  33. chunkr_ai/types/page.py +27 -0
  34. chunkr_ai/types/parse_configuration.py +64 -0
  35. chunkr_ai/types/parse_configuration_param.py +65 -0
  36. chunkr_ai/types/parse_output_response.py +29 -0
  37. chunkr_ai/types/segment.py +109 -0
  38. chunkr_ai/types/segment_processing.py +228 -0
  39. chunkr_ai/types/segment_processing_param.py +229 -0
  40. chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
  41. chunkr_ai/types/task_list_params.py +7 -1
  42. chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
  43. chunkr_ai/types/task_response.py +68 -0
  44. chunkr_ai/types/tasks/__init__.py +7 -1
  45. chunkr_ai/types/tasks/extract_create_params.py +47 -0
  46. chunkr_ai/types/tasks/extract_create_response.py +214 -0
  47. chunkr_ai/types/tasks/extract_get_params.py +21 -0
  48. chunkr_ai/types/tasks/extract_get_response.py +214 -0
  49. chunkr_ai/types/tasks/parse_create_params.py +25 -793
  50. chunkr_ai/types/tasks/parse_create_response.py +55 -0
  51. chunkr_ai/types/tasks/parse_get_params.py +21 -0
  52. chunkr_ai/types/tasks/parse_get_response.py +55 -0
  53. chunkr_ai/types/unwrap_webhook_event.py +11 -0
  54. chunkr_ai/types/version_info.py +31 -0
  55. chunkr_ai/types/webhook_url_response.py +9 -0
  56. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a7.dist-info}/METADATA +14 -13
  57. chunkr_ai-0.1.0a7.dist-info/RECORD +86 -0
  58. chunkr_ai/types/task.py +0 -1225
  59. chunkr_ai/types/tasks/parse_update_params.py +0 -845
  60. chunkr_ai-0.1.0a6.dist-info/RECORD +0 -52
  61. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a7.dist-info}/WHEEL +0 -0
  62. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a7.dist-info}/licenses/LICENSE +0 -0
@@ -17,9 +17,13 @@ from ..._response import (
17
17
  async_to_raw_response_wrapper,
18
18
  async_to_streamed_response_wrapper,
19
19
  )
20
- from ...types.task import Task
21
- from ...types.tasks import parse_create_params, parse_update_params
20
+ from ...types.tasks import parse_get_params, parse_create_params
22
21
  from ..._base_client import make_request_options
22
+ from ...types.llm_processing_param import LlmProcessingParam
23
+ from ...types.chunk_processing_param import ChunkProcessingParam
24
+ from ...types.segment_processing_param import SegmentProcessingParam
25
+ from ...types.tasks.parse_get_response import ParseGetResponse
26
+ from ...types.tasks.parse_create_response import ParseCreateResponse
23
27
 
24
28
  __all__ = ["ParseResource", "AsyncParseResource"]
25
29
 
@@ -48,15 +52,15 @@ class ParseResource(SyncAPIResource):
48
52
  self,
49
53
  *,
50
54
  file: str,
51
- chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
52
- error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
55
+ chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
56
+ error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
53
57
  expires_in: Optional[int] | NotGiven = NOT_GIVEN,
54
58
  file_name: Optional[str] | NotGiven = NOT_GIVEN,
55
- llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
56
- ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
57
- pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
58
- segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
59
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
59
+ llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
60
+ ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
61
+ pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
62
+ segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
63
+ segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
60
64
  # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
61
65
  # The extra values given here take precedence over values defined on the client or passed to this method.
62
66
  extra_headers: Headers | None = None,
@@ -64,17 +68,17 @@ class ParseResource(SyncAPIResource):
64
68
  extra_body: Body | None = None,
65
69
  timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
66
70
  idempotency_key: str | None = None,
67
- ) -> Task:
71
+ ) -> ParseCreateResponse:
68
72
  """
69
73
  Queues a document for processing and returns a `TaskResponse` with the assigned
70
74
  `task_id`, initial configuration, file metadata, and timestamps. The initial
71
75
  status is `Starting`.
72
76
 
73
- Creates a task and returns its metadata immediately.
77
+ Creates a parse task and returns its metadata immediately.
74
78
 
75
79
  Args:
76
80
  file:
77
- The file to be uploaded. Supported inputs:
81
+ The file to be parsed. Supported inputs:
78
82
 
79
83
  - `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
80
84
  API
@@ -93,7 +97,7 @@ class ParseResource(SyncAPIResource):
93
97
  expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
94
98
  updated, polled or accessed via web interface.
95
99
 
96
- file_name: The name of the file to be uploaded. If not set a name will be generated.
100
+ file_name: The name of the file to be parsed. If not set a name will be generated.
97
101
 
98
102
  llm_processing: Controls the LLM used for the task.
99
103
 
@@ -104,41 +108,26 @@ class ParseResource(SyncAPIResource):
104
108
  text. When text layer is present the bounding boxes from the text layer are
105
109
  used.
106
110
 
107
- pipeline: Choose the provider whose models will be used for segmentation and OCR. The
108
- output will be unified to the Chunkr `output` format.
111
+ segment_processing: Configuration for how each document segment is processed and formatted.
109
112
 
110
- segment_processing: Defines how each segment type is handled when generating the final output.
113
+ Each segment has sensible defaults, but you can override specific settings:
111
114
 
112
- Each segment uses one of three strategies. The chosen strategy controls:
115
+ - `format`: Output as `Html` or `Markdown`
116
+ - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
117
+ - `crop_image`: Whether to crop images to segment bounds
118
+ - `extended_context`: Use full page as context for LLM processing
119
+ - `description`: Generate descriptions for segments
113
120
 
114
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
115
- - How the content is produced (rule-based vs. LLM).
116
- - The output format (`Html` or `Markdown`).
121
+ **Defaults per segment type:** Check the documentation for more details.
117
122
 
118
- Optional flags such as image **cropping**, **extended context**, and
119
- **descriptions** further refine behaviour.
120
-
121
- **Default strategy per segment**
122
-
123
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
124
- (Markdown, description off)
125
- - `Table` → **LLM** (HTML, description on)
126
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
127
- - `Formula`, `Page` → **LLM** (Markdown, description off)
128
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
129
-
130
- **Strategy reference**
131
-
132
- - **Auto** – rule-based content generation.
133
- - **LLM** – generate content with an LLM.
134
- - **Ignore** – exclude the segment entirely.
123
+ Only specify the fields you want to change - everything else uses the defaults.
135
124
 
136
125
  segmentation_strategy:
137
126
  Controls the segmentation strategy:
138
127
 
139
128
  - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
140
129
  `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
141
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
130
+ segmentation and better chunking.
142
131
  - `Page`: Treats each page as a single segment. Faster processing, but without
143
132
  layout element detection and only simple chunking.
144
133
 
@@ -176,104 +165,46 @@ class ParseResource(SyncAPIResource):
176
165
  timeout=timeout,
177
166
  idempotency_key=idempotency_key,
178
167
  ),
179
- cast_to=Task,
168
+ cast_to=ParseCreateResponse,
180
169
  )
181
170
 
182
- def update(
171
+ def get(
183
172
  self,
184
- task_id: str,
173
+ task_id: Optional[str],
185
174
  *,
186
- chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
187
- error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
188
- expires_in: Optional[int] | NotGiven = NOT_GIVEN,
189
- high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
190
- llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
191
- ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
192
- pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
193
- segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
194
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
175
+ base64_urls: bool | NotGiven = NOT_GIVEN,
176
+ include_chunks: bool | NotGiven = NOT_GIVEN,
177
+ wait_for_completion: bool | NotGiven = NOT_GIVEN,
195
178
  # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
196
179
  # The extra values given here take precedence over values defined on the client or passed to this method.
197
180
  extra_headers: Headers | None = None,
198
181
  extra_query: Query | None = None,
199
182
  extra_body: Body | None = None,
200
183
  timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
201
- idempotency_key: str | None = None,
202
- ) -> Task:
203
- """Updates an existing task's configuration and reprocesses the document.
204
-
205
- The
206
- current configuration is used as the base; only provided fields are changed.
184
+ ) -> ParseGetResponse:
185
+ """
186
+ Retrieves the current state of a parse task and, when requested, can wait for
187
+ completion.
207
188
 
208
- Requirements:
189
+ Returns task details such as processing status, configuration, output (when
190
+ available), file metadata, and timestamps. If `wait_for_completion=true` is
191
+ provided, the server will hold the request briefly. If the task does not reach a
192
+ terminal state during that window, the response will indicate a retry with
193
+ appropriate headers.
209
194
 
210
- - Task must be in a terminal state (`Succeeded` or `Failed`).
211
- - The new configuration must differ from the current configuration.
195
+ Typical uses:
212
196
 
213
- Updates a task and returns its new metadata immediately.
197
+ - Poll a task during processing
198
+ - Retrieve the final output once processing is complete
199
+ - Access task metadata and configuration
214
200
 
215
201
  Args:
216
- chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
217
-
218
- error_handling:
219
- Controls how errors are handled during processing:
220
-
221
- - `Fail`: Stops processing and fails the task when any error occurs
222
- - `Continue`: Attempts to continue processing despite non-critical errors (eg.
223
- LLM refusals etc.)
224
-
225
- expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
226
- updated, polled or accessed via web interface.
227
-
228
- high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
229
- penalty: ~7 seconds per page)
230
-
231
- llm_processing: Controls the LLM used for the task.
232
-
233
- ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
234
-
235
- - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
236
- - `Auto`: Selectively applies OCR only to pages with missing or low-quality
237
- text. When text layer is present the bounding boxes from the text layer are
238
- used.
239
-
240
- pipeline: Choose the provider whose models will be used for segmentation and OCR. The
241
- output will be unified to the Chunkr `output` format.
202
+ base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
203
+ presigned URLs.
242
204
 
243
- segment_processing: Defines how each segment type is handled when generating the final output.
205
+ include_chunks: Whether to include chunks in the output response
244
206
 
245
- Each segment uses one of three strategies. The chosen strategy controls:
246
-
247
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
248
- - How the content is produced (rule-based vs. LLM).
249
- - The output format (`Html` or `Markdown`).
250
-
251
- Optional flags such as image **cropping**, **extended context**, and
252
- **descriptions** further refine behaviour.
253
-
254
- **Default strategy per segment**
255
-
256
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
257
- (Markdown, description off)
258
- - `Table` → **LLM** (HTML, description on)
259
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
260
- - `Formula`, `Page` → **LLM** (Markdown, description off)
261
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
262
-
263
- **Strategy reference**
264
-
265
- - **Auto** – rule-based content generation.
266
- - **LLM** – generate content with an LLM.
267
- - **Ignore** – exclude the segment entirely.
268
-
269
- segmentation_strategy:
270
- Controls the segmentation strategy:
271
-
272
- - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
273
- `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
274
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
275
- - `Page`: Treats each page as a single segment. Faster processing, but without
276
- layout element detection and only simple chunking.
207
+ wait_for_completion: Whether to wait for the task to complete
277
208
 
278
209
  extra_headers: Send extra headers
279
210
 
@@ -282,35 +213,26 @@ class ParseResource(SyncAPIResource):
282
213
  extra_body: Add additional JSON properties to the request
283
214
 
284
215
  timeout: Override the client-level default timeout for this request, in seconds
285
-
286
- idempotency_key: Specify a custom idempotency key for this request
287
216
  """
288
217
  if not task_id:
289
218
  raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
290
- return self._patch(
291
- f"/tasks/parse/{task_id}",
292
- body=maybe_transform(
293
- {
294
- "chunk_processing": chunk_processing,
295
- "error_handling": error_handling,
296
- "expires_in": expires_in,
297
- "high_resolution": high_resolution,
298
- "llm_processing": llm_processing,
299
- "ocr_strategy": ocr_strategy,
300
- "pipeline": pipeline,
301
- "segment_processing": segment_processing,
302
- "segmentation_strategy": segmentation_strategy,
303
- },
304
- parse_update_params.ParseUpdateParams,
305
- ),
219
+ return self._get(
220
+ f"/tasks/{task_id}/parse",
306
221
  options=make_request_options(
307
222
  extra_headers=extra_headers,
308
223
  extra_query=extra_query,
309
224
  extra_body=extra_body,
310
225
  timeout=timeout,
311
- idempotency_key=idempotency_key,
226
+ query=maybe_transform(
227
+ {
228
+ "base64_urls": base64_urls,
229
+ "include_chunks": include_chunks,
230
+ "wait_for_completion": wait_for_completion,
231
+ },
232
+ parse_get_params.ParseGetParams,
233
+ ),
312
234
  ),
313
- cast_to=Task,
235
+ cast_to=ParseGetResponse,
314
236
  )
315
237
 
316
238
 
@@ -338,15 +260,15 @@ class AsyncParseResource(AsyncAPIResource):
338
260
  self,
339
261
  *,
340
262
  file: str,
341
- chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
342
- error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
263
+ chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
264
+ error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
343
265
  expires_in: Optional[int] | NotGiven = NOT_GIVEN,
344
266
  file_name: Optional[str] | NotGiven = NOT_GIVEN,
345
- llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
346
- ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
347
- pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
348
- segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
349
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
267
+ llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
268
+ ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
269
+ pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
270
+ segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
271
+ segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
350
272
  # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
351
273
  # The extra values given here take precedence over values defined on the client or passed to this method.
352
274
  extra_headers: Headers | None = None,
@@ -354,17 +276,17 @@ class AsyncParseResource(AsyncAPIResource):
354
276
  extra_body: Body | None = None,
355
277
  timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
356
278
  idempotency_key: str | None = None,
357
- ) -> Task:
279
+ ) -> ParseCreateResponse:
358
280
  """
359
281
  Queues a document for processing and returns a `TaskResponse` with the assigned
360
282
  `task_id`, initial configuration, file metadata, and timestamps. The initial
361
283
  status is `Starting`.
362
284
 
363
- Creates a task and returns its metadata immediately.
285
+ Creates a parse task and returns its metadata immediately.
364
286
 
365
287
  Args:
366
288
  file:
367
- The file to be uploaded. Supported inputs:
289
+ The file to be parsed. Supported inputs:
368
290
 
369
291
  - `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
370
292
  API
@@ -383,7 +305,7 @@ class AsyncParseResource(AsyncAPIResource):
383
305
  expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
384
306
  updated, polled or accessed via web interface.
385
307
 
386
- file_name: The name of the file to be uploaded. If not set a name will be generated.
308
+ file_name: The name of the file to be parsed. If not set a name will be generated.
387
309
 
388
310
  llm_processing: Controls the LLM used for the task.
389
311
 
@@ -394,41 +316,26 @@ class AsyncParseResource(AsyncAPIResource):
394
316
  text. When text layer is present the bounding boxes from the text layer are
395
317
  used.
396
318
 
397
- pipeline: Choose the provider whose models will be used for segmentation and OCR. The
398
- output will be unified to the Chunkr `output` format.
319
+ segment_processing: Configuration for how each document segment is processed and formatted.
399
320
 
400
- segment_processing: Defines how each segment type is handled when generating the final output.
321
+ Each segment has sensible defaults, but you can override specific settings:
401
322
 
402
- Each segment uses one of three strategies. The chosen strategy controls:
323
+ - `format`: Output as `Html` or `Markdown`
324
+ - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
325
+ - `crop_image`: Whether to crop images to segment bounds
326
+ - `extended_context`: Use full page as context for LLM processing
327
+ - `description`: Generate descriptions for segments
403
328
 
404
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
405
- - How the content is produced (rule-based vs. LLM).
406
- - The output format (`Html` or `Markdown`).
329
+ **Defaults per segment type:** Check the documentation for more details.
407
330
 
408
- Optional flags such as image **cropping**, **extended context**, and
409
- **descriptions** further refine behaviour.
410
-
411
- **Default strategy per segment**
412
-
413
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
414
- (Markdown, description off)
415
- - `Table` → **LLM** (HTML, description on)
416
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
417
- - `Formula`, `Page` → **LLM** (Markdown, description off)
418
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
419
-
420
- **Strategy reference**
421
-
422
- - **Auto** – rule-based content generation.
423
- - **LLM** – generate content with an LLM.
424
- - **Ignore** – exclude the segment entirely.
331
+ Only specify the fields you want to change - everything else uses the defaults.
425
332
 
426
333
  segmentation_strategy:
427
334
  Controls the segmentation strategy:
428
335
 
429
336
  - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
430
337
  `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
431
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
338
+ segmentation and better chunking.
432
339
  - `Page`: Treats each page as a single segment. Faster processing, but without
433
340
  layout element detection and only simple chunking.
434
341
 
@@ -466,104 +373,46 @@ class AsyncParseResource(AsyncAPIResource):
466
373
  timeout=timeout,
467
374
  idempotency_key=idempotency_key,
468
375
  ),
469
- cast_to=Task,
376
+ cast_to=ParseCreateResponse,
470
377
  )
471
378
 
472
- async def update(
379
+ async def get(
473
380
  self,
474
- task_id: str,
381
+ task_id: Optional[str],
475
382
  *,
476
- chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
477
- error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
478
- expires_in: Optional[int] | NotGiven = NOT_GIVEN,
479
- high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
480
- llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
481
- ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
482
- pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
483
- segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
484
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
383
+ base64_urls: bool | NotGiven = NOT_GIVEN,
384
+ include_chunks: bool | NotGiven = NOT_GIVEN,
385
+ wait_for_completion: bool | NotGiven = NOT_GIVEN,
485
386
  # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
486
387
  # The extra values given here take precedence over values defined on the client or passed to this method.
487
388
  extra_headers: Headers | None = None,
488
389
  extra_query: Query | None = None,
489
390
  extra_body: Body | None = None,
490
391
  timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
491
- idempotency_key: str | None = None,
492
- ) -> Task:
493
- """Updates an existing task's configuration and reprocesses the document.
494
-
495
- The
496
- current configuration is used as the base; only provided fields are changed.
392
+ ) -> ParseGetResponse:
393
+ """
394
+ Retrieves the current state of a parse task and, when requested, can wait for
395
+ completion.
497
396
 
498
- Requirements:
397
+ Returns task details such as processing status, configuration, output (when
398
+ available), file metadata, and timestamps. If `wait_for_completion=true` is
399
+ provided, the server will hold the request briefly. If the task does not reach a
400
+ terminal state during that window, the response will indicate a retry with
401
+ appropriate headers.
499
402
 
500
- - Task must be in a terminal state (`Succeeded` or `Failed`).
501
- - The new configuration must differ from the current configuration.
403
+ Typical uses:
502
404
 
503
- Updates a task and returns its new metadata immediately.
405
+ - Poll a task during processing
406
+ - Retrieve the final output once processing is complete
407
+ - Access task metadata and configuration
504
408
 
505
409
  Args:
506
- chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
507
-
508
- error_handling:
509
- Controls how errors are handled during processing:
510
-
511
- - `Fail`: Stops processing and fails the task when any error occurs
512
- - `Continue`: Attempts to continue processing despite non-critical errors (eg.
513
- LLM refusals etc.)
514
-
515
- expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
516
- updated, polled or accessed via web interface.
517
-
518
- high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
519
- penalty: ~7 seconds per page)
520
-
521
- llm_processing: Controls the LLM used for the task.
522
-
523
- ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
524
-
525
- - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
526
- - `Auto`: Selectively applies OCR only to pages with missing or low-quality
527
- text. When text layer is present the bounding boxes from the text layer are
528
- used.
529
-
530
- pipeline: Choose the provider whose models will be used for segmentation and OCR. The
531
- output will be unified to the Chunkr `output` format.
410
+ base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
411
+ presigned URLs.
532
412
 
533
- segment_processing: Defines how each segment type is handled when generating the final output.
413
+ include_chunks: Whether to include chunks in the output response
534
414
 
535
- Each segment uses one of three strategies. The chosen strategy controls:
536
-
537
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
538
- - How the content is produced (rule-based vs. LLM).
539
- - The output format (`Html` or `Markdown`).
540
-
541
- Optional flags such as image **cropping**, **extended context**, and
542
- **descriptions** further refine behaviour.
543
-
544
- **Default strategy per segment**
545
-
546
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
547
- (Markdown, description off)
548
- - `Table` → **LLM** (HTML, description on)
549
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
550
- - `Formula`, `Page` → **LLM** (Markdown, description off)
551
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
552
-
553
- **Strategy reference**
554
-
555
- - **Auto** – rule-based content generation.
556
- - **LLM** – generate content with an LLM.
557
- - **Ignore** – exclude the segment entirely.
558
-
559
- segmentation_strategy:
560
- Controls the segmentation strategy:
561
-
562
- - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
563
- `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
564
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
565
- - `Page`: Treats each page as a single segment. Faster processing, but without
566
- layout element detection and only simple chunking.
415
+ wait_for_completion: Whether to wait for the task to complete
567
416
 
568
417
  extra_headers: Send extra headers
569
418
 
@@ -572,35 +421,26 @@ class AsyncParseResource(AsyncAPIResource):
572
421
  extra_body: Add additional JSON properties to the request
573
422
 
574
423
  timeout: Override the client-level default timeout for this request, in seconds
575
-
576
- idempotency_key: Specify a custom idempotency key for this request
577
424
  """
578
425
  if not task_id:
579
426
  raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
580
- return await self._patch(
581
- f"/tasks/parse/{task_id}",
582
- body=await async_maybe_transform(
583
- {
584
- "chunk_processing": chunk_processing,
585
- "error_handling": error_handling,
586
- "expires_in": expires_in,
587
- "high_resolution": high_resolution,
588
- "llm_processing": llm_processing,
589
- "ocr_strategy": ocr_strategy,
590
- "pipeline": pipeline,
591
- "segment_processing": segment_processing,
592
- "segmentation_strategy": segmentation_strategy,
593
- },
594
- parse_update_params.ParseUpdateParams,
595
- ),
427
+ return await self._get(
428
+ f"/tasks/{task_id}/parse",
596
429
  options=make_request_options(
597
430
  extra_headers=extra_headers,
598
431
  extra_query=extra_query,
599
432
  extra_body=extra_body,
600
433
  timeout=timeout,
601
- idempotency_key=idempotency_key,
434
+ query=await async_maybe_transform(
435
+ {
436
+ "base64_urls": base64_urls,
437
+ "include_chunks": include_chunks,
438
+ "wait_for_completion": wait_for_completion,
439
+ },
440
+ parse_get_params.ParseGetParams,
441
+ ),
602
442
  ),
603
- cast_to=Task,
443
+ cast_to=ParseGetResponse,
604
444
  )
605
445
 
606
446
 
@@ -611,8 +451,8 @@ class ParseResourceWithRawResponse:
611
451
  self.create = to_raw_response_wrapper(
612
452
  parse.create,
613
453
  )
614
- self.update = to_raw_response_wrapper(
615
- parse.update,
454
+ self.get = to_raw_response_wrapper(
455
+ parse.get,
616
456
  )
617
457
 
618
458
 
@@ -623,8 +463,8 @@ class AsyncParseResourceWithRawResponse:
623
463
  self.create = async_to_raw_response_wrapper(
624
464
  parse.create,
625
465
  )
626
- self.update = async_to_raw_response_wrapper(
627
- parse.update,
466
+ self.get = async_to_raw_response_wrapper(
467
+ parse.get,
628
468
  )
629
469
 
630
470
 
@@ -635,8 +475,8 @@ class ParseResourceWithStreamingResponse:
635
475
  self.create = to_streamed_response_wrapper(
636
476
  parse.create,
637
477
  )
638
- self.update = to_streamed_response_wrapper(
639
- parse.update,
478
+ self.get = to_streamed_response_wrapper(
479
+ parse.get,
640
480
  )
641
481
 
642
482
 
@@ -647,6 +487,6 @@ class AsyncParseResourceWithStreamingResponse:
647
487
  self.create = async_to_streamed_response_wrapper(
648
488
  parse.create,
649
489
  )
650
- self.update = async_to_streamed_response_wrapper(
651
- parse.update,
490
+ self.get = async_to_streamed_response_wrapper(
491
+ parse.get,
652
492
  )