chunkr-ai 0.1.0a6__py3-none-any.whl → 0.1.0a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. chunkr_ai/__init__.py +2 -0
  2. chunkr_ai/_base_client.py +3 -3
  3. chunkr_ai/_client.py +31 -3
  4. chunkr_ai/_compat.py +48 -48
  5. chunkr_ai/_constants.py +5 -5
  6. chunkr_ai/_exceptions.py +4 -0
  7. chunkr_ai/_models.py +41 -41
  8. chunkr_ai/_types.py +35 -1
  9. chunkr_ai/_utils/__init__.py +9 -2
  10. chunkr_ai/_utils/_compat.py +45 -0
  11. chunkr_ai/_utils/_datetime_parse.py +136 -0
  12. chunkr_ai/_utils/_transform.py +11 -1
  13. chunkr_ai/_utils/_typing.py +6 -1
  14. chunkr_ai/_utils/_utils.py +0 -1
  15. chunkr_ai/_version.py +1 -1
  16. chunkr_ai/resources/__init__.py +14 -0
  17. chunkr_ai/resources/files.py +3 -3
  18. chunkr_ai/resources/tasks/__init__.py +14 -0
  19. chunkr_ai/resources/tasks/extract.py +393 -0
  20. chunkr_ai/resources/tasks/parse.py +110 -286
  21. chunkr_ai/resources/tasks/tasks.py +64 -32
  22. chunkr_ai/resources/webhooks.py +193 -0
  23. chunkr_ai/types/__init__.py +27 -1
  24. chunkr_ai/types/bounding_box.py +19 -0
  25. chunkr_ai/types/cell.py +39 -0
  26. chunkr_ai/types/cell_style.py +28 -0
  27. chunkr_ai/types/chunk.py +40 -0
  28. chunkr_ai/types/chunk_processing.py +40 -0
  29. chunkr_ai/types/chunk_processing_param.py +42 -0
  30. chunkr_ai/types/extract_configuration.py +24 -0
  31. chunkr_ai/types/extract_output_response.py +62 -0
  32. chunkr_ai/types/file_create_params.py +2 -1
  33. chunkr_ai/types/file_info.py +21 -0
  34. chunkr_ai/types/generation_config.py +29 -0
  35. chunkr_ai/types/generation_config_param.py +29 -0
  36. chunkr_ai/types/llm_processing.py +36 -0
  37. chunkr_ai/types/llm_processing_param.py +36 -0
  38. chunkr_ai/types/ocr_result.py +28 -0
  39. chunkr_ai/types/page.py +27 -0
  40. chunkr_ai/types/parse_configuration.py +64 -0
  41. chunkr_ai/types/parse_configuration_param.py +65 -0
  42. chunkr_ai/types/parse_output_response.py +29 -0
  43. chunkr_ai/types/segment.py +109 -0
  44. chunkr_ai/types/segment_processing.py +228 -0
  45. chunkr_ai/types/segment_processing_param.py +229 -0
  46. chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
  47. chunkr_ai/types/task_get_params.py +0 -3
  48. chunkr_ai/types/task_list_params.py +7 -1
  49. chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
  50. chunkr_ai/types/task_response.py +68 -0
  51. chunkr_ai/types/tasks/__init__.py +7 -1
  52. chunkr_ai/types/tasks/extract_create_params.py +47 -0
  53. chunkr_ai/types/tasks/extract_create_response.py +67 -0
  54. chunkr_ai/types/tasks/extract_get_params.py +18 -0
  55. chunkr_ai/types/tasks/extract_get_response.py +67 -0
  56. chunkr_ai/types/tasks/parse_create_params.py +25 -793
  57. chunkr_ai/types/tasks/parse_create_response.py +55 -0
  58. chunkr_ai/types/tasks/parse_get_params.py +18 -0
  59. chunkr_ai/types/tasks/parse_get_response.py +55 -0
  60. chunkr_ai/types/unwrap_webhook_event.py +11 -0
  61. chunkr_ai/types/version_info.py +31 -0
  62. chunkr_ai/types/webhook_url_response.py +9 -0
  63. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/METADATA +14 -13
  64. chunkr_ai-0.1.0a8.dist-info/RECORD +88 -0
  65. chunkr_ai/types/task.py +0 -1225
  66. chunkr_ai/types/tasks/parse_update_params.py +0 -845
  67. chunkr_ai-0.1.0a6.dist-info/RECORD +0 -52
  68. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/WHEEL +0 -0
  69. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/licenses/LICENSE +0 -0
@@ -17,9 +17,13 @@ from ..._response import (
17
17
  async_to_raw_response_wrapper,
18
18
  async_to_streamed_response_wrapper,
19
19
  )
20
- from ...types.task import Task
21
- from ...types.tasks import parse_create_params, parse_update_params
20
+ from ...types.tasks import parse_get_params, parse_create_params
22
21
  from ..._base_client import make_request_options
22
+ from ...types.llm_processing_param import LlmProcessingParam
23
+ from ...types.chunk_processing_param import ChunkProcessingParam
24
+ from ...types.segment_processing_param import SegmentProcessingParam
25
+ from ...types.tasks.parse_get_response import ParseGetResponse
26
+ from ...types.tasks.parse_create_response import ParseCreateResponse
23
27
 
24
28
  __all__ = ["ParseResource", "AsyncParseResource"]
25
29
 
@@ -48,15 +52,15 @@ class ParseResource(SyncAPIResource):
48
52
  self,
49
53
  *,
50
54
  file: str,
51
- chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
52
- error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
55
+ chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
56
+ error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
53
57
  expires_in: Optional[int] | NotGiven = NOT_GIVEN,
54
58
  file_name: Optional[str] | NotGiven = NOT_GIVEN,
55
- llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
56
- ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
57
- pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
58
- segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
59
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
59
+ llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
60
+ ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
61
+ pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
62
+ segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
63
+ segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
60
64
  # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
61
65
  # The extra values given here take precedence over values defined on the client or passed to this method.
62
66
  extra_headers: Headers | None = None,
@@ -64,17 +68,17 @@ class ParseResource(SyncAPIResource):
64
68
  extra_body: Body | None = None,
65
69
  timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
66
70
  idempotency_key: str | None = None,
67
- ) -> Task:
71
+ ) -> ParseCreateResponse:
68
72
  """
69
73
  Queues a document for processing and returns a `TaskResponse` with the assigned
70
74
  `task_id`, initial configuration, file metadata, and timestamps. The initial
71
75
  status is `Starting`.
72
76
 
73
- Creates a task and returns its metadata immediately.
77
+ Creates a parse task and returns its metadata immediately.
74
78
 
75
79
  Args:
76
80
  file:
77
- The file to be uploaded. Supported inputs:
81
+ The file to be parsed. Supported inputs:
78
82
 
79
83
  - `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
80
84
  API
@@ -93,7 +97,7 @@ class ParseResource(SyncAPIResource):
93
97
  expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
94
98
  updated, polled or accessed via web interface.
95
99
 
96
- file_name: The name of the file to be uploaded. If not set a name will be generated.
100
+ file_name: The name of the file to be parsed. If not set a name will be generated.
97
101
 
98
102
  llm_processing: Controls the LLM used for the task.
99
103
 
@@ -104,41 +108,26 @@ class ParseResource(SyncAPIResource):
104
108
  text. When text layer is present the bounding boxes from the text layer are
105
109
  used.
106
110
 
107
- pipeline: Choose the provider whose models will be used for segmentation and OCR. The
108
- output will be unified to the Chunkr `output` format.
111
+ segment_processing: Configuration for how each document segment is processed and formatted.
109
112
 
110
- segment_processing: Defines how each segment type is handled when generating the final output.
113
+ Each segment has sensible defaults, but you can override specific settings:
111
114
 
112
- Each segment uses one of three strategies. The chosen strategy controls:
115
+ - `format`: Output as `Html` or `Markdown`
116
+ - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
117
+ - `crop_image`: Whether to crop images to segment bounds
118
+ - `extended_context`: Use full page as context for LLM processing
119
+ - `description`: Generate descriptions for segments
113
120
 
114
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
115
- - How the content is produced (rule-based vs. LLM).
116
- - The output format (`Html` or `Markdown`).
121
+ **Defaults per segment type:** Check the documentation for more details.
117
122
 
118
- Optional flags such as image **cropping**, **extended context**, and
119
- **descriptions** further refine behaviour.
120
-
121
- **Default strategy per segment**
122
-
123
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
124
- (Markdown, description off)
125
- - `Table` → **LLM** (HTML, description on)
126
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
127
- - `Formula`, `Page` → **LLM** (Markdown, description off)
128
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
129
-
130
- **Strategy reference**
131
-
132
- - **Auto** – rule-based content generation.
133
- - **LLM** – generate content with an LLM.
134
- - **Ignore** – exclude the segment entirely.
123
+ Only specify the fields you want to change - everything else uses the defaults.
135
124
 
136
125
  segmentation_strategy:
137
126
  Controls the segmentation strategy:
138
127
 
139
128
  - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
140
129
  `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
141
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
130
+ segmentation and better chunking.
142
131
  - `Page`: Treats each page as a single segment. Faster processing, but without
143
132
  layout element detection and only simple chunking.
144
133
 
@@ -176,104 +165,39 @@ class ParseResource(SyncAPIResource):
176
165
  timeout=timeout,
177
166
  idempotency_key=idempotency_key,
178
167
  ),
179
- cast_to=Task,
168
+ cast_to=ParseCreateResponse,
180
169
  )
181
170
 
182
- def update(
171
+ def get(
183
172
  self,
184
- task_id: str,
173
+ task_id: Optional[str],
185
174
  *,
186
- chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
187
- error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
188
- expires_in: Optional[int] | NotGiven = NOT_GIVEN,
189
- high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
190
- llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
191
- ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
192
- pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
193
- segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
194
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
175
+ base64_urls: bool | NotGiven = NOT_GIVEN,
176
+ include_chunks: bool | NotGiven = NOT_GIVEN,
195
177
  # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
196
178
  # The extra values given here take precedence over values defined on the client or passed to this method.
197
179
  extra_headers: Headers | None = None,
198
180
  extra_query: Query | None = None,
199
181
  extra_body: Body | None = None,
200
182
  timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
201
- idempotency_key: str | None = None,
202
- ) -> Task:
203
- """Updates an existing task's configuration and reprocesses the document.
204
-
205
- The
206
- current configuration is used as the base; only provided fields are changed.
183
+ ) -> ParseGetResponse:
184
+ """
185
+ Retrieves the current state of a parse task.
207
186
 
208
- Requirements:
187
+ Returns task details such as processing status, configuration, output (when
188
+ available), file metadata, and timestamps.
209
189
 
210
- - Task must be in a terminal state (`Succeeded` or `Failed`).
211
- - The new configuration must differ from the current configuration.
190
+ Typical uses:
212
191
 
213
- Updates a task and returns its new metadata immediately.
192
+ - Poll a task during processing
193
+ - Retrieve the final output once processing is complete
194
+ - Access task metadata and configuration
214
195
 
215
196
  Args:
216
- chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
217
-
218
- error_handling:
219
- Controls how errors are handled during processing:
220
-
221
- - `Fail`: Stops processing and fails the task when any error occurs
222
- - `Continue`: Attempts to continue processing despite non-critical errors (eg.
223
- LLM refusals etc.)
224
-
225
- expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
226
- updated, polled or accessed via web interface.
227
-
228
- high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
229
- penalty: ~7 seconds per page)
230
-
231
- llm_processing: Controls the LLM used for the task.
232
-
233
- ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
234
-
235
- - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
236
- - `Auto`: Selectively applies OCR only to pages with missing or low-quality
237
- text. When text layer is present the bounding boxes from the text layer are
238
- used.
239
-
240
- pipeline: Choose the provider whose models will be used for segmentation and OCR. The
241
- output will be unified to the Chunkr `output` format.
197
+ base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
198
+ presigned URLs.
242
199
 
243
- segment_processing: Defines how each segment type is handled when generating the final output.
244
-
245
- Each segment uses one of three strategies. The chosen strategy controls:
246
-
247
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
248
- - How the content is produced (rule-based vs. LLM).
249
- - The output format (`Html` or `Markdown`).
250
-
251
- Optional flags such as image **cropping**, **extended context**, and
252
- **descriptions** further refine behaviour.
253
-
254
- **Default strategy per segment**
255
-
256
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
257
- (Markdown, description off)
258
- - `Table` → **LLM** (HTML, description on)
259
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
260
- - `Formula`, `Page` → **LLM** (Markdown, description off)
261
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
262
-
263
- **Strategy reference**
264
-
265
- - **Auto** – rule-based content generation.
266
- - **LLM** – generate content with an LLM.
267
- - **Ignore** – exclude the segment entirely.
268
-
269
- segmentation_strategy:
270
- Controls the segmentation strategy:
271
-
272
- - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
273
- `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
274
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
275
- - `Page`: Treats each page as a single segment. Faster processing, but without
276
- layout element detection and only simple chunking.
200
+ include_chunks: Whether to include chunks in the output response
277
201
 
278
202
  extra_headers: Send extra headers
279
203
 
@@ -282,35 +206,25 @@ class ParseResource(SyncAPIResource):
282
206
  extra_body: Add additional JSON properties to the request
283
207
 
284
208
  timeout: Override the client-level default timeout for this request, in seconds
285
-
286
- idempotency_key: Specify a custom idempotency key for this request
287
209
  """
288
210
  if not task_id:
289
211
  raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
290
- return self._patch(
291
- f"/tasks/parse/{task_id}",
292
- body=maybe_transform(
293
- {
294
- "chunk_processing": chunk_processing,
295
- "error_handling": error_handling,
296
- "expires_in": expires_in,
297
- "high_resolution": high_resolution,
298
- "llm_processing": llm_processing,
299
- "ocr_strategy": ocr_strategy,
300
- "pipeline": pipeline,
301
- "segment_processing": segment_processing,
302
- "segmentation_strategy": segmentation_strategy,
303
- },
304
- parse_update_params.ParseUpdateParams,
305
- ),
212
+ return self._get(
213
+ f"/tasks/{task_id}/parse",
306
214
  options=make_request_options(
307
215
  extra_headers=extra_headers,
308
216
  extra_query=extra_query,
309
217
  extra_body=extra_body,
310
218
  timeout=timeout,
311
- idempotency_key=idempotency_key,
219
+ query=maybe_transform(
220
+ {
221
+ "base64_urls": base64_urls,
222
+ "include_chunks": include_chunks,
223
+ },
224
+ parse_get_params.ParseGetParams,
225
+ ),
312
226
  ),
313
- cast_to=Task,
227
+ cast_to=ParseGetResponse,
314
228
  )
315
229
 
316
230
 
@@ -338,15 +252,15 @@ class AsyncParseResource(AsyncAPIResource):
338
252
  self,
339
253
  *,
340
254
  file: str,
341
- chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
342
- error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
255
+ chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
256
+ error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
343
257
  expires_in: Optional[int] | NotGiven = NOT_GIVEN,
344
258
  file_name: Optional[str] | NotGiven = NOT_GIVEN,
345
- llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
346
- ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
347
- pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
348
- segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
349
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
259
+ llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
260
+ ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
261
+ pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
262
+ segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
263
+ segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
350
264
  # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
351
265
  # The extra values given here take precedence over values defined on the client or passed to this method.
352
266
  extra_headers: Headers | None = None,
@@ -354,17 +268,17 @@ class AsyncParseResource(AsyncAPIResource):
354
268
  extra_body: Body | None = None,
355
269
  timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
356
270
  idempotency_key: str | None = None,
357
- ) -> Task:
271
+ ) -> ParseCreateResponse:
358
272
  """
359
273
  Queues a document for processing and returns a `TaskResponse` with the assigned
360
274
  `task_id`, initial configuration, file metadata, and timestamps. The initial
361
275
  status is `Starting`.
362
276
 
363
- Creates a task and returns its metadata immediately.
277
+ Creates a parse task and returns its metadata immediately.
364
278
 
365
279
  Args:
366
280
  file:
367
- The file to be uploaded. Supported inputs:
281
+ The file to be parsed. Supported inputs:
368
282
 
369
283
  - `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
370
284
  API
@@ -383,7 +297,7 @@ class AsyncParseResource(AsyncAPIResource):
383
297
  expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
384
298
  updated, polled or accessed via web interface.
385
299
 
386
- file_name: The name of the file to be uploaded. If not set a name will be generated.
300
+ file_name: The name of the file to be parsed. If not set a name will be generated.
387
301
 
388
302
  llm_processing: Controls the LLM used for the task.
389
303
 
@@ -394,41 +308,26 @@ class AsyncParseResource(AsyncAPIResource):
394
308
  text. When text layer is present the bounding boxes from the text layer are
395
309
  used.
396
310
 
397
- pipeline: Choose the provider whose models will be used for segmentation and OCR. The
398
- output will be unified to the Chunkr `output` format.
311
+ segment_processing: Configuration for how each document segment is processed and formatted.
399
312
 
400
- segment_processing: Defines how each segment type is handled when generating the final output.
313
+ Each segment has sensible defaults, but you can override specific settings:
401
314
 
402
- Each segment uses one of three strategies. The chosen strategy controls:
315
+ - `format`: Output as `Html` or `Markdown`
316
+ - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
317
+ - `crop_image`: Whether to crop images to segment bounds
318
+ - `extended_context`: Use full page as context for LLM processing
319
+ - `description`: Generate descriptions for segments
403
320
 
404
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
405
- - How the content is produced (rule-based vs. LLM).
406
- - The output format (`Html` or `Markdown`).
321
+ **Defaults per segment type:** Check the documentation for more details.
407
322
 
408
- Optional flags such as image **cropping**, **extended context**, and
409
- **descriptions** further refine behaviour.
410
-
411
- **Default strategy per segment**
412
-
413
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
414
- (Markdown, description off)
415
- - `Table` → **LLM** (HTML, description on)
416
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
417
- - `Formula`, `Page` → **LLM** (Markdown, description off)
418
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
419
-
420
- **Strategy reference**
421
-
422
- - **Auto** – rule-based content generation.
423
- - **LLM** – generate content with an LLM.
424
- - **Ignore** – exclude the segment entirely.
323
+ Only specify the fields you want to change - everything else uses the defaults.
425
324
 
426
325
  segmentation_strategy:
427
326
  Controls the segmentation strategy:
428
327
 
429
328
  - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
430
329
  `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
431
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
330
+ segmentation and better chunking.
432
331
  - `Page`: Treats each page as a single segment. Faster processing, but without
433
332
  layout element detection and only simple chunking.
434
333
 
@@ -466,104 +365,39 @@ class AsyncParseResource(AsyncAPIResource):
466
365
  timeout=timeout,
467
366
  idempotency_key=idempotency_key,
468
367
  ),
469
- cast_to=Task,
368
+ cast_to=ParseCreateResponse,
470
369
  )
471
370
 
472
- async def update(
371
+ async def get(
473
372
  self,
474
- task_id: str,
373
+ task_id: Optional[str],
475
374
  *,
476
- chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
477
- error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
478
- expires_in: Optional[int] | NotGiven = NOT_GIVEN,
479
- high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
480
- llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
481
- ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
482
- pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
483
- segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
484
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
375
+ base64_urls: bool | NotGiven = NOT_GIVEN,
376
+ include_chunks: bool | NotGiven = NOT_GIVEN,
485
377
  # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
486
378
  # The extra values given here take precedence over values defined on the client or passed to this method.
487
379
  extra_headers: Headers | None = None,
488
380
  extra_query: Query | None = None,
489
381
  extra_body: Body | None = None,
490
382
  timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
491
- idempotency_key: str | None = None,
492
- ) -> Task:
493
- """Updates an existing task's configuration and reprocesses the document.
494
-
495
- The
496
- current configuration is used as the base; only provided fields are changed.
383
+ ) -> ParseGetResponse:
384
+ """
385
+ Retrieves the current state of a parse task.
497
386
 
498
- Requirements:
387
+ Returns task details such as processing status, configuration, output (when
388
+ available), file metadata, and timestamps.
499
389
 
500
- - Task must be in a terminal state (`Succeeded` or `Failed`).
501
- - The new configuration must differ from the current configuration.
390
+ Typical uses:
502
391
 
503
- Updates a task and returns its new metadata immediately.
392
+ - Poll a task during processing
393
+ - Retrieve the final output once processing is complete
394
+ - Access task metadata and configuration
504
395
 
505
396
  Args:
506
- chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
507
-
508
- error_handling:
509
- Controls how errors are handled during processing:
510
-
511
- - `Fail`: Stops processing and fails the task when any error occurs
512
- - `Continue`: Attempts to continue processing despite non-critical errors (eg.
513
- LLM refusals etc.)
514
-
515
- expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
516
- updated, polled or accessed via web interface.
517
-
518
- high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
519
- penalty: ~7 seconds per page)
520
-
521
- llm_processing: Controls the LLM used for the task.
522
-
523
- ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
524
-
525
- - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
526
- - `Auto`: Selectively applies OCR only to pages with missing or low-quality
527
- text. When text layer is present the bounding boxes from the text layer are
528
- used.
529
-
530
- pipeline: Choose the provider whose models will be used for segmentation and OCR. The
531
- output will be unified to the Chunkr `output` format.
397
+ base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
398
+ presigned URLs.
532
399
 
533
- segment_processing: Defines how each segment type is handled when generating the final output.
534
-
535
- Each segment uses one of three strategies. The chosen strategy controls:
536
-
537
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
538
- - How the content is produced (rule-based vs. LLM).
539
- - The output format (`Html` or `Markdown`).
540
-
541
- Optional flags such as image **cropping**, **extended context**, and
542
- **descriptions** further refine behaviour.
543
-
544
- **Default strategy per segment**
545
-
546
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
547
- (Markdown, description off)
548
- - `Table` → **LLM** (HTML, description on)
549
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
550
- - `Formula`, `Page` → **LLM** (Markdown, description off)
551
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
552
-
553
- **Strategy reference**
554
-
555
- - **Auto** – rule-based content generation.
556
- - **LLM** – generate content with an LLM.
557
- - **Ignore** – exclude the segment entirely.
558
-
559
- segmentation_strategy:
560
- Controls the segmentation strategy:
561
-
562
- - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
563
- `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
564
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
565
- - `Page`: Treats each page as a single segment. Faster processing, but without
566
- layout element detection and only simple chunking.
400
+ include_chunks: Whether to include chunks in the output response
567
401
 
568
402
  extra_headers: Send extra headers
569
403
 
@@ -572,35 +406,25 @@ class AsyncParseResource(AsyncAPIResource):
572
406
  extra_body: Add additional JSON properties to the request
573
407
 
574
408
  timeout: Override the client-level default timeout for this request, in seconds
575
-
576
- idempotency_key: Specify a custom idempotency key for this request
577
409
  """
578
410
  if not task_id:
579
411
  raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
580
- return await self._patch(
581
- f"/tasks/parse/{task_id}",
582
- body=await async_maybe_transform(
583
- {
584
- "chunk_processing": chunk_processing,
585
- "error_handling": error_handling,
586
- "expires_in": expires_in,
587
- "high_resolution": high_resolution,
588
- "llm_processing": llm_processing,
589
- "ocr_strategy": ocr_strategy,
590
- "pipeline": pipeline,
591
- "segment_processing": segment_processing,
592
- "segmentation_strategy": segmentation_strategy,
593
- },
594
- parse_update_params.ParseUpdateParams,
595
- ),
412
+ return await self._get(
413
+ f"/tasks/{task_id}/parse",
596
414
  options=make_request_options(
597
415
  extra_headers=extra_headers,
598
416
  extra_query=extra_query,
599
417
  extra_body=extra_body,
600
418
  timeout=timeout,
601
- idempotency_key=idempotency_key,
419
+ query=await async_maybe_transform(
420
+ {
421
+ "base64_urls": base64_urls,
422
+ "include_chunks": include_chunks,
423
+ },
424
+ parse_get_params.ParseGetParams,
425
+ ),
602
426
  ),
603
- cast_to=Task,
427
+ cast_to=ParseGetResponse,
604
428
  )
605
429
 
606
430
 
@@ -611,8 +435,8 @@ class ParseResourceWithRawResponse:
611
435
  self.create = to_raw_response_wrapper(
612
436
  parse.create,
613
437
  )
614
- self.update = to_raw_response_wrapper(
615
- parse.update,
438
+ self.get = to_raw_response_wrapper(
439
+ parse.get,
616
440
  )
617
441
 
618
442
 
@@ -623,8 +447,8 @@ class AsyncParseResourceWithRawResponse:
623
447
  self.create = async_to_raw_response_wrapper(
624
448
  parse.create,
625
449
  )
626
- self.update = async_to_raw_response_wrapper(
627
- parse.update,
450
+ self.get = async_to_raw_response_wrapper(
451
+ parse.get,
628
452
  )
629
453
 
630
454
 
@@ -635,8 +459,8 @@ class ParseResourceWithStreamingResponse:
635
459
  self.create = to_streamed_response_wrapper(
636
460
  parse.create,
637
461
  )
638
- self.update = to_streamed_response_wrapper(
639
- parse.update,
462
+ self.get = to_streamed_response_wrapper(
463
+ parse.get,
640
464
  )
641
465
 
642
466
 
@@ -647,6 +471,6 @@ class AsyncParseResourceWithStreamingResponse:
647
471
  self.create = async_to_streamed_response_wrapper(
648
472
  parse.create,
649
473
  )
650
- self.update = async_to_streamed_response_wrapper(
651
- parse.update,
474
+ self.get = async_to_streamed_response_wrapper(
475
+ parse.get,
652
476
  )