chunkr-ai 0.1.0a5__py3-none-any.whl → 0.1.0a7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. chunkr_ai/__init__.py +2 -0
  2. chunkr_ai/_client.py +31 -3
  3. chunkr_ai/_constants.py +5 -5
  4. chunkr_ai/_exceptions.py +4 -0
  5. chunkr_ai/_models.py +1 -1
  6. chunkr_ai/_types.py +35 -1
  7. chunkr_ai/_utils/__init__.py +1 -0
  8. chunkr_ai/_utils/_typing.py +5 -0
  9. chunkr_ai/_version.py +1 -1
  10. chunkr_ai/resources/__init__.py +14 -0
  11. chunkr_ai/resources/files.py +3 -3
  12. chunkr_ai/resources/tasks/__init__.py +14 -0
  13. chunkr_ai/resources/tasks/extract.py +409 -0
  14. chunkr_ai/resources/tasks/parse.py +102 -346
  15. chunkr_ai/resources/tasks/tasks.py +62 -14
  16. chunkr_ai/resources/webhooks.py +193 -0
  17. chunkr_ai/types/__init__.py +27 -1
  18. chunkr_ai/types/bounding_box.py +19 -0
  19. chunkr_ai/types/cell.py +39 -0
  20. chunkr_ai/types/cell_style.py +28 -0
  21. chunkr_ai/types/chunk.py +40 -0
  22. chunkr_ai/types/chunk_processing.py +40 -0
  23. chunkr_ai/types/chunk_processing_param.py +42 -0
  24. chunkr_ai/types/extract_configuration.py +24 -0
  25. chunkr_ai/types/extract_output_response.py +19 -0
  26. chunkr_ai/types/file_create_params.py +2 -1
  27. chunkr_ai/types/file_info.py +21 -0
  28. chunkr_ai/types/generation_config.py +29 -0
  29. chunkr_ai/types/generation_config_param.py +29 -0
  30. chunkr_ai/types/llm_processing.py +36 -0
  31. chunkr_ai/types/llm_processing_param.py +36 -0
  32. chunkr_ai/types/ocr_result.py +28 -0
  33. chunkr_ai/types/page.py +27 -0
  34. chunkr_ai/types/parse_configuration.py +64 -0
  35. chunkr_ai/types/parse_configuration_param.py +65 -0
  36. chunkr_ai/types/parse_output_response.py +29 -0
  37. chunkr_ai/types/segment.py +109 -0
  38. chunkr_ai/types/segment_processing.py +228 -0
  39. chunkr_ai/types/segment_processing_param.py +229 -0
  40. chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
  41. chunkr_ai/types/task_list_params.py +7 -1
  42. chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
  43. chunkr_ai/types/task_response.py +68 -0
  44. chunkr_ai/types/tasks/__init__.py +7 -1
  45. chunkr_ai/types/tasks/extract_create_params.py +47 -0
  46. chunkr_ai/types/tasks/extract_create_response.py +214 -0
  47. chunkr_ai/types/tasks/extract_get_params.py +21 -0
  48. chunkr_ai/types/tasks/extract_get_response.py +214 -0
  49. chunkr_ai/types/tasks/parse_create_params.py +25 -805
  50. chunkr_ai/types/tasks/parse_create_response.py +55 -0
  51. chunkr_ai/types/tasks/parse_get_params.py +21 -0
  52. chunkr_ai/types/tasks/parse_get_response.py +55 -0
  53. chunkr_ai/types/unwrap_webhook_event.py +11 -0
  54. chunkr_ai/types/version_info.py +31 -0
  55. chunkr_ai/types/webhook_url_response.py +9 -0
  56. {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/METADATA +14 -13
  57. chunkr_ai-0.1.0a7.dist-info/RECORD +86 -0
  58. chunkr_ai/types/task.py +0 -1225
  59. chunkr_ai/types/tasks/parse_update_params.py +0 -857
  60. chunkr_ai-0.1.0a5.dist-info/RECORD +0 -52
  61. {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/WHEEL +0 -0
  62. {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/licenses/LICENSE +0 -0
@@ -17,9 +17,13 @@ from ..._response import (
17
17
  async_to_raw_response_wrapper,
18
18
  async_to_streamed_response_wrapper,
19
19
  )
20
- from ...types.task import Task
21
- from ...types.tasks import parse_create_params, parse_update_params
20
+ from ...types.tasks import parse_get_params, parse_create_params
22
21
  from ..._base_client import make_request_options
22
+ from ...types.llm_processing_param import LlmProcessingParam
23
+ from ...types.chunk_processing_param import ChunkProcessingParam
24
+ from ...types.segment_processing_param import SegmentProcessingParam
25
+ from ...types.tasks.parse_get_response import ParseGetResponse
26
+ from ...types.tasks.parse_create_response import ParseCreateResponse
23
27
 
24
28
  __all__ = ["ParseResource", "AsyncParseResource"]
25
29
 
@@ -48,18 +52,15 @@ class ParseResource(SyncAPIResource):
48
52
  self,
49
53
  *,
50
54
  file: str,
51
- base64_urls: bool | NotGiven = NOT_GIVEN,
52
- include_chunks: bool | NotGiven = NOT_GIVEN,
53
- wait_for_completion: bool | NotGiven = NOT_GIVEN,
54
- chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
55
- error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
55
+ chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
56
+ error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
56
57
  expires_in: Optional[int] | NotGiven = NOT_GIVEN,
57
58
  file_name: Optional[str] | NotGiven = NOT_GIVEN,
58
- llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
59
- ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
60
- pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
61
- segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
62
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
59
+ llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
60
+ ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
61
+ pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
62
+ segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
63
+ segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
63
64
  # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
64
65
  # The extra values given here take precedence over values defined on the client or passed to this method.
65
66
  extra_headers: Headers | None = None,
@@ -67,33 +68,23 @@ class ParseResource(SyncAPIResource):
67
68
  extra_body: Body | None = None,
68
69
  timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
69
70
  idempotency_key: str | None = None,
70
- ) -> Task:
71
+ ) -> ParseCreateResponse:
71
72
  """
72
73
  Queues a document for processing and returns a `TaskResponse` with the assigned
73
74
  `task_id`, initial configuration, file metadata, and timestamps. The initial
74
75
  status is `Starting`.
75
76
 
76
- If `wait_for_completion=true` is provided, the server waits briefly for
77
- completion. If the task completes within that window, a 200 response with the
78
- final `TaskResponse` is returned. Otherwise, the server returns a 408 or 409
79
- with retry guidance and a body describing how long to wait before retrying.
77
+ Creates a parse task and returns its metadata immediately.
80
78
 
81
79
  Args:
82
80
  file:
83
- The file to be uploaded. Supported inputs:
81
+ The file to be parsed. Supported inputs:
84
82
 
85
83
  - `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
86
84
  API
87
85
  - `http(s)://...`: Remote URL to fetch
88
86
  - `data:*;base64,...` or raw base64 string
89
87
 
90
- base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
91
-
92
- include_chunks: Whether to include chunks in the output response
93
-
94
- wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
95
- 408/409 with Retry-After headers
96
-
97
88
  chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
98
89
 
99
90
  error_handling:
@@ -106,7 +97,7 @@ class ParseResource(SyncAPIResource):
106
97
  expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
107
98
  updated, polled or accessed via web interface.
108
99
 
109
- file_name: The name of the file to be uploaded. If not set a name will be generated.
100
+ file_name: The name of the file to be parsed. If not set a name will be generated.
110
101
 
111
102
  llm_processing: Controls the LLM used for the task.
112
103
 
@@ -117,41 +108,26 @@ class ParseResource(SyncAPIResource):
117
108
  text. When text layer is present the bounding boxes from the text layer are
118
109
  used.
119
110
 
120
- pipeline: Choose the provider whose models will be used for segmentation and OCR. The
121
- output will be unified to the Chunkr `output` format.
122
-
123
- segment_processing: Defines how each segment type is handled when generating the final output.
124
-
125
- Each segment uses one of three strategies. The chosen strategy controls:
126
-
127
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
128
- - How the content is produced (rule-based vs. LLM).
129
- - The output format (`Html` or `Markdown`).
111
+ segment_processing: Configuration for how each document segment is processed and formatted.
130
112
 
131
- Optional flags such as image **cropping**, **extended context**, and
132
- **descriptions** further refine behaviour.
113
+ Each segment has sensible defaults, but you can override specific settings:
133
114
 
134
- **Default strategy per segment**
115
+ - `format`: Output as `Html` or `Markdown`
116
+ - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
117
+ - `crop_image`: Whether to crop images to segment bounds
118
+ - `extended_context`: Use full page as context for LLM processing
119
+ - `description`: Generate descriptions for segments
135
120
 
136
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` **Auto**
137
- (Markdown, description off)
138
- - `Table` → **LLM** (HTML, description on)
139
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
140
- - `Formula`, `Page` → **LLM** (Markdown, description off)
141
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
121
+ **Defaults per segment type:** Check the documentation for more details.
142
122
 
143
- **Strategy reference**
144
-
145
- - **Auto** – rule-based content generation.
146
- - **LLM** – generate content with an LLM.
147
- - **Ignore** – exclude the segment entirely.
123
+ Only specify the fields you want to change - everything else uses the defaults.
148
124
 
149
125
  segmentation_strategy:
150
126
  Controls the segmentation strategy:
151
127
 
152
128
  - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
153
129
  `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
154
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
130
+ segmentation and better chunking.
155
131
  - `Page`: Treats each page as a single segment. Faster processing, but without
156
132
  layout element detection and only simple chunking.
157
133
 
@@ -188,126 +164,47 @@ class ParseResource(SyncAPIResource):
188
164
  extra_body=extra_body,
189
165
  timeout=timeout,
190
166
  idempotency_key=idempotency_key,
191
- query=maybe_transform(
192
- {
193
- "base64_urls": base64_urls,
194
- "include_chunks": include_chunks,
195
- "wait_for_completion": wait_for_completion,
196
- },
197
- parse_create_params.ParseCreateParams,
198
- ),
199
167
  ),
200
- cast_to=Task,
168
+ cast_to=ParseCreateResponse,
201
169
  )
202
170
 
203
- def update(
171
+ def get(
204
172
  self,
205
- task_id: str,
173
+ task_id: Optional[str],
206
174
  *,
207
175
  base64_urls: bool | NotGiven = NOT_GIVEN,
208
176
  include_chunks: bool | NotGiven = NOT_GIVEN,
209
177
  wait_for_completion: bool | NotGiven = NOT_GIVEN,
210
- chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
211
- error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
212
- expires_in: Optional[int] | NotGiven = NOT_GIVEN,
213
- high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
214
- llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
215
- ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
216
- pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
217
- segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
218
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
219
178
  # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
220
179
  # The extra values given here take precedence over values defined on the client or passed to this method.
221
180
  extra_headers: Headers | None = None,
222
181
  extra_query: Query | None = None,
223
182
  extra_body: Body | None = None,
224
183
  timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
225
- idempotency_key: str | None = None,
226
- ) -> Task:
227
- """Updates an existing task's configuration and reprocesses the document.
228
-
229
- The
230
- current configuration is used as the base; only provided fields are changed.
184
+ ) -> ParseGetResponse:
185
+ """
186
+ Retrieves the current state of a parse task and, when requested, can wait for
187
+ completion.
231
188
 
232
- Requirements:
189
+ Returns task details such as processing status, configuration, output (when
190
+ available), file metadata, and timestamps. If `wait_for_completion=true` is
191
+ provided, the server will hold the request briefly. If the task does not reach a
192
+ terminal state during that window, the response will indicate a retry with
193
+ appropriate headers.
233
194
 
234
- - Task must be in a terminal state (`Succeeded` or `Failed`).
235
- - The new configuration must differ from the current configuration.
195
+ Typical uses:
236
196
 
237
- If `wait_for_completion=true` is provided, the server waits briefly for
238
- completion. If the task completes within that window, a 200 response with the
239
- final `TaskResponse` is returned. Otherwise, the server returns a 408 with retry
240
- guidance and a body describing how long to wait before retrying.
197
+ - Poll a task during processing
198
+ - Retrieve the final output once processing is complete
199
+ - Access task metadata and configuration
241
200
 
242
201
  Args:
243
- base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
202
+ base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
203
+ presigned URLs.
244
204
 
245
205
  include_chunks: Whether to include chunks in the output response
246
206
 
247
- wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
248
- 408/409 with Retry-After headers
249
-
250
- chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
251
-
252
- error_handling:
253
- Controls how errors are handled during processing:
254
-
255
- - `Fail`: Stops processing and fails the task when any error occurs
256
- - `Continue`: Attempts to continue processing despite non-critical errors (eg.
257
- LLM refusals etc.)
258
-
259
- expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
260
- updated, polled or accessed via web interface.
261
-
262
- high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
263
- penalty: ~7 seconds per page)
264
-
265
- llm_processing: Controls the LLM used for the task.
266
-
267
- ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
268
-
269
- - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
270
- - `Auto`: Selectively applies OCR only to pages with missing or low-quality
271
- text. When text layer is present the bounding boxes from the text layer are
272
- used.
273
-
274
- pipeline: Choose the provider whose models will be used for segmentation and OCR. The
275
- output will be unified to the Chunkr `output` format.
276
-
277
- segment_processing: Defines how each segment type is handled when generating the final output.
278
-
279
- Each segment uses one of three strategies. The chosen strategy controls:
280
-
281
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
282
- - How the content is produced (rule-based vs. LLM).
283
- - The output format (`Html` or `Markdown`).
284
-
285
- Optional flags such as image **cropping**, **extended context**, and
286
- **descriptions** further refine behaviour.
287
-
288
- **Default strategy per segment**
289
-
290
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
291
- (Markdown, description off)
292
- - `Table` → **LLM** (HTML, description on)
293
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
294
- - `Formula`, `Page` → **LLM** (Markdown, description off)
295
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
296
-
297
- **Strategy reference**
298
-
299
- - **Auto** – rule-based content generation.
300
- - **LLM** – generate content with an LLM.
301
- - **Ignore** – exclude the segment entirely.
302
-
303
- segmentation_strategy:
304
- Controls the segmentation strategy:
305
-
306
- - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
307
- `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
308
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
309
- - `Page`: Treats each page as a single segment. Faster processing, but without
310
- layout element detection and only simple chunking.
207
+ wait_for_completion: Whether to wait for the task to complete
311
208
 
312
209
  extra_headers: Send extra headers
313
210
 
@@ -316,43 +213,26 @@ class ParseResource(SyncAPIResource):
316
213
  extra_body: Add additional JSON properties to the request
317
214
 
318
215
  timeout: Override the client-level default timeout for this request, in seconds
319
-
320
- idempotency_key: Specify a custom idempotency key for this request
321
216
  """
322
217
  if not task_id:
323
218
  raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
324
- return self._patch(
325
- f"/tasks/parse/{task_id}",
326
- body=maybe_transform(
327
- {
328
- "chunk_processing": chunk_processing,
329
- "error_handling": error_handling,
330
- "expires_in": expires_in,
331
- "high_resolution": high_resolution,
332
- "llm_processing": llm_processing,
333
- "ocr_strategy": ocr_strategy,
334
- "pipeline": pipeline,
335
- "segment_processing": segment_processing,
336
- "segmentation_strategy": segmentation_strategy,
337
- },
338
- parse_update_params.ParseUpdateParams,
339
- ),
219
+ return self._get(
220
+ f"/tasks/{task_id}/parse",
340
221
  options=make_request_options(
341
222
  extra_headers=extra_headers,
342
223
  extra_query=extra_query,
343
224
  extra_body=extra_body,
344
225
  timeout=timeout,
345
- idempotency_key=idempotency_key,
346
226
  query=maybe_transform(
347
227
  {
348
228
  "base64_urls": base64_urls,
349
229
  "include_chunks": include_chunks,
350
230
  "wait_for_completion": wait_for_completion,
351
231
  },
352
- parse_update_params.ParseUpdateParams,
232
+ parse_get_params.ParseGetParams,
353
233
  ),
354
234
  ),
355
- cast_to=Task,
235
+ cast_to=ParseGetResponse,
356
236
  )
357
237
 
358
238
 
@@ -380,18 +260,15 @@ class AsyncParseResource(AsyncAPIResource):
380
260
  self,
381
261
  *,
382
262
  file: str,
383
- base64_urls: bool | NotGiven = NOT_GIVEN,
384
- include_chunks: bool | NotGiven = NOT_GIVEN,
385
- wait_for_completion: bool | NotGiven = NOT_GIVEN,
386
- chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
387
- error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
263
+ chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
264
+ error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
388
265
  expires_in: Optional[int] | NotGiven = NOT_GIVEN,
389
266
  file_name: Optional[str] | NotGiven = NOT_GIVEN,
390
- llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
391
- ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
392
- pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
393
- segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
394
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
267
+ llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
268
+ ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
269
+ pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
270
+ segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
271
+ segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
395
272
  # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
396
273
  # The extra values given here take precedence over values defined on the client or passed to this method.
397
274
  extra_headers: Headers | None = None,
@@ -399,33 +276,23 @@ class AsyncParseResource(AsyncAPIResource):
399
276
  extra_body: Body | None = None,
400
277
  timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
401
278
  idempotency_key: str | None = None,
402
- ) -> Task:
279
+ ) -> ParseCreateResponse:
403
280
  """
404
281
  Queues a document for processing and returns a `TaskResponse` with the assigned
405
282
  `task_id`, initial configuration, file metadata, and timestamps. The initial
406
283
  status is `Starting`.
407
284
 
408
- If `wait_for_completion=true` is provided, the server waits briefly for
409
- completion. If the task completes within that window, a 200 response with the
410
- final `TaskResponse` is returned. Otherwise, the server returns a 408 or 409
411
- with retry guidance and a body describing how long to wait before retrying.
285
+ Creates a parse task and returns its metadata immediately.
412
286
 
413
287
  Args:
414
288
  file:
415
- The file to be uploaded. Supported inputs:
289
+ The file to be parsed. Supported inputs:
416
290
 
417
291
  - `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
418
292
  API
419
293
  - `http(s)://...`: Remote URL to fetch
420
294
  - `data:*;base64,...` or raw base64 string
421
295
 
422
- base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
423
-
424
- include_chunks: Whether to include chunks in the output response
425
-
426
- wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
427
- 408/409 with Retry-After headers
428
-
429
296
  chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
430
297
 
431
298
  error_handling:
@@ -438,7 +305,7 @@ class AsyncParseResource(AsyncAPIResource):
438
305
  expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
439
306
  updated, polled or accessed via web interface.
440
307
 
441
- file_name: The name of the file to be uploaded. If not set a name will be generated.
308
+ file_name: The name of the file to be parsed. If not set a name will be generated.
442
309
 
443
310
  llm_processing: Controls the LLM used for the task.
444
311
 
@@ -449,41 +316,26 @@ class AsyncParseResource(AsyncAPIResource):
449
316
  text. When text layer is present the bounding boxes from the text layer are
450
317
  used.
451
318
 
452
- pipeline: Choose the provider whose models will be used for segmentation and OCR. The
453
- output will be unified to the Chunkr `output` format.
454
-
455
- segment_processing: Defines how each segment type is handled when generating the final output.
456
-
457
- Each segment uses one of three strategies. The chosen strategy controls:
458
-
459
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
460
- - How the content is produced (rule-based vs. LLM).
461
- - The output format (`Html` or `Markdown`).
319
+ segment_processing: Configuration for how each document segment is processed and formatted.
462
320
 
463
- Optional flags such as image **cropping**, **extended context**, and
464
- **descriptions** further refine behaviour.
321
+ Each segment has sensible defaults, but you can override specific settings:
465
322
 
466
- **Default strategy per segment**
323
+ - `format`: Output as `Html` or `Markdown`
324
+ - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
325
+ - `crop_image`: Whether to crop images to segment bounds
326
+ - `extended_context`: Use full page as context for LLM processing
327
+ - `description`: Generate descriptions for segments
467
328
 
468
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` **Auto**
469
- (Markdown, description off)
470
- - `Table` → **LLM** (HTML, description on)
471
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
472
- - `Formula`, `Page` → **LLM** (Markdown, description off)
473
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
329
+ **Defaults per segment type:** Check the documentation for more details.
474
330
 
475
- **Strategy reference**
476
-
477
- - **Auto** – rule-based content generation.
478
- - **LLM** – generate content with an LLM.
479
- - **Ignore** – exclude the segment entirely.
331
+ Only specify the fields you want to change - everything else uses the defaults.
480
332
 
481
333
  segmentation_strategy:
482
334
  Controls the segmentation strategy:
483
335
 
484
336
  - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
485
337
  `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
486
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
338
+ segmentation and better chunking.
487
339
  - `Page`: Treats each page as a single segment. Faster processing, but without
488
340
  layout element detection and only simple chunking.
489
341
 
@@ -520,126 +372,47 @@ class AsyncParseResource(AsyncAPIResource):
520
372
  extra_body=extra_body,
521
373
  timeout=timeout,
522
374
  idempotency_key=idempotency_key,
523
- query=await async_maybe_transform(
524
- {
525
- "base64_urls": base64_urls,
526
- "include_chunks": include_chunks,
527
- "wait_for_completion": wait_for_completion,
528
- },
529
- parse_create_params.ParseCreateParams,
530
- ),
531
375
  ),
532
- cast_to=Task,
376
+ cast_to=ParseCreateResponse,
533
377
  )
534
378
 
535
- async def update(
379
+ async def get(
536
380
  self,
537
- task_id: str,
381
+ task_id: Optional[str],
538
382
  *,
539
383
  base64_urls: bool | NotGiven = NOT_GIVEN,
540
384
  include_chunks: bool | NotGiven = NOT_GIVEN,
541
385
  wait_for_completion: bool | NotGiven = NOT_GIVEN,
542
- chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
543
- error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
544
- expires_in: Optional[int] | NotGiven = NOT_GIVEN,
545
- high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
546
- llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
547
- ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
548
- pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
549
- segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
550
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
551
386
  # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
552
387
  # The extra values given here take precedence over values defined on the client or passed to this method.
553
388
  extra_headers: Headers | None = None,
554
389
  extra_query: Query | None = None,
555
390
  extra_body: Body | None = None,
556
391
  timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
557
- idempotency_key: str | None = None,
558
- ) -> Task:
559
- """Updates an existing task's configuration and reprocesses the document.
560
-
561
- The
562
- current configuration is used as the base; only provided fields are changed.
392
+ ) -> ParseGetResponse:
393
+ """
394
+ Retrieves the current state of a parse task and, when requested, can wait for
395
+ completion.
563
396
 
564
- Requirements:
397
+ Returns task details such as processing status, configuration, output (when
398
+ available), file metadata, and timestamps. If `wait_for_completion=true` is
399
+ provided, the server will hold the request briefly. If the task does not reach a
400
+ terminal state during that window, the response will indicate a retry with
401
+ appropriate headers.
565
402
 
566
- - Task must be in a terminal state (`Succeeded` or `Failed`).
567
- - The new configuration must differ from the current configuration.
403
+ Typical uses:
568
404
 
569
- If `wait_for_completion=true` is provided, the server waits briefly for
570
- completion. If the task completes within that window, a 200 response with the
571
- final `TaskResponse` is returned. Otherwise, the server returns a 408 with retry
572
- guidance and a body describing how long to wait before retrying.
405
+ - Poll a task during processing
406
+ - Retrieve the final output once processing is complete
407
+ - Access task metadata and configuration
573
408
 
574
409
  Args:
575
- base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
410
+ base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
411
+ presigned URLs.
576
412
 
577
413
  include_chunks: Whether to include chunks in the output response
578
414
 
579
- wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
580
- 408/409 with Retry-After headers
581
-
582
- chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
583
-
584
- error_handling:
585
- Controls how errors are handled during processing:
586
-
587
- - `Fail`: Stops processing and fails the task when any error occurs
588
- - `Continue`: Attempts to continue processing despite non-critical errors (eg.
589
- LLM refusals etc.)
590
-
591
- expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
592
- updated, polled or accessed via web interface.
593
-
594
- high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
595
- penalty: ~7 seconds per page)
596
-
597
- llm_processing: Controls the LLM used for the task.
598
-
599
- ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
600
-
601
- - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
602
- - `Auto`: Selectively applies OCR only to pages with missing or low-quality
603
- text. When text layer is present the bounding boxes from the text layer are
604
- used.
605
-
606
- pipeline: Choose the provider whose models will be used for segmentation and OCR. The
607
- output will be unified to the Chunkr `output` format.
608
-
609
- segment_processing: Defines how each segment type is handled when generating the final output.
610
-
611
- Each segment uses one of three strategies. The chosen strategy controls:
612
-
613
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
614
- - How the content is produced (rule-based vs. LLM).
615
- - The output format (`Html` or `Markdown`).
616
-
617
- Optional flags such as image **cropping**, **extended context**, and
618
- **descriptions** further refine behaviour.
619
-
620
- **Default strategy per segment**
621
-
622
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
623
- (Markdown, description off)
624
- - `Table` → **LLM** (HTML, description on)
625
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
626
- - `Formula`, `Page` → **LLM** (Markdown, description off)
627
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
628
-
629
- **Strategy reference**
630
-
631
- - **Auto** – rule-based content generation.
632
- - **LLM** – generate content with an LLM.
633
- - **Ignore** – exclude the segment entirely.
634
-
635
- segmentation_strategy:
636
- Controls the segmentation strategy:
637
-
638
- - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
639
- `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
640
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
641
- - `Page`: Treats each page as a single segment. Faster processing, but without
642
- layout element detection and only simple chunking.
415
+ wait_for_completion: Whether to wait for the task to complete
643
416
 
644
417
  extra_headers: Send extra headers
645
418
 
@@ -648,43 +421,26 @@ class AsyncParseResource(AsyncAPIResource):
648
421
  extra_body: Add additional JSON properties to the request
649
422
 
650
423
  timeout: Override the client-level default timeout for this request, in seconds
651
-
652
- idempotency_key: Specify a custom idempotency key for this request
653
424
  """
654
425
  if not task_id:
655
426
  raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
656
- return await self._patch(
657
- f"/tasks/parse/{task_id}",
658
- body=await async_maybe_transform(
659
- {
660
- "chunk_processing": chunk_processing,
661
- "error_handling": error_handling,
662
- "expires_in": expires_in,
663
- "high_resolution": high_resolution,
664
- "llm_processing": llm_processing,
665
- "ocr_strategy": ocr_strategy,
666
- "pipeline": pipeline,
667
- "segment_processing": segment_processing,
668
- "segmentation_strategy": segmentation_strategy,
669
- },
670
- parse_update_params.ParseUpdateParams,
671
- ),
427
+ return await self._get(
428
+ f"/tasks/{task_id}/parse",
672
429
  options=make_request_options(
673
430
  extra_headers=extra_headers,
674
431
  extra_query=extra_query,
675
432
  extra_body=extra_body,
676
433
  timeout=timeout,
677
- idempotency_key=idempotency_key,
678
434
  query=await async_maybe_transform(
679
435
  {
680
436
  "base64_urls": base64_urls,
681
437
  "include_chunks": include_chunks,
682
438
  "wait_for_completion": wait_for_completion,
683
439
  },
684
- parse_update_params.ParseUpdateParams,
440
+ parse_get_params.ParseGetParams,
685
441
  ),
686
442
  ),
687
- cast_to=Task,
443
+ cast_to=ParseGetResponse,
688
444
  )
689
445
 
690
446
 
@@ -695,8 +451,8 @@ class ParseResourceWithRawResponse:
695
451
  self.create = to_raw_response_wrapper(
696
452
  parse.create,
697
453
  )
698
- self.update = to_raw_response_wrapper(
699
- parse.update,
454
+ self.get = to_raw_response_wrapper(
455
+ parse.get,
700
456
  )
701
457
 
702
458
 
@@ -707,8 +463,8 @@ class AsyncParseResourceWithRawResponse:
707
463
  self.create = async_to_raw_response_wrapper(
708
464
  parse.create,
709
465
  )
710
- self.update = async_to_raw_response_wrapper(
711
- parse.update,
466
+ self.get = async_to_raw_response_wrapper(
467
+ parse.get,
712
468
  )
713
469
 
714
470
 
@@ -719,8 +475,8 @@ class ParseResourceWithStreamingResponse:
719
475
  self.create = to_streamed_response_wrapper(
720
476
  parse.create,
721
477
  )
722
- self.update = to_streamed_response_wrapper(
723
- parse.update,
478
+ self.get = to_streamed_response_wrapper(
479
+ parse.get,
724
480
  )
725
481
 
726
482
 
@@ -731,6 +487,6 @@ class AsyncParseResourceWithStreamingResponse:
731
487
  self.create = async_to_streamed_response_wrapper(
732
488
  parse.create,
733
489
  )
734
- self.update = async_to_streamed_response_wrapper(
735
- parse.update,
490
+ self.get = async_to_streamed_response_wrapper(
491
+ parse.get,
736
492
  )