chunkr-ai 0.1.0__py3-none-any.whl → 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. chunkr_ai/__init__.py +89 -2
  2. chunkr_ai/_base_client.py +1995 -0
  3. chunkr_ai/_client.py +403 -0
  4. chunkr_ai/_compat.py +219 -0
  5. chunkr_ai/_constants.py +14 -0
  6. chunkr_ai/_exceptions.py +108 -0
  7. chunkr_ai/_files.py +123 -0
  8. chunkr_ai/_models.py +829 -0
  9. chunkr_ai/_qs.py +150 -0
  10. chunkr_ai/_resource.py +43 -0
  11. chunkr_ai/_response.py +830 -0
  12. chunkr_ai/_streaming.py +333 -0
  13. chunkr_ai/_types.py +219 -0
  14. chunkr_ai/_utils/__init__.py +57 -0
  15. chunkr_ai/_utils/_logs.py +25 -0
  16. chunkr_ai/_utils/_proxy.py +65 -0
  17. chunkr_ai/_utils/_reflection.py +42 -0
  18. chunkr_ai/_utils/_resources_proxy.py +24 -0
  19. chunkr_ai/_utils/_streams.py +12 -0
  20. chunkr_ai/_utils/_sync.py +86 -0
  21. chunkr_ai/_utils/_transform.py +447 -0
  22. chunkr_ai/_utils/_typing.py +151 -0
  23. chunkr_ai/_utils/_utils.py +422 -0
  24. chunkr_ai/_version.py +4 -0
  25. chunkr_ai/lib/.keep +4 -0
  26. chunkr_ai/pagination.py +71 -0
  27. chunkr_ai/resources/__init__.py +33 -0
  28. chunkr_ai/resources/health.py +136 -0
  29. chunkr_ai/resources/task/__init__.py +33 -0
  30. chunkr_ai/resources/task/parse.py +616 -0
  31. chunkr_ai/resources/task/task.py +664 -0
  32. chunkr_ai/types/__init__.py +8 -0
  33. chunkr_ai/types/health_check_response.py +7 -0
  34. chunkr_ai/types/task/__init__.py +7 -0
  35. chunkr_ai/types/task/parse_create_params.py +806 -0
  36. chunkr_ai/types/task/parse_update_params.py +806 -0
  37. chunkr_ai/types/task/task.py +1186 -0
  38. chunkr_ai/types/task_get_params.py +18 -0
  39. chunkr_ai/types/task_list_params.py +37 -0
  40. chunkr_ai-0.1.0a2.dist-info/METADATA +504 -0
  41. chunkr_ai-0.1.0a2.dist-info/RECORD +44 -0
  42. {chunkr_ai-0.1.0.dist-info → chunkr_ai-0.1.0a2.dist-info}/WHEEL +1 -2
  43. chunkr_ai-0.1.0a2.dist-info/licenses/LICENSE +201 -0
  44. chunkr_ai/api/auth.py +0 -13
  45. chunkr_ai/api/chunkr.py +0 -103
  46. chunkr_ai/api/chunkr_base.py +0 -185
  47. chunkr_ai/api/configuration.py +0 -313
  48. chunkr_ai/api/decorators.py +0 -101
  49. chunkr_ai/api/misc.py +0 -139
  50. chunkr_ai/api/protocol.py +0 -14
  51. chunkr_ai/api/task_response.py +0 -208
  52. chunkr_ai/models.py +0 -55
  53. chunkr_ai-0.1.0.dist-info/METADATA +0 -268
  54. chunkr_ai-0.1.0.dist-info/RECORD +0 -16
  55. chunkr_ai-0.1.0.dist-info/licenses/LICENSE +0 -21
  56. chunkr_ai-0.1.0.dist-info/top_level.txt +0 -1
  57. /chunkr_ai/{api/__init__.py → py.typed} +0 -0
@@ -0,0 +1,616 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+ from typing_extensions import Literal
7
+
8
+ import httpx
9
+
10
+ from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven
11
+ from ..._utils import maybe_transform, async_maybe_transform
12
+ from ..._compat import cached_property
13
+ from ..._resource import SyncAPIResource, AsyncAPIResource
14
+ from ..._response import (
15
+ to_raw_response_wrapper,
16
+ to_streamed_response_wrapper,
17
+ async_to_raw_response_wrapper,
18
+ async_to_streamed_response_wrapper,
19
+ )
20
+ from ...types.task import parse_create_params, parse_update_params
21
+ from ..._base_client import make_request_options
22
+ from ...types.task.task import Task
23
+
24
+ __all__ = ["ParseResource", "AsyncParseResource"]
25
+
26
+
27
+ class ParseResource(SyncAPIResource):
28
+ @cached_property
29
+ def with_raw_response(self) -> ParseResourceWithRawResponse:
30
+ """
31
+ This property can be used as a prefix for any HTTP method call to return
32
+ the raw response object instead of the parsed content.
33
+
34
+ For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#accessing-raw-response-data-eg-headers
35
+ """
36
+ return ParseResourceWithRawResponse(self)
37
+
38
+ @cached_property
39
+ def with_streaming_response(self) -> ParseResourceWithStreamingResponse:
40
+ """
41
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
42
+
43
+ For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#with_streaming_response
44
+ """
45
+ return ParseResourceWithStreamingResponse(self)
46
+
47
+ def create(
48
+ self,
49
+ *,
50
+ file: str,
51
+ chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
52
+ error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
53
+ expires_in: Optional[int] | NotGiven = NOT_GIVEN,
54
+ file_name: Optional[str] | NotGiven = NOT_GIVEN,
55
+ llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
56
+ ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
57
+ pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
58
+ segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
59
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
60
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
61
+ # The extra values given here take precedence over values defined on the client or passed to this method.
62
+ extra_headers: Headers | None = None,
63
+ extra_query: Query | None = None,
64
+ extra_body: Body | None = None,
65
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
66
+ ) -> Task:
67
+ """
68
+ Queues a document for processing and returns a TaskResponse containing:
69
+
70
+ - Task ID for status polling
71
+ - Initial configuration
72
+ - File metadata
73
+ - Processing status
74
+ - Creation timestamp
75
+ - Presigned URLs for file access
76
+
77
+ The returned task will typically be in a `Starting` or `Processing` state. Use
78
+ the `GET /task/{task_id}` endpoint to poll for completion.
79
+
80
+ Args:
81
+ file: The file to be uploaded. Can be a URL or a base64 encoded file.
82
+
83
+ chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
84
+
85
+ error_handling:
86
+ Controls how errors are handled during processing:
87
+
88
+ - `Fail`: Stops processing and fails the task when any error occurs
89
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
90
+ LLM refusals etc.)
91
+
92
+ expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
93
+ updated, polled or accessed via web interface.
94
+
95
+ file_name: The name of the file to be uploaded. If not set a name will be generated.
96
+
97
+ llm_processing: Controls the LLM used for the task.
98
+
99
+ ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
100
+
101
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
102
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
103
+ text. When text layer is present the bounding boxes from the text layer are
104
+ used.
105
+
106
+ pipeline: Choose the provider whose models will be used for segmentation and OCR. The
107
+ output will be unified to the Chunkr `output` format.
108
+
109
+ segment_processing: Defines how each segment type is handled when generating the final output.
110
+
111
+ Each segment uses one of three strategies. The chosen strategy controls: •
112
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
113
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
114
+ `Markdown`).
115
+
116
+ Optional flags such as image **cropping**, **extended context**, and **LLM
117
+ descriptions** further refine behaviour.
118
+
119
+ ---
120
+
121
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
122
+ `Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
123
+ description on) • `Picture` → **LLM** (Markdown, description off, cropping
124
+ _All_) • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
125
+ **Ignore** (removed from output)
126
+
127
+ ---
128
+
129
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
130
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
131
+
132
+ segmentation_strategy:
133
+ Controls the segmentation strategy:
134
+
135
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
136
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
137
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
138
+ - `Page`: Treats each page as a single segment. Faster processing, but without
139
+ layout element detection and only simple chunking.
140
+
141
+ extra_headers: Send extra headers
142
+
143
+ extra_query: Add additional query parameters to the request
144
+
145
+ extra_body: Add additional JSON properties to the request
146
+
147
+ timeout: Override the client-level default timeout for this request, in seconds
148
+ """
149
+ return self._post(
150
+ "/task/parse",
151
+ body=maybe_transform(
152
+ {
153
+ "file": file,
154
+ "chunk_processing": chunk_processing,
155
+ "error_handling": error_handling,
156
+ "expires_in": expires_in,
157
+ "file_name": file_name,
158
+ "llm_processing": llm_processing,
159
+ "ocr_strategy": ocr_strategy,
160
+ "pipeline": pipeline,
161
+ "segment_processing": segment_processing,
162
+ "segmentation_strategy": segmentation_strategy,
163
+ },
164
+ parse_create_params.ParseCreateParams,
165
+ ),
166
+ options=make_request_options(
167
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
168
+ ),
169
+ cast_to=Task,
170
+ )
171
+
172
+ def update(
173
+ self,
174
+ task_id: str,
175
+ *,
176
+ chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
177
+ error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
178
+ expires_in: Optional[int] | NotGiven = NOT_GIVEN,
179
+ high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
180
+ llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
181
+ ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
182
+ pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
183
+ segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
184
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
185
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
186
+ # The extra values given here take precedence over values defined on the client or passed to this method.
187
+ extra_headers: Headers | None = None,
188
+ extra_query: Query | None = None,
189
+ extra_body: Body | None = None,
190
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
191
+ ) -> Task:
192
+ """Updates an existing task's configuration and reprocesses the document.
193
+
194
+ The
195
+ original configuration will be used for all values that are not provided in the
196
+ update.
197
+
198
+ Requirements:
199
+
200
+ - Task must have status `Succeeded` or `Failed`
201
+ - New configuration must be different from the current one
202
+
203
+ The returned task will typically be in a `Starting` or `Processing` state. Use
204
+ the `GET /task/{task_id}` endpoint to poll for completion.
205
+
206
+ Args:
207
+ chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
208
+
209
+ error_handling:
210
+ Controls how errors are handled during processing:
211
+
212
+ - `Fail`: Stops processing and fails the task when any error occurs
213
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
214
+ LLM refusals etc.)
215
+
216
+ expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
217
+ updated, polled or accessed via web interface.
218
+
219
+ high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
220
+ penalty: ~7 seconds per page)
221
+
222
+ llm_processing: Controls the LLM used for the task.
223
+
224
+ ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
225
+
226
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
227
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
228
+ text. When text layer is present the bounding boxes from the text layer are
229
+ used.
230
+
231
+ pipeline: Choose the provider whose models will be used for segmentation and OCR. The
232
+ output will be unified to the Chunkr `output` format.
233
+
234
+ segment_processing: Defines how each segment type is handled when generating the final output.
235
+
236
+ Each segment uses one of three strategies. The chosen strategy controls: •
237
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
238
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
239
+ `Markdown`).
240
+
241
+ Optional flags such as image **cropping**, **extended context**, and **LLM
242
+ descriptions** further refine behaviour.
243
+
244
+ ---
245
+
246
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
247
+ `Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
248
+ description on) • `Picture` → **LLM** (Markdown, description off, cropping
249
+ _All_) • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
250
+ **Ignore** (removed from output)
251
+
252
+ ---
253
+
254
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
255
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
256
+
257
+ segmentation_strategy:
258
+ Controls the segmentation strategy:
259
+
260
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
261
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
262
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
263
+ - `Page`: Treats each page as a single segment. Faster processing, but without
264
+ layout element detection and only simple chunking.
265
+
266
+ extra_headers: Send extra headers
267
+
268
+ extra_query: Add additional query parameters to the request
269
+
270
+ extra_body: Add additional JSON properties to the request
271
+
272
+ timeout: Override the client-level default timeout for this request, in seconds
273
+ """
274
+ if not task_id:
275
+ raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
276
+ return self._patch(
277
+ f"/task/{task_id}/parse",
278
+ body=maybe_transform(
279
+ {
280
+ "chunk_processing": chunk_processing,
281
+ "error_handling": error_handling,
282
+ "expires_in": expires_in,
283
+ "high_resolution": high_resolution,
284
+ "llm_processing": llm_processing,
285
+ "ocr_strategy": ocr_strategy,
286
+ "pipeline": pipeline,
287
+ "segment_processing": segment_processing,
288
+ "segmentation_strategy": segmentation_strategy,
289
+ },
290
+ parse_update_params.ParseUpdateParams,
291
+ ),
292
+ options=make_request_options(
293
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
294
+ ),
295
+ cast_to=Task,
296
+ )
297
+
298
+
299
+ class AsyncParseResource(AsyncAPIResource):
300
+ @cached_property
301
+ def with_raw_response(self) -> AsyncParseResourceWithRawResponse:
302
+ """
303
+ This property can be used as a prefix for any HTTP method call to return
304
+ the raw response object instead of the parsed content.
305
+
306
+ For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#accessing-raw-response-data-eg-headers
307
+ """
308
+ return AsyncParseResourceWithRawResponse(self)
309
+
310
+ @cached_property
311
+ def with_streaming_response(self) -> AsyncParseResourceWithStreamingResponse:
312
+ """
313
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
314
+
315
+ For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#with_streaming_response
316
+ """
317
+ return AsyncParseResourceWithStreamingResponse(self)
318
+
319
+ async def create(
320
+ self,
321
+ *,
322
+ file: str,
323
+ chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
324
+ error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
325
+ expires_in: Optional[int] | NotGiven = NOT_GIVEN,
326
+ file_name: Optional[str] | NotGiven = NOT_GIVEN,
327
+ llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
328
+ ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
329
+ pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
330
+ segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
331
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
332
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
333
+ # The extra values given here take precedence over values defined on the client or passed to this method.
334
+ extra_headers: Headers | None = None,
335
+ extra_query: Query | None = None,
336
+ extra_body: Body | None = None,
337
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
338
+ ) -> Task:
339
+ """
340
+ Queues a document for processing and returns a TaskResponse containing:
341
+
342
+ - Task ID for status polling
343
+ - Initial configuration
344
+ - File metadata
345
+ - Processing status
346
+ - Creation timestamp
347
+ - Presigned URLs for file access
348
+
349
+ The returned task will typically be in a `Starting` or `Processing` state. Use
350
+ the `GET /task/{task_id}` endpoint to poll for completion.
351
+
352
+ Args:
353
+ file: The file to be uploaded. Can be a URL or a base64 encoded file.
354
+
355
+ chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
356
+
357
+ error_handling:
358
+ Controls how errors are handled during processing:
359
+
360
+ - `Fail`: Stops processing and fails the task when any error occurs
361
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
362
+ LLM refusals etc.)
363
+
364
+ expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
365
+ updated, polled or accessed via web interface.
366
+
367
+ file_name: The name of the file to be uploaded. If not set a name will be generated.
368
+
369
+ llm_processing: Controls the LLM used for the task.
370
+
371
+ ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
372
+
373
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
374
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
375
+ text. When text layer is present the bounding boxes from the text layer are
376
+ used.
377
+
378
+ pipeline: Choose the provider whose models will be used for segmentation and OCR. The
379
+ output will be unified to the Chunkr `output` format.
380
+
381
+ segment_processing: Defines how each segment type is handled when generating the final output.
382
+
383
+ Each segment uses one of three strategies. The chosen strategy controls: •
384
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
385
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
386
+ `Markdown`).
387
+
388
+ Optional flags such as image **cropping**, **extended context**, and **LLM
389
+ descriptions** further refine behaviour.
390
+
391
+ ---
392
+
393
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
394
+ `Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
395
+ description on) • `Picture` → **LLM** (Markdown, description off, cropping
396
+ _All_) • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
397
+ **Ignore** (removed from output)
398
+
399
+ ---
400
+
401
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
402
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
403
+
404
+ segmentation_strategy:
405
+ Controls the segmentation strategy:
406
+
407
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
408
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
409
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
410
+ - `Page`: Treats each page as a single segment. Faster processing, but without
411
+ layout element detection and only simple chunking.
412
+
413
+ extra_headers: Send extra headers
414
+
415
+ extra_query: Add additional query parameters to the request
416
+
417
+ extra_body: Add additional JSON properties to the request
418
+
419
+ timeout: Override the client-level default timeout for this request, in seconds
420
+ """
421
+ return await self._post(
422
+ "/task/parse",
423
+ body=await async_maybe_transform(
424
+ {
425
+ "file": file,
426
+ "chunk_processing": chunk_processing,
427
+ "error_handling": error_handling,
428
+ "expires_in": expires_in,
429
+ "file_name": file_name,
430
+ "llm_processing": llm_processing,
431
+ "ocr_strategy": ocr_strategy,
432
+ "pipeline": pipeline,
433
+ "segment_processing": segment_processing,
434
+ "segmentation_strategy": segmentation_strategy,
435
+ },
436
+ parse_create_params.ParseCreateParams,
437
+ ),
438
+ options=make_request_options(
439
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
440
+ ),
441
+ cast_to=Task,
442
+ )
443
+
444
+ async def update(
445
+ self,
446
+ task_id: str,
447
+ *,
448
+ chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
449
+ error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
450
+ expires_in: Optional[int] | NotGiven = NOT_GIVEN,
451
+ high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
452
+ llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
453
+ ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
454
+ pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
455
+ segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
456
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
457
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
458
+ # The extra values given here take precedence over values defined on the client or passed to this method.
459
+ extra_headers: Headers | None = None,
460
+ extra_query: Query | None = None,
461
+ extra_body: Body | None = None,
462
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
463
+ ) -> Task:
464
+ """Updates an existing task's configuration and reprocesses the document.
465
+
466
+ The
467
+ original configuration will be used for all values that are not provided in the
468
+ update.
469
+
470
+ Requirements:
471
+
472
+ - Task must have status `Succeeded` or `Failed`
473
+ - New configuration must be different from the current one
474
+
475
+ The returned task will typically be in a `Starting` or `Processing` state. Use
476
+ the `GET /task/{task_id}` endpoint to poll for completion.
477
+
478
+ Args:
479
+ chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
480
+
481
+ error_handling:
482
+ Controls how errors are handled during processing:
483
+
484
+ - `Fail`: Stops processing and fails the task when any error occurs
485
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
486
+ LLM refusals etc.)
487
+
488
+ expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
489
+ updated, polled or accessed via web interface.
490
+
491
+ high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
492
+ penalty: ~7 seconds per page)
493
+
494
+ llm_processing: Controls the LLM used for the task.
495
+
496
+ ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
497
+
498
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
499
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
500
+ text. When text layer is present the bounding boxes from the text layer are
501
+ used.
502
+
503
+ pipeline: Choose the provider whose models will be used for segmentation and OCR. The
504
+ output will be unified to the Chunkr `output` format.
505
+
506
+ segment_processing: Defines how each segment type is handled when generating the final output.
507
+
508
+ Each segment uses one of three strategies. The chosen strategy controls: •
509
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
510
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
511
+ `Markdown`).
512
+
513
+ Optional flags such as image **cropping**, **extended context**, and **LLM
514
+ descriptions** further refine behaviour.
515
+
516
+ ---
517
+
518
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
519
+ `Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
520
+ description on) • `Picture` → **LLM** (Markdown, description off, cropping
521
+ _All_) • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
522
+ **Ignore** (removed from output)
523
+
524
+ ---
525
+
526
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
527
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
528
+
529
+ segmentation_strategy:
530
+ Controls the segmentation strategy:
531
+
532
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
533
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
534
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
535
+ - `Page`: Treats each page as a single segment. Faster processing, but without
536
+ layout element detection and only simple chunking.
537
+
538
+ extra_headers: Send extra headers
539
+
540
+ extra_query: Add additional query parameters to the request
541
+
542
+ extra_body: Add additional JSON properties to the request
543
+
544
+ timeout: Override the client-level default timeout for this request, in seconds
545
+ """
546
+ if not task_id:
547
+ raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
548
+ return await self._patch(
549
+ f"/task/{task_id}/parse",
550
+ body=await async_maybe_transform(
551
+ {
552
+ "chunk_processing": chunk_processing,
553
+ "error_handling": error_handling,
554
+ "expires_in": expires_in,
555
+ "high_resolution": high_resolution,
556
+ "llm_processing": llm_processing,
557
+ "ocr_strategy": ocr_strategy,
558
+ "pipeline": pipeline,
559
+ "segment_processing": segment_processing,
560
+ "segmentation_strategy": segmentation_strategy,
561
+ },
562
+ parse_update_params.ParseUpdateParams,
563
+ ),
564
+ options=make_request_options(
565
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
566
+ ),
567
+ cast_to=Task,
568
+ )
569
+
570
+
571
+ class ParseResourceWithRawResponse:
572
+ def __init__(self, parse: ParseResource) -> None:
573
+ self._parse = parse
574
+
575
+ self.create = to_raw_response_wrapper(
576
+ parse.create,
577
+ )
578
+ self.update = to_raw_response_wrapper(
579
+ parse.update,
580
+ )
581
+
582
+
583
+ class AsyncParseResourceWithRawResponse:
584
+ def __init__(self, parse: AsyncParseResource) -> None:
585
+ self._parse = parse
586
+
587
+ self.create = async_to_raw_response_wrapper(
588
+ parse.create,
589
+ )
590
+ self.update = async_to_raw_response_wrapper(
591
+ parse.update,
592
+ )
593
+
594
+
595
+ class ParseResourceWithStreamingResponse:
596
+ def __init__(self, parse: ParseResource) -> None:
597
+ self._parse = parse
598
+
599
+ self.create = to_streamed_response_wrapper(
600
+ parse.create,
601
+ )
602
+ self.update = to_streamed_response_wrapper(
603
+ parse.update,
604
+ )
605
+
606
+
607
+ class AsyncParseResourceWithStreamingResponse:
608
+ def __init__(self, parse: AsyncParseResource) -> None:
609
+ self._parse = parse
610
+
611
+ self.create = async_to_streamed_response_wrapper(
612
+ parse.create,
613
+ )
614
+ self.update = async_to_streamed_response_wrapper(
615
+ parse.update,
616
+ )