chunkr-ai 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. chunkr_ai/_client.py +18 -9
  2. chunkr_ai/_files.py +1 -1
  3. chunkr_ai/_version.py +1 -1
  4. chunkr_ai/pagination.py +61 -1
  5. chunkr_ai/resources/__init__.py +27 -13
  6. chunkr_ai/resources/files.py +712 -0
  7. chunkr_ai/resources/tasks/__init__.py +33 -0
  8. chunkr_ai/resources/tasks/parse.py +612 -0
  9. chunkr_ai/resources/tasks/tasks.py +596 -0
  10. chunkr_ai/types/__init__.py +7 -19
  11. chunkr_ai/types/delete.py +10 -0
  12. chunkr_ai/types/file.py +30 -0
  13. chunkr_ai/types/file_create_params.py +17 -0
  14. chunkr_ai/types/file_list_params.py +28 -0
  15. chunkr_ai/types/file_url.py +15 -0
  16. chunkr_ai/types/file_url_params.py +15 -0
  17. chunkr_ai/types/files_page_response.py +20 -0
  18. chunkr_ai/types/task.py +866 -27
  19. chunkr_ai/types/tasks/__init__.py +6 -0
  20. chunkr_ai/types/tasks/parse_create_params.py +844 -0
  21. chunkr_ai/types/tasks/parse_update_params.py +838 -0
  22. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/METADATA +39 -21
  23. chunkr_ai-0.1.0a3.dist-info/RECORD +52 -0
  24. chunkr_ai/resources/task.py +0 -1166
  25. chunkr_ai/types/auto_generation_config.py +0 -39
  26. chunkr_ai/types/auto_generation_config_param.py +0 -39
  27. chunkr_ai/types/bounding_box.py +0 -19
  28. chunkr_ai/types/chunk_processing.py +0 -40
  29. chunkr_ai/types/chunk_processing_param.py +0 -42
  30. chunkr_ai/types/ignore_generation_config.py +0 -39
  31. chunkr_ai/types/ignore_generation_config_param.py +0 -39
  32. chunkr_ai/types/llm_generation_config.py +0 -39
  33. chunkr_ai/types/llm_generation_config_param.py +0 -39
  34. chunkr_ai/types/llm_processing.py +0 -36
  35. chunkr_ai/types/llm_processing_param.py +0 -36
  36. chunkr_ai/types/picture_generation_config.py +0 -39
  37. chunkr_ai/types/picture_generation_config_param.py +0 -39
  38. chunkr_ai/types/segment_processing.py +0 -280
  39. chunkr_ai/types/segment_processing_param.py +0 -281
  40. chunkr_ai/types/table_generation_config.py +0 -39
  41. chunkr_ai/types/table_generation_config_param.py +0 -39
  42. chunkr_ai/types/task_parse_params.py +0 -90
  43. chunkr_ai/types/task_update_params.py +0 -90
  44. chunkr_ai-0.1.0a1.dist-info/RECORD +0 -58
  45. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/WHEEL +0 -0
  46. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,33 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from .parse import (
4
+ ParseResource,
5
+ AsyncParseResource,
6
+ ParseResourceWithRawResponse,
7
+ AsyncParseResourceWithRawResponse,
8
+ ParseResourceWithStreamingResponse,
9
+ AsyncParseResourceWithStreamingResponse,
10
+ )
11
+ from .tasks import (
12
+ TasksResource,
13
+ AsyncTasksResource,
14
+ TasksResourceWithRawResponse,
15
+ AsyncTasksResourceWithRawResponse,
16
+ TasksResourceWithStreamingResponse,
17
+ AsyncTasksResourceWithStreamingResponse,
18
+ )
19
+
20
+ __all__ = [
21
+ "ParseResource",
22
+ "AsyncParseResource",
23
+ "ParseResourceWithRawResponse",
24
+ "AsyncParseResourceWithRawResponse",
25
+ "ParseResourceWithStreamingResponse",
26
+ "AsyncParseResourceWithStreamingResponse",
27
+ "TasksResource",
28
+ "AsyncTasksResource",
29
+ "TasksResourceWithRawResponse",
30
+ "AsyncTasksResourceWithRawResponse",
31
+ "TasksResourceWithStreamingResponse",
32
+ "AsyncTasksResourceWithStreamingResponse",
33
+ ]
@@ -0,0 +1,612 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+ from typing_extensions import Literal
7
+
8
+ import httpx
9
+
10
+ from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven
11
+ from ..._utils import maybe_transform, async_maybe_transform
12
+ from ..._compat import cached_property
13
+ from ..._resource import SyncAPIResource, AsyncAPIResource
14
+ from ..._response import (
15
+ to_raw_response_wrapper,
16
+ to_streamed_response_wrapper,
17
+ async_to_raw_response_wrapper,
18
+ async_to_streamed_response_wrapper,
19
+ )
20
+ from ...types.task import Task
21
+ from ...types.tasks import parse_create_params, parse_update_params
22
+ from ..._base_client import make_request_options
23
+
24
+ __all__ = ["ParseResource", "AsyncParseResource"]
25
+
26
+
27
+ class ParseResource(SyncAPIResource):
28
+ @cached_property
29
+ def with_raw_response(self) -> ParseResourceWithRawResponse:
30
+ """
31
+ This property can be used as a prefix for any HTTP method call to return
32
+ the raw response object instead of the parsed content.
33
+
34
+ For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#accessing-raw-response-data-eg-headers
35
+ """
36
+ return ParseResourceWithRawResponse(self)
37
+
38
+ @cached_property
39
+ def with_streaming_response(self) -> ParseResourceWithStreamingResponse:
40
+ """
41
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
42
+
43
+ For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#with_streaming_response
44
+ """
45
+ return ParseResourceWithStreamingResponse(self)
46
+
47
+ def create(
48
+ self,
49
+ *,
50
+ file: str,
51
+ chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
52
+ error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
53
+ expires_in: Optional[int] | NotGiven = NOT_GIVEN,
54
+ file_name: Optional[str] | NotGiven = NOT_GIVEN,
55
+ llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
56
+ ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
57
+ pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
58
+ segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
59
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
60
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
61
+ # The extra values given here take precedence over values defined on the client or passed to this method.
62
+ extra_headers: Headers | None = None,
63
+ extra_query: Query | None = None,
64
+ extra_body: Body | None = None,
65
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
66
+ ) -> Task:
67
+ """
68
+ Queues a document for processing and returns a TaskResponse containing:
69
+
70
+ - Task ID for status polling
71
+ - Initial configuration
72
+ - File metadata
73
+ - Processing status
74
+ - Creation timestamp
75
+ - Presigned URLs for file access
76
+
77
+ The returned task will typically be in a `Starting` or `Processing` state. Use
78
+ the `GET /tasks/{task_id}` endpoint to poll for completion.
79
+
80
+ Args:
81
+ file:
82
+ The file to be uploaded. Supported inputs:
83
+
84
+ - `ch://files/{file_id}`: References a previously uploaded file you own
85
+ (authorization enforced)
86
+ - `http(s)://...`: Remote URL to fetch
87
+ - `data:*;base64,...` or raw base64 string
88
+
89
+ chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
90
+
91
+ error_handling:
92
+ Controls how errors are handled during processing:
93
+
94
+ - `Fail`: Stops processing and fails the task when any error occurs
95
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
96
+ LLM refusals etc.)
97
+
98
+ expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
99
+ updated, polled or accessed via web interface.
100
+
101
+ file_name: The name of the file to be uploaded. If not set a name will be generated.
102
+
103
+ llm_processing: Controls the LLM used for the task.
104
+
105
+ ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
106
+
107
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
108
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
109
+ text. When text layer is present the bounding boxes from the text layer are
110
+ used.
111
+
112
+ pipeline: Choose the provider whose models will be used for segmentation and OCR. The
113
+ output will be unified to the Chunkr `output` format.
114
+
115
+ segment_processing: Defines how each segment type is handled when generating the final output.
116
+
117
+ Each segment uses one of three strategies. The chosen strategy controls: •
118
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
119
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
120
+ `Markdown`).
121
+
122
+ Optional flags such as image **cropping**, **extended context**, and
123
+ **descriptions** further refine behaviour.
124
+
125
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
126
+ `Caption`, `Footnote` → **Auto** (Markdown, description off) • `Table` → **LLM**
127
+ (HTML, description on) • `Picture` → **LLM** (Markdown, description off,
128
+ cropping _All_) • `Formula`, `Page` → **LLM** (Markdown, description off) •
129
+ `PageHeader`, `PageFooter` → **Ignore** (removed from output)
130
+
131
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
132
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
133
+
134
+ segmentation_strategy:
135
+ Controls the segmentation strategy:
136
+
137
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
138
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
139
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
140
+ - `Page`: Treats each page as a single segment. Faster processing, but without
141
+ layout element detection and only simple chunking.
142
+
143
+ extra_headers: Send extra headers
144
+
145
+ extra_query: Add additional query parameters to the request
146
+
147
+ extra_body: Add additional JSON properties to the request
148
+
149
+ timeout: Override the client-level default timeout for this request, in seconds
150
+ """
151
+ return self._post(
152
+ "/tasks/parse",
153
+ body=maybe_transform(
154
+ {
155
+ "file": file,
156
+ "chunk_processing": chunk_processing,
157
+ "error_handling": error_handling,
158
+ "expires_in": expires_in,
159
+ "file_name": file_name,
160
+ "llm_processing": llm_processing,
161
+ "ocr_strategy": ocr_strategy,
162
+ "pipeline": pipeline,
163
+ "segment_processing": segment_processing,
164
+ "segmentation_strategy": segmentation_strategy,
165
+ },
166
+ parse_create_params.ParseCreateParams,
167
+ ),
168
+ options=make_request_options(
169
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
170
+ ),
171
+ cast_to=Task,
172
+ )
173
+
174
+ def update(
175
+ self,
176
+ task_id: str,
177
+ *,
178
+ chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
179
+ error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
180
+ expires_in: Optional[int] | NotGiven = NOT_GIVEN,
181
+ high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
182
+ llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
183
+ ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
184
+ pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
185
+ segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
186
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
187
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
188
+ # The extra values given here take precedence over values defined on the client or passed to this method.
189
+ extra_headers: Headers | None = None,
190
+ extra_query: Query | None = None,
191
+ extra_body: Body | None = None,
192
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
193
+ ) -> Task:
194
+ """Updates an existing task's configuration and reprocesses the document.
195
+
196
+ The
197
+ original configuration will be used for all values that are not provided in the
198
+ update.
199
+
200
+ Requirements:
201
+
202
+ - Task must have status `Succeeded` or `Failed`
203
+ - New configuration must be different from the current one
204
+
205
+ The returned task will typically be in a `Starting` or `Processing` state. Use
206
+ the `GET /tasks/{task_id}` endpoint to poll for completion.
207
+
208
+ Args:
209
+ chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
210
+
211
+ error_handling:
212
+ Controls how errors are handled during processing:
213
+
214
+ - `Fail`: Stops processing and fails the task when any error occurs
215
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
216
+ LLM refusals etc.)
217
+
218
+ expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
219
+ updated, polled or accessed via web interface.
220
+
221
+ high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
222
+ penalty: ~7 seconds per page)
223
+
224
+ llm_processing: Controls the LLM used for the task.
225
+
226
+ ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
227
+
228
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
229
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
230
+ text. When text layer is present the bounding boxes from the text layer are
231
+ used.
232
+
233
+ pipeline: Choose the provider whose models will be used for segmentation and OCR. The
234
+ output will be unified to the Chunkr `output` format.
235
+
236
+ segment_processing: Defines how each segment type is handled when generating the final output.
237
+
238
+ Each segment uses one of three strategies. The chosen strategy controls: •
239
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
240
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
241
+ `Markdown`).
242
+
243
+ Optional flags such as image **cropping**, **extended context**, and
244
+ **descriptions** further refine behaviour.
245
+
246
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
247
+ `Caption`, `Footnote` → **Auto** (Markdown, description off) • `Table` → **LLM**
248
+ (HTML, description on) • `Picture` → **LLM** (Markdown, description off,
249
+ cropping _All_) • `Formula`, `Page` → **LLM** (Markdown, description off) •
250
+ `PageHeader`, `PageFooter` → **Ignore** (removed from output)
251
+
252
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
253
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
254
+
255
+ segmentation_strategy:
256
+ Controls the segmentation strategy:
257
+
258
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
259
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
260
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
261
+ - `Page`: Treats each page as a single segment. Faster processing, but without
262
+ layout element detection and only simple chunking.
263
+
264
+ extra_headers: Send extra headers
265
+
266
+ extra_query: Add additional query parameters to the request
267
+
268
+ extra_body: Add additional JSON properties to the request
269
+
270
+ timeout: Override the client-level default timeout for this request, in seconds
271
+ """
272
+ if not task_id:
273
+ raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
274
+ return self._patch(
275
+ f"/tasks/parse/{task_id}",
276
+ body=maybe_transform(
277
+ {
278
+ "chunk_processing": chunk_processing,
279
+ "error_handling": error_handling,
280
+ "expires_in": expires_in,
281
+ "high_resolution": high_resolution,
282
+ "llm_processing": llm_processing,
283
+ "ocr_strategy": ocr_strategy,
284
+ "pipeline": pipeline,
285
+ "segment_processing": segment_processing,
286
+ "segmentation_strategy": segmentation_strategy,
287
+ },
288
+ parse_update_params.ParseUpdateParams,
289
+ ),
290
+ options=make_request_options(
291
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
292
+ ),
293
+ cast_to=Task,
294
+ )
295
+
296
+
297
+ class AsyncParseResource(AsyncAPIResource):
298
+ @cached_property
299
+ def with_raw_response(self) -> AsyncParseResourceWithRawResponse:
300
+ """
301
+ This property can be used as a prefix for any HTTP method call to return
302
+ the raw response object instead of the parsed content.
303
+
304
+ For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#accessing-raw-response-data-eg-headers
305
+ """
306
+ return AsyncParseResourceWithRawResponse(self)
307
+
308
+ @cached_property
309
+ def with_streaming_response(self) -> AsyncParseResourceWithStreamingResponse:
310
+ """
311
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
312
+
313
+ For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#with_streaming_response
314
+ """
315
+ return AsyncParseResourceWithStreamingResponse(self)
316
+
317
+ async def create(
318
+ self,
319
+ *,
320
+ file: str,
321
+ chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
322
+ error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
323
+ expires_in: Optional[int] | NotGiven = NOT_GIVEN,
324
+ file_name: Optional[str] | NotGiven = NOT_GIVEN,
325
+ llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
326
+ ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
327
+ pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
328
+ segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
329
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
330
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
331
+ # The extra values given here take precedence over values defined on the client or passed to this method.
332
+ extra_headers: Headers | None = None,
333
+ extra_query: Query | None = None,
334
+ extra_body: Body | None = None,
335
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
336
+ ) -> Task:
337
+ """
338
+ Queues a document for processing and returns a TaskResponse containing:
339
+
340
+ - Task ID for status polling
341
+ - Initial configuration
342
+ - File metadata
343
+ - Processing status
344
+ - Creation timestamp
345
+ - Presigned URLs for file access
346
+
347
+ The returned task will typically be in a `Starting` or `Processing` state. Use
348
+ the `GET /tasks/{task_id}` endpoint to poll for completion.
349
+
350
+ Args:
351
+ file:
352
+ The file to be uploaded. Supported inputs:
353
+
354
+ - `ch://files/{file_id}`: References a previously uploaded file you own
355
+ (authorization enforced)
356
+ - `http(s)://...`: Remote URL to fetch
357
+ - `data:*;base64,...` or raw base64 string
358
+
359
+ chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
360
+
361
+ error_handling:
362
+ Controls how errors are handled during processing:
363
+
364
+ - `Fail`: Stops processing and fails the task when any error occurs
365
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
366
+ LLM refusals etc.)
367
+
368
+ expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
369
+ updated, polled or accessed via web interface.
370
+
371
+ file_name: The name of the file to be uploaded. If not set a name will be generated.
372
+
373
+ llm_processing: Controls the LLM used for the task.
374
+
375
+ ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
376
+
377
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
378
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
379
+ text. When text layer is present the bounding boxes from the text layer are
380
+ used.
381
+
382
+ pipeline: Choose the provider whose models will be used for segmentation and OCR. The
383
+ output will be unified to the Chunkr `output` format.
384
+
385
+ segment_processing: Defines how each segment type is handled when generating the final output.
386
+
387
+ Each segment uses one of three strategies. The chosen strategy controls: •
388
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
389
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
390
+ `Markdown`).
391
+
392
+ Optional flags such as image **cropping**, **extended context**, and
393
+ **descriptions** further refine behaviour.
394
+
395
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
396
+ `Caption`, `Footnote` → **Auto** (Markdown, description off) • `Table` → **LLM**
397
+ (HTML, description on) • `Picture` → **LLM** (Markdown, description off,
398
+ cropping _All_) • `Formula`, `Page` → **LLM** (Markdown, description off) •
399
+ `PageHeader`, `PageFooter` → **Ignore** (removed from output)
400
+
401
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
402
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
403
+
404
+ segmentation_strategy:
405
+ Controls the segmentation strategy:
406
+
407
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
408
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
409
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
410
+ - `Page`: Treats each page as a single segment. Faster processing, but without
411
+ layout element detection and only simple chunking.
412
+
413
+ extra_headers: Send extra headers
414
+
415
+ extra_query: Add additional query parameters to the request
416
+
417
+ extra_body: Add additional JSON properties to the request
418
+
419
+ timeout: Override the client-level default timeout for this request, in seconds
420
+ """
421
+ return await self._post(
422
+ "/tasks/parse",
423
+ body=await async_maybe_transform(
424
+ {
425
+ "file": file,
426
+ "chunk_processing": chunk_processing,
427
+ "error_handling": error_handling,
428
+ "expires_in": expires_in,
429
+ "file_name": file_name,
430
+ "llm_processing": llm_processing,
431
+ "ocr_strategy": ocr_strategy,
432
+ "pipeline": pipeline,
433
+ "segment_processing": segment_processing,
434
+ "segmentation_strategy": segmentation_strategy,
435
+ },
436
+ parse_create_params.ParseCreateParams,
437
+ ),
438
+ options=make_request_options(
439
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
440
+ ),
441
+ cast_to=Task,
442
+ )
443
+
444
+ async def update(
445
+ self,
446
+ task_id: str,
447
+ *,
448
+ chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
449
+ error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
450
+ expires_in: Optional[int] | NotGiven = NOT_GIVEN,
451
+ high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
452
+ llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
453
+ ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
454
+ pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
455
+ segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
456
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
457
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
458
+ # The extra values given here take precedence over values defined on the client or passed to this method.
459
+ extra_headers: Headers | None = None,
460
+ extra_query: Query | None = None,
461
+ extra_body: Body | None = None,
462
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
463
+ ) -> Task:
464
+ """Updates an existing task's configuration and reprocesses the document.
465
+
466
+ The
467
+ original configuration will be used for all values that are not provided in the
468
+ update.
469
+
470
+ Requirements:
471
+
472
+ - Task must have status `Succeeded` or `Failed`
473
+ - New configuration must be different from the current one
474
+
475
+ The returned task will typically be in a `Starting` or `Processing` state. Use
476
+ the `GET /tasks/{task_id}` endpoint to poll for completion.
477
+
478
+ Args:
479
+ chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
480
+
481
+ error_handling:
482
+ Controls how errors are handled during processing:
483
+
484
+ - `Fail`: Stops processing and fails the task when any error occurs
485
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
486
+ LLM refusals etc.)
487
+
488
+ expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
489
+ updated, polled or accessed via web interface.
490
+
491
+ high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
492
+ penalty: ~7 seconds per page)
493
+
494
+ llm_processing: Controls the LLM used for the task.
495
+
496
+ ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
497
+
498
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
499
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
500
+ text. When text layer is present the bounding boxes from the text layer are
501
+ used.
502
+
503
+ pipeline: Choose the provider whose models will be used for segmentation and OCR. The
504
+ output will be unified to the Chunkr `output` format.
505
+
506
+ segment_processing: Defines how each segment type is handled when generating the final output.
507
+
508
+ Each segment uses one of three strategies. The chosen strategy controls: •
509
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
510
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
511
+ `Markdown`).
512
+
513
+ Optional flags such as image **cropping**, **extended context**, and
514
+ **descriptions** further refine behaviour.
515
+
516
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
517
+ `Caption`, `Footnote` → **Auto** (Markdown, description off) • `Table` → **LLM**
518
+ (HTML, description on) • `Picture` → **LLM** (Markdown, description off,
519
+ cropping _All_) • `Formula`, `Page` → **LLM** (Markdown, description off) •
520
+ `PageHeader`, `PageFooter` → **Ignore** (removed from output)
521
+
522
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
523
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
524
+
525
+ segmentation_strategy:
526
+ Controls the segmentation strategy:
527
+
528
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
529
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
530
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
531
+ - `Page`: Treats each page as a single segment. Faster processing, but without
532
+ layout element detection and only simple chunking.
533
+
534
+ extra_headers: Send extra headers
535
+
536
+ extra_query: Add additional query parameters to the request
537
+
538
+ extra_body: Add additional JSON properties to the request
539
+
540
+ timeout: Override the client-level default timeout for this request, in seconds
541
+ """
542
+ if not task_id:
543
+ raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
544
+ return await self._patch(
545
+ f"/tasks/parse/{task_id}",
546
+ body=await async_maybe_transform(
547
+ {
548
+ "chunk_processing": chunk_processing,
549
+ "error_handling": error_handling,
550
+ "expires_in": expires_in,
551
+ "high_resolution": high_resolution,
552
+ "llm_processing": llm_processing,
553
+ "ocr_strategy": ocr_strategy,
554
+ "pipeline": pipeline,
555
+ "segment_processing": segment_processing,
556
+ "segmentation_strategy": segmentation_strategy,
557
+ },
558
+ parse_update_params.ParseUpdateParams,
559
+ ),
560
+ options=make_request_options(
561
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
562
+ ),
563
+ cast_to=Task,
564
+ )
565
+
566
+
567
+ class ParseResourceWithRawResponse:
568
+ def __init__(self, parse: ParseResource) -> None:
569
+ self._parse = parse
570
+
571
+ self.create = to_raw_response_wrapper(
572
+ parse.create,
573
+ )
574
+ self.update = to_raw_response_wrapper(
575
+ parse.update,
576
+ )
577
+
578
+
579
+ class AsyncParseResourceWithRawResponse:
580
+ def __init__(self, parse: AsyncParseResource) -> None:
581
+ self._parse = parse
582
+
583
+ self.create = async_to_raw_response_wrapper(
584
+ parse.create,
585
+ )
586
+ self.update = async_to_raw_response_wrapper(
587
+ parse.update,
588
+ )
589
+
590
+
591
+ class ParseResourceWithStreamingResponse:
592
+ def __init__(self, parse: ParseResource) -> None:
593
+ self._parse = parse
594
+
595
+ self.create = to_streamed_response_wrapper(
596
+ parse.create,
597
+ )
598
+ self.update = to_streamed_response_wrapper(
599
+ parse.update,
600
+ )
601
+
602
+
603
+ class AsyncParseResourceWithStreamingResponse:
604
+ def __init__(self, parse: AsyncParseResource) -> None:
605
+ self._parse = parse
606
+
607
+ self.create = async_to_streamed_response_wrapper(
608
+ parse.create,
609
+ )
610
+ self.update = async_to_streamed_response_wrapper(
611
+ parse.update,
612
+ )