chunkr-ai 0.1.0__py3-none-any.whl → 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/__init__.py +89 -2
- chunkr_ai/_base_client.py +1995 -0
- chunkr_ai/_client.py +403 -0
- chunkr_ai/_compat.py +219 -0
- chunkr_ai/_constants.py +14 -0
- chunkr_ai/_exceptions.py +108 -0
- chunkr_ai/_files.py +123 -0
- chunkr_ai/_models.py +829 -0
- chunkr_ai/_qs.py +150 -0
- chunkr_ai/_resource.py +43 -0
- chunkr_ai/_response.py +830 -0
- chunkr_ai/_streaming.py +333 -0
- chunkr_ai/_types.py +219 -0
- chunkr_ai/_utils/__init__.py +57 -0
- chunkr_ai/_utils/_logs.py +25 -0
- chunkr_ai/_utils/_proxy.py +65 -0
- chunkr_ai/_utils/_reflection.py +42 -0
- chunkr_ai/_utils/_resources_proxy.py +24 -0
- chunkr_ai/_utils/_streams.py +12 -0
- chunkr_ai/_utils/_sync.py +86 -0
- chunkr_ai/_utils/_transform.py +447 -0
- chunkr_ai/_utils/_typing.py +151 -0
- chunkr_ai/_utils/_utils.py +422 -0
- chunkr_ai/_version.py +4 -0
- chunkr_ai/lib/.keep +4 -0
- chunkr_ai/pagination.py +71 -0
- chunkr_ai/resources/__init__.py +33 -0
- chunkr_ai/resources/health.py +136 -0
- chunkr_ai/resources/task/__init__.py +33 -0
- chunkr_ai/resources/task/parse.py +616 -0
- chunkr_ai/resources/task/task.py +664 -0
- chunkr_ai/types/__init__.py +8 -0
- chunkr_ai/types/health_check_response.py +7 -0
- chunkr_ai/types/task/__init__.py +7 -0
- chunkr_ai/types/task/parse_create_params.py +806 -0
- chunkr_ai/types/task/parse_update_params.py +806 -0
- chunkr_ai/types/task/task.py +1186 -0
- chunkr_ai/types/task_get_params.py +18 -0
- chunkr_ai/types/task_list_params.py +37 -0
- chunkr_ai-0.1.0a2.dist-info/METADATA +504 -0
- chunkr_ai-0.1.0a2.dist-info/RECORD +44 -0
- {chunkr_ai-0.1.0.dist-info → chunkr_ai-0.1.0a2.dist-info}/WHEEL +1 -2
- chunkr_ai-0.1.0a2.dist-info/licenses/LICENSE +201 -0
- chunkr_ai/api/auth.py +0 -13
- chunkr_ai/api/chunkr.py +0 -103
- chunkr_ai/api/chunkr_base.py +0 -185
- chunkr_ai/api/configuration.py +0 -313
- chunkr_ai/api/decorators.py +0 -101
- chunkr_ai/api/misc.py +0 -139
- chunkr_ai/api/protocol.py +0 -14
- chunkr_ai/api/task_response.py +0 -208
- chunkr_ai/models.py +0 -55
- chunkr_ai-0.1.0.dist-info/METADATA +0 -268
- chunkr_ai-0.1.0.dist-info/RECORD +0 -16
- chunkr_ai-0.1.0.dist-info/licenses/LICENSE +0 -21
- chunkr_ai-0.1.0.dist-info/top_level.txt +0 -1
- /chunkr_ai/{api/__init__.py → py.typed} +0 -0
@@ -0,0 +1,616 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import Optional
|
6
|
+
from typing_extensions import Literal
|
7
|
+
|
8
|
+
import httpx
|
9
|
+
|
10
|
+
from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven
|
11
|
+
from ..._utils import maybe_transform, async_maybe_transform
|
12
|
+
from ..._compat import cached_property
|
13
|
+
from ..._resource import SyncAPIResource, AsyncAPIResource
|
14
|
+
from ..._response import (
|
15
|
+
to_raw_response_wrapper,
|
16
|
+
to_streamed_response_wrapper,
|
17
|
+
async_to_raw_response_wrapper,
|
18
|
+
async_to_streamed_response_wrapper,
|
19
|
+
)
|
20
|
+
from ...types.task import parse_create_params, parse_update_params
|
21
|
+
from ..._base_client import make_request_options
|
22
|
+
from ...types.task.task import Task
|
23
|
+
|
24
|
+
__all__ = ["ParseResource", "AsyncParseResource"]
|
25
|
+
|
26
|
+
|
27
|
+
class ParseResource(SyncAPIResource):
|
28
|
+
@cached_property
|
29
|
+
def with_raw_response(self) -> ParseResourceWithRawResponse:
|
30
|
+
"""
|
31
|
+
This property can be used as a prefix for any HTTP method call to return
|
32
|
+
the raw response object instead of the parsed content.
|
33
|
+
|
34
|
+
For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#accessing-raw-response-data-eg-headers
|
35
|
+
"""
|
36
|
+
return ParseResourceWithRawResponse(self)
|
37
|
+
|
38
|
+
@cached_property
|
39
|
+
def with_streaming_response(self) -> ParseResourceWithStreamingResponse:
|
40
|
+
"""
|
41
|
+
An alternative to `.with_raw_response` that doesn't eagerly read the response body.
|
42
|
+
|
43
|
+
For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#with_streaming_response
|
44
|
+
"""
|
45
|
+
return ParseResourceWithStreamingResponse(self)
|
46
|
+
|
47
|
+
def create(
|
48
|
+
self,
|
49
|
+
*,
|
50
|
+
file: str,
|
51
|
+
chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
|
52
|
+
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
53
|
+
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
54
|
+
file_name: Optional[str] | NotGiven = NOT_GIVEN,
|
55
|
+
llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
|
56
|
+
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
57
|
+
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
58
|
+
segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
|
59
|
+
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
60
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
61
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
62
|
+
extra_headers: Headers | None = None,
|
63
|
+
extra_query: Query | None = None,
|
64
|
+
extra_body: Body | None = None,
|
65
|
+
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
66
|
+
) -> Task:
|
67
|
+
"""
|
68
|
+
Queues a document for processing and returns a TaskResponse containing:
|
69
|
+
|
70
|
+
- Task ID for status polling
|
71
|
+
- Initial configuration
|
72
|
+
- File metadata
|
73
|
+
- Processing status
|
74
|
+
- Creation timestamp
|
75
|
+
- Presigned URLs for file access
|
76
|
+
|
77
|
+
The returned task will typically be in a `Starting` or `Processing` state. Use
|
78
|
+
the `GET /task/{task_id}` endpoint to poll for completion.
|
79
|
+
|
80
|
+
Args:
|
81
|
+
file: The file to be uploaded. Can be a URL or a base64 encoded file.
|
82
|
+
|
83
|
+
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
84
|
+
|
85
|
+
error_handling:
|
86
|
+
Controls how errors are handled during processing:
|
87
|
+
|
88
|
+
- `Fail`: Stops processing and fails the task when any error occurs
|
89
|
+
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
90
|
+
LLM refusals etc.)
|
91
|
+
|
92
|
+
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
93
|
+
updated, polled or accessed via web interface.
|
94
|
+
|
95
|
+
file_name: The name of the file to be uploaded. If not set a name will be generated.
|
96
|
+
|
97
|
+
llm_processing: Controls the LLM used for the task.
|
98
|
+
|
99
|
+
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
100
|
+
|
101
|
+
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
102
|
+
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
103
|
+
text. When text layer is present the bounding boxes from the text layer are
|
104
|
+
used.
|
105
|
+
|
106
|
+
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
107
|
+
output will be unified to the Chunkr `output` format.
|
108
|
+
|
109
|
+
segment_processing: Defines how each segment type is handled when generating the final output.
|
110
|
+
|
111
|
+
Each segment uses one of three strategies. The chosen strategy controls: •
|
112
|
+
Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
|
113
|
+
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
114
|
+
`Markdown`).
|
115
|
+
|
116
|
+
Optional flags such as image **cropping**, **extended context**, and **LLM
|
117
|
+
descriptions** further refine behaviour.
|
118
|
+
|
119
|
+
---
|
120
|
+
|
121
|
+
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
122
|
+
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
|
123
|
+
description on) • `Picture` → **LLM** (Markdown, description off, cropping
|
124
|
+
_All_) • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
|
125
|
+
**Ignore** (removed from output)
|
126
|
+
|
127
|
+
---
|
128
|
+
|
129
|
+
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
130
|
+
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
131
|
+
|
132
|
+
segmentation_strategy:
|
133
|
+
Controls the segmentation strategy:
|
134
|
+
|
135
|
+
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
136
|
+
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
137
|
+
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
138
|
+
- `Page`: Treats each page as a single segment. Faster processing, but without
|
139
|
+
layout element detection and only simple chunking.
|
140
|
+
|
141
|
+
extra_headers: Send extra headers
|
142
|
+
|
143
|
+
extra_query: Add additional query parameters to the request
|
144
|
+
|
145
|
+
extra_body: Add additional JSON properties to the request
|
146
|
+
|
147
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
148
|
+
"""
|
149
|
+
return self._post(
|
150
|
+
"/task/parse",
|
151
|
+
body=maybe_transform(
|
152
|
+
{
|
153
|
+
"file": file,
|
154
|
+
"chunk_processing": chunk_processing,
|
155
|
+
"error_handling": error_handling,
|
156
|
+
"expires_in": expires_in,
|
157
|
+
"file_name": file_name,
|
158
|
+
"llm_processing": llm_processing,
|
159
|
+
"ocr_strategy": ocr_strategy,
|
160
|
+
"pipeline": pipeline,
|
161
|
+
"segment_processing": segment_processing,
|
162
|
+
"segmentation_strategy": segmentation_strategy,
|
163
|
+
},
|
164
|
+
parse_create_params.ParseCreateParams,
|
165
|
+
),
|
166
|
+
options=make_request_options(
|
167
|
+
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
168
|
+
),
|
169
|
+
cast_to=Task,
|
170
|
+
)
|
171
|
+
|
172
|
+
def update(
|
173
|
+
self,
|
174
|
+
task_id: str,
|
175
|
+
*,
|
176
|
+
chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
|
177
|
+
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
178
|
+
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
179
|
+
high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
|
180
|
+
llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
|
181
|
+
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
182
|
+
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
183
|
+
segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
|
184
|
+
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
185
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
186
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
187
|
+
extra_headers: Headers | None = None,
|
188
|
+
extra_query: Query | None = None,
|
189
|
+
extra_body: Body | None = None,
|
190
|
+
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
191
|
+
) -> Task:
|
192
|
+
"""Updates an existing task's configuration and reprocesses the document.
|
193
|
+
|
194
|
+
The
|
195
|
+
original configuration will be used for all values that are not provided in the
|
196
|
+
update.
|
197
|
+
|
198
|
+
Requirements:
|
199
|
+
|
200
|
+
- Task must have status `Succeeded` or `Failed`
|
201
|
+
- New configuration must be different from the current one
|
202
|
+
|
203
|
+
The returned task will typically be in a `Starting` or `Processing` state. Use
|
204
|
+
the `GET /task/{task_id}` endpoint to poll for completion.
|
205
|
+
|
206
|
+
Args:
|
207
|
+
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
208
|
+
|
209
|
+
error_handling:
|
210
|
+
Controls how errors are handled during processing:
|
211
|
+
|
212
|
+
- `Fail`: Stops processing and fails the task when any error occurs
|
213
|
+
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
214
|
+
LLM refusals etc.)
|
215
|
+
|
216
|
+
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
217
|
+
updated, polled or accessed via web interface.
|
218
|
+
|
219
|
+
high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
|
220
|
+
penalty: ~7 seconds per page)
|
221
|
+
|
222
|
+
llm_processing: Controls the LLM used for the task.
|
223
|
+
|
224
|
+
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
225
|
+
|
226
|
+
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
227
|
+
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
228
|
+
text. When text layer is present the bounding boxes from the text layer are
|
229
|
+
used.
|
230
|
+
|
231
|
+
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
232
|
+
output will be unified to the Chunkr `output` format.
|
233
|
+
|
234
|
+
segment_processing: Defines how each segment type is handled when generating the final output.
|
235
|
+
|
236
|
+
Each segment uses one of three strategies. The chosen strategy controls: •
|
237
|
+
Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
|
238
|
+
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
239
|
+
`Markdown`).
|
240
|
+
|
241
|
+
Optional flags such as image **cropping**, **extended context**, and **LLM
|
242
|
+
descriptions** further refine behaviour.
|
243
|
+
|
244
|
+
---
|
245
|
+
|
246
|
+
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
247
|
+
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
|
248
|
+
description on) • `Picture` → **LLM** (Markdown, description off, cropping
|
249
|
+
_All_) • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
|
250
|
+
**Ignore** (removed from output)
|
251
|
+
|
252
|
+
---
|
253
|
+
|
254
|
+
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
255
|
+
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
256
|
+
|
257
|
+
segmentation_strategy:
|
258
|
+
Controls the segmentation strategy:
|
259
|
+
|
260
|
+
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
261
|
+
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
262
|
+
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
263
|
+
- `Page`: Treats each page as a single segment. Faster processing, but without
|
264
|
+
layout element detection and only simple chunking.
|
265
|
+
|
266
|
+
extra_headers: Send extra headers
|
267
|
+
|
268
|
+
extra_query: Add additional query parameters to the request
|
269
|
+
|
270
|
+
extra_body: Add additional JSON properties to the request
|
271
|
+
|
272
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
273
|
+
"""
|
274
|
+
if not task_id:
|
275
|
+
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
276
|
+
return self._patch(
|
277
|
+
f"/task/{task_id}/parse",
|
278
|
+
body=maybe_transform(
|
279
|
+
{
|
280
|
+
"chunk_processing": chunk_processing,
|
281
|
+
"error_handling": error_handling,
|
282
|
+
"expires_in": expires_in,
|
283
|
+
"high_resolution": high_resolution,
|
284
|
+
"llm_processing": llm_processing,
|
285
|
+
"ocr_strategy": ocr_strategy,
|
286
|
+
"pipeline": pipeline,
|
287
|
+
"segment_processing": segment_processing,
|
288
|
+
"segmentation_strategy": segmentation_strategy,
|
289
|
+
},
|
290
|
+
parse_update_params.ParseUpdateParams,
|
291
|
+
),
|
292
|
+
options=make_request_options(
|
293
|
+
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
294
|
+
),
|
295
|
+
cast_to=Task,
|
296
|
+
)
|
297
|
+
|
298
|
+
|
299
|
+
class AsyncParseResource(AsyncAPIResource):
|
300
|
+
@cached_property
|
301
|
+
def with_raw_response(self) -> AsyncParseResourceWithRawResponse:
|
302
|
+
"""
|
303
|
+
This property can be used as a prefix for any HTTP method call to return
|
304
|
+
the raw response object instead of the parsed content.
|
305
|
+
|
306
|
+
For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#accessing-raw-response-data-eg-headers
|
307
|
+
"""
|
308
|
+
return AsyncParseResourceWithRawResponse(self)
|
309
|
+
|
310
|
+
@cached_property
|
311
|
+
def with_streaming_response(self) -> AsyncParseResourceWithStreamingResponse:
|
312
|
+
"""
|
313
|
+
An alternative to `.with_raw_response` that doesn't eagerly read the response body.
|
314
|
+
|
315
|
+
For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#with_streaming_response
|
316
|
+
"""
|
317
|
+
return AsyncParseResourceWithStreamingResponse(self)
|
318
|
+
|
319
|
+
async def create(
|
320
|
+
self,
|
321
|
+
*,
|
322
|
+
file: str,
|
323
|
+
chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
|
324
|
+
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
325
|
+
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
326
|
+
file_name: Optional[str] | NotGiven = NOT_GIVEN,
|
327
|
+
llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
|
328
|
+
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
329
|
+
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
330
|
+
segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
|
331
|
+
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
332
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
333
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
334
|
+
extra_headers: Headers | None = None,
|
335
|
+
extra_query: Query | None = None,
|
336
|
+
extra_body: Body | None = None,
|
337
|
+
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
338
|
+
) -> Task:
|
339
|
+
"""
|
340
|
+
Queues a document for processing and returns a TaskResponse containing:
|
341
|
+
|
342
|
+
- Task ID for status polling
|
343
|
+
- Initial configuration
|
344
|
+
- File metadata
|
345
|
+
- Processing status
|
346
|
+
- Creation timestamp
|
347
|
+
- Presigned URLs for file access
|
348
|
+
|
349
|
+
The returned task will typically be in a `Starting` or `Processing` state. Use
|
350
|
+
the `GET /task/{task_id}` endpoint to poll for completion.
|
351
|
+
|
352
|
+
Args:
|
353
|
+
file: The file to be uploaded. Can be a URL or a base64 encoded file.
|
354
|
+
|
355
|
+
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
356
|
+
|
357
|
+
error_handling:
|
358
|
+
Controls how errors are handled during processing:
|
359
|
+
|
360
|
+
- `Fail`: Stops processing and fails the task when any error occurs
|
361
|
+
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
362
|
+
LLM refusals etc.)
|
363
|
+
|
364
|
+
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
365
|
+
updated, polled or accessed via web interface.
|
366
|
+
|
367
|
+
file_name: The name of the file to be uploaded. If not set a name will be generated.
|
368
|
+
|
369
|
+
llm_processing: Controls the LLM used for the task.
|
370
|
+
|
371
|
+
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
372
|
+
|
373
|
+
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
374
|
+
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
375
|
+
text. When text layer is present the bounding boxes from the text layer are
|
376
|
+
used.
|
377
|
+
|
378
|
+
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
379
|
+
output will be unified to the Chunkr `output` format.
|
380
|
+
|
381
|
+
segment_processing: Defines how each segment type is handled when generating the final output.
|
382
|
+
|
383
|
+
Each segment uses one of three strategies. The chosen strategy controls: •
|
384
|
+
Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
|
385
|
+
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
386
|
+
`Markdown`).
|
387
|
+
|
388
|
+
Optional flags such as image **cropping**, **extended context**, and **LLM
|
389
|
+
descriptions** further refine behaviour.
|
390
|
+
|
391
|
+
---
|
392
|
+
|
393
|
+
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
394
|
+
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
|
395
|
+
description on) • `Picture` → **LLM** (Markdown, description off, cropping
|
396
|
+
_All_) • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
|
397
|
+
**Ignore** (removed from output)
|
398
|
+
|
399
|
+
---
|
400
|
+
|
401
|
+
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
402
|
+
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
403
|
+
|
404
|
+
segmentation_strategy:
|
405
|
+
Controls the segmentation strategy:
|
406
|
+
|
407
|
+
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
408
|
+
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
409
|
+
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
410
|
+
- `Page`: Treats each page as a single segment. Faster processing, but without
|
411
|
+
layout element detection and only simple chunking.
|
412
|
+
|
413
|
+
extra_headers: Send extra headers
|
414
|
+
|
415
|
+
extra_query: Add additional query parameters to the request
|
416
|
+
|
417
|
+
extra_body: Add additional JSON properties to the request
|
418
|
+
|
419
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
420
|
+
"""
|
421
|
+
return await self._post(
|
422
|
+
"/task/parse",
|
423
|
+
body=await async_maybe_transform(
|
424
|
+
{
|
425
|
+
"file": file,
|
426
|
+
"chunk_processing": chunk_processing,
|
427
|
+
"error_handling": error_handling,
|
428
|
+
"expires_in": expires_in,
|
429
|
+
"file_name": file_name,
|
430
|
+
"llm_processing": llm_processing,
|
431
|
+
"ocr_strategy": ocr_strategy,
|
432
|
+
"pipeline": pipeline,
|
433
|
+
"segment_processing": segment_processing,
|
434
|
+
"segmentation_strategy": segmentation_strategy,
|
435
|
+
},
|
436
|
+
parse_create_params.ParseCreateParams,
|
437
|
+
),
|
438
|
+
options=make_request_options(
|
439
|
+
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
440
|
+
),
|
441
|
+
cast_to=Task,
|
442
|
+
)
|
443
|
+
|
444
|
+
async def update(
|
445
|
+
self,
|
446
|
+
task_id: str,
|
447
|
+
*,
|
448
|
+
chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
|
449
|
+
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
450
|
+
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
451
|
+
high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
|
452
|
+
llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
|
453
|
+
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
454
|
+
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
455
|
+
segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
|
456
|
+
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
457
|
+
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
458
|
+
# The extra values given here take precedence over values defined on the client or passed to this method.
|
459
|
+
extra_headers: Headers | None = None,
|
460
|
+
extra_query: Query | None = None,
|
461
|
+
extra_body: Body | None = None,
|
462
|
+
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
463
|
+
) -> Task:
|
464
|
+
"""Updates an existing task's configuration and reprocesses the document.
|
465
|
+
|
466
|
+
The
|
467
|
+
original configuration will be used for all values that are not provided in the
|
468
|
+
update.
|
469
|
+
|
470
|
+
Requirements:
|
471
|
+
|
472
|
+
- Task must have status `Succeeded` or `Failed`
|
473
|
+
- New configuration must be different from the current one
|
474
|
+
|
475
|
+
The returned task will typically be in a `Starting` or `Processing` state. Use
|
476
|
+
the `GET /task/{task_id}` endpoint to poll for completion.
|
477
|
+
|
478
|
+
Args:
|
479
|
+
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
480
|
+
|
481
|
+
error_handling:
|
482
|
+
Controls how errors are handled during processing:
|
483
|
+
|
484
|
+
- `Fail`: Stops processing and fails the task when any error occurs
|
485
|
+
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
486
|
+
LLM refusals etc.)
|
487
|
+
|
488
|
+
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
489
|
+
updated, polled or accessed via web interface.
|
490
|
+
|
491
|
+
high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
|
492
|
+
penalty: ~7 seconds per page)
|
493
|
+
|
494
|
+
llm_processing: Controls the LLM used for the task.
|
495
|
+
|
496
|
+
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
497
|
+
|
498
|
+
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
499
|
+
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
500
|
+
text. When text layer is present the bounding boxes from the text layer are
|
501
|
+
used.
|
502
|
+
|
503
|
+
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
504
|
+
output will be unified to the Chunkr `output` format.
|
505
|
+
|
506
|
+
segment_processing: Defines how each segment type is handled when generating the final output.
|
507
|
+
|
508
|
+
Each segment uses one of three strategies. The chosen strategy controls: •
|
509
|
+
Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
|
510
|
+
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
511
|
+
`Markdown`).
|
512
|
+
|
513
|
+
Optional flags such as image **cropping**, **extended context**, and **LLM
|
514
|
+
descriptions** further refine behaviour.
|
515
|
+
|
516
|
+
---
|
517
|
+
|
518
|
+
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
519
|
+
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
|
520
|
+
description on) • `Picture` → **LLM** (Markdown, description off, cropping
|
521
|
+
_All_) • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
|
522
|
+
**Ignore** (removed from output)
|
523
|
+
|
524
|
+
---
|
525
|
+
|
526
|
+
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
527
|
+
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
528
|
+
|
529
|
+
segmentation_strategy:
|
530
|
+
Controls the segmentation strategy:
|
531
|
+
|
532
|
+
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
533
|
+
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
534
|
+
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
535
|
+
- `Page`: Treats each page as a single segment. Faster processing, but without
|
536
|
+
layout element detection and only simple chunking.
|
537
|
+
|
538
|
+
extra_headers: Send extra headers
|
539
|
+
|
540
|
+
extra_query: Add additional query parameters to the request
|
541
|
+
|
542
|
+
extra_body: Add additional JSON properties to the request
|
543
|
+
|
544
|
+
timeout: Override the client-level default timeout for this request, in seconds
|
545
|
+
"""
|
546
|
+
if not task_id:
|
547
|
+
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
548
|
+
return await self._patch(
|
549
|
+
f"/task/{task_id}/parse",
|
550
|
+
body=await async_maybe_transform(
|
551
|
+
{
|
552
|
+
"chunk_processing": chunk_processing,
|
553
|
+
"error_handling": error_handling,
|
554
|
+
"expires_in": expires_in,
|
555
|
+
"high_resolution": high_resolution,
|
556
|
+
"llm_processing": llm_processing,
|
557
|
+
"ocr_strategy": ocr_strategy,
|
558
|
+
"pipeline": pipeline,
|
559
|
+
"segment_processing": segment_processing,
|
560
|
+
"segmentation_strategy": segmentation_strategy,
|
561
|
+
},
|
562
|
+
parse_update_params.ParseUpdateParams,
|
563
|
+
),
|
564
|
+
options=make_request_options(
|
565
|
+
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
566
|
+
),
|
567
|
+
cast_to=Task,
|
568
|
+
)
|
569
|
+
|
570
|
+
|
571
|
+
class ParseResourceWithRawResponse:
|
572
|
+
def __init__(self, parse: ParseResource) -> None:
|
573
|
+
self._parse = parse
|
574
|
+
|
575
|
+
self.create = to_raw_response_wrapper(
|
576
|
+
parse.create,
|
577
|
+
)
|
578
|
+
self.update = to_raw_response_wrapper(
|
579
|
+
parse.update,
|
580
|
+
)
|
581
|
+
|
582
|
+
|
583
|
+
class AsyncParseResourceWithRawResponse:
|
584
|
+
def __init__(self, parse: AsyncParseResource) -> None:
|
585
|
+
self._parse = parse
|
586
|
+
|
587
|
+
self.create = async_to_raw_response_wrapper(
|
588
|
+
parse.create,
|
589
|
+
)
|
590
|
+
self.update = async_to_raw_response_wrapper(
|
591
|
+
parse.update,
|
592
|
+
)
|
593
|
+
|
594
|
+
|
595
|
+
class ParseResourceWithStreamingResponse:
|
596
|
+
def __init__(self, parse: ParseResource) -> None:
|
597
|
+
self._parse = parse
|
598
|
+
|
599
|
+
self.create = to_streamed_response_wrapper(
|
600
|
+
parse.create,
|
601
|
+
)
|
602
|
+
self.update = to_streamed_response_wrapper(
|
603
|
+
parse.update,
|
604
|
+
)
|
605
|
+
|
606
|
+
|
607
|
+
class AsyncParseResourceWithStreamingResponse:
|
608
|
+
def __init__(self, parse: AsyncParseResource) -> None:
|
609
|
+
self._parse = parse
|
610
|
+
|
611
|
+
self.create = async_to_streamed_response_wrapper(
|
612
|
+
parse.create,
|
613
|
+
)
|
614
|
+
self.update = async_to_streamed_response_wrapper(
|
615
|
+
parse.update,
|
616
|
+
)
|