chunkr-ai 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/_client.py +18 -9
- chunkr_ai/_files.py +1 -1
- chunkr_ai/_version.py +1 -1
- chunkr_ai/pagination.py +61 -1
- chunkr_ai/resources/__init__.py +27 -13
- chunkr_ai/resources/files.py +712 -0
- chunkr_ai/resources/tasks/__init__.py +33 -0
- chunkr_ai/resources/tasks/parse.py +612 -0
- chunkr_ai/resources/tasks/tasks.py +596 -0
- chunkr_ai/types/__init__.py +7 -19
- chunkr_ai/types/delete.py +10 -0
- chunkr_ai/types/file.py +30 -0
- chunkr_ai/types/file_create_params.py +17 -0
- chunkr_ai/types/file_list_params.py +28 -0
- chunkr_ai/types/file_url.py +15 -0
- chunkr_ai/types/file_url_params.py +15 -0
- chunkr_ai/types/files_page_response.py +20 -0
- chunkr_ai/types/task.py +866 -27
- chunkr_ai/types/tasks/__init__.py +6 -0
- chunkr_ai/types/tasks/parse_create_params.py +844 -0
- chunkr_ai/types/tasks/parse_update_params.py +838 -0
- {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/METADATA +39 -21
- chunkr_ai-0.1.0a3.dist-info/RECORD +52 -0
- chunkr_ai/resources/task.py +0 -1166
- chunkr_ai/types/auto_generation_config.py +0 -39
- chunkr_ai/types/auto_generation_config_param.py +0 -39
- chunkr_ai/types/bounding_box.py +0 -19
- chunkr_ai/types/chunk_processing.py +0 -40
- chunkr_ai/types/chunk_processing_param.py +0 -42
- chunkr_ai/types/ignore_generation_config.py +0 -39
- chunkr_ai/types/ignore_generation_config_param.py +0 -39
- chunkr_ai/types/llm_generation_config.py +0 -39
- chunkr_ai/types/llm_generation_config_param.py +0 -39
- chunkr_ai/types/llm_processing.py +0 -36
- chunkr_ai/types/llm_processing_param.py +0 -36
- chunkr_ai/types/picture_generation_config.py +0 -39
- chunkr_ai/types/picture_generation_config_param.py +0 -39
- chunkr_ai/types/segment_processing.py +0 -280
- chunkr_ai/types/segment_processing_param.py +0 -281
- chunkr_ai/types/table_generation_config.py +0 -39
- chunkr_ai/types/table_generation_config_param.py +0 -39
- chunkr_ai/types/task_parse_params.py +0 -90
- chunkr_ai/types/task_update_params.py +0 -90
- chunkr_ai-0.1.0a1.dist-info/RECORD +0 -58
- {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
chunkr_ai/resources/task.py
DELETED
@@ -1,1166 +0,0 @@
|
|
1
|
-
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
-
|
3
|
-
from __future__ import annotations
|
4
|
-
|
5
|
-
from typing import Union, Optional
|
6
|
-
from datetime import datetime
|
7
|
-
from typing_extensions import Literal
|
8
|
-
|
9
|
-
import httpx
|
10
|
-
|
11
|
-
from ..types import (
|
12
|
-
task_get_params,
|
13
|
-
task_list_params,
|
14
|
-
task_parse_params,
|
15
|
-
task_update_params,
|
16
|
-
)
|
17
|
-
from .._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
|
18
|
-
from .._utils import maybe_transform, async_maybe_transform
|
19
|
-
from .._compat import cached_property
|
20
|
-
from .._resource import SyncAPIResource, AsyncAPIResource
|
21
|
-
from .._response import (
|
22
|
-
to_raw_response_wrapper,
|
23
|
-
to_streamed_response_wrapper,
|
24
|
-
async_to_raw_response_wrapper,
|
25
|
-
async_to_streamed_response_wrapper,
|
26
|
-
)
|
27
|
-
from ..pagination import SyncTasksPage, AsyncTasksPage
|
28
|
-
from ..types.task import Task
|
29
|
-
from .._base_client import AsyncPaginator, make_request_options
|
30
|
-
from ..types.llm_processing_param import LlmProcessingParam
|
31
|
-
from ..types.chunk_processing_param import ChunkProcessingParam
|
32
|
-
from ..types.segment_processing_param import SegmentProcessingParam
|
33
|
-
|
34
|
-
__all__ = ["TaskResource", "AsyncTaskResource"]
|
35
|
-
|
36
|
-
|
37
|
-
class TaskResource(SyncAPIResource):
|
38
|
-
@cached_property
|
39
|
-
def with_raw_response(self) -> TaskResourceWithRawResponse:
|
40
|
-
"""
|
41
|
-
This property can be used as a prefix for any HTTP method call to return
|
42
|
-
the raw response object instead of the parsed content.
|
43
|
-
|
44
|
-
For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#accessing-raw-response-data-eg-headers
|
45
|
-
"""
|
46
|
-
return TaskResourceWithRawResponse(self)
|
47
|
-
|
48
|
-
@cached_property
|
49
|
-
def with_streaming_response(self) -> TaskResourceWithStreamingResponse:
|
50
|
-
"""
|
51
|
-
An alternative to `.with_raw_response` that doesn't eagerly read the response body.
|
52
|
-
|
53
|
-
For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#with_streaming_response
|
54
|
-
"""
|
55
|
-
return TaskResourceWithStreamingResponse(self)
|
56
|
-
|
57
|
-
def update(
|
58
|
-
self,
|
59
|
-
task_id: str,
|
60
|
-
*,
|
61
|
-
chunk_processing: Optional[ChunkProcessingParam] | NotGiven = NOT_GIVEN,
|
62
|
-
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
63
|
-
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
64
|
-
high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
|
65
|
-
llm_processing: Optional[LlmProcessingParam] | NotGiven = NOT_GIVEN,
|
66
|
-
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
67
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
68
|
-
segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
|
69
|
-
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
70
|
-
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
71
|
-
# The extra values given here take precedence over values defined on the client or passed to this method.
|
72
|
-
extra_headers: Headers | None = None,
|
73
|
-
extra_query: Query | None = None,
|
74
|
-
extra_body: Body | None = None,
|
75
|
-
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
76
|
-
) -> Task:
|
77
|
-
"""Updates an existing task's configuration and reprocesses the document.
|
78
|
-
|
79
|
-
The
|
80
|
-
original configuration will be used for all values that are not provided in the
|
81
|
-
update.
|
82
|
-
|
83
|
-
Requirements:
|
84
|
-
|
85
|
-
- Task must have status `Succeeded` or `Failed`
|
86
|
-
- New configuration must be different from the current one
|
87
|
-
|
88
|
-
The returned task will typically be in a `Starting` or `Processing` state. Use
|
89
|
-
the `GET /task/{task_id}` endpoint to poll for completion.
|
90
|
-
|
91
|
-
Args:
|
92
|
-
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
93
|
-
|
94
|
-
error_handling:
|
95
|
-
Controls how errors are handled during processing:
|
96
|
-
|
97
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
98
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
99
|
-
LLM refusals etc.)
|
100
|
-
|
101
|
-
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
102
|
-
updated, polled or accessed via web interface.
|
103
|
-
|
104
|
-
high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
|
105
|
-
penalty: ~7 seconds per page)
|
106
|
-
|
107
|
-
llm_processing: Controls the LLM used for the task.
|
108
|
-
|
109
|
-
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
110
|
-
|
111
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
112
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
113
|
-
text. When text layer is present the bounding boxes from the text layer are
|
114
|
-
used.
|
115
|
-
|
116
|
-
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
117
|
-
output will be unified to the Chunkr `output` format.
|
118
|
-
|
119
|
-
segment_processing: Defines how each segment type is handled when generating the final output.
|
120
|
-
|
121
|
-
Each segment uses one of three strategies. The chosen strategy controls: •
|
122
|
-
Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
|
123
|
-
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
124
|
-
`Markdown`).
|
125
|
-
|
126
|
-
Optional flags such as image **cropping**, **extended context**, and **LLM
|
127
|
-
descriptions** further refine behaviour.
|
128
|
-
|
129
|
-
---
|
130
|
-
|
131
|
-
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
132
|
-
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
|
133
|
-
description on) • `Picture` → **LLM** (Markdown, description on, cropping _All_)
|
134
|
-
• `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
|
135
|
-
**Ignore** (removed from output)
|
136
|
-
|
137
|
-
---
|
138
|
-
|
139
|
-
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
140
|
-
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
141
|
-
|
142
|
-
segmentation_strategy:
|
143
|
-
Controls the segmentation strategy:
|
144
|
-
|
145
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
146
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
147
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
148
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
149
|
-
layout element detection and only simple chunking.
|
150
|
-
|
151
|
-
extra_headers: Send extra headers
|
152
|
-
|
153
|
-
extra_query: Add additional query parameters to the request
|
154
|
-
|
155
|
-
extra_body: Add additional JSON properties to the request
|
156
|
-
|
157
|
-
timeout: Override the client-level default timeout for this request, in seconds
|
158
|
-
"""
|
159
|
-
if not task_id:
|
160
|
-
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
161
|
-
return self._patch(
|
162
|
-
f"/task/{task_id}/parse",
|
163
|
-
body=maybe_transform(
|
164
|
-
{
|
165
|
-
"chunk_processing": chunk_processing,
|
166
|
-
"error_handling": error_handling,
|
167
|
-
"expires_in": expires_in,
|
168
|
-
"high_resolution": high_resolution,
|
169
|
-
"llm_processing": llm_processing,
|
170
|
-
"ocr_strategy": ocr_strategy,
|
171
|
-
"pipeline": pipeline,
|
172
|
-
"segment_processing": segment_processing,
|
173
|
-
"segmentation_strategy": segmentation_strategy,
|
174
|
-
},
|
175
|
-
task_update_params.TaskUpdateParams,
|
176
|
-
),
|
177
|
-
options=make_request_options(
|
178
|
-
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
179
|
-
),
|
180
|
-
cast_to=Task,
|
181
|
-
)
|
182
|
-
|
183
|
-
def list(
|
184
|
-
self,
|
185
|
-
*,
|
186
|
-
base64_urls: bool | NotGiven = NOT_GIVEN,
|
187
|
-
cursor: Union[str, datetime] | NotGiven = NOT_GIVEN,
|
188
|
-
end: Union[str, datetime] | NotGiven = NOT_GIVEN,
|
189
|
-
include_chunks: bool | NotGiven = NOT_GIVEN,
|
190
|
-
limit: int | NotGiven = NOT_GIVEN,
|
191
|
-
sort: Literal["asc", "desc"] | NotGiven = NOT_GIVEN,
|
192
|
-
start: Union[str, datetime] | NotGiven = NOT_GIVEN,
|
193
|
-
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
194
|
-
# The extra values given here take precedence over values defined on the client or passed to this method.
|
195
|
-
extra_headers: Headers | None = None,
|
196
|
-
extra_query: Query | None = None,
|
197
|
-
extra_body: Body | None = None,
|
198
|
-
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
199
|
-
) -> SyncTasksPage[Task]:
|
200
|
-
"""Retrieves a list of tasks with cursor-based pagination.
|
201
|
-
|
202
|
-
By default, tasks are
|
203
|
-
returned in descending order (newest first).
|
204
|
-
|
205
|
-
## Default Behaviors:
|
206
|
-
|
207
|
-
- **limit**: Returns all tasks if not specified
|
208
|
-
- **start**: No start date filter (returns from beginning of time)
|
209
|
-
- **end**: No end date filter (returns up to current time)
|
210
|
-
- **cursor**: Starts from most recent tasks (no pagination offset)
|
211
|
-
- **sort**: 'desc' (descending order, newest first)
|
212
|
-
- **include_chunks**: false (excludes chunks for better performance)
|
213
|
-
- **base64_urls**: false (returns presigned URLs instead of base64)
|
214
|
-
|
215
|
-
## Common Usage Patterns:
|
216
|
-
|
217
|
-
**Basic usage (get all tasks):** `GET /api/v1/tasks`
|
218
|
-
|
219
|
-
**Get first 10 tasks:** `GET /api/v1/tasks?limit=10`
|
220
|
-
|
221
|
-
**Paginate through results:**
|
222
|
-
|
223
|
-
1. First request: `GET /api/v1/tasks?limit=10`
|
224
|
-
2. Use next_cursor from response for subsequent pages:
|
225
|
-
`GET /api/v1/tasks?limit=10&cursor=<timestamp>`
|
226
|
-
|
227
|
-
**Filter by date range:**
|
228
|
-
`GET /api/v1/tasks?start=2025-01-01T00:00:00Z&end=2025-12-31T23:59:59Z`
|
229
|
-
|
230
|
-
**Get detailed results with chunks:** `GET /api/v1/tasks?include_chunks=true`
|
231
|
-
|
232
|
-
**Get base64 encoded content:** `GET /api/v1/tasks?base64_urls=true`
|
233
|
-
|
234
|
-
**Get tasks in ascending order (oldest first):** `GET /api/v1/tasks?sort=asc`
|
235
|
-
|
236
|
-
**Get tasks in descending order (newest first, default):**
|
237
|
-
`GET /api/v1/tasks?sort=desc`
|
238
|
-
|
239
|
-
Args:
|
240
|
-
base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
|
241
|
-
presigned URLs.
|
242
|
-
|
243
|
-
cursor: Cursor for pagination (timestamp)
|
244
|
-
|
245
|
-
end: End date
|
246
|
-
|
247
|
-
include_chunks: Whether to include chunks in the output response
|
248
|
-
|
249
|
-
limit: Number of tasks per page
|
250
|
-
|
251
|
-
sort: Sort order: 'asc' for ascending, 'desc' for descending (default)
|
252
|
-
|
253
|
-
start: Start date
|
254
|
-
|
255
|
-
extra_headers: Send extra headers
|
256
|
-
|
257
|
-
extra_query: Add additional query parameters to the request
|
258
|
-
|
259
|
-
extra_body: Add additional JSON properties to the request
|
260
|
-
|
261
|
-
timeout: Override the client-level default timeout for this request, in seconds
|
262
|
-
"""
|
263
|
-
return self._get_api_list(
|
264
|
-
"/tasks",
|
265
|
-
page=SyncTasksPage[Task],
|
266
|
-
options=make_request_options(
|
267
|
-
extra_headers=extra_headers,
|
268
|
-
extra_query=extra_query,
|
269
|
-
extra_body=extra_body,
|
270
|
-
timeout=timeout,
|
271
|
-
query=maybe_transform(
|
272
|
-
{
|
273
|
-
"base64_urls": base64_urls,
|
274
|
-
"cursor": cursor,
|
275
|
-
"end": end,
|
276
|
-
"include_chunks": include_chunks,
|
277
|
-
"limit": limit,
|
278
|
-
"sort": sort,
|
279
|
-
"start": start,
|
280
|
-
},
|
281
|
-
task_list_params.TaskListParams,
|
282
|
-
),
|
283
|
-
),
|
284
|
-
model=Task,
|
285
|
-
)
|
286
|
-
|
287
|
-
def delete(
|
288
|
-
self,
|
289
|
-
task_id: Optional[str],
|
290
|
-
*,
|
291
|
-
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
292
|
-
# The extra values given here take precedence over values defined on the client or passed to this method.
|
293
|
-
extra_headers: Headers | None = None,
|
294
|
-
extra_query: Query | None = None,
|
295
|
-
extra_body: Body | None = None,
|
296
|
-
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
297
|
-
) -> None:
|
298
|
-
"""
|
299
|
-
Delete a task by its ID.
|
300
|
-
|
301
|
-
Requirements:
|
302
|
-
|
303
|
-
- Task must have status `Succeeded` or `Failed`
|
304
|
-
|
305
|
-
Args:
|
306
|
-
extra_headers: Send extra headers
|
307
|
-
|
308
|
-
extra_query: Add additional query parameters to the request
|
309
|
-
|
310
|
-
extra_body: Add additional JSON properties to the request
|
311
|
-
|
312
|
-
timeout: Override the client-level default timeout for this request, in seconds
|
313
|
-
"""
|
314
|
-
if not task_id:
|
315
|
-
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
316
|
-
extra_headers = {"Accept": "*/*", **(extra_headers or {})}
|
317
|
-
return self._delete(
|
318
|
-
f"/task/{task_id}",
|
319
|
-
options=make_request_options(
|
320
|
-
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
321
|
-
),
|
322
|
-
cast_to=NoneType,
|
323
|
-
)
|
324
|
-
|
325
|
-
def cancel(
|
326
|
-
self,
|
327
|
-
task_id: Optional[str],
|
328
|
-
*,
|
329
|
-
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
330
|
-
# The extra values given here take precedence over values defined on the client or passed to this method.
|
331
|
-
extra_headers: Headers | None = None,
|
332
|
-
extra_query: Query | None = None,
|
333
|
-
extra_body: Body | None = None,
|
334
|
-
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
335
|
-
) -> None:
|
336
|
-
"""
|
337
|
-
Cancel a task that hasn't started processing yet:
|
338
|
-
|
339
|
-
- For new tasks: Status will be updated to `Cancelled`
|
340
|
-
- For updating tasks: Task will revert to the previous state
|
341
|
-
|
342
|
-
Requirements:
|
343
|
-
|
344
|
-
- Task must have status `Starting`
|
345
|
-
|
346
|
-
Args:
|
347
|
-
extra_headers: Send extra headers
|
348
|
-
|
349
|
-
extra_query: Add additional query parameters to the request
|
350
|
-
|
351
|
-
extra_body: Add additional JSON properties to the request
|
352
|
-
|
353
|
-
timeout: Override the client-level default timeout for this request, in seconds
|
354
|
-
"""
|
355
|
-
if not task_id:
|
356
|
-
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
357
|
-
extra_headers = {"Accept": "*/*", **(extra_headers or {})}
|
358
|
-
return self._get(
|
359
|
-
f"/task/{task_id}/cancel",
|
360
|
-
options=make_request_options(
|
361
|
-
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
362
|
-
),
|
363
|
-
cast_to=NoneType,
|
364
|
-
)
|
365
|
-
|
366
|
-
def get(
|
367
|
-
self,
|
368
|
-
task_id: Optional[str],
|
369
|
-
*,
|
370
|
-
base64_urls: bool | NotGiven = NOT_GIVEN,
|
371
|
-
include_chunks: bool | NotGiven = NOT_GIVEN,
|
372
|
-
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
373
|
-
# The extra values given here take precedence over values defined on the client or passed to this method.
|
374
|
-
extra_headers: Headers | None = None,
|
375
|
-
extra_query: Query | None = None,
|
376
|
-
extra_body: Body | None = None,
|
377
|
-
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
378
|
-
) -> Task:
|
379
|
-
"""
|
380
|
-
Retrieves detailed information about a task by its ID, including:
|
381
|
-
|
382
|
-
- Processing status
|
383
|
-
- Task configuration
|
384
|
-
- Output data (if processing is complete)
|
385
|
-
- File metadata (name, page count)
|
386
|
-
- Timestamps (created, started, finished)
|
387
|
-
- Presigned URLs for accessing files
|
388
|
-
|
389
|
-
This endpoint can be used to:
|
390
|
-
|
391
|
-
1. Poll the task status during processing
|
392
|
-
2. Retrieve the final output once processing is complete
|
393
|
-
3. Access task metadata and configuration
|
394
|
-
|
395
|
-
Args:
|
396
|
-
base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
|
397
|
-
presigned URLs.
|
398
|
-
|
399
|
-
include_chunks: Whether to include chunks in the output response
|
400
|
-
|
401
|
-
extra_headers: Send extra headers
|
402
|
-
|
403
|
-
extra_query: Add additional query parameters to the request
|
404
|
-
|
405
|
-
extra_body: Add additional JSON properties to the request
|
406
|
-
|
407
|
-
timeout: Override the client-level default timeout for this request, in seconds
|
408
|
-
"""
|
409
|
-
if not task_id:
|
410
|
-
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
411
|
-
return self._get(
|
412
|
-
f"/task/{task_id}",
|
413
|
-
options=make_request_options(
|
414
|
-
extra_headers=extra_headers,
|
415
|
-
extra_query=extra_query,
|
416
|
-
extra_body=extra_body,
|
417
|
-
timeout=timeout,
|
418
|
-
query=maybe_transform(
|
419
|
-
{
|
420
|
-
"base64_urls": base64_urls,
|
421
|
-
"include_chunks": include_chunks,
|
422
|
-
},
|
423
|
-
task_get_params.TaskGetParams,
|
424
|
-
),
|
425
|
-
),
|
426
|
-
cast_to=Task,
|
427
|
-
)
|
428
|
-
|
429
|
-
def parse(
|
430
|
-
self,
|
431
|
-
*,
|
432
|
-
file: str,
|
433
|
-
chunk_processing: Optional[ChunkProcessingParam] | NotGiven = NOT_GIVEN,
|
434
|
-
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
435
|
-
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
436
|
-
file_name: Optional[str] | NotGiven = NOT_GIVEN,
|
437
|
-
llm_processing: Optional[LlmProcessingParam] | NotGiven = NOT_GIVEN,
|
438
|
-
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
439
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
440
|
-
segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
|
441
|
-
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
442
|
-
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
443
|
-
# The extra values given here take precedence over values defined on the client or passed to this method.
|
444
|
-
extra_headers: Headers | None = None,
|
445
|
-
extra_query: Query | None = None,
|
446
|
-
extra_body: Body | None = None,
|
447
|
-
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
448
|
-
) -> Task:
|
449
|
-
"""
|
450
|
-
Queues a document for processing and returns a TaskResponse containing:
|
451
|
-
|
452
|
-
- Task ID for status polling
|
453
|
-
- Initial configuration
|
454
|
-
- File metadata
|
455
|
-
- Processing status
|
456
|
-
- Creation timestamp
|
457
|
-
- Presigned URLs for file access
|
458
|
-
|
459
|
-
The returned task will typically be in a `Starting` or `Processing` state. Use
|
460
|
-
the `GET /task/{task_id}` endpoint to poll for completion.
|
461
|
-
|
462
|
-
Args:
|
463
|
-
file: The file to be uploaded. Can be a URL or a base64 encoded file.
|
464
|
-
|
465
|
-
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
466
|
-
|
467
|
-
error_handling:
|
468
|
-
Controls how errors are handled during processing:
|
469
|
-
|
470
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
471
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
472
|
-
LLM refusals etc.)
|
473
|
-
|
474
|
-
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
475
|
-
updated, polled or accessed via web interface.
|
476
|
-
|
477
|
-
file_name: The name of the file to be uploaded. If not set a name will be generated.
|
478
|
-
|
479
|
-
llm_processing: Controls the LLM used for the task.
|
480
|
-
|
481
|
-
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
482
|
-
|
483
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
484
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
485
|
-
text. When text layer is present the bounding boxes from the text layer are
|
486
|
-
used.
|
487
|
-
|
488
|
-
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
489
|
-
output will be unified to the Chunkr `output` format.
|
490
|
-
|
491
|
-
segment_processing: Defines how each segment type is handled when generating the final output.
|
492
|
-
|
493
|
-
Each segment uses one of three strategies. The chosen strategy controls: •
|
494
|
-
Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
|
495
|
-
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
496
|
-
`Markdown`).
|
497
|
-
|
498
|
-
Optional flags such as image **cropping**, **extended context**, and **LLM
|
499
|
-
descriptions** further refine behaviour.
|
500
|
-
|
501
|
-
---
|
502
|
-
|
503
|
-
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
504
|
-
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
|
505
|
-
description on) • `Picture` → **LLM** (Markdown, description on, cropping _All_)
|
506
|
-
• `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
|
507
|
-
**Ignore** (removed from output)
|
508
|
-
|
509
|
-
---
|
510
|
-
|
511
|
-
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
512
|
-
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
513
|
-
|
514
|
-
segmentation_strategy:
|
515
|
-
Controls the segmentation strategy:
|
516
|
-
|
517
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
518
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
519
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
520
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
521
|
-
layout element detection and only simple chunking.
|
522
|
-
|
523
|
-
extra_headers: Send extra headers
|
524
|
-
|
525
|
-
extra_query: Add additional query parameters to the request
|
526
|
-
|
527
|
-
extra_body: Add additional JSON properties to the request
|
528
|
-
|
529
|
-
timeout: Override the client-level default timeout for this request, in seconds
|
530
|
-
"""
|
531
|
-
return self._post(
|
532
|
-
"/task/parse",
|
533
|
-
body=maybe_transform(
|
534
|
-
{
|
535
|
-
"file": file,
|
536
|
-
"chunk_processing": chunk_processing,
|
537
|
-
"error_handling": error_handling,
|
538
|
-
"expires_in": expires_in,
|
539
|
-
"file_name": file_name,
|
540
|
-
"llm_processing": llm_processing,
|
541
|
-
"ocr_strategy": ocr_strategy,
|
542
|
-
"pipeline": pipeline,
|
543
|
-
"segment_processing": segment_processing,
|
544
|
-
"segmentation_strategy": segmentation_strategy,
|
545
|
-
},
|
546
|
-
task_parse_params.TaskParseParams,
|
547
|
-
),
|
548
|
-
options=make_request_options(
|
549
|
-
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
550
|
-
),
|
551
|
-
cast_to=Task,
|
552
|
-
)
|
553
|
-
|
554
|
-
|
555
|
-
class AsyncTaskResource(AsyncAPIResource):
|
556
|
-
@cached_property
|
557
|
-
def with_raw_response(self) -> AsyncTaskResourceWithRawResponse:
|
558
|
-
"""
|
559
|
-
This property can be used as a prefix for any HTTP method call to return
|
560
|
-
the raw response object instead of the parsed content.
|
561
|
-
|
562
|
-
For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#accessing-raw-response-data-eg-headers
|
563
|
-
"""
|
564
|
-
return AsyncTaskResourceWithRawResponse(self)
|
565
|
-
|
566
|
-
@cached_property
|
567
|
-
def with_streaming_response(self) -> AsyncTaskResourceWithStreamingResponse:
|
568
|
-
"""
|
569
|
-
An alternative to `.with_raw_response` that doesn't eagerly read the response body.
|
570
|
-
|
571
|
-
For more information, see https://www.github.com/lumina-ai-inc/chunkr-python#with_streaming_response
|
572
|
-
"""
|
573
|
-
return AsyncTaskResourceWithStreamingResponse(self)
|
574
|
-
|
575
|
-
async def update(
|
576
|
-
self,
|
577
|
-
task_id: str,
|
578
|
-
*,
|
579
|
-
chunk_processing: Optional[ChunkProcessingParam] | NotGiven = NOT_GIVEN,
|
580
|
-
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
581
|
-
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
582
|
-
high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
|
583
|
-
llm_processing: Optional[LlmProcessingParam] | NotGiven = NOT_GIVEN,
|
584
|
-
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
585
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
586
|
-
segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
|
587
|
-
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
588
|
-
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
589
|
-
# The extra values given here take precedence over values defined on the client or passed to this method.
|
590
|
-
extra_headers: Headers | None = None,
|
591
|
-
extra_query: Query | None = None,
|
592
|
-
extra_body: Body | None = None,
|
593
|
-
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
594
|
-
) -> Task:
|
595
|
-
"""Updates an existing task's configuration and reprocesses the document.
|
596
|
-
|
597
|
-
The
|
598
|
-
original configuration will be used for all values that are not provided in the
|
599
|
-
update.
|
600
|
-
|
601
|
-
Requirements:
|
602
|
-
|
603
|
-
- Task must have status `Succeeded` or `Failed`
|
604
|
-
- New configuration must be different from the current one
|
605
|
-
|
606
|
-
The returned task will typically be in a `Starting` or `Processing` state. Use
|
607
|
-
the `GET /task/{task_id}` endpoint to poll for completion.
|
608
|
-
|
609
|
-
Args:
|
610
|
-
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
611
|
-
|
612
|
-
error_handling:
|
613
|
-
Controls how errors are handled during processing:
|
614
|
-
|
615
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
616
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
617
|
-
LLM refusals etc.)
|
618
|
-
|
619
|
-
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
620
|
-
updated, polled or accessed via web interface.
|
621
|
-
|
622
|
-
high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
|
623
|
-
penalty: ~7 seconds per page)
|
624
|
-
|
625
|
-
llm_processing: Controls the LLM used for the task.
|
626
|
-
|
627
|
-
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
628
|
-
|
629
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
630
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
631
|
-
text. When text layer is present the bounding boxes from the text layer are
|
632
|
-
used.
|
633
|
-
|
634
|
-
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
635
|
-
output will be unified to the Chunkr `output` format.
|
636
|
-
|
637
|
-
segment_processing: Defines how each segment type is handled when generating the final output.
|
638
|
-
|
639
|
-
Each segment uses one of three strategies. The chosen strategy controls: •
|
640
|
-
Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
|
641
|
-
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
642
|
-
`Markdown`).
|
643
|
-
|
644
|
-
Optional flags such as image **cropping**, **extended context**, and **LLM
|
645
|
-
descriptions** further refine behaviour.
|
646
|
-
|
647
|
-
---
|
648
|
-
|
649
|
-
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
650
|
-
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
|
651
|
-
description on) • `Picture` → **LLM** (Markdown, description on, cropping _All_)
|
652
|
-
• `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
|
653
|
-
**Ignore** (removed from output)
|
654
|
-
|
655
|
-
---
|
656
|
-
|
657
|
-
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
658
|
-
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
659
|
-
|
660
|
-
segmentation_strategy:
|
661
|
-
Controls the segmentation strategy:
|
662
|
-
|
663
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
664
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
665
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
666
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
667
|
-
layout element detection and only simple chunking.
|
668
|
-
|
669
|
-
extra_headers: Send extra headers
|
670
|
-
|
671
|
-
extra_query: Add additional query parameters to the request
|
672
|
-
|
673
|
-
extra_body: Add additional JSON properties to the request
|
674
|
-
|
675
|
-
timeout: Override the client-level default timeout for this request, in seconds
|
676
|
-
"""
|
677
|
-
if not task_id:
|
678
|
-
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
679
|
-
return await self._patch(
|
680
|
-
f"/task/{task_id}/parse",
|
681
|
-
body=await async_maybe_transform(
|
682
|
-
{
|
683
|
-
"chunk_processing": chunk_processing,
|
684
|
-
"error_handling": error_handling,
|
685
|
-
"expires_in": expires_in,
|
686
|
-
"high_resolution": high_resolution,
|
687
|
-
"llm_processing": llm_processing,
|
688
|
-
"ocr_strategy": ocr_strategy,
|
689
|
-
"pipeline": pipeline,
|
690
|
-
"segment_processing": segment_processing,
|
691
|
-
"segmentation_strategy": segmentation_strategy,
|
692
|
-
},
|
693
|
-
task_update_params.TaskUpdateParams,
|
694
|
-
),
|
695
|
-
options=make_request_options(
|
696
|
-
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
697
|
-
),
|
698
|
-
cast_to=Task,
|
699
|
-
)
|
700
|
-
|
701
|
-
def list(
|
702
|
-
self,
|
703
|
-
*,
|
704
|
-
base64_urls: bool | NotGiven = NOT_GIVEN,
|
705
|
-
cursor: Union[str, datetime] | NotGiven = NOT_GIVEN,
|
706
|
-
end: Union[str, datetime] | NotGiven = NOT_GIVEN,
|
707
|
-
include_chunks: bool | NotGiven = NOT_GIVEN,
|
708
|
-
limit: int | NotGiven = NOT_GIVEN,
|
709
|
-
sort: Literal["asc", "desc"] | NotGiven = NOT_GIVEN,
|
710
|
-
start: Union[str, datetime] | NotGiven = NOT_GIVEN,
|
711
|
-
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
712
|
-
# The extra values given here take precedence over values defined on the client or passed to this method.
|
713
|
-
extra_headers: Headers | None = None,
|
714
|
-
extra_query: Query | None = None,
|
715
|
-
extra_body: Body | None = None,
|
716
|
-
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
717
|
-
) -> AsyncPaginator[Task, AsyncTasksPage[Task]]:
|
718
|
-
"""Retrieves a list of tasks with cursor-based pagination.
|
719
|
-
|
720
|
-
By default, tasks are
|
721
|
-
returned in descending order (newest first).
|
722
|
-
|
723
|
-
## Default Behaviors:
|
724
|
-
|
725
|
-
- **limit**: Returns all tasks if not specified
|
726
|
-
- **start**: No start date filter (returns from beginning of time)
|
727
|
-
- **end**: No end date filter (returns up to current time)
|
728
|
-
- **cursor**: Starts from most recent tasks (no pagination offset)
|
729
|
-
- **sort**: 'desc' (descending order, newest first)
|
730
|
-
- **include_chunks**: false (excludes chunks for better performance)
|
731
|
-
- **base64_urls**: false (returns presigned URLs instead of base64)
|
732
|
-
|
733
|
-
## Common Usage Patterns:
|
734
|
-
|
735
|
-
**Basic usage (get all tasks):** `GET /api/v1/tasks`
|
736
|
-
|
737
|
-
**Get first 10 tasks:** `GET /api/v1/tasks?limit=10`
|
738
|
-
|
739
|
-
**Paginate through results:**
|
740
|
-
|
741
|
-
1. First request: `GET /api/v1/tasks?limit=10`
|
742
|
-
2. Use next_cursor from response for subsequent pages:
|
743
|
-
`GET /api/v1/tasks?limit=10&cursor=<timestamp>`
|
744
|
-
|
745
|
-
**Filter by date range:**
|
746
|
-
`GET /api/v1/tasks?start=2025-01-01T00:00:00Z&end=2025-12-31T23:59:59Z`
|
747
|
-
|
748
|
-
**Get detailed results with chunks:** `GET /api/v1/tasks?include_chunks=true`
|
749
|
-
|
750
|
-
**Get base64 encoded content:** `GET /api/v1/tasks?base64_urls=true`
|
751
|
-
|
752
|
-
**Get tasks in ascending order (oldest first):** `GET /api/v1/tasks?sort=asc`
|
753
|
-
|
754
|
-
**Get tasks in descending order (newest first, default):**
|
755
|
-
`GET /api/v1/tasks?sort=desc`
|
756
|
-
|
757
|
-
Args:
|
758
|
-
base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
|
759
|
-
presigned URLs.
|
760
|
-
|
761
|
-
cursor: Cursor for pagination (timestamp)
|
762
|
-
|
763
|
-
end: End date
|
764
|
-
|
765
|
-
include_chunks: Whether to include chunks in the output response
|
766
|
-
|
767
|
-
limit: Number of tasks per page
|
768
|
-
|
769
|
-
sort: Sort order: 'asc' for ascending, 'desc' for descending (default)
|
770
|
-
|
771
|
-
start: Start date
|
772
|
-
|
773
|
-
extra_headers: Send extra headers
|
774
|
-
|
775
|
-
extra_query: Add additional query parameters to the request
|
776
|
-
|
777
|
-
extra_body: Add additional JSON properties to the request
|
778
|
-
|
779
|
-
timeout: Override the client-level default timeout for this request, in seconds
|
780
|
-
"""
|
781
|
-
return self._get_api_list(
|
782
|
-
"/tasks",
|
783
|
-
page=AsyncTasksPage[Task],
|
784
|
-
options=make_request_options(
|
785
|
-
extra_headers=extra_headers,
|
786
|
-
extra_query=extra_query,
|
787
|
-
extra_body=extra_body,
|
788
|
-
timeout=timeout,
|
789
|
-
query=maybe_transform(
|
790
|
-
{
|
791
|
-
"base64_urls": base64_urls,
|
792
|
-
"cursor": cursor,
|
793
|
-
"end": end,
|
794
|
-
"include_chunks": include_chunks,
|
795
|
-
"limit": limit,
|
796
|
-
"sort": sort,
|
797
|
-
"start": start,
|
798
|
-
},
|
799
|
-
task_list_params.TaskListParams,
|
800
|
-
),
|
801
|
-
),
|
802
|
-
model=Task,
|
803
|
-
)
|
804
|
-
|
805
|
-
async def delete(
|
806
|
-
self,
|
807
|
-
task_id: Optional[str],
|
808
|
-
*,
|
809
|
-
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
810
|
-
# The extra values given here take precedence over values defined on the client or passed to this method.
|
811
|
-
extra_headers: Headers | None = None,
|
812
|
-
extra_query: Query | None = None,
|
813
|
-
extra_body: Body | None = None,
|
814
|
-
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
815
|
-
) -> None:
|
816
|
-
"""
|
817
|
-
Delete a task by its ID.
|
818
|
-
|
819
|
-
Requirements:
|
820
|
-
|
821
|
-
- Task must have status `Succeeded` or `Failed`
|
822
|
-
|
823
|
-
Args:
|
824
|
-
extra_headers: Send extra headers
|
825
|
-
|
826
|
-
extra_query: Add additional query parameters to the request
|
827
|
-
|
828
|
-
extra_body: Add additional JSON properties to the request
|
829
|
-
|
830
|
-
timeout: Override the client-level default timeout for this request, in seconds
|
831
|
-
"""
|
832
|
-
if not task_id:
|
833
|
-
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
834
|
-
extra_headers = {"Accept": "*/*", **(extra_headers or {})}
|
835
|
-
return await self._delete(
|
836
|
-
f"/task/{task_id}",
|
837
|
-
options=make_request_options(
|
838
|
-
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
839
|
-
),
|
840
|
-
cast_to=NoneType,
|
841
|
-
)
|
842
|
-
|
843
|
-
async def cancel(
|
844
|
-
self,
|
845
|
-
task_id: Optional[str],
|
846
|
-
*,
|
847
|
-
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
848
|
-
# The extra values given here take precedence over values defined on the client or passed to this method.
|
849
|
-
extra_headers: Headers | None = None,
|
850
|
-
extra_query: Query | None = None,
|
851
|
-
extra_body: Body | None = None,
|
852
|
-
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
853
|
-
) -> None:
|
854
|
-
"""
|
855
|
-
Cancel a task that hasn't started processing yet:
|
856
|
-
|
857
|
-
- For new tasks: Status will be updated to `Cancelled`
|
858
|
-
- For updating tasks: Task will revert to the previous state
|
859
|
-
|
860
|
-
Requirements:
|
861
|
-
|
862
|
-
- Task must have status `Starting`
|
863
|
-
|
864
|
-
Args:
|
865
|
-
extra_headers: Send extra headers
|
866
|
-
|
867
|
-
extra_query: Add additional query parameters to the request
|
868
|
-
|
869
|
-
extra_body: Add additional JSON properties to the request
|
870
|
-
|
871
|
-
timeout: Override the client-level default timeout for this request, in seconds
|
872
|
-
"""
|
873
|
-
if not task_id:
|
874
|
-
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
875
|
-
extra_headers = {"Accept": "*/*", **(extra_headers or {})}
|
876
|
-
return await self._get(
|
877
|
-
f"/task/{task_id}/cancel",
|
878
|
-
options=make_request_options(
|
879
|
-
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
880
|
-
),
|
881
|
-
cast_to=NoneType,
|
882
|
-
)
|
883
|
-
|
884
|
-
async def get(
|
885
|
-
self,
|
886
|
-
task_id: Optional[str],
|
887
|
-
*,
|
888
|
-
base64_urls: bool | NotGiven = NOT_GIVEN,
|
889
|
-
include_chunks: bool | NotGiven = NOT_GIVEN,
|
890
|
-
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
891
|
-
# The extra values given here take precedence over values defined on the client or passed to this method.
|
892
|
-
extra_headers: Headers | None = None,
|
893
|
-
extra_query: Query | None = None,
|
894
|
-
extra_body: Body | None = None,
|
895
|
-
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
896
|
-
) -> Task:
|
897
|
-
"""
|
898
|
-
Retrieves detailed information about a task by its ID, including:
|
899
|
-
|
900
|
-
- Processing status
|
901
|
-
- Task configuration
|
902
|
-
- Output data (if processing is complete)
|
903
|
-
- File metadata (name, page count)
|
904
|
-
- Timestamps (created, started, finished)
|
905
|
-
- Presigned URLs for accessing files
|
906
|
-
|
907
|
-
This endpoint can be used to:
|
908
|
-
|
909
|
-
1. Poll the task status during processing
|
910
|
-
2. Retrieve the final output once processing is complete
|
911
|
-
3. Access task metadata and configuration
|
912
|
-
|
913
|
-
Args:
|
914
|
-
base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
|
915
|
-
presigned URLs.
|
916
|
-
|
917
|
-
include_chunks: Whether to include chunks in the output response
|
918
|
-
|
919
|
-
extra_headers: Send extra headers
|
920
|
-
|
921
|
-
extra_query: Add additional query parameters to the request
|
922
|
-
|
923
|
-
extra_body: Add additional JSON properties to the request
|
924
|
-
|
925
|
-
timeout: Override the client-level default timeout for this request, in seconds
|
926
|
-
"""
|
927
|
-
if not task_id:
|
928
|
-
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
929
|
-
return await self._get(
|
930
|
-
f"/task/{task_id}",
|
931
|
-
options=make_request_options(
|
932
|
-
extra_headers=extra_headers,
|
933
|
-
extra_query=extra_query,
|
934
|
-
extra_body=extra_body,
|
935
|
-
timeout=timeout,
|
936
|
-
query=await async_maybe_transform(
|
937
|
-
{
|
938
|
-
"base64_urls": base64_urls,
|
939
|
-
"include_chunks": include_chunks,
|
940
|
-
},
|
941
|
-
task_get_params.TaskGetParams,
|
942
|
-
),
|
943
|
-
),
|
944
|
-
cast_to=Task,
|
945
|
-
)
|
946
|
-
|
947
|
-
async def parse(
|
948
|
-
self,
|
949
|
-
*,
|
950
|
-
file: str,
|
951
|
-
chunk_processing: Optional[ChunkProcessingParam] | NotGiven = NOT_GIVEN,
|
952
|
-
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
953
|
-
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
954
|
-
file_name: Optional[str] | NotGiven = NOT_GIVEN,
|
955
|
-
llm_processing: Optional[LlmProcessingParam] | NotGiven = NOT_GIVEN,
|
956
|
-
ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
|
957
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
|
958
|
-
segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
|
959
|
-
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
|
960
|
-
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
961
|
-
# The extra values given here take precedence over values defined on the client or passed to this method.
|
962
|
-
extra_headers: Headers | None = None,
|
963
|
-
extra_query: Query | None = None,
|
964
|
-
extra_body: Body | None = None,
|
965
|
-
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
966
|
-
) -> Task:
|
967
|
-
"""
|
968
|
-
Queues a document for processing and returns a TaskResponse containing:
|
969
|
-
|
970
|
-
- Task ID for status polling
|
971
|
-
- Initial configuration
|
972
|
-
- File metadata
|
973
|
-
- Processing status
|
974
|
-
- Creation timestamp
|
975
|
-
- Presigned URLs for file access
|
976
|
-
|
977
|
-
The returned task will typically be in a `Starting` or `Processing` state. Use
|
978
|
-
the `GET /task/{task_id}` endpoint to poll for completion.
|
979
|
-
|
980
|
-
Args:
|
981
|
-
file: The file to be uploaded. Can be a URL or a base64 encoded file.
|
982
|
-
|
983
|
-
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
984
|
-
|
985
|
-
error_handling:
|
986
|
-
Controls how errors are handled during processing:
|
987
|
-
|
988
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
989
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
990
|
-
LLM refusals etc.)
|
991
|
-
|
992
|
-
expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
|
993
|
-
updated, polled or accessed via web interface.
|
994
|
-
|
995
|
-
file_name: The name of the file to be uploaded. If not set a name will be generated.
|
996
|
-
|
997
|
-
llm_processing: Controls the LLM used for the task.
|
998
|
-
|
999
|
-
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
1000
|
-
|
1001
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
1002
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
1003
|
-
text. When text layer is present the bounding boxes from the text layer are
|
1004
|
-
used.
|
1005
|
-
|
1006
|
-
pipeline: Choose the provider whose models will be used for segmentation and OCR. The
|
1007
|
-
output will be unified to the Chunkr `output` format.
|
1008
|
-
|
1009
|
-
segment_processing: Defines how each segment type is handled when generating the final output.
|
1010
|
-
|
1011
|
-
Each segment uses one of three strategies. The chosen strategy controls: •
|
1012
|
-
Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
|
1013
|
-
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
1014
|
-
`Markdown`).
|
1015
|
-
|
1016
|
-
Optional flags such as image **cropping**, **extended context**, and **LLM
|
1017
|
-
descriptions** further refine behaviour.
|
1018
|
-
|
1019
|
-
---
|
1020
|
-
|
1021
|
-
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
1022
|
-
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
|
1023
|
-
description on) • `Picture` → **LLM** (Markdown, description on, cropping _All_)
|
1024
|
-
• `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
|
1025
|
-
**Ignore** (removed from output)
|
1026
|
-
|
1027
|
-
---
|
1028
|
-
|
1029
|
-
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
1030
|
-
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
1031
|
-
|
1032
|
-
segmentation_strategy:
|
1033
|
-
Controls the segmentation strategy:
|
1034
|
-
|
1035
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
1036
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
1037
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
1038
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
1039
|
-
layout element detection and only simple chunking.
|
1040
|
-
|
1041
|
-
extra_headers: Send extra headers
|
1042
|
-
|
1043
|
-
extra_query: Add additional query parameters to the request
|
1044
|
-
|
1045
|
-
extra_body: Add additional JSON properties to the request
|
1046
|
-
|
1047
|
-
timeout: Override the client-level default timeout for this request, in seconds
|
1048
|
-
"""
|
1049
|
-
return await self._post(
|
1050
|
-
"/task/parse",
|
1051
|
-
body=await async_maybe_transform(
|
1052
|
-
{
|
1053
|
-
"file": file,
|
1054
|
-
"chunk_processing": chunk_processing,
|
1055
|
-
"error_handling": error_handling,
|
1056
|
-
"expires_in": expires_in,
|
1057
|
-
"file_name": file_name,
|
1058
|
-
"llm_processing": llm_processing,
|
1059
|
-
"ocr_strategy": ocr_strategy,
|
1060
|
-
"pipeline": pipeline,
|
1061
|
-
"segment_processing": segment_processing,
|
1062
|
-
"segmentation_strategy": segmentation_strategy,
|
1063
|
-
},
|
1064
|
-
task_parse_params.TaskParseParams,
|
1065
|
-
),
|
1066
|
-
options=make_request_options(
|
1067
|
-
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
|
1068
|
-
),
|
1069
|
-
cast_to=Task,
|
1070
|
-
)
|
1071
|
-
|
1072
|
-
|
1073
|
-
class TaskResourceWithRawResponse:
|
1074
|
-
def __init__(self, task: TaskResource) -> None:
|
1075
|
-
self._task = task
|
1076
|
-
|
1077
|
-
self.update = to_raw_response_wrapper(
|
1078
|
-
task.update,
|
1079
|
-
)
|
1080
|
-
self.list = to_raw_response_wrapper(
|
1081
|
-
task.list,
|
1082
|
-
)
|
1083
|
-
self.delete = to_raw_response_wrapper(
|
1084
|
-
task.delete,
|
1085
|
-
)
|
1086
|
-
self.cancel = to_raw_response_wrapper(
|
1087
|
-
task.cancel,
|
1088
|
-
)
|
1089
|
-
self.get = to_raw_response_wrapper(
|
1090
|
-
task.get,
|
1091
|
-
)
|
1092
|
-
self.parse = to_raw_response_wrapper(
|
1093
|
-
task.parse,
|
1094
|
-
)
|
1095
|
-
|
1096
|
-
|
1097
|
-
class AsyncTaskResourceWithRawResponse:
|
1098
|
-
def __init__(self, task: AsyncTaskResource) -> None:
|
1099
|
-
self._task = task
|
1100
|
-
|
1101
|
-
self.update = async_to_raw_response_wrapper(
|
1102
|
-
task.update,
|
1103
|
-
)
|
1104
|
-
self.list = async_to_raw_response_wrapper(
|
1105
|
-
task.list,
|
1106
|
-
)
|
1107
|
-
self.delete = async_to_raw_response_wrapper(
|
1108
|
-
task.delete,
|
1109
|
-
)
|
1110
|
-
self.cancel = async_to_raw_response_wrapper(
|
1111
|
-
task.cancel,
|
1112
|
-
)
|
1113
|
-
self.get = async_to_raw_response_wrapper(
|
1114
|
-
task.get,
|
1115
|
-
)
|
1116
|
-
self.parse = async_to_raw_response_wrapper(
|
1117
|
-
task.parse,
|
1118
|
-
)
|
1119
|
-
|
1120
|
-
|
1121
|
-
class TaskResourceWithStreamingResponse:
|
1122
|
-
def __init__(self, task: TaskResource) -> None:
|
1123
|
-
self._task = task
|
1124
|
-
|
1125
|
-
self.update = to_streamed_response_wrapper(
|
1126
|
-
task.update,
|
1127
|
-
)
|
1128
|
-
self.list = to_streamed_response_wrapper(
|
1129
|
-
task.list,
|
1130
|
-
)
|
1131
|
-
self.delete = to_streamed_response_wrapper(
|
1132
|
-
task.delete,
|
1133
|
-
)
|
1134
|
-
self.cancel = to_streamed_response_wrapper(
|
1135
|
-
task.cancel,
|
1136
|
-
)
|
1137
|
-
self.get = to_streamed_response_wrapper(
|
1138
|
-
task.get,
|
1139
|
-
)
|
1140
|
-
self.parse = to_streamed_response_wrapper(
|
1141
|
-
task.parse,
|
1142
|
-
)
|
1143
|
-
|
1144
|
-
|
1145
|
-
class AsyncTaskResourceWithStreamingResponse:
|
1146
|
-
def __init__(self, task: AsyncTaskResource) -> None:
|
1147
|
-
self._task = task
|
1148
|
-
|
1149
|
-
self.update = async_to_streamed_response_wrapper(
|
1150
|
-
task.update,
|
1151
|
-
)
|
1152
|
-
self.list = async_to_streamed_response_wrapper(
|
1153
|
-
task.list,
|
1154
|
-
)
|
1155
|
-
self.delete = async_to_streamed_response_wrapper(
|
1156
|
-
task.delete,
|
1157
|
-
)
|
1158
|
-
self.cancel = async_to_streamed_response_wrapper(
|
1159
|
-
task.cancel,
|
1160
|
-
)
|
1161
|
-
self.get = async_to_streamed_response_wrapper(
|
1162
|
-
task.get,
|
1163
|
-
)
|
1164
|
-
self.parse = async_to_streamed_response_wrapper(
|
1165
|
-
task.parse,
|
1166
|
-
)
|