chunkr-ai 0.1.0a4__py3-none-any.whl → 0.1.0a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/__init__.py +0 -3
- chunkr_ai/_client.py +4 -0
- chunkr_ai/_constants.py +5 -5
- chunkr_ai/_version.py +1 -1
- chunkr_ai/resources/files.py +32 -4
- chunkr_ai/resources/tasks/parse.py +208 -84
- chunkr_ai/resources/tasks/tasks.py +46 -24
- chunkr_ai/types/task.py +18 -11
- chunkr_ai/types/task_get_params.py +3 -0
- chunkr_ai/types/tasks/parse_create_params.py +32 -13
- chunkr_ai/types/tasks/parse_update_params.py +30 -11
- {chunkr_ai-0.1.0a4.dist-info → chunkr_ai-0.1.0a5.dist-info}/METADATA +4 -4
- {chunkr_ai-0.1.0a4.dist-info → chunkr_ai-0.1.0a5.dist-info}/RECORD +15 -16
- chunkr_ai/lib/tasks_poll.py +0 -122
- {chunkr_ai-0.1.0a4.dist-info → chunkr_ai-0.1.0a5.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a4.dist-info → chunkr_ai-0.1.0a5.dist-info}/licenses/LICENSE +0 -0
chunkr_ai/__init__.py
CHANGED
@@ -72,9 +72,6 @@ __all__ = [
|
|
72
72
|
]
|
73
73
|
|
74
74
|
if not _t.TYPE_CHECKING:
|
75
|
-
# Load custom helpers that monkey-patch generated types.
|
76
|
-
# This keeps custom code separate from generated files, per Stainless guidance.
|
77
|
-
from .lib import tasks_poll as _tasks_poll # noqa: F401
|
78
75
|
from ._utils._resources_proxy import resources as resources
|
79
76
|
|
80
77
|
_setup_logging()
|
chunkr_ai/_client.py
CHANGED
@@ -95,6 +95,8 @@ class Chunkr(SyncAPIClient):
|
|
95
95
|
_strict_response_validation=_strict_response_validation,
|
96
96
|
)
|
97
97
|
|
98
|
+
self._idempotency_header = "Idempotency-Key"
|
99
|
+
|
98
100
|
self.tasks = tasks.TasksResource(self)
|
99
101
|
self.files = files.FilesResource(self)
|
100
102
|
self.health = health.HealthResource(self)
|
@@ -267,6 +269,8 @@ class AsyncChunkr(AsyncAPIClient):
|
|
267
269
|
_strict_response_validation=_strict_response_validation,
|
268
270
|
)
|
269
271
|
|
272
|
+
self._idempotency_header = "Idempotency-Key"
|
273
|
+
|
270
274
|
self.tasks = tasks.AsyncTasksResource(self)
|
271
275
|
self.files = files.AsyncFilesResource(self)
|
272
276
|
self.health = health.AsyncHealthResource(self)
|
chunkr_ai/_constants.py
CHANGED
@@ -5,10 +5,10 @@ import httpx
|
|
5
5
|
RAW_RESPONSE_HEADER = "X-Stainless-Raw-Response"
|
6
6
|
OVERRIDE_CAST_TO_HEADER = "____stainless_override_cast_to"
|
7
7
|
|
8
|
-
# default timeout is
|
9
|
-
DEFAULT_TIMEOUT = httpx.Timeout(timeout=
|
10
|
-
DEFAULT_MAX_RETRIES =
|
8
|
+
# default timeout is 30 seconds
|
9
|
+
DEFAULT_TIMEOUT = httpx.Timeout(timeout=30, connect=5.0)
|
10
|
+
DEFAULT_MAX_RETRIES = 50
|
11
11
|
DEFAULT_CONNECTION_LIMITS = httpx.Limits(max_connections=100, max_keepalive_connections=20)
|
12
12
|
|
13
|
-
INITIAL_RETRY_DELAY = 0
|
14
|
-
MAX_RETRY_DELAY =
|
13
|
+
INITIAL_RETRY_DELAY = 1.0
|
14
|
+
MAX_RETRY_DELAY = 10.0
|
chunkr_ai/_version.py
CHANGED
chunkr_ai/resources/files.py
CHANGED
@@ -59,6 +59,7 @@ class FilesResource(SyncAPIResource):
|
|
59
59
|
extra_query: Query | None = None,
|
60
60
|
extra_body: Body | None = None,
|
61
61
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
62
|
+
idempotency_key: str | None = None,
|
62
63
|
) -> File:
|
63
64
|
"""
|
64
65
|
Accepts multipart/form-data with fields:
|
@@ -78,6 +79,8 @@ class FilesResource(SyncAPIResource):
|
|
78
79
|
extra_body: Add additional JSON properties to the request
|
79
80
|
|
80
81
|
timeout: Override the client-level default timeout for this request, in seconds
|
82
|
+
|
83
|
+
idempotency_key: Specify a custom idempotency key for this request
|
81
84
|
"""
|
82
85
|
body = deepcopy_minimal(
|
83
86
|
{
|
@@ -95,7 +98,11 @@ class FilesResource(SyncAPIResource):
|
|
95
98
|
body=maybe_transform(body, file_create_params.FileCreateParams),
|
96
99
|
files=files,
|
97
100
|
options=make_request_options(
|
98
|
-
extra_headers=extra_headers,
|
101
|
+
extra_headers=extra_headers,
|
102
|
+
extra_query=extra_query,
|
103
|
+
extra_body=extra_body,
|
104
|
+
timeout=timeout,
|
105
|
+
idempotency_key=idempotency_key,
|
99
106
|
),
|
100
107
|
cast_to=File,
|
101
108
|
)
|
@@ -170,6 +177,7 @@ class FilesResource(SyncAPIResource):
|
|
170
177
|
extra_query: Query | None = None,
|
171
178
|
extra_body: Body | None = None,
|
172
179
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
180
|
+
idempotency_key: str | None = None,
|
173
181
|
) -> Delete:
|
174
182
|
"""Delete file contents and scrub sensitive metadata.
|
175
183
|
|
@@ -184,13 +192,19 @@ class FilesResource(SyncAPIResource):
|
|
184
192
|
extra_body: Add additional JSON properties to the request
|
185
193
|
|
186
194
|
timeout: Override the client-level default timeout for this request, in seconds
|
195
|
+
|
196
|
+
idempotency_key: Specify a custom idempotency key for this request
|
187
197
|
"""
|
188
198
|
if not file_id:
|
189
199
|
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
|
190
200
|
return self._delete(
|
191
201
|
f"/files/{file_id}",
|
192
202
|
options=make_request_options(
|
193
|
-
extra_headers=extra_headers,
|
203
|
+
extra_headers=extra_headers,
|
204
|
+
extra_query=extra_query,
|
205
|
+
extra_body=extra_body,
|
206
|
+
timeout=timeout,
|
207
|
+
idempotency_key=idempotency_key,
|
194
208
|
),
|
195
209
|
cast_to=Delete,
|
196
210
|
)
|
@@ -353,6 +367,7 @@ class AsyncFilesResource(AsyncAPIResource):
|
|
353
367
|
extra_query: Query | None = None,
|
354
368
|
extra_body: Body | None = None,
|
355
369
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
370
|
+
idempotency_key: str | None = None,
|
356
371
|
) -> File:
|
357
372
|
"""
|
358
373
|
Accepts multipart/form-data with fields:
|
@@ -372,6 +387,8 @@ class AsyncFilesResource(AsyncAPIResource):
|
|
372
387
|
extra_body: Add additional JSON properties to the request
|
373
388
|
|
374
389
|
timeout: Override the client-level default timeout for this request, in seconds
|
390
|
+
|
391
|
+
idempotency_key: Specify a custom idempotency key for this request
|
375
392
|
"""
|
376
393
|
body = deepcopy_minimal(
|
377
394
|
{
|
@@ -389,7 +406,11 @@ class AsyncFilesResource(AsyncAPIResource):
|
|
389
406
|
body=await async_maybe_transform(body, file_create_params.FileCreateParams),
|
390
407
|
files=files,
|
391
408
|
options=make_request_options(
|
392
|
-
extra_headers=extra_headers,
|
409
|
+
extra_headers=extra_headers,
|
410
|
+
extra_query=extra_query,
|
411
|
+
extra_body=extra_body,
|
412
|
+
timeout=timeout,
|
413
|
+
idempotency_key=idempotency_key,
|
393
414
|
),
|
394
415
|
cast_to=File,
|
395
416
|
)
|
@@ -464,6 +485,7 @@ class AsyncFilesResource(AsyncAPIResource):
|
|
464
485
|
extra_query: Query | None = None,
|
465
486
|
extra_body: Body | None = None,
|
466
487
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
488
|
+
idempotency_key: str | None = None,
|
467
489
|
) -> Delete:
|
468
490
|
"""Delete file contents and scrub sensitive metadata.
|
469
491
|
|
@@ -478,13 +500,19 @@ class AsyncFilesResource(AsyncAPIResource):
|
|
478
500
|
extra_body: Add additional JSON properties to the request
|
479
501
|
|
480
502
|
timeout: Override the client-level default timeout for this request, in seconds
|
503
|
+
|
504
|
+
idempotency_key: Specify a custom idempotency key for this request
|
481
505
|
"""
|
482
506
|
if not file_id:
|
483
507
|
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
|
484
508
|
return await self._delete(
|
485
509
|
f"/files/{file_id}",
|
486
510
|
options=make_request_options(
|
487
|
-
extra_headers=extra_headers,
|
511
|
+
extra_headers=extra_headers,
|
512
|
+
extra_query=extra_query,
|
513
|
+
extra_body=extra_body,
|
514
|
+
timeout=timeout,
|
515
|
+
idempotency_key=idempotency_key,
|
488
516
|
),
|
489
517
|
cast_to=Delete,
|
490
518
|
)
|
@@ -48,6 +48,9 @@ class ParseResource(SyncAPIResource):
|
|
48
48
|
self,
|
49
49
|
*,
|
50
50
|
file: str,
|
51
|
+
base64_urls: bool | NotGiven = NOT_GIVEN,
|
52
|
+
include_chunks: bool | NotGiven = NOT_GIVEN,
|
53
|
+
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
51
54
|
chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
|
52
55
|
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
53
56
|
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
@@ -63,29 +66,34 @@ class ParseResource(SyncAPIResource):
|
|
63
66
|
extra_query: Query | None = None,
|
64
67
|
extra_body: Body | None = None,
|
65
68
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
69
|
+
idempotency_key: str | None = None,
|
66
70
|
) -> Task:
|
67
71
|
"""
|
68
|
-
Queues a document for processing and returns a TaskResponse
|
72
|
+
Queues a document for processing and returns a `TaskResponse` with the assigned
|
73
|
+
`task_id`, initial configuration, file metadata, and timestamps. The initial
|
74
|
+
status is `Starting`.
|
69
75
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
- Creation timestamp
|
75
|
-
- Presigned URLs for file access
|
76
|
-
|
77
|
-
The returned task will typically be in a `Starting` or `Processing` state. Use
|
78
|
-
the `GET /tasks/{task_id}` endpoint to poll for completion.
|
76
|
+
If `wait_for_completion=true` is provided, the server waits briefly for
|
77
|
+
completion. If the task completes within that window, a 200 response with the
|
78
|
+
final `TaskResponse` is returned. Otherwise, the server returns a 408 or 409
|
79
|
+
with retry guidance and a body describing how long to wait before retrying.
|
79
80
|
|
80
81
|
Args:
|
81
82
|
file:
|
82
83
|
The file to be uploaded. Supported inputs:
|
83
84
|
|
84
|
-
- `ch://files/{file_id}`:
|
85
|
-
|
85
|
+
- `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
|
86
|
+
API
|
86
87
|
- `http(s)://...`: Remote URL to fetch
|
87
88
|
- `data:*;base64,...` or raw base64 string
|
88
89
|
|
90
|
+
base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
|
91
|
+
|
92
|
+
include_chunks: Whether to include chunks in the output response
|
93
|
+
|
94
|
+
wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
|
95
|
+
408/409 with Retry-After headers
|
96
|
+
|
89
97
|
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
90
98
|
|
91
99
|
error_handling:
|
@@ -114,22 +122,29 @@ class ParseResource(SyncAPIResource):
|
|
114
122
|
|
115
123
|
segment_processing: Defines how each segment type is handled when generating the final output.
|
116
124
|
|
117
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
118
|
-
|
119
|
-
|
120
|
-
|
125
|
+
Each segment uses one of three strategies. The chosen strategy controls:
|
126
|
+
|
127
|
+
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
128
|
+
- How the content is produced (rule-based vs. LLM).
|
129
|
+
- The output format (`Html` or `Markdown`).
|
121
130
|
|
122
131
|
Optional flags such as image **cropping**, **extended context**, and
|
123
132
|
**descriptions** further refine behaviour.
|
124
133
|
|
125
|
-
**Default strategy per segment**
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
134
|
+
**Default strategy per segment**
|
135
|
+
|
136
|
+
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
137
|
+
(Markdown, description off)
|
138
|
+
- `Table` → **LLM** (HTML, description on)
|
139
|
+
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
140
|
+
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
141
|
+
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
130
142
|
|
131
|
-
**Strategy reference**
|
132
|
-
|
143
|
+
**Strategy reference**
|
144
|
+
|
145
|
+
- **Auto** – rule-based content generation.
|
146
|
+
- **LLM** – generate content with an LLM.
|
147
|
+
- **Ignore** – exclude the segment entirely.
|
133
148
|
|
134
149
|
segmentation_strategy:
|
135
150
|
Controls the segmentation strategy:
|
@@ -147,6 +162,8 @@ class ParseResource(SyncAPIResource):
|
|
147
162
|
extra_body: Add additional JSON properties to the request
|
148
163
|
|
149
164
|
timeout: Override the client-level default timeout for this request, in seconds
|
165
|
+
|
166
|
+
idempotency_key: Specify a custom idempotency key for this request
|
150
167
|
"""
|
151
168
|
return self._post(
|
152
169
|
"/tasks/parse",
|
@@ -166,7 +183,19 @@ class ParseResource(SyncAPIResource):
|
|
166
183
|
parse_create_params.ParseCreateParams,
|
167
184
|
),
|
168
185
|
options=make_request_options(
|
169
|
-
extra_headers=extra_headers,
|
186
|
+
extra_headers=extra_headers,
|
187
|
+
extra_query=extra_query,
|
188
|
+
extra_body=extra_body,
|
189
|
+
timeout=timeout,
|
190
|
+
idempotency_key=idempotency_key,
|
191
|
+
query=maybe_transform(
|
192
|
+
{
|
193
|
+
"base64_urls": base64_urls,
|
194
|
+
"include_chunks": include_chunks,
|
195
|
+
"wait_for_completion": wait_for_completion,
|
196
|
+
},
|
197
|
+
parse_create_params.ParseCreateParams,
|
198
|
+
),
|
170
199
|
),
|
171
200
|
cast_to=Task,
|
172
201
|
)
|
@@ -175,6 +204,9 @@ class ParseResource(SyncAPIResource):
|
|
175
204
|
self,
|
176
205
|
task_id: str,
|
177
206
|
*,
|
207
|
+
base64_urls: bool | NotGiven = NOT_GIVEN,
|
208
|
+
include_chunks: bool | NotGiven = NOT_GIVEN,
|
209
|
+
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
178
210
|
chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
|
179
211
|
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
180
212
|
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
@@ -190,22 +222,31 @@ class ParseResource(SyncAPIResource):
|
|
190
222
|
extra_query: Query | None = None,
|
191
223
|
extra_body: Body | None = None,
|
192
224
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
225
|
+
idempotency_key: str | None = None,
|
193
226
|
) -> Task:
|
194
227
|
"""Updates an existing task's configuration and reprocesses the document.
|
195
228
|
|
196
229
|
The
|
197
|
-
|
198
|
-
update.
|
230
|
+
current configuration is used as the base; only provided fields are changed.
|
199
231
|
|
200
232
|
Requirements:
|
201
233
|
|
202
|
-
- Task must
|
203
|
-
-
|
234
|
+
- Task must be in a terminal state (`Succeeded` or `Failed`).
|
235
|
+
- The new configuration must differ from the current configuration.
|
204
236
|
|
205
|
-
|
206
|
-
the
|
237
|
+
If `wait_for_completion=true` is provided, the server waits briefly for
|
238
|
+
completion. If the task completes within that window, a 200 response with the
|
239
|
+
final `TaskResponse` is returned. Otherwise, the server returns a 408 with retry
|
240
|
+
guidance and a body describing how long to wait before retrying.
|
207
241
|
|
208
242
|
Args:
|
243
|
+
base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
|
244
|
+
|
245
|
+
include_chunks: Whether to include chunks in the output response
|
246
|
+
|
247
|
+
wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
|
248
|
+
408/409 with Retry-After headers
|
249
|
+
|
209
250
|
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
210
251
|
|
211
252
|
error_handling:
|
@@ -235,22 +276,29 @@ class ParseResource(SyncAPIResource):
|
|
235
276
|
|
236
277
|
segment_processing: Defines how each segment type is handled when generating the final output.
|
237
278
|
|
238
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
239
|
-
|
240
|
-
|
241
|
-
|
279
|
+
Each segment uses one of three strategies. The chosen strategy controls:
|
280
|
+
|
281
|
+
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
282
|
+
- How the content is produced (rule-based vs. LLM).
|
283
|
+
- The output format (`Html` or `Markdown`).
|
242
284
|
|
243
285
|
Optional flags such as image **cropping**, **extended context**, and
|
244
286
|
**descriptions** further refine behaviour.
|
245
287
|
|
246
|
-
**Default strategy per segment**
|
247
|
-
`Caption`, `Footnote` → **Auto** (Markdown, description off) • `Table` → **LLM**
|
248
|
-
(HTML, description on) • `Picture` → **LLM** (Markdown, description off,
|
249
|
-
cropping _All_) • `Formula`, `Page` → **LLM** (Markdown, description off) •
|
250
|
-
`PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
288
|
+
**Default strategy per segment**
|
251
289
|
|
252
|
-
|
253
|
-
|
290
|
+
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
291
|
+
(Markdown, description off)
|
292
|
+
- `Table` → **LLM** (HTML, description on)
|
293
|
+
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
294
|
+
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
295
|
+
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
296
|
+
|
297
|
+
**Strategy reference**
|
298
|
+
|
299
|
+
- **Auto** – rule-based content generation.
|
300
|
+
- **LLM** – generate content with an LLM.
|
301
|
+
- **Ignore** – exclude the segment entirely.
|
254
302
|
|
255
303
|
segmentation_strategy:
|
256
304
|
Controls the segmentation strategy:
|
@@ -268,6 +316,8 @@ class ParseResource(SyncAPIResource):
|
|
268
316
|
extra_body: Add additional JSON properties to the request
|
269
317
|
|
270
318
|
timeout: Override the client-level default timeout for this request, in seconds
|
319
|
+
|
320
|
+
idempotency_key: Specify a custom idempotency key for this request
|
271
321
|
"""
|
272
322
|
if not task_id:
|
273
323
|
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
@@ -288,7 +338,19 @@ class ParseResource(SyncAPIResource):
|
|
288
338
|
parse_update_params.ParseUpdateParams,
|
289
339
|
),
|
290
340
|
options=make_request_options(
|
291
|
-
extra_headers=extra_headers,
|
341
|
+
extra_headers=extra_headers,
|
342
|
+
extra_query=extra_query,
|
343
|
+
extra_body=extra_body,
|
344
|
+
timeout=timeout,
|
345
|
+
idempotency_key=idempotency_key,
|
346
|
+
query=maybe_transform(
|
347
|
+
{
|
348
|
+
"base64_urls": base64_urls,
|
349
|
+
"include_chunks": include_chunks,
|
350
|
+
"wait_for_completion": wait_for_completion,
|
351
|
+
},
|
352
|
+
parse_update_params.ParseUpdateParams,
|
353
|
+
),
|
292
354
|
),
|
293
355
|
cast_to=Task,
|
294
356
|
)
|
@@ -318,6 +380,9 @@ class AsyncParseResource(AsyncAPIResource):
|
|
318
380
|
self,
|
319
381
|
*,
|
320
382
|
file: str,
|
383
|
+
base64_urls: bool | NotGiven = NOT_GIVEN,
|
384
|
+
include_chunks: bool | NotGiven = NOT_GIVEN,
|
385
|
+
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
321
386
|
chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
|
322
387
|
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
323
388
|
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
@@ -333,29 +398,34 @@ class AsyncParseResource(AsyncAPIResource):
|
|
333
398
|
extra_query: Query | None = None,
|
334
399
|
extra_body: Body | None = None,
|
335
400
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
401
|
+
idempotency_key: str | None = None,
|
336
402
|
) -> Task:
|
337
403
|
"""
|
338
|
-
Queues a document for processing and returns a TaskResponse
|
404
|
+
Queues a document for processing and returns a `TaskResponse` with the assigned
|
405
|
+
`task_id`, initial configuration, file metadata, and timestamps. The initial
|
406
|
+
status is `Starting`.
|
339
407
|
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
- Creation timestamp
|
345
|
-
- Presigned URLs for file access
|
346
|
-
|
347
|
-
The returned task will typically be in a `Starting` or `Processing` state. Use
|
348
|
-
the `GET /tasks/{task_id}` endpoint to poll for completion.
|
408
|
+
If `wait_for_completion=true` is provided, the server waits briefly for
|
409
|
+
completion. If the task completes within that window, a 200 response with the
|
410
|
+
final `TaskResponse` is returned. Otherwise, the server returns a 408 or 409
|
411
|
+
with retry guidance and a body describing how long to wait before retrying.
|
349
412
|
|
350
413
|
Args:
|
351
414
|
file:
|
352
415
|
The file to be uploaded. Supported inputs:
|
353
416
|
|
354
|
-
- `ch://files/{file_id}`:
|
355
|
-
|
417
|
+
- `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
|
418
|
+
API
|
356
419
|
- `http(s)://...`: Remote URL to fetch
|
357
420
|
- `data:*;base64,...` or raw base64 string
|
358
421
|
|
422
|
+
base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
|
423
|
+
|
424
|
+
include_chunks: Whether to include chunks in the output response
|
425
|
+
|
426
|
+
wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
|
427
|
+
408/409 with Retry-After headers
|
428
|
+
|
359
429
|
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
360
430
|
|
361
431
|
error_handling:
|
@@ -384,22 +454,29 @@ class AsyncParseResource(AsyncAPIResource):
|
|
384
454
|
|
385
455
|
segment_processing: Defines how each segment type is handled when generating the final output.
|
386
456
|
|
387
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
388
|
-
|
389
|
-
|
390
|
-
|
457
|
+
Each segment uses one of three strategies. The chosen strategy controls:
|
458
|
+
|
459
|
+
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
460
|
+
- How the content is produced (rule-based vs. LLM).
|
461
|
+
- The output format (`Html` or `Markdown`).
|
391
462
|
|
392
463
|
Optional flags such as image **cropping**, **extended context**, and
|
393
464
|
**descriptions** further refine behaviour.
|
394
465
|
|
395
|
-
**Default strategy per segment**
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
466
|
+
**Default strategy per segment**
|
467
|
+
|
468
|
+
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
469
|
+
(Markdown, description off)
|
470
|
+
- `Table` → **LLM** (HTML, description on)
|
471
|
+
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
472
|
+
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
473
|
+
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
400
474
|
|
401
|
-
**Strategy reference**
|
402
|
-
|
475
|
+
**Strategy reference**
|
476
|
+
|
477
|
+
- **Auto** – rule-based content generation.
|
478
|
+
- **LLM** – generate content with an LLM.
|
479
|
+
- **Ignore** – exclude the segment entirely.
|
403
480
|
|
404
481
|
segmentation_strategy:
|
405
482
|
Controls the segmentation strategy:
|
@@ -417,6 +494,8 @@ class AsyncParseResource(AsyncAPIResource):
|
|
417
494
|
extra_body: Add additional JSON properties to the request
|
418
495
|
|
419
496
|
timeout: Override the client-level default timeout for this request, in seconds
|
497
|
+
|
498
|
+
idempotency_key: Specify a custom idempotency key for this request
|
420
499
|
"""
|
421
500
|
return await self._post(
|
422
501
|
"/tasks/parse",
|
@@ -436,7 +515,19 @@ class AsyncParseResource(AsyncAPIResource):
|
|
436
515
|
parse_create_params.ParseCreateParams,
|
437
516
|
),
|
438
517
|
options=make_request_options(
|
439
|
-
extra_headers=extra_headers,
|
518
|
+
extra_headers=extra_headers,
|
519
|
+
extra_query=extra_query,
|
520
|
+
extra_body=extra_body,
|
521
|
+
timeout=timeout,
|
522
|
+
idempotency_key=idempotency_key,
|
523
|
+
query=await async_maybe_transform(
|
524
|
+
{
|
525
|
+
"base64_urls": base64_urls,
|
526
|
+
"include_chunks": include_chunks,
|
527
|
+
"wait_for_completion": wait_for_completion,
|
528
|
+
},
|
529
|
+
parse_create_params.ParseCreateParams,
|
530
|
+
),
|
440
531
|
),
|
441
532
|
cast_to=Task,
|
442
533
|
)
|
@@ -445,6 +536,9 @@ class AsyncParseResource(AsyncAPIResource):
|
|
445
536
|
self,
|
446
537
|
task_id: str,
|
447
538
|
*,
|
539
|
+
base64_urls: bool | NotGiven = NOT_GIVEN,
|
540
|
+
include_chunks: bool | NotGiven = NOT_GIVEN,
|
541
|
+
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
448
542
|
chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
|
449
543
|
error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
|
450
544
|
expires_in: Optional[int] | NotGiven = NOT_GIVEN,
|
@@ -460,22 +554,31 @@ class AsyncParseResource(AsyncAPIResource):
|
|
460
554
|
extra_query: Query | None = None,
|
461
555
|
extra_body: Body | None = None,
|
462
556
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
557
|
+
idempotency_key: str | None = None,
|
463
558
|
) -> Task:
|
464
559
|
"""Updates an existing task's configuration and reprocesses the document.
|
465
560
|
|
466
561
|
The
|
467
|
-
|
468
|
-
update.
|
562
|
+
current configuration is used as the base; only provided fields are changed.
|
469
563
|
|
470
564
|
Requirements:
|
471
565
|
|
472
|
-
- Task must
|
473
|
-
-
|
566
|
+
- Task must be in a terminal state (`Succeeded` or `Failed`).
|
567
|
+
- The new configuration must differ from the current configuration.
|
474
568
|
|
475
|
-
|
476
|
-
the
|
569
|
+
If `wait_for_completion=true` is provided, the server waits briefly for
|
570
|
+
completion. If the task completes within that window, a 200 response with the
|
571
|
+
final `TaskResponse` is returned. Otherwise, the server returns a 408 with retry
|
572
|
+
guidance and a body describing how long to wait before retrying.
|
477
573
|
|
478
574
|
Args:
|
575
|
+
base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
|
576
|
+
|
577
|
+
include_chunks: Whether to include chunks in the output response
|
578
|
+
|
579
|
+
wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
|
580
|
+
408/409 with Retry-After headers
|
581
|
+
|
479
582
|
chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
|
480
583
|
|
481
584
|
error_handling:
|
@@ -505,22 +608,29 @@ class AsyncParseResource(AsyncAPIResource):
|
|
505
608
|
|
506
609
|
segment_processing: Defines how each segment type is handled when generating the final output.
|
507
610
|
|
508
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
509
|
-
|
510
|
-
|
511
|
-
|
611
|
+
Each segment uses one of three strategies. The chosen strategy controls:
|
612
|
+
|
613
|
+
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
614
|
+
- How the content is produced (rule-based vs. LLM).
|
615
|
+
- The output format (`Html` or `Markdown`).
|
512
616
|
|
513
617
|
Optional flags such as image **cropping**, **extended context**, and
|
514
618
|
**descriptions** further refine behaviour.
|
515
619
|
|
516
|
-
**Default strategy per segment**
|
517
|
-
`Caption`, `Footnote` → **Auto** (Markdown, description off) • `Table` → **LLM**
|
518
|
-
(HTML, description on) • `Picture` → **LLM** (Markdown, description off,
|
519
|
-
cropping _All_) • `Formula`, `Page` → **LLM** (Markdown, description off) •
|
520
|
-
`PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
620
|
+
**Default strategy per segment**
|
521
621
|
|
522
|
-
|
523
|
-
|
622
|
+
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
623
|
+
(Markdown, description off)
|
624
|
+
- `Table` → **LLM** (HTML, description on)
|
625
|
+
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
626
|
+
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
627
|
+
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
628
|
+
|
629
|
+
**Strategy reference**
|
630
|
+
|
631
|
+
- **Auto** – rule-based content generation.
|
632
|
+
- **LLM** – generate content with an LLM.
|
633
|
+
- **Ignore** – exclude the segment entirely.
|
524
634
|
|
525
635
|
segmentation_strategy:
|
526
636
|
Controls the segmentation strategy:
|
@@ -538,6 +648,8 @@ class AsyncParseResource(AsyncAPIResource):
|
|
538
648
|
extra_body: Add additional JSON properties to the request
|
539
649
|
|
540
650
|
timeout: Override the client-level default timeout for this request, in seconds
|
651
|
+
|
652
|
+
idempotency_key: Specify a custom idempotency key for this request
|
541
653
|
"""
|
542
654
|
if not task_id:
|
543
655
|
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
@@ -558,7 +670,19 @@ class AsyncParseResource(AsyncAPIResource):
|
|
558
670
|
parse_update_params.ParseUpdateParams,
|
559
671
|
),
|
560
672
|
options=make_request_options(
|
561
|
-
extra_headers=extra_headers,
|
673
|
+
extra_headers=extra_headers,
|
674
|
+
extra_query=extra_query,
|
675
|
+
extra_body=extra_body,
|
676
|
+
timeout=timeout,
|
677
|
+
idempotency_key=idempotency_key,
|
678
|
+
query=await async_maybe_transform(
|
679
|
+
{
|
680
|
+
"base64_urls": base64_urls,
|
681
|
+
"include_chunks": include_chunks,
|
682
|
+
"wait_for_completion": wait_for_completion,
|
683
|
+
},
|
684
|
+
parse_update_params.ParseUpdateParams,
|
685
|
+
),
|
562
686
|
),
|
563
687
|
cast_to=Task,
|
564
688
|
)
|
@@ -138,6 +138,7 @@ class TasksResource(SyncAPIResource):
|
|
138
138
|
extra_query: Query | None = None,
|
139
139
|
extra_body: Body | None = None,
|
140
140
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
141
|
+
idempotency_key: str | None = None,
|
141
142
|
) -> None:
|
142
143
|
"""
|
143
144
|
Delete a task by its ID.
|
@@ -154,6 +155,8 @@ class TasksResource(SyncAPIResource):
|
|
154
155
|
extra_body: Add additional JSON properties to the request
|
155
156
|
|
156
157
|
timeout: Override the client-level default timeout for this request, in seconds
|
158
|
+
|
159
|
+
idempotency_key: Specify a custom idempotency key for this request
|
157
160
|
"""
|
158
161
|
if not task_id:
|
159
162
|
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
@@ -161,7 +164,11 @@ class TasksResource(SyncAPIResource):
|
|
161
164
|
return self._delete(
|
162
165
|
f"/tasks/{task_id}",
|
163
166
|
options=make_request_options(
|
164
|
-
extra_headers=extra_headers,
|
167
|
+
extra_headers=extra_headers,
|
168
|
+
extra_query=extra_query,
|
169
|
+
extra_body=extra_body,
|
170
|
+
timeout=timeout,
|
171
|
+
idempotency_key=idempotency_key,
|
165
172
|
),
|
166
173
|
cast_to=NoneType,
|
167
174
|
)
|
@@ -213,6 +220,7 @@ class TasksResource(SyncAPIResource):
|
|
213
220
|
*,
|
214
221
|
base64_urls: bool | NotGiven = NOT_GIVEN,
|
215
222
|
include_chunks: bool | NotGiven = NOT_GIVEN,
|
223
|
+
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
216
224
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
217
225
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
218
226
|
extra_headers: Headers | None = None,
|
@@ -221,20 +229,20 @@ class TasksResource(SyncAPIResource):
|
|
221
229
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
222
230
|
) -> Task:
|
223
231
|
"""
|
224
|
-
Retrieves
|
232
|
+
Retrieves the current state of a task and, when requested, waits briefly for
|
233
|
+
completion.
|
225
234
|
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
- Presigned URLs for accessing files
|
235
|
+
Returns task details such as processing status, configuration, output (when
|
236
|
+
available), file metadata, and timestamps. If `wait_for_completion=true` is
|
237
|
+
provided, the server will hold the request briefly. If the task does not reach a
|
238
|
+
terminal state during that window, the response will indicate a retry with
|
239
|
+
appropriate headers.
|
232
240
|
|
233
|
-
|
241
|
+
Typical uses:
|
234
242
|
|
235
|
-
|
236
|
-
|
237
|
-
|
243
|
+
- Poll a task during processing
|
244
|
+
- Retrieve the final output once processing is complete
|
245
|
+
- Access task metadata and configuration
|
238
246
|
|
239
247
|
Args:
|
240
248
|
base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
|
@@ -242,6 +250,8 @@ class TasksResource(SyncAPIResource):
|
|
242
250
|
|
243
251
|
include_chunks: Whether to include chunks in the output response
|
244
252
|
|
253
|
+
wait_for_completion: Whether to wait for the task to complete
|
254
|
+
|
245
255
|
extra_headers: Send extra headers
|
246
256
|
|
247
257
|
extra_query: Add additional query parameters to the request
|
@@ -263,6 +273,7 @@ class TasksResource(SyncAPIResource):
|
|
263
273
|
{
|
264
274
|
"base64_urls": base64_urls,
|
265
275
|
"include_chunks": include_chunks,
|
276
|
+
"wait_for_completion": wait_for_completion,
|
266
277
|
},
|
267
278
|
task_get_params.TaskGetParams,
|
268
279
|
),
|
@@ -375,6 +386,7 @@ class AsyncTasksResource(AsyncAPIResource):
|
|
375
386
|
extra_query: Query | None = None,
|
376
387
|
extra_body: Body | None = None,
|
377
388
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
389
|
+
idempotency_key: str | None = None,
|
378
390
|
) -> None:
|
379
391
|
"""
|
380
392
|
Delete a task by its ID.
|
@@ -391,6 +403,8 @@ class AsyncTasksResource(AsyncAPIResource):
|
|
391
403
|
extra_body: Add additional JSON properties to the request
|
392
404
|
|
393
405
|
timeout: Override the client-level default timeout for this request, in seconds
|
406
|
+
|
407
|
+
idempotency_key: Specify a custom idempotency key for this request
|
394
408
|
"""
|
395
409
|
if not task_id:
|
396
410
|
raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
|
@@ -398,7 +412,11 @@ class AsyncTasksResource(AsyncAPIResource):
|
|
398
412
|
return await self._delete(
|
399
413
|
f"/tasks/{task_id}",
|
400
414
|
options=make_request_options(
|
401
|
-
extra_headers=extra_headers,
|
415
|
+
extra_headers=extra_headers,
|
416
|
+
extra_query=extra_query,
|
417
|
+
extra_body=extra_body,
|
418
|
+
timeout=timeout,
|
419
|
+
idempotency_key=idempotency_key,
|
402
420
|
),
|
403
421
|
cast_to=NoneType,
|
404
422
|
)
|
@@ -450,6 +468,7 @@ class AsyncTasksResource(AsyncAPIResource):
|
|
450
468
|
*,
|
451
469
|
base64_urls: bool | NotGiven = NOT_GIVEN,
|
452
470
|
include_chunks: bool | NotGiven = NOT_GIVEN,
|
471
|
+
wait_for_completion: bool | NotGiven = NOT_GIVEN,
|
453
472
|
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
|
454
473
|
# The extra values given here take precedence over values defined on the client or passed to this method.
|
455
474
|
extra_headers: Headers | None = None,
|
@@ -458,20 +477,20 @@ class AsyncTasksResource(AsyncAPIResource):
|
|
458
477
|
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
|
459
478
|
) -> Task:
|
460
479
|
"""
|
461
|
-
Retrieves
|
480
|
+
Retrieves the current state of a task and, when requested, waits briefly for
|
481
|
+
completion.
|
462
482
|
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
- Presigned URLs for accessing files
|
483
|
+
Returns task details such as processing status, configuration, output (when
|
484
|
+
available), file metadata, and timestamps. If `wait_for_completion=true` is
|
485
|
+
provided, the server will hold the request briefly. If the task does not reach a
|
486
|
+
terminal state during that window, the response will indicate a retry with
|
487
|
+
appropriate headers.
|
469
488
|
|
470
|
-
|
489
|
+
Typical uses:
|
471
490
|
|
472
|
-
|
473
|
-
|
474
|
-
|
491
|
+
- Poll a task during processing
|
492
|
+
- Retrieve the final output once processing is complete
|
493
|
+
- Access task metadata and configuration
|
475
494
|
|
476
495
|
Args:
|
477
496
|
base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
|
@@ -479,6 +498,8 @@ class AsyncTasksResource(AsyncAPIResource):
|
|
479
498
|
|
480
499
|
include_chunks: Whether to include chunks in the output response
|
481
500
|
|
501
|
+
wait_for_completion: Whether to wait for the task to complete
|
502
|
+
|
482
503
|
extra_headers: Send extra headers
|
483
504
|
|
484
505
|
extra_query: Add additional query parameters to the request
|
@@ -500,6 +521,7 @@ class AsyncTasksResource(AsyncAPIResource):
|
|
500
521
|
{
|
501
522
|
"base64_urls": base64_urls,
|
502
523
|
"include_chunks": include_chunks,
|
524
|
+
"wait_for_completion": wait_for_completion,
|
503
525
|
},
|
504
526
|
task_get_params.TaskGetParams,
|
505
527
|
),
|
chunkr_ai/types/task.py
CHANGED
@@ -827,22 +827,29 @@ class Configuration(BaseModel):
|
|
827
827
|
segment_processing: ConfigurationSegmentProcessing
|
828
828
|
"""Defines how each segment type is handled when generating the final output.
|
829
829
|
|
830
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
831
|
-
|
832
|
-
|
833
|
-
|
830
|
+
Each segment uses one of three strategies. The chosen strategy controls:
|
831
|
+
|
832
|
+
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
833
|
+
- How the content is produced (rule-based vs. LLM).
|
834
|
+
- The output format (`Html` or `Markdown`).
|
834
835
|
|
835
836
|
Optional flags such as image **cropping**, **extended context**, and
|
836
837
|
**descriptions** further refine behaviour.
|
837
838
|
|
838
|
-
**Default strategy per segment**
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
839
|
+
**Default strategy per segment**
|
840
|
+
|
841
|
+
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
842
|
+
(Markdown, description off)
|
843
|
+
- `Table` → **LLM** (HTML, description on)
|
844
|
+
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
845
|
+
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
846
|
+
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
847
|
+
|
848
|
+
**Strategy reference**
|
843
849
|
|
844
|
-
|
845
|
-
generate content with an LLM.
|
850
|
+
- **Auto** – rule-based content generation.
|
851
|
+
- **LLM** – generate content with an LLM.
|
852
|
+
- **Ignore** – exclude the segment entirely.
|
846
853
|
"""
|
847
854
|
|
848
855
|
segmentation_strategy: Literal["LayoutAnalysis", "Page"]
|
@@ -36,12 +36,24 @@ class ParseCreateParams(TypedDict, total=False):
|
|
36
36
|
file: Required[str]
|
37
37
|
"""The file to be uploaded. Supported inputs:
|
38
38
|
|
39
|
-
- `ch://files/{file_id}`:
|
40
|
-
|
39
|
+
- `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
|
40
|
+
API
|
41
41
|
- `http(s)://...`: Remote URL to fetch
|
42
42
|
- `data:*;base64,...` or raw base64 string
|
43
43
|
"""
|
44
44
|
|
45
|
+
base64_urls: bool
|
46
|
+
"""Whether to return base64 encoded URLs. If false, presigned URLs are returned."""
|
47
|
+
|
48
|
+
include_chunks: bool
|
49
|
+
"""Whether to include chunks in the output response"""
|
50
|
+
|
51
|
+
wait_for_completion: bool
|
52
|
+
"""
|
53
|
+
If true, server holds briefly and may return 200 when done; otherwise returns
|
54
|
+
408/409 with Retry-After headers
|
55
|
+
"""
|
56
|
+
|
45
57
|
chunk_processing: Optional[ChunkProcessing]
|
46
58
|
"""Controls the setting for the chunking and post-processing of each chunk."""
|
47
59
|
|
@@ -83,22 +95,29 @@ class ParseCreateParams(TypedDict, total=False):
|
|
83
95
|
segment_processing: Optional[SegmentProcessing]
|
84
96
|
"""Defines how each segment type is handled when generating the final output.
|
85
97
|
|
86
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
87
|
-
|
88
|
-
|
89
|
-
|
98
|
+
Each segment uses one of three strategies. The chosen strategy controls:
|
99
|
+
|
100
|
+
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
101
|
+
- How the content is produced (rule-based vs. LLM).
|
102
|
+
- The output format (`Html` or `Markdown`).
|
90
103
|
|
91
104
|
Optional flags such as image **cropping**, **extended context**, and
|
92
105
|
**descriptions** further refine behaviour.
|
93
106
|
|
94
|
-
**Default strategy per segment**
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
107
|
+
**Default strategy per segment**
|
108
|
+
|
109
|
+
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
110
|
+
(Markdown, description off)
|
111
|
+
- `Table` → **LLM** (HTML, description on)
|
112
|
+
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
113
|
+
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
114
|
+
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
115
|
+
|
116
|
+
**Strategy reference**
|
99
117
|
|
100
|
-
|
101
|
-
generate content with an LLM.
|
118
|
+
- **Auto** – rule-based content generation.
|
119
|
+
- **LLM** – generate content with an LLM.
|
120
|
+
- **Ignore** – exclude the segment entirely.
|
102
121
|
"""
|
103
122
|
|
104
123
|
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]]
|
@@ -33,6 +33,18 @@ __all__ = [
|
|
33
33
|
|
34
34
|
|
35
35
|
class ParseUpdateParams(TypedDict, total=False):
|
36
|
+
base64_urls: bool
|
37
|
+
"""Whether to return base64 encoded URLs. If false, presigned URLs are returned."""
|
38
|
+
|
39
|
+
include_chunks: bool
|
40
|
+
"""Whether to include chunks in the output response"""
|
41
|
+
|
42
|
+
wait_for_completion: bool
|
43
|
+
"""
|
44
|
+
If true, server holds briefly and may return 200 when done; otherwise returns
|
45
|
+
408/409 with Retry-After headers
|
46
|
+
"""
|
47
|
+
|
36
48
|
chunk_processing: Optional[ChunkProcessing]
|
37
49
|
"""Controls the setting for the chunking and post-processing of each chunk."""
|
38
50
|
|
@@ -77,22 +89,29 @@ class ParseUpdateParams(TypedDict, total=False):
|
|
77
89
|
segment_processing: Optional[SegmentProcessing]
|
78
90
|
"""Defines how each segment type is handled when generating the final output.
|
79
91
|
|
80
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
81
|
-
|
82
|
-
|
83
|
-
|
92
|
+
Each segment uses one of three strategies. The chosen strategy controls:
|
93
|
+
|
94
|
+
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
95
|
+
- How the content is produced (rule-based vs. LLM).
|
96
|
+
- The output format (`Html` or `Markdown`).
|
84
97
|
|
85
98
|
Optional flags such as image **cropping**, **extended context**, and
|
86
99
|
**descriptions** further refine behaviour.
|
87
100
|
|
88
|
-
**Default strategy per segment**
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
101
|
+
**Default strategy per segment**
|
102
|
+
|
103
|
+
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
104
|
+
(Markdown, description off)
|
105
|
+
- `Table` → **LLM** (HTML, description on)
|
106
|
+
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
107
|
+
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
108
|
+
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
109
|
+
|
110
|
+
**Strategy reference**
|
93
111
|
|
94
|
-
|
95
|
-
generate content with an LLM.
|
112
|
+
- **Auto** – rule-based content generation.
|
113
|
+
- **LLM** – generate content with an LLM.
|
114
|
+
- **Ignore** – exclude the segment entirely.
|
96
115
|
"""
|
97
116
|
|
98
117
|
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.0a5
|
4
4
|
Summary: The official Python library for the chunkr API
|
5
5
|
Project-URL: Homepage, https://github.com/lumina-ai-inc/chunkr-python
|
6
6
|
Project-URL: Repository, https://github.com/lumina-ai-inc/chunkr-python
|
@@ -299,7 +299,7 @@ Error codes are as follows:
|
|
299
299
|
|
300
300
|
### Retries
|
301
301
|
|
302
|
-
Certain errors are automatically retried
|
302
|
+
Certain errors are automatically retried 50 times by default, with a short exponential backoff.
|
303
303
|
Connection errors (for example, due to a network connectivity problem), 408 Request Timeout, 409 Conflict,
|
304
304
|
429 Rate Limit, and >=500 Internal errors are all retried by default.
|
305
305
|
|
@@ -322,7 +322,7 @@ client.with_options(max_retries=5).tasks.parse.create(
|
|
322
322
|
|
323
323
|
### Timeouts
|
324
324
|
|
325
|
-
By default requests time out after
|
325
|
+
By default requests time out after 30 seconds. You can configure this with a `timeout` option,
|
326
326
|
which accepts a float or an [`httpx.Timeout`](https://www.python-httpx.org/advanced/timeouts/#fine-tuning-the-configuration) object:
|
327
327
|
|
328
328
|
```python
|
@@ -330,7 +330,7 @@ from chunkr_ai import Chunkr
|
|
330
330
|
|
331
331
|
# Configure the default for all requests:
|
332
332
|
client = Chunkr(
|
333
|
-
# 20 seconds (default is
|
333
|
+
# 20 seconds (default is 30 seconds)
|
334
334
|
timeout=20.0,
|
335
335
|
)
|
336
336
|
|
@@ -1,8 +1,8 @@
|
|
1
|
-
chunkr_ai/__init__.py,sha256=
|
1
|
+
chunkr_ai/__init__.py,sha256=scS30uHiCpLbaalKTAJSCFSTqnu_b9R5JCkTu2hmbzU,2587
|
2
2
|
chunkr_ai/_base_client.py,sha256=Nv5b_rmVdmmPbF42mlOfymbSC6lxcYsrsvBhKSBDXWQ,67038
|
3
|
-
chunkr_ai/_client.py,sha256=
|
3
|
+
chunkr_ai/_client.py,sha256=yn0QdzDkm0M6Ft2-ItmfJpUxQnJVoWa29tSv_2g3KDQ,15975
|
4
4
|
chunkr_ai/_compat.py,sha256=VWemUKbj6DDkQ-O4baSpHVLJafotzeXmCQGJugfVTIw,6580
|
5
|
-
chunkr_ai/_constants.py,sha256=
|
5
|
+
chunkr_ai/_constants.py,sha256=SZppb_i55UWs0n0_MRbw7s0Hy_TOGQu9q7FVd-fCwgM,466
|
6
6
|
chunkr_ai/_exceptions.py,sha256=ClgXUcwf4qhBTXnK4LzUPQCFdFldRxAlcYdOFFgpTxA,3220
|
7
7
|
chunkr_ai/_files.py,sha256=SUFtic_gwSzbvhLtMdQ7TBem8szrqZE2nZFFMRa0KTw,3619
|
8
8
|
chunkr_ai/_models.py,sha256=KvjsMfb88XZlFUKVoOxr8OyDj47MhoH2OKqWNEbBhk4,30010
|
@@ -11,7 +11,7 @@ chunkr_ai/_resource.py,sha256=f5tiwjxcKdbeMor8idoHtMFTUhqD9yc2xXtq5rqeLLk,1100
|
|
11
11
|
chunkr_ai/_response.py,sha256=xXNpF53hiYARmAW7npKuxQ5UHAEjgAzm7ME_L3eIstY,28800
|
12
12
|
chunkr_ai/_streaming.py,sha256=ZmyrVWk7-AWkLAATR55WgNxnyFzYmaqJt2LthA_PTqQ,10100
|
13
13
|
chunkr_ai/_types.py,sha256=dnzU2Q2tLcuk29QFEcnPC1wp0-4XB4Cpef_3AnRhV5Y,6200
|
14
|
-
chunkr_ai/_version.py,sha256=
|
14
|
+
chunkr_ai/_version.py,sha256=W1WwLVPdlihFXegK9LX1wYOXmi6mf953UnIguIoR8TA,169
|
15
15
|
chunkr_ai/pagination.py,sha256=bT-ErcJ80YlKBV6tWq2s9uqg-wv7o66SKe_AgUAGrKc,3533
|
16
16
|
chunkr_ai/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
chunkr_ai/_utils/__init__.py,sha256=PNZ_QJuzZEgyYXqkO1HVhGkj5IU9bglVUcw7H-Knjzw,2062
|
@@ -25,13 +25,12 @@ chunkr_ai/_utils/_transform.py,sha256=n7kskEWz6o__aoNvhFoGVyDoalNe6mJwp-g7BWkdj8
|
|
25
25
|
chunkr_ai/_utils/_typing.py,sha256=D0DbbNu8GnYQTSICnTSHDGsYXj8TcAKyhejb0XcnjtY,4602
|
26
26
|
chunkr_ai/_utils/_utils.py,sha256=ts4CiiuNpFiGB6YMdkQRh2SZvYvsl7mAF-JWHCcLDf4,12312
|
27
27
|
chunkr_ai/lib/.keep,sha256=wuNrz-5SXo3jJaJOJgz4vFHM41YH_g20F5cRQo0vLes,224
|
28
|
-
chunkr_ai/lib/tasks_poll.py,sha256=3yosl_hH5j6NVNH9mANqneAW0FJSbIV9dMoTcF-OdJU,3341
|
29
28
|
chunkr_ai/resources/__init__.py,sha256=K-axuAEg2pJQl45N5ao1tm8AnRwpQVVNp_b6qSMgB6A,1426
|
30
|
-
chunkr_ai/resources/files.py,sha256=
|
29
|
+
chunkr_ai/resources/files.py,sha256=iX6LbX2PqM6kFKNoLxS_R9OGaVSnnZJ8U0dCUxNBGIM,27184
|
31
30
|
chunkr_ai/resources/health.py,sha256=XTvUtRs5hEK-uccb_40mcIex85eEUo1a171nQUjpSOs,4965
|
32
31
|
chunkr_ai/resources/tasks/__init__.py,sha256=W-sclAx_Kfm7OBGlSs694QzNCMkewtz9LU9KRcb8Ud0,976
|
33
|
-
chunkr_ai/resources/tasks/parse.py,sha256=
|
34
|
-
chunkr_ai/resources/tasks/tasks.py,sha256=
|
32
|
+
chunkr_ai/resources/tasks/parse.py,sha256=NDGtWPtrukPG6lwhLYP2kI3vTq0W2I_c8N9crj9OnJo,33441
|
33
|
+
chunkr_ai/resources/tasks/tasks.py,sha256=XkNulmXZz4N6UXaG-6EdS-WAyncTgwMY3BYbJBqeEGw,22745
|
35
34
|
chunkr_ai/types/__init__.py,sha256=DSRAMgXVRTZM2t8s2yrFU-FHt3FTs_wpZfVILH1zjJ0,728
|
36
35
|
chunkr_ai/types/delete.py,sha256=EU78fjXpc8-fqvgcFTuJ0ejs5u_UjbhOz5frkeUHvxY,225
|
37
36
|
chunkr_ai/types/file.py,sha256=kOxR0g-3A-qOxz2cjuTcq0wFMqPoph9uQuLYQ56zb-c,718
|
@@ -41,13 +40,13 @@ chunkr_ai/types/file_url.py,sha256=L434WnOXkNmt59dJiaAgT1_3pN3BIsxm2q14zHQK6xY,3
|
|
41
40
|
chunkr_ai/types/file_url_params.py,sha256=ZHfKiy_6B25StdDemulavGcsPggNNMKLWf6KN7xfPTY,413
|
42
41
|
chunkr_ai/types/files_list_response.py,sha256=ggSRWhTzZWjcDXxStyCzrYICXXB5TqnL2j-SN9mHH_g,506
|
43
42
|
chunkr_ai/types/health_check_response.py,sha256=6Zn5YYHCQf2RgMjDlf39mtiTPqfaBfC9Vv599U_rKCI,200
|
44
|
-
chunkr_ai/types/task.py,sha256=
|
45
|
-
chunkr_ai/types/task_get_params.py,sha256=
|
43
|
+
chunkr_ai/types/task.py,sha256=L8vE_q0Hej_YuJM_rd_bZOg8kHbithsFx6fOQYpH0cY,46702
|
44
|
+
chunkr_ai/types/task_get_params.py,sha256=yGMHRfkbLzQpRLdF_Dj-8TqcioEhDNWyVbEt50xDAP0,542
|
46
45
|
chunkr_ai/types/task_list_params.py,sha256=fCku42QW6QUsLmZgKJBaxisGvUcmcQ5fa6LgHHRIwiQ,1043
|
47
46
|
chunkr_ai/types/tasks/__init__.py,sha256=VdLEmQvgPoiykSEYaRhkMYVaIueGDkR4P_MjCq9SbQY,267
|
48
|
-
chunkr_ai/types/tasks/parse_create_params.py,sha256=
|
49
|
-
chunkr_ai/types/tasks/parse_update_params.py,sha256=
|
50
|
-
chunkr_ai-0.1.
|
51
|
-
chunkr_ai-0.1.
|
52
|
-
chunkr_ai-0.1.
|
53
|
-
chunkr_ai-0.1.
|
47
|
+
chunkr_ai/types/tasks/parse_create_params.py,sha256=tQLvgfhjdgIDKsEejFPLs-guQ8trDmvTC3BVWWXBaNg,34686
|
48
|
+
chunkr_ai/types/tasks/parse_update_params.py,sha256=QSUqh2Hb1B5KYEJJqlCJ1XfvoGLVKuOsQz9PceeqHjk,34474
|
49
|
+
chunkr_ai-0.1.0a5.dist-info/METADATA,sha256=7-RwQM4pkLESzFisF-3Ofl7jMebFT3eLAXe8Bbl15vU,16446
|
50
|
+
chunkr_ai-0.1.0a5.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
|
51
|
+
chunkr_ai-0.1.0a5.dist-info/licenses/LICENSE,sha256=3FDRL-L-DFkrFy8yJpb1Nxhuztm0PB2kawcCgK5utFg,11336
|
52
|
+
chunkr_ai-0.1.0a5.dist-info/RECORD,,
|
chunkr_ai/lib/tasks_poll.py
DELETED
@@ -1,122 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
"""
|
4
|
-
Custom helpers for task polling.
|
5
|
-
|
6
|
-
This module adds `Task.poll()` and `Task.apoll()` methods at runtime to the
|
7
|
-
generated `Task` model, without modifying generated code directly.
|
8
|
-
|
9
|
-
Usage:
|
10
|
-
task = client.tasks.get(task_id)
|
11
|
-
task = task.poll(client) # blocks until terminal state
|
12
|
-
|
13
|
-
# async
|
14
|
-
task = await async_client.tasks.get(task_id)
|
15
|
-
task = await task.apoll(async_client)
|
16
|
-
"""
|
17
|
-
|
18
|
-
import time
|
19
|
-
import asyncio
|
20
|
-
from typing import Protocol, cast
|
21
|
-
|
22
|
-
from .._types import NOT_GIVEN, NotGiven
|
23
|
-
from .._client import Chunkr, AsyncChunkr
|
24
|
-
from ..types.task import Task as _Task
|
25
|
-
from .._exceptions import ChunkrError
|
26
|
-
|
27
|
-
TERMINAL_STATUSES = {"Succeeded", "Failed", "Cancelled"}
|
28
|
-
|
29
|
-
|
30
|
-
def _task_poll(
|
31
|
-
self: _Task,
|
32
|
-
client: Chunkr,
|
33
|
-
*,
|
34
|
-
interval: float = 0.5,
|
35
|
-
timeout: float = 600.0,
|
36
|
-
include_chunks: bool | NotGiven = NOT_GIVEN,
|
37
|
-
base64_urls: bool | NotGiven = NOT_GIVEN,
|
38
|
-
) -> _Task:
|
39
|
-
"""Poll the task until it reaches a terminal status.
|
40
|
-
|
41
|
-
Args:
|
42
|
-
client: Synchronous Chunkr client instance.
|
43
|
-
interval: Seconds to sleep between polls.
|
44
|
-
timeout: Maximum total seconds to wait before raising an error.
|
45
|
-
include_chunks: Whether to include chunks in the output response for each poll.
|
46
|
-
base64_urls: Whether to return base64 encoded URLs.
|
47
|
-
"""
|
48
|
-
start_time = time.monotonic()
|
49
|
-
current: _Task = self
|
50
|
-
|
51
|
-
class _TasksGetProtocol(Protocol):
|
52
|
-
def get(
|
53
|
-
self,
|
54
|
-
task_id: str,
|
55
|
-
*,
|
56
|
-
base64_urls: bool | NotGiven = NOT_GIVEN,
|
57
|
-
include_chunks: bool | NotGiven = NOT_GIVEN,
|
58
|
-
) -> _Task: ...
|
59
|
-
|
60
|
-
resource = cast(_TasksGetProtocol, client.tasks)
|
61
|
-
|
62
|
-
while current.status not in TERMINAL_STATUSES:
|
63
|
-
if time.monotonic() - start_time > timeout:
|
64
|
-
raise ChunkrError("Task polling timed out.")
|
65
|
-
|
66
|
-
if interval > 0:
|
67
|
-
time.sleep(interval)
|
68
|
-
|
69
|
-
current = resource.get(
|
70
|
-
current.task_id,
|
71
|
-
include_chunks=include_chunks,
|
72
|
-
base64_urls=base64_urls,
|
73
|
-
)
|
74
|
-
|
75
|
-
return current
|
76
|
-
|
77
|
-
|
78
|
-
async def _task_apoll(
|
79
|
-
self: _Task,
|
80
|
-
client: AsyncChunkr,
|
81
|
-
*,
|
82
|
-
interval: float = 0.5,
|
83
|
-
timeout: float = 600.0,
|
84
|
-
include_chunks: bool | NotGiven = NOT_GIVEN,
|
85
|
-
base64_urls: bool | NotGiven = NOT_GIVEN,
|
86
|
-
) -> _Task:
|
87
|
-
"""Async poll the task until it reaches a terminal status."""
|
88
|
-
start_time = time.monotonic()
|
89
|
-
current: _Task = self
|
90
|
-
|
91
|
-
class _AsyncTasksGetProtocol(Protocol):
|
92
|
-
async def get(
|
93
|
-
self,
|
94
|
-
task_id: str,
|
95
|
-
*,
|
96
|
-
base64_urls: bool | NotGiven = NOT_GIVEN,
|
97
|
-
include_chunks: bool | NotGiven = NOT_GIVEN,
|
98
|
-
) -> _Task: ...
|
99
|
-
|
100
|
-
aresource = cast(_AsyncTasksGetProtocol, client.tasks)
|
101
|
-
|
102
|
-
while current.status not in TERMINAL_STATUSES:
|
103
|
-
if time.monotonic() - start_time > timeout:
|
104
|
-
raise ChunkrError("Task polling timed out.")
|
105
|
-
|
106
|
-
if interval > 0:
|
107
|
-
await asyncio.sleep(interval)
|
108
|
-
|
109
|
-
current = await aresource.get(
|
110
|
-
current.task_id,
|
111
|
-
include_chunks=include_chunks,
|
112
|
-
base64_urls=base64_urls,
|
113
|
-
)
|
114
|
-
|
115
|
-
return current
|
116
|
-
|
117
|
-
|
118
|
-
# Attach methods to the generated Task model
|
119
|
-
_Task.poll = _task_poll # type: ignore[attr-defined]
|
120
|
-
_Task.apoll = _task_apoll # type: ignore[attr-defined]
|
121
|
-
|
122
|
-
|
File without changes
|
File without changes
|