chunkr-ai 0.1.0a6__py3-none-any.whl → 0.1.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/__init__.py +2 -0
- chunkr_ai/_base_client.py +3 -3
- chunkr_ai/_client.py +31 -3
- chunkr_ai/_compat.py +48 -48
- chunkr_ai/_constants.py +5 -5
- chunkr_ai/_exceptions.py +4 -0
- chunkr_ai/_models.py +41 -41
- chunkr_ai/_types.py +35 -1
- chunkr_ai/_utils/__init__.py +9 -2
- chunkr_ai/_utils/_compat.py +45 -0
- chunkr_ai/_utils/_datetime_parse.py +136 -0
- chunkr_ai/_utils/_transform.py +11 -1
- chunkr_ai/_utils/_typing.py +6 -1
- chunkr_ai/_utils/_utils.py +0 -1
- chunkr_ai/_version.py +1 -1
- chunkr_ai/resources/__init__.py +14 -0
- chunkr_ai/resources/files.py +3 -3
- chunkr_ai/resources/tasks/__init__.py +14 -0
- chunkr_ai/resources/tasks/extract.py +393 -0
- chunkr_ai/resources/tasks/parse.py +110 -286
- chunkr_ai/resources/tasks/tasks.py +64 -32
- chunkr_ai/resources/webhooks.py +193 -0
- chunkr_ai/types/__init__.py +27 -1
- chunkr_ai/types/bounding_box.py +19 -0
- chunkr_ai/types/cell.py +39 -0
- chunkr_ai/types/cell_style.py +28 -0
- chunkr_ai/types/chunk.py +40 -0
- chunkr_ai/types/chunk_processing.py +40 -0
- chunkr_ai/types/chunk_processing_param.py +42 -0
- chunkr_ai/types/extract_configuration.py +24 -0
- chunkr_ai/types/extract_output_response.py +62 -0
- chunkr_ai/types/file_create_params.py +2 -1
- chunkr_ai/types/file_info.py +21 -0
- chunkr_ai/types/generation_config.py +29 -0
- chunkr_ai/types/generation_config_param.py +29 -0
- chunkr_ai/types/llm_processing.py +36 -0
- chunkr_ai/types/llm_processing_param.py +36 -0
- chunkr_ai/types/ocr_result.py +28 -0
- chunkr_ai/types/page.py +27 -0
- chunkr_ai/types/parse_configuration.py +64 -0
- chunkr_ai/types/parse_configuration_param.py +65 -0
- chunkr_ai/types/parse_output_response.py +29 -0
- chunkr_ai/types/segment.py +109 -0
- chunkr_ai/types/segment_processing.py +228 -0
- chunkr_ai/types/segment_processing_param.py +229 -0
- chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
- chunkr_ai/types/task_get_params.py +0 -3
- chunkr_ai/types/task_list_params.py +7 -1
- chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
- chunkr_ai/types/task_response.py +68 -0
- chunkr_ai/types/tasks/__init__.py +7 -1
- chunkr_ai/types/tasks/extract_create_params.py +47 -0
- chunkr_ai/types/tasks/extract_create_response.py +67 -0
- chunkr_ai/types/tasks/extract_get_params.py +18 -0
- chunkr_ai/types/tasks/extract_get_response.py +67 -0
- chunkr_ai/types/tasks/parse_create_params.py +25 -793
- chunkr_ai/types/tasks/parse_create_response.py +55 -0
- chunkr_ai/types/tasks/parse_get_params.py +18 -0
- chunkr_ai/types/tasks/parse_get_response.py +55 -0
- chunkr_ai/types/unwrap_webhook_event.py +11 -0
- chunkr_ai/types/version_info.py +31 -0
- chunkr_ai/types/webhook_url_response.py +9 -0
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/METADATA +14 -13
- chunkr_ai-0.1.0a8.dist-info/RECORD +88 -0
- chunkr_ai/types/task.py +0 -1225
- chunkr_ai/types/tasks/parse_update_params.py +0 -845
- chunkr_ai-0.1.0a6.dist-info/RECORD +0 -52
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/licenses/LICENSE +0 -0
chunkr_ai/types/task.py
DELETED
@@ -1,1225 +0,0 @@
|
|
1
|
-
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
-
|
3
|
-
from typing import List, Union, Optional
|
4
|
-
from datetime import datetime
|
5
|
-
from typing_extensions import Literal, TypeAlias
|
6
|
-
|
7
|
-
from pydantic import Field as FieldInfo
|
8
|
-
|
9
|
-
from .._models import BaseModel
|
10
|
-
|
11
|
-
__all__ = [
|
12
|
-
"Task",
|
13
|
-
"Configuration",
|
14
|
-
"ConfigurationChunkProcessing",
|
15
|
-
"ConfigurationChunkProcessingTokenizer",
|
16
|
-
"ConfigurationChunkProcessingTokenizerEnum",
|
17
|
-
"ConfigurationChunkProcessingTokenizerString",
|
18
|
-
"ConfigurationLlmProcessing",
|
19
|
-
"ConfigurationLlmProcessingFallbackStrategy",
|
20
|
-
"ConfigurationLlmProcessingFallbackStrategyModel",
|
21
|
-
"ConfigurationSegmentProcessing",
|
22
|
-
"ConfigurationSegmentProcessingCaption",
|
23
|
-
"ConfigurationSegmentProcessingFootnote",
|
24
|
-
"ConfigurationSegmentProcessingFormula",
|
25
|
-
"ConfigurationSegmentProcessingListItem",
|
26
|
-
"ConfigurationSegmentProcessingPage",
|
27
|
-
"ConfigurationSegmentProcessingPageFooter",
|
28
|
-
"ConfigurationSegmentProcessingPageHeader",
|
29
|
-
"ConfigurationSegmentProcessingPicture",
|
30
|
-
"ConfigurationSegmentProcessingSectionHeader",
|
31
|
-
"ConfigurationSegmentProcessingTable",
|
32
|
-
"ConfigurationSegmentProcessingText",
|
33
|
-
"ConfigurationSegmentProcessingTitle",
|
34
|
-
"ConfigurationClientVersion",
|
35
|
-
"ConfigurationClientVersionManualSDK",
|
36
|
-
"ConfigurationClientVersionGeneratedSDK",
|
37
|
-
"Output",
|
38
|
-
"OutputChunk",
|
39
|
-
"OutputChunkSegment",
|
40
|
-
"OutputChunkSegmentBbox",
|
41
|
-
"OutputChunkSegmentOcr",
|
42
|
-
"OutputChunkSegmentOcrBbox",
|
43
|
-
"OutputChunkSegmentSSCell",
|
44
|
-
"OutputChunkSegmentSSCellStyle",
|
45
|
-
"OutputChunkSegmentSSHeaderBbox",
|
46
|
-
"OutputChunkSegmentSSHeaderOcr",
|
47
|
-
"OutputChunkSegmentSSHeaderOcrBbox",
|
48
|
-
"OutputPage",
|
49
|
-
]
|
50
|
-
|
51
|
-
|
52
|
-
class ConfigurationChunkProcessingTokenizerEnum(BaseModel):
|
53
|
-
enum: Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"] = FieldInfo(alias="Enum")
|
54
|
-
"""Use one of the predefined tokenizer types"""
|
55
|
-
|
56
|
-
|
57
|
-
class ConfigurationChunkProcessingTokenizerString(BaseModel):
|
58
|
-
string: str = FieldInfo(alias="String")
|
59
|
-
"""
|
60
|
-
Use any Hugging Face tokenizer by specifying its model ID Examples:
|
61
|
-
"Qwen/Qwen-tokenizer", "facebook/bart-large"
|
62
|
-
"""
|
63
|
-
|
64
|
-
|
65
|
-
ConfigurationChunkProcessingTokenizer: TypeAlias = Union[
|
66
|
-
ConfigurationChunkProcessingTokenizerEnum, ConfigurationChunkProcessingTokenizerString
|
67
|
-
]
|
68
|
-
|
69
|
-
|
70
|
-
class ConfigurationChunkProcessing(BaseModel):
|
71
|
-
ignore_headers_and_footers: Optional[bool] = None
|
72
|
-
"""DEPRECATED: use `segment_processing.ignore` instead"""
|
73
|
-
|
74
|
-
target_length: Optional[int] = None
|
75
|
-
"""The target number of words in each chunk.
|
76
|
-
|
77
|
-
If 0, each chunk will contain a single segment.
|
78
|
-
"""
|
79
|
-
|
80
|
-
tokenizer: Optional[ConfigurationChunkProcessingTokenizer] = None
|
81
|
-
"""The tokenizer to use for the chunking process."""
|
82
|
-
|
83
|
-
|
84
|
-
class ConfigurationLlmProcessingFallbackStrategyModel(BaseModel):
|
85
|
-
model: str = FieldInfo(alias="Model")
|
86
|
-
"""Use a specific model as fallback"""
|
87
|
-
|
88
|
-
|
89
|
-
ConfigurationLlmProcessingFallbackStrategy: TypeAlias = Union[
|
90
|
-
Literal["None", "Default"], ConfigurationLlmProcessingFallbackStrategyModel
|
91
|
-
]
|
92
|
-
|
93
|
-
|
94
|
-
class ConfigurationLlmProcessing(BaseModel):
|
95
|
-
fallback_strategy: Optional[ConfigurationLlmProcessingFallbackStrategy] = None
|
96
|
-
"""The fallback strategy to use for the LLMs in the task."""
|
97
|
-
|
98
|
-
llm_model_id: Optional[str] = None
|
99
|
-
"""The ID of the model to use for the task.
|
100
|
-
|
101
|
-
If not provided, the default model will be used. Please check the documentation
|
102
|
-
for the model you want to use.
|
103
|
-
"""
|
104
|
-
|
105
|
-
max_completion_tokens: Optional[int] = None
|
106
|
-
"""The maximum number of tokens to generate."""
|
107
|
-
|
108
|
-
temperature: Optional[float] = None
|
109
|
-
"""The temperature to use for the LLM."""
|
110
|
-
|
111
|
-
|
112
|
-
class ConfigurationSegmentProcessingCaption(BaseModel):
|
113
|
-
crop_image: Optional[Literal["All", "Auto"]] = None
|
114
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
115
|
-
|
116
|
-
- `All` crops all images in the item
|
117
|
-
- `Auto` crops images only if required for post-processing
|
118
|
-
"""
|
119
|
-
|
120
|
-
description: Optional[bool] = None
|
121
|
-
"""Generate LLM descriptions for this segment"""
|
122
|
-
|
123
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
124
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
125
|
-
|
126
|
-
extended_context: Optional[bool] = None
|
127
|
-
"""Use the full page image as context for LLM generation"""
|
128
|
-
|
129
|
-
format: Optional[Literal["Html", "Markdown"]] = None
|
130
|
-
|
131
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
132
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
133
|
-
|
134
|
-
llm: Optional[str] = None
|
135
|
-
"""**DEPRECATED**: use description instead"""
|
136
|
-
|
137
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
138
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
139
|
-
|
140
|
-
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
141
|
-
|
142
|
-
|
143
|
-
class ConfigurationSegmentProcessingFootnote(BaseModel):
|
144
|
-
crop_image: Optional[Literal["All", "Auto"]] = None
|
145
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
146
|
-
|
147
|
-
- `All` crops all images in the item
|
148
|
-
- `Auto` crops images only if required for post-processing
|
149
|
-
"""
|
150
|
-
|
151
|
-
description: Optional[bool] = None
|
152
|
-
"""Generate LLM descriptions for this segment"""
|
153
|
-
|
154
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
155
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
156
|
-
|
157
|
-
extended_context: Optional[bool] = None
|
158
|
-
"""Use the full page image as context for LLM generation"""
|
159
|
-
|
160
|
-
format: Optional[Literal["Html", "Markdown"]] = None
|
161
|
-
|
162
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
163
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
164
|
-
|
165
|
-
llm: Optional[str] = None
|
166
|
-
"""**DEPRECATED**: use description instead"""
|
167
|
-
|
168
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
169
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
170
|
-
|
171
|
-
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
172
|
-
|
173
|
-
|
174
|
-
class ConfigurationSegmentProcessingFormula(BaseModel):
|
175
|
-
crop_image: Optional[Literal["All", "Auto"]] = None
|
176
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
177
|
-
|
178
|
-
- `All` crops all images in the item
|
179
|
-
- `Auto` crops images only if required for post-processing
|
180
|
-
"""
|
181
|
-
|
182
|
-
description: Optional[bool] = None
|
183
|
-
"""Generate LLM descriptions for this segment"""
|
184
|
-
|
185
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
186
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
187
|
-
|
188
|
-
extended_context: Optional[bool] = None
|
189
|
-
"""Use the full page image as context for LLM generation"""
|
190
|
-
|
191
|
-
format: Optional[Literal["Html", "Markdown"]] = None
|
192
|
-
|
193
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
194
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
195
|
-
|
196
|
-
llm: Optional[str] = None
|
197
|
-
"""**DEPRECATED**: use description instead"""
|
198
|
-
|
199
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
200
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
201
|
-
|
202
|
-
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
203
|
-
|
204
|
-
|
205
|
-
class ConfigurationSegmentProcessingListItem(BaseModel):
|
206
|
-
crop_image: Optional[Literal["All", "Auto"]] = None
|
207
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
208
|
-
|
209
|
-
- `All` crops all images in the item
|
210
|
-
- `Auto` crops images only if required for post-processing
|
211
|
-
"""
|
212
|
-
|
213
|
-
description: Optional[bool] = None
|
214
|
-
"""Generate LLM descriptions for this segment"""
|
215
|
-
|
216
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
217
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
218
|
-
|
219
|
-
extended_context: Optional[bool] = None
|
220
|
-
"""Use the full page image as context for LLM generation"""
|
221
|
-
|
222
|
-
format: Optional[Literal["Html", "Markdown"]] = None
|
223
|
-
|
224
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
225
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
226
|
-
|
227
|
-
llm: Optional[str] = None
|
228
|
-
"""**DEPRECATED**: use description instead"""
|
229
|
-
|
230
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
231
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
232
|
-
|
233
|
-
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
234
|
-
|
235
|
-
|
236
|
-
class ConfigurationSegmentProcessingPage(BaseModel):
|
237
|
-
crop_image: Optional[Literal["All", "Auto"]] = None
|
238
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
239
|
-
|
240
|
-
- `All` crops all images in the item
|
241
|
-
- `Auto` crops images only if required for post-processing
|
242
|
-
"""
|
243
|
-
|
244
|
-
description: Optional[bool] = None
|
245
|
-
"""Generate LLM descriptions for this segment"""
|
246
|
-
|
247
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
248
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
249
|
-
|
250
|
-
extended_context: Optional[bool] = None
|
251
|
-
"""Use the full page image as context for LLM generation"""
|
252
|
-
|
253
|
-
format: Optional[Literal["Html", "Markdown"]] = None
|
254
|
-
|
255
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
256
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
257
|
-
|
258
|
-
llm: Optional[str] = None
|
259
|
-
"""**DEPRECATED**: use description instead"""
|
260
|
-
|
261
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
262
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
263
|
-
|
264
|
-
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
265
|
-
|
266
|
-
|
267
|
-
class ConfigurationSegmentProcessingPageFooter(BaseModel):
|
268
|
-
crop_image: Optional[Literal["All", "Auto"]] = None
|
269
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
270
|
-
|
271
|
-
- `All` crops all images in the item
|
272
|
-
- `Auto` crops images only if required for post-processing
|
273
|
-
"""
|
274
|
-
|
275
|
-
description: Optional[bool] = None
|
276
|
-
"""Generate LLM descriptions for this segment"""
|
277
|
-
|
278
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
279
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
280
|
-
|
281
|
-
extended_context: Optional[bool] = None
|
282
|
-
"""Use the full page image as context for LLM generation"""
|
283
|
-
|
284
|
-
format: Optional[Literal["Html", "Markdown"]] = None
|
285
|
-
|
286
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
287
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
288
|
-
|
289
|
-
llm: Optional[str] = None
|
290
|
-
"""**DEPRECATED**: use description instead"""
|
291
|
-
|
292
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
293
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
294
|
-
|
295
|
-
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
296
|
-
|
297
|
-
|
298
|
-
class ConfigurationSegmentProcessingPageHeader(BaseModel):
|
299
|
-
crop_image: Optional[Literal["All", "Auto"]] = None
|
300
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
301
|
-
|
302
|
-
- `All` crops all images in the item
|
303
|
-
- `Auto` crops images only if required for post-processing
|
304
|
-
"""
|
305
|
-
|
306
|
-
description: Optional[bool] = None
|
307
|
-
"""Generate LLM descriptions for this segment"""
|
308
|
-
|
309
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
310
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
311
|
-
|
312
|
-
extended_context: Optional[bool] = None
|
313
|
-
"""Use the full page image as context for LLM generation"""
|
314
|
-
|
315
|
-
format: Optional[Literal["Html", "Markdown"]] = None
|
316
|
-
|
317
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
318
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
319
|
-
|
320
|
-
llm: Optional[str] = None
|
321
|
-
"""**DEPRECATED**: use description instead"""
|
322
|
-
|
323
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
324
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
325
|
-
|
326
|
-
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
327
|
-
|
328
|
-
|
329
|
-
class ConfigurationSegmentProcessingPicture(BaseModel):
|
330
|
-
crop_image: Optional[Literal["All", "Auto"]] = None
|
331
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
332
|
-
|
333
|
-
- `All` crops all images in the item
|
334
|
-
- `Auto` crops images only if required for post-processing
|
335
|
-
"""
|
336
|
-
|
337
|
-
description: Optional[bool] = None
|
338
|
-
"""Generate LLM descriptions for this segment"""
|
339
|
-
|
340
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
341
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
342
|
-
|
343
|
-
extended_context: Optional[bool] = None
|
344
|
-
"""Use the full page image as context for LLM generation"""
|
345
|
-
|
346
|
-
format: Optional[Literal["Html", "Markdown"]] = None
|
347
|
-
|
348
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
349
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
350
|
-
|
351
|
-
llm: Optional[str] = None
|
352
|
-
"""**DEPRECATED**: use description instead"""
|
353
|
-
|
354
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
355
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
356
|
-
|
357
|
-
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
358
|
-
|
359
|
-
|
360
|
-
class ConfigurationSegmentProcessingSectionHeader(BaseModel):
|
361
|
-
crop_image: Optional[Literal["All", "Auto"]] = None
|
362
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
363
|
-
|
364
|
-
- `All` crops all images in the item
|
365
|
-
- `Auto` crops images only if required for post-processing
|
366
|
-
"""
|
367
|
-
|
368
|
-
description: Optional[bool] = None
|
369
|
-
"""Generate LLM descriptions for this segment"""
|
370
|
-
|
371
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
372
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
373
|
-
|
374
|
-
extended_context: Optional[bool] = None
|
375
|
-
"""Use the full page image as context for LLM generation"""
|
376
|
-
|
377
|
-
format: Optional[Literal["Html", "Markdown"]] = None
|
378
|
-
|
379
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
380
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
381
|
-
|
382
|
-
llm: Optional[str] = None
|
383
|
-
"""**DEPRECATED**: use description instead"""
|
384
|
-
|
385
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
386
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
387
|
-
|
388
|
-
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
389
|
-
|
390
|
-
|
391
|
-
class ConfigurationSegmentProcessingTable(BaseModel):
|
392
|
-
crop_image: Optional[Literal["All", "Auto"]] = None
|
393
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
394
|
-
|
395
|
-
- `All` crops all images in the item
|
396
|
-
- `Auto` crops images only if required for post-processing
|
397
|
-
"""
|
398
|
-
|
399
|
-
description: Optional[bool] = None
|
400
|
-
"""Generate LLM descriptions for this segment"""
|
401
|
-
|
402
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
403
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
404
|
-
|
405
|
-
extended_context: Optional[bool] = None
|
406
|
-
"""Use the full page image as context for LLM generation"""
|
407
|
-
|
408
|
-
format: Optional[Literal["Html", "Markdown"]] = None
|
409
|
-
|
410
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
411
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
412
|
-
|
413
|
-
llm: Optional[str] = None
|
414
|
-
"""**DEPRECATED**: use description instead"""
|
415
|
-
|
416
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
417
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
418
|
-
|
419
|
-
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
420
|
-
|
421
|
-
|
422
|
-
class ConfigurationSegmentProcessingText(BaseModel):
|
423
|
-
crop_image: Optional[Literal["All", "Auto"]] = None
|
424
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
425
|
-
|
426
|
-
- `All` crops all images in the item
|
427
|
-
- `Auto` crops images only if required for post-processing
|
428
|
-
"""
|
429
|
-
|
430
|
-
description: Optional[bool] = None
|
431
|
-
"""Generate LLM descriptions for this segment"""
|
432
|
-
|
433
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
434
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
435
|
-
|
436
|
-
extended_context: Optional[bool] = None
|
437
|
-
"""Use the full page image as context for LLM generation"""
|
438
|
-
|
439
|
-
format: Optional[Literal["Html", "Markdown"]] = None
|
440
|
-
|
441
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
442
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
443
|
-
|
444
|
-
llm: Optional[str] = None
|
445
|
-
"""**DEPRECATED**: use description instead"""
|
446
|
-
|
447
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
448
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
449
|
-
|
450
|
-
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
451
|
-
|
452
|
-
|
453
|
-
class ConfigurationSegmentProcessingTitle(BaseModel):
|
454
|
-
crop_image: Optional[Literal["All", "Auto"]] = None
|
455
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
456
|
-
|
457
|
-
- `All` crops all images in the item
|
458
|
-
- `Auto` crops images only if required for post-processing
|
459
|
-
"""
|
460
|
-
|
461
|
-
description: Optional[bool] = None
|
462
|
-
"""Generate LLM descriptions for this segment"""
|
463
|
-
|
464
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
465
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
466
|
-
|
467
|
-
extended_context: Optional[bool] = None
|
468
|
-
"""Use the full page image as context for LLM generation"""
|
469
|
-
|
470
|
-
format: Optional[Literal["Html", "Markdown"]] = None
|
471
|
-
|
472
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
473
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
474
|
-
|
475
|
-
llm: Optional[str] = None
|
476
|
-
"""**DEPRECATED**: use description instead"""
|
477
|
-
|
478
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
479
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
480
|
-
|
481
|
-
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
482
|
-
|
483
|
-
|
484
|
-
class ConfigurationSegmentProcessing(BaseModel):
|
485
|
-
caption: Optional[ConfigurationSegmentProcessingCaption] = FieldInfo(alias="Caption", default=None)
|
486
|
-
"""Controls the processing and generation for the segment.
|
487
|
-
|
488
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
489
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
490
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
491
|
-
post-processing.
|
492
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
493
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
494
|
-
- `Auto`: Process content automatically
|
495
|
-
- `LLM`: Use large language models for processing
|
496
|
-
- `Ignore`: Exclude segments from final output
|
497
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
498
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
499
|
-
configuration.
|
500
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
501
|
-
|
502
|
-
**Deprecated fields (for backwards compatibility):**
|
503
|
-
|
504
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
505
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
506
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
507
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
508
|
-
"""
|
509
|
-
|
510
|
-
footnote: Optional[ConfigurationSegmentProcessingFootnote] = FieldInfo(alias="Footnote", default=None)
|
511
|
-
"""Controls the processing and generation for the segment.
|
512
|
-
|
513
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
514
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
515
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
516
|
-
post-processing.
|
517
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
518
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
519
|
-
- `Auto`: Process content automatically
|
520
|
-
- `LLM`: Use large language models for processing
|
521
|
-
- `Ignore`: Exclude segments from final output
|
522
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
523
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
524
|
-
configuration.
|
525
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
526
|
-
|
527
|
-
**Deprecated fields (for backwards compatibility):**
|
528
|
-
|
529
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
530
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
531
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
532
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
533
|
-
"""
|
534
|
-
|
535
|
-
formula: Optional[ConfigurationSegmentProcessingFormula] = FieldInfo(alias="Formula", default=None)
|
536
|
-
"""Controls the processing and generation for the segment.
|
537
|
-
|
538
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
539
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
540
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
541
|
-
post-processing.
|
542
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
543
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
544
|
-
- `Auto`: Process content automatically
|
545
|
-
- `LLM`: Use large language models for processing
|
546
|
-
- `Ignore`: Exclude segments from final output
|
547
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
548
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
549
|
-
configuration.
|
550
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
551
|
-
|
552
|
-
**Deprecated fields (for backwards compatibility):**
|
553
|
-
|
554
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
555
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
556
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
557
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
558
|
-
"""
|
559
|
-
|
560
|
-
list_item: Optional[ConfigurationSegmentProcessingListItem] = FieldInfo(alias="ListItem", default=None)
|
561
|
-
"""Controls the processing and generation for the segment.
|
562
|
-
|
563
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
564
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
565
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
566
|
-
post-processing.
|
567
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
568
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
569
|
-
- `Auto`: Process content automatically
|
570
|
-
- `LLM`: Use large language models for processing
|
571
|
-
- `Ignore`: Exclude segments from final output
|
572
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
573
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
574
|
-
configuration.
|
575
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
576
|
-
|
577
|
-
**Deprecated fields (for backwards compatibility):**
|
578
|
-
|
579
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
580
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
581
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
582
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
583
|
-
"""
|
584
|
-
|
585
|
-
page: Optional[ConfigurationSegmentProcessingPage] = FieldInfo(alias="Page", default=None)
|
586
|
-
"""Controls the processing and generation for the segment.
|
587
|
-
|
588
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
589
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
590
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
591
|
-
post-processing.
|
592
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
593
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
594
|
-
- `Auto`: Process content automatically
|
595
|
-
- `LLM`: Use large language models for processing
|
596
|
-
- `Ignore`: Exclude segments from final output
|
597
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
598
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
599
|
-
configuration.
|
600
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
601
|
-
|
602
|
-
**Deprecated fields (for backwards compatibility):**
|
603
|
-
|
604
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
605
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
606
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
607
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
608
|
-
"""
|
609
|
-
|
610
|
-
page_footer: Optional[ConfigurationSegmentProcessingPageFooter] = FieldInfo(alias="PageFooter", default=None)
|
611
|
-
"""Controls the processing and generation for the segment.
|
612
|
-
|
613
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
614
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
615
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
616
|
-
post-processing.
|
617
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
618
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
619
|
-
- `Auto`: Process content automatically
|
620
|
-
- `LLM`: Use large language models for processing
|
621
|
-
- `Ignore`: Exclude segments from final output
|
622
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
623
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
624
|
-
configuration.
|
625
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
626
|
-
|
627
|
-
**Deprecated fields (for backwards compatibility):**
|
628
|
-
|
629
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
630
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
631
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
632
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
633
|
-
"""
|
634
|
-
|
635
|
-
page_header: Optional[ConfigurationSegmentProcessingPageHeader] = FieldInfo(alias="PageHeader", default=None)
|
636
|
-
"""Controls the processing and generation for the segment.
|
637
|
-
|
638
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
639
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
640
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
641
|
-
post-processing.
|
642
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
643
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
644
|
-
- `Auto`: Process content automatically
|
645
|
-
- `LLM`: Use large language models for processing
|
646
|
-
- `Ignore`: Exclude segments from final output
|
647
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
648
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
649
|
-
configuration.
|
650
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
651
|
-
|
652
|
-
**Deprecated fields (for backwards compatibility):**
|
653
|
-
|
654
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
655
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
656
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
657
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
658
|
-
"""
|
659
|
-
|
660
|
-
picture: Optional[ConfigurationSegmentProcessingPicture] = FieldInfo(alias="Picture", default=None)
|
661
|
-
"""Controls the processing and generation for the segment.
|
662
|
-
|
663
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
664
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
665
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
666
|
-
post-processing.
|
667
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
668
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
669
|
-
- `Auto`: Process content automatically
|
670
|
-
- `LLM`: Use large language models for processing
|
671
|
-
- `Ignore`: Exclude segments from final output
|
672
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
673
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
674
|
-
configuration.
|
675
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
676
|
-
|
677
|
-
**Deprecated fields (for backwards compatibility):**
|
678
|
-
|
679
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
680
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
681
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
682
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
683
|
-
"""
|
684
|
-
|
685
|
-
section_header: Optional[ConfigurationSegmentProcessingSectionHeader] = FieldInfo(
|
686
|
-
alias="SectionHeader", default=None
|
687
|
-
)
|
688
|
-
"""Controls the processing and generation for the segment.
|
689
|
-
|
690
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
691
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
692
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
693
|
-
post-processing.
|
694
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
695
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
696
|
-
- `Auto`: Process content automatically
|
697
|
-
- `LLM`: Use large language models for processing
|
698
|
-
- `Ignore`: Exclude segments from final output
|
699
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
700
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
701
|
-
configuration.
|
702
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
703
|
-
|
704
|
-
**Deprecated fields (for backwards compatibility):**
|
705
|
-
|
706
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
707
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
708
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
709
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
710
|
-
"""
|
711
|
-
|
712
|
-
table: Optional[ConfigurationSegmentProcessingTable] = FieldInfo(alias="Table", default=None)
|
713
|
-
"""Controls the processing and generation for the segment.
|
714
|
-
|
715
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
716
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
717
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
718
|
-
post-processing.
|
719
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
720
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
721
|
-
- `Auto`: Process content automatically
|
722
|
-
- `LLM`: Use large language models for processing
|
723
|
-
- `Ignore`: Exclude segments from final output
|
724
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
725
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
726
|
-
configuration.
|
727
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
728
|
-
|
729
|
-
**Deprecated fields (for backwards compatibility):**
|
730
|
-
|
731
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
732
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
733
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
734
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
735
|
-
"""
|
736
|
-
|
737
|
-
text: Optional[ConfigurationSegmentProcessingText] = FieldInfo(alias="Text", default=None)
|
738
|
-
"""Controls the processing and generation for the segment.
|
739
|
-
|
740
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
741
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
742
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
743
|
-
post-processing.
|
744
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
745
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
746
|
-
- `Auto`: Process content automatically
|
747
|
-
- `LLM`: Use large language models for processing
|
748
|
-
- `Ignore`: Exclude segments from final output
|
749
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
750
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
751
|
-
configuration.
|
752
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
753
|
-
|
754
|
-
**Deprecated fields (for backwards compatibility):**
|
755
|
-
|
756
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
757
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
758
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
759
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
760
|
-
"""
|
761
|
-
|
762
|
-
title: Optional[ConfigurationSegmentProcessingTitle] = FieldInfo(alias="Title", default=None)
|
763
|
-
"""Controls the processing and generation for the segment.
|
764
|
-
|
765
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
766
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
767
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
768
|
-
post-processing.
|
769
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
770
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
771
|
-
- `Auto`: Process content automatically
|
772
|
-
- `LLM`: Use large language models for processing
|
773
|
-
- `Ignore`: Exclude segments from final output
|
774
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
775
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
776
|
-
configuration.
|
777
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
778
|
-
|
779
|
-
**Deprecated fields (for backwards compatibility):**
|
780
|
-
|
781
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
782
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
783
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
784
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
785
|
-
"""
|
786
|
-
|
787
|
-
|
788
|
-
class ConfigurationClientVersionManualSDK(BaseModel):
|
789
|
-
manual_sdk: str = FieldInfo(alias="ManualSdk")
|
790
|
-
"""Current manually-maintained SDK"""
|
791
|
-
|
792
|
-
|
793
|
-
class ConfigurationClientVersionGeneratedSDK(BaseModel):
|
794
|
-
generated_sdk: str = FieldInfo(alias="GeneratedSdk")
|
795
|
-
"""Future auto-generated SDK"""
|
796
|
-
|
797
|
-
|
798
|
-
ConfigurationClientVersion: TypeAlias = Union[
|
799
|
-
Literal["Legacy"], ConfigurationClientVersionManualSDK, ConfigurationClientVersionGeneratedSDK, None
|
800
|
-
]
|
801
|
-
|
802
|
-
|
803
|
-
class Configuration(BaseModel):
|
804
|
-
chunk_processing: ConfigurationChunkProcessing
|
805
|
-
"""Controls the setting for the chunking and post-processing of each chunk."""
|
806
|
-
|
807
|
-
error_handling: Literal["Fail", "Continue"]
|
808
|
-
"""Controls how errors are handled during processing:
|
809
|
-
|
810
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
811
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
812
|
-
LLM refusals etc.)
|
813
|
-
"""
|
814
|
-
|
815
|
-
llm_processing: ConfigurationLlmProcessing
|
816
|
-
"""Controls the LLM used for the task."""
|
817
|
-
|
818
|
-
ocr_strategy: Literal["All", "Auto"]
|
819
|
-
"""Controls the Optical Character Recognition (OCR) strategy.
|
820
|
-
|
821
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
822
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
823
|
-
text. When text layer is present the bounding boxes from the text layer are
|
824
|
-
used.
|
825
|
-
"""
|
826
|
-
|
827
|
-
segment_processing: ConfigurationSegmentProcessing
|
828
|
-
"""Defines how each segment type is handled when generating the final output.
|
829
|
-
|
830
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
831
|
-
|
832
|
-
- Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
|
833
|
-
- How the content is produced (rule-based vs. LLM).
|
834
|
-
- The output format (`Html` or `Markdown`).
|
835
|
-
|
836
|
-
Optional flags such as image **cropping**, **extended context**, and
|
837
|
-
**descriptions** further refine behaviour.
|
838
|
-
|
839
|
-
**Default strategy per segment**
|
840
|
-
|
841
|
-
- `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
|
842
|
-
(Markdown, description off)
|
843
|
-
- `Table` → **LLM** (HTML, description on)
|
844
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
845
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
846
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
847
|
-
|
848
|
-
**Strategy reference**
|
849
|
-
|
850
|
-
- **Auto** – rule-based content generation.
|
851
|
-
- **LLM** – generate content with an LLM.
|
852
|
-
- **Ignore** – exclude the segment entirely.
|
853
|
-
"""
|
854
|
-
|
855
|
-
segmentation_strategy: Literal["LayoutAnalysis", "Page"]
|
856
|
-
"""Controls the segmentation strategy:
|
857
|
-
|
858
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
859
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
860
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
861
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
862
|
-
layout element detection and only simple chunking.
|
863
|
-
"""
|
864
|
-
|
865
|
-
client_version: Optional[ConfigurationClientVersion] = None
|
866
|
-
"""Client version for backwards compatibility processing"""
|
867
|
-
|
868
|
-
expires_in: Optional[int] = None
|
869
|
-
"""
|
870
|
-
The number of seconds until task is deleted. Expired tasks can **not** be
|
871
|
-
updated, polled or accessed via web interface.
|
872
|
-
"""
|
873
|
-
|
874
|
-
high_resolution: Optional[bool] = None
|
875
|
-
"""Whether to use high-resolution images for cropping and post-processing."""
|
876
|
-
|
877
|
-
input_file_url: Optional[str] = None
|
878
|
-
"""The presigned URL of the input file."""
|
879
|
-
|
880
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]] = None
|
881
|
-
|
882
|
-
target_chunk_length: Optional[int] = None
|
883
|
-
"""The target number of words in each chunk.
|
884
|
-
|
885
|
-
If 0, each chunk will contain a single segment.
|
886
|
-
"""
|
887
|
-
|
888
|
-
|
889
|
-
class OutputChunkSegmentBbox(BaseModel):
|
890
|
-
height: float
|
891
|
-
"""The height of the bounding box."""
|
892
|
-
|
893
|
-
left: float
|
894
|
-
"""The left coordinate of the bounding box."""
|
895
|
-
|
896
|
-
top: float
|
897
|
-
"""The top coordinate of the bounding box."""
|
898
|
-
|
899
|
-
width: float
|
900
|
-
"""The width of the bounding box."""
|
901
|
-
|
902
|
-
|
903
|
-
class OutputChunkSegmentOcrBbox(BaseModel):
|
904
|
-
height: float
|
905
|
-
"""The height of the bounding box."""
|
906
|
-
|
907
|
-
left: float
|
908
|
-
"""The left coordinate of the bounding box."""
|
909
|
-
|
910
|
-
top: float
|
911
|
-
"""The top coordinate of the bounding box."""
|
912
|
-
|
913
|
-
width: float
|
914
|
-
"""The width of the bounding box."""
|
915
|
-
|
916
|
-
|
917
|
-
class OutputChunkSegmentOcr(BaseModel):
|
918
|
-
bbox: OutputChunkSegmentOcrBbox
|
919
|
-
"""Bounding box for an item. It is used for chunks, segments and OCR results."""
|
920
|
-
|
921
|
-
text: str
|
922
|
-
"""The recognized text of the OCR result."""
|
923
|
-
|
924
|
-
confidence: Optional[float] = None
|
925
|
-
"""The confidence score of the recognized text."""
|
926
|
-
|
927
|
-
|
928
|
-
class OutputChunkSegmentSSCellStyle(BaseModel):
|
929
|
-
align: Optional[Literal["Left", "Center", "Right", "Justify"]] = None
|
930
|
-
"""Alignment of the cell content."""
|
931
|
-
|
932
|
-
bg_color: Optional[str] = None
|
933
|
-
"""Background color of the cell (e.g., "#FFFFFF" or "#DAE3F3")."""
|
934
|
-
|
935
|
-
font_face: Optional[str] = None
|
936
|
-
"""Font face/family of the cell (e.g., "Arial", "Daytona")."""
|
937
|
-
|
938
|
-
is_bold: Optional[bool] = None
|
939
|
-
"""Whether the cell content is bold."""
|
940
|
-
|
941
|
-
text_color: Optional[str] = None
|
942
|
-
"""Text color of the cell (e.g., "#000000" or "red")."""
|
943
|
-
|
944
|
-
valign: Optional[Literal["Top", "Middle", "Bottom", "Baseline"]] = None
|
945
|
-
"""Vertical alignment of the cell content."""
|
946
|
-
|
947
|
-
|
948
|
-
class OutputChunkSegmentSSCell(BaseModel):
|
949
|
-
cell_id: str
|
950
|
-
"""The cell ID."""
|
951
|
-
|
952
|
-
range: str
|
953
|
-
"""Range of the cell."""
|
954
|
-
|
955
|
-
text: str
|
956
|
-
"""Text content of the cell."""
|
957
|
-
|
958
|
-
formula: Optional[str] = None
|
959
|
-
"""Formula of the cell."""
|
960
|
-
|
961
|
-
hyperlink: Optional[str] = None
|
962
|
-
"""Hyperlink URL if the cell contains a link (e.g., "https://www.chunkr.ai")."""
|
963
|
-
|
964
|
-
style: Optional[OutputChunkSegmentSSCellStyle] = None
|
965
|
-
"""Styling information for the cell including colors, fonts, and formatting."""
|
966
|
-
|
967
|
-
value: Optional[str] = None
|
968
|
-
"""The computed/evaluated value of the cell.
|
969
|
-
|
970
|
-
This represents the actual result after evaluating any formulas, as opposed to
|
971
|
-
the raw text content. For cells with formulas, this is the calculated result;
|
972
|
-
for cells with static content, this is typically the same as the text field.
|
973
|
-
|
974
|
-
Example: text might show "3.14" (formatted to 2 decimal places) while value
|
975
|
-
could be "3.141592653589793" (full precision).
|
976
|
-
"""
|
977
|
-
|
978
|
-
|
979
|
-
class OutputChunkSegmentSSHeaderBbox(BaseModel):
|
980
|
-
height: float
|
981
|
-
"""The height of the bounding box."""
|
982
|
-
|
983
|
-
left: float
|
984
|
-
"""The left coordinate of the bounding box."""
|
985
|
-
|
986
|
-
top: float
|
987
|
-
"""The top coordinate of the bounding box."""
|
988
|
-
|
989
|
-
width: float
|
990
|
-
"""The width of the bounding box."""
|
991
|
-
|
992
|
-
|
993
|
-
class OutputChunkSegmentSSHeaderOcrBbox(BaseModel):
|
994
|
-
height: float
|
995
|
-
"""The height of the bounding box."""
|
996
|
-
|
997
|
-
left: float
|
998
|
-
"""The left coordinate of the bounding box."""
|
999
|
-
|
1000
|
-
top: float
|
1001
|
-
"""The top coordinate of the bounding box."""
|
1002
|
-
|
1003
|
-
width: float
|
1004
|
-
"""The width of the bounding box."""
|
1005
|
-
|
1006
|
-
|
1007
|
-
class OutputChunkSegmentSSHeaderOcr(BaseModel):
|
1008
|
-
bbox: OutputChunkSegmentSSHeaderOcrBbox
|
1009
|
-
"""Bounding box for an item. It is used for chunks, segments and OCR results."""
|
1010
|
-
|
1011
|
-
text: str
|
1012
|
-
"""The recognized text of the OCR result."""
|
1013
|
-
|
1014
|
-
confidence: Optional[float] = None
|
1015
|
-
"""The confidence score of the recognized text."""
|
1016
|
-
|
1017
|
-
|
1018
|
-
class OutputChunkSegment(BaseModel):
|
1019
|
-
bbox: OutputChunkSegmentBbox
|
1020
|
-
"""Bounding box for an item. It is used for chunks, segments and OCR results."""
|
1021
|
-
|
1022
|
-
page_height: float
|
1023
|
-
"""Height of the page/sheet containing the segment."""
|
1024
|
-
|
1025
|
-
page_number: int
|
1026
|
-
"""Page number/Sheet number of the segment."""
|
1027
|
-
|
1028
|
-
page_width: float
|
1029
|
-
"""Width of the page/sheet containing the segment."""
|
1030
|
-
|
1031
|
-
segment_id: str
|
1032
|
-
"""Unique identifier for the segment."""
|
1033
|
-
|
1034
|
-
segment_type: Literal[
|
1035
|
-
"Caption",
|
1036
|
-
"Footnote",
|
1037
|
-
"Formula",
|
1038
|
-
"ListItem",
|
1039
|
-
"Page",
|
1040
|
-
"PageFooter",
|
1041
|
-
"PageHeader",
|
1042
|
-
"Picture",
|
1043
|
-
"SectionHeader",
|
1044
|
-
"Table",
|
1045
|
-
"Text",
|
1046
|
-
"Title",
|
1047
|
-
]
|
1048
|
-
"""
|
1049
|
-
All the possible types for a segment. Note: Different configurations will
|
1050
|
-
produce different types. Please refer to the documentation for more information.
|
1051
|
-
"""
|
1052
|
-
|
1053
|
-
confidence: Optional[float] = None
|
1054
|
-
"""Confidence score of the layout analysis model"""
|
1055
|
-
|
1056
|
-
content: Optional[str] = None
|
1057
|
-
"""
|
1058
|
-
Content of the segment, will be either HTML or Markdown, depending on format
|
1059
|
-
chosen.
|
1060
|
-
"""
|
1061
|
-
|
1062
|
-
description: Optional[str] = None
|
1063
|
-
"""Description of the segment, generated by the LLM."""
|
1064
|
-
|
1065
|
-
embed: Optional[str] = None
|
1066
|
-
"""Embeddable content of the segment."""
|
1067
|
-
|
1068
|
-
html: Optional[str] = None
|
1069
|
-
"""HTML representation of the segment."""
|
1070
|
-
|
1071
|
-
image: Optional[str] = None
|
1072
|
-
"""Presigned URL to the image of the segment."""
|
1073
|
-
|
1074
|
-
llm: Optional[str] = None
|
1075
|
-
"""LLM representation of the segment."""
|
1076
|
-
|
1077
|
-
markdown: Optional[str] = None
|
1078
|
-
"""Markdown representation of the segment."""
|
1079
|
-
|
1080
|
-
ocr: Optional[List[OutputChunkSegmentOcr]] = None
|
1081
|
-
"""OCR results for the segment."""
|
1082
|
-
|
1083
|
-
segment_length: Optional[int] = None
|
1084
|
-
"""Length of the segment in tokens."""
|
1085
|
-
|
1086
|
-
ss_cells: Optional[List[OutputChunkSegmentSSCell]] = None
|
1087
|
-
"""Cells of the segment. Only used for Spreadsheets."""
|
1088
|
-
|
1089
|
-
ss_header_bbox: Optional[OutputChunkSegmentSSHeaderBbox] = None
|
1090
|
-
"""Bounding box of the header of the segment, if found.
|
1091
|
-
|
1092
|
-
Only used for Spreadsheets.
|
1093
|
-
"""
|
1094
|
-
|
1095
|
-
ss_header_ocr: Optional[List[OutputChunkSegmentSSHeaderOcr]] = None
|
1096
|
-
"""OCR results of the header of the segment, if found. Only used for Spreadsheets."""
|
1097
|
-
|
1098
|
-
ss_header_range: Optional[str] = None
|
1099
|
-
"""
|
1100
|
-
Header range of the segment, if found. The header can have overlap with the
|
1101
|
-
`segment.range` if the table contains the header, if the header is located in a
|
1102
|
-
different sheet, the header range will have no overlap with the `segment.range`.
|
1103
|
-
Only used for Spreadsheets.
|
1104
|
-
"""
|
1105
|
-
|
1106
|
-
ss_header_text: Optional[str] = None
|
1107
|
-
"""Text content of the header of the segment, if found.
|
1108
|
-
|
1109
|
-
Only used for Spreadsheets.
|
1110
|
-
"""
|
1111
|
-
|
1112
|
-
ss_range: Optional[str] = None
|
1113
|
-
"""Range of the segment in Excel notation (e.g., A1:B5).
|
1114
|
-
|
1115
|
-
Only used for Spreadsheets.
|
1116
|
-
"""
|
1117
|
-
|
1118
|
-
ss_sheet_name: Optional[str] = None
|
1119
|
-
"""Name of the sheet containing the segment. Only used for Spreadsheets."""
|
1120
|
-
|
1121
|
-
text: Optional[str] = None
|
1122
|
-
"""Text content of the segment. Calculated by the OCR results."""
|
1123
|
-
|
1124
|
-
|
1125
|
-
class OutputChunk(BaseModel):
|
1126
|
-
chunk_length: int
|
1127
|
-
"""The total number of tokens in the `embed` field of the chunk.
|
1128
|
-
|
1129
|
-
Calculated by the `tokenizer`.
|
1130
|
-
"""
|
1131
|
-
|
1132
|
-
segments: List[OutputChunkSegment]
|
1133
|
-
"""
|
1134
|
-
Collection of document segments that form this chunk. When
|
1135
|
-
`target_chunk_length` > 0, contains the maximum number of segments that fit
|
1136
|
-
within that length (segments remain intact). Otherwise, contains exactly one
|
1137
|
-
segment.
|
1138
|
-
"""
|
1139
|
-
|
1140
|
-
chunk_id: Optional[str] = None
|
1141
|
-
"""The unique identifier for the chunk."""
|
1142
|
-
|
1143
|
-
content: Optional[str] = None
|
1144
|
-
"""The content of the chunk.
|
1145
|
-
|
1146
|
-
This is the text that is generated by combining the `content` field from each
|
1147
|
-
segment. Can be used provided as context to the LLM.
|
1148
|
-
"""
|
1149
|
-
|
1150
|
-
embed: Optional[str] = None
|
1151
|
-
"""Suggested text to be embedded for the chunk.
|
1152
|
-
|
1153
|
-
This text is generated by combining the `embed` field from each segment.
|
1154
|
-
"""
|
1155
|
-
|
1156
|
-
|
1157
|
-
class OutputPage(BaseModel):
|
1158
|
-
image: str
|
1159
|
-
"""The presigned URL of the page/sheet image."""
|
1160
|
-
|
1161
|
-
page_height: float
|
1162
|
-
"""The number of pages in the file."""
|
1163
|
-
|
1164
|
-
page_number: int
|
1165
|
-
"""The number of pages in the file."""
|
1166
|
-
|
1167
|
-
page_width: float
|
1168
|
-
"""The number of pages in the file."""
|
1169
|
-
|
1170
|
-
dpi: Optional[float] = None
|
1171
|
-
"""DPI of the page/sheet. All cropped images are scaled to this DPI."""
|
1172
|
-
|
1173
|
-
ss_sheet_name: Optional[str] = None
|
1174
|
-
"""The name of the sheet containing the page. Only used for Spreadsheets."""
|
1175
|
-
|
1176
|
-
|
1177
|
-
class Output(BaseModel):
|
1178
|
-
chunks: List[OutputChunk]
|
1179
|
-
"""Collection of document chunks, where each chunk contains one or more segments"""
|
1180
|
-
|
1181
|
-
file_name: Optional[str] = None
|
1182
|
-
"""The name of the file."""
|
1183
|
-
|
1184
|
-
mime_type: Optional[str] = None
|
1185
|
-
"""The MIME type of the file."""
|
1186
|
-
|
1187
|
-
page_count: Optional[int] = None
|
1188
|
-
"""The number of pages in the file."""
|
1189
|
-
|
1190
|
-
pages: Optional[List[OutputPage]] = None
|
1191
|
-
"""The pages of the file. Includes the image and metadata for each page."""
|
1192
|
-
|
1193
|
-
pdf_url: Optional[str] = None
|
1194
|
-
"""The presigned URL of the PDF file."""
|
1195
|
-
|
1196
|
-
|
1197
|
-
class Task(BaseModel):
|
1198
|
-
configuration: Configuration
|
1199
|
-
|
1200
|
-
created_at: datetime
|
1201
|
-
"""The date and time when the task was created and queued."""
|
1202
|
-
|
1203
|
-
message: str
|
1204
|
-
"""A message describing the task's status or any errors that occurred."""
|
1205
|
-
|
1206
|
-
status: Literal["Starting", "Processing", "Succeeded", "Failed", "Cancelled"]
|
1207
|
-
"""The status of the task."""
|
1208
|
-
|
1209
|
-
task_id: str
|
1210
|
-
"""The unique identifier for the task."""
|
1211
|
-
|
1212
|
-
expires_at: Optional[datetime] = None
|
1213
|
-
"""The date and time when the task will expire."""
|
1214
|
-
|
1215
|
-
finished_at: Optional[datetime] = None
|
1216
|
-
"""The date and time when the task was finished."""
|
1217
|
-
|
1218
|
-
output: Optional[Output] = None
|
1219
|
-
"""The processed results of a document analysis task"""
|
1220
|
-
|
1221
|
-
started_at: Optional[datetime] = None
|
1222
|
-
"""The date and time when the task was started."""
|
1223
|
-
|
1224
|
-
task_url: Optional[str] = None
|
1225
|
-
"""The presigned URL of the task."""
|