chunkr-ai 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/_client.py +18 -9
- chunkr_ai/_files.py +1 -1
- chunkr_ai/_version.py +1 -1
- chunkr_ai/pagination.py +61 -1
- chunkr_ai/resources/__init__.py +27 -13
- chunkr_ai/resources/files.py +712 -0
- chunkr_ai/resources/tasks/__init__.py +33 -0
- chunkr_ai/resources/tasks/parse.py +612 -0
- chunkr_ai/resources/tasks/tasks.py +596 -0
- chunkr_ai/types/__init__.py +7 -19
- chunkr_ai/types/delete.py +10 -0
- chunkr_ai/types/file.py +30 -0
- chunkr_ai/types/file_create_params.py +17 -0
- chunkr_ai/types/file_list_params.py +28 -0
- chunkr_ai/types/file_url.py +15 -0
- chunkr_ai/types/file_url_params.py +15 -0
- chunkr_ai/types/files_page_response.py +20 -0
- chunkr_ai/types/task.py +866 -27
- chunkr_ai/types/tasks/__init__.py +6 -0
- chunkr_ai/types/tasks/parse_create_params.py +844 -0
- chunkr_ai/types/tasks/parse_update_params.py +838 -0
- {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/METADATA +39 -21
- chunkr_ai-0.1.0a3.dist-info/RECORD +52 -0
- chunkr_ai/resources/task.py +0 -1166
- chunkr_ai/types/auto_generation_config.py +0 -39
- chunkr_ai/types/auto_generation_config_param.py +0 -39
- chunkr_ai/types/bounding_box.py +0 -19
- chunkr_ai/types/chunk_processing.py +0 -40
- chunkr_ai/types/chunk_processing_param.py +0 -42
- chunkr_ai/types/ignore_generation_config.py +0 -39
- chunkr_ai/types/ignore_generation_config_param.py +0 -39
- chunkr_ai/types/llm_generation_config.py +0 -39
- chunkr_ai/types/llm_generation_config_param.py +0 -39
- chunkr_ai/types/llm_processing.py +0 -36
- chunkr_ai/types/llm_processing_param.py +0 -36
- chunkr_ai/types/picture_generation_config.py +0 -39
- chunkr_ai/types/picture_generation_config_param.py +0 -39
- chunkr_ai/types/segment_processing.py +0 -280
- chunkr_ai/types/segment_processing_param.py +0 -281
- chunkr_ai/types/table_generation_config.py +0 -39
- chunkr_ai/types/table_generation_config_param.py +0 -39
- chunkr_ai/types/task_parse_params.py +0 -90
- chunkr_ai/types/task_update_params.py +0 -90
- chunkr_ai-0.1.0a1.dist-info/RECORD +0 -58
- {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
chunkr_ai/types/task.py
CHANGED
@@ -1,31 +1,807 @@
|
|
1
1
|
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
2
|
|
3
|
-
from typing import List, Optional
|
3
|
+
from typing import List, Union, Optional
|
4
4
|
from datetime import datetime
|
5
|
-
from typing_extensions import Literal
|
5
|
+
from typing_extensions import Literal, TypeAlias
|
6
|
+
|
7
|
+
from pydantic import Field as FieldInfo
|
6
8
|
|
7
9
|
from .._models import BaseModel
|
8
|
-
from .bounding_box import BoundingBox
|
9
|
-
from .llm_processing import LlmProcessing
|
10
|
-
from .chunk_processing import ChunkProcessing
|
11
|
-
from .segment_processing import SegmentProcessing
|
12
10
|
|
13
11
|
__all__ = [
|
14
12
|
"Task",
|
15
13
|
"Configuration",
|
14
|
+
"ConfigurationChunkProcessing",
|
15
|
+
"ConfigurationChunkProcessingTokenizer",
|
16
|
+
"ConfigurationChunkProcessingTokenizerEnum",
|
17
|
+
"ConfigurationChunkProcessingTokenizerString",
|
18
|
+
"ConfigurationLlmProcessing",
|
19
|
+
"ConfigurationLlmProcessingFallbackStrategy",
|
20
|
+
"ConfigurationLlmProcessingFallbackStrategyModel",
|
21
|
+
"ConfigurationSegmentProcessing",
|
22
|
+
"ConfigurationSegmentProcessingCaption",
|
23
|
+
"ConfigurationSegmentProcessingFootnote",
|
24
|
+
"ConfigurationSegmentProcessingFormula",
|
25
|
+
"ConfigurationSegmentProcessingListItem",
|
26
|
+
"ConfigurationSegmentProcessingPage",
|
27
|
+
"ConfigurationSegmentProcessingPageFooter",
|
28
|
+
"ConfigurationSegmentProcessingPageHeader",
|
29
|
+
"ConfigurationSegmentProcessingPicture",
|
30
|
+
"ConfigurationSegmentProcessingSectionHeader",
|
31
|
+
"ConfigurationSegmentProcessingTable",
|
32
|
+
"ConfigurationSegmentProcessingText",
|
33
|
+
"ConfigurationSegmentProcessingTitle",
|
34
|
+
"ConfigurationClientVersion",
|
35
|
+
"ConfigurationClientVersionManualSDK",
|
36
|
+
"ConfigurationClientVersionGeneratedSDK",
|
16
37
|
"Output",
|
17
38
|
"OutputChunk",
|
18
39
|
"OutputChunkSegment",
|
40
|
+
"OutputChunkSegmentBbox",
|
19
41
|
"OutputChunkSegmentOcr",
|
42
|
+
"OutputChunkSegmentOcrBbox",
|
20
43
|
"OutputChunkSegmentSSCell",
|
21
44
|
"OutputChunkSegmentSSCellStyle",
|
45
|
+
"OutputChunkSegmentSSHeaderBbox",
|
22
46
|
"OutputChunkSegmentSSHeaderOcr",
|
47
|
+
"OutputChunkSegmentSSHeaderOcrBbox",
|
23
48
|
"OutputPage",
|
24
49
|
]
|
25
50
|
|
26
51
|
|
52
|
+
class ConfigurationChunkProcessingTokenizerEnum(BaseModel):
|
53
|
+
enum: Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"] = FieldInfo(alias="Enum")
|
54
|
+
"""Use one of the predefined tokenizer types"""
|
55
|
+
|
56
|
+
|
57
|
+
class ConfigurationChunkProcessingTokenizerString(BaseModel):
|
58
|
+
string: str = FieldInfo(alias="String")
|
59
|
+
"""
|
60
|
+
Use any Hugging Face tokenizer by specifying its model ID Examples:
|
61
|
+
"Qwen/Qwen-tokenizer", "facebook/bart-large"
|
62
|
+
"""
|
63
|
+
|
64
|
+
|
65
|
+
ConfigurationChunkProcessingTokenizer: TypeAlias = Union[
|
66
|
+
ConfigurationChunkProcessingTokenizerEnum, ConfigurationChunkProcessingTokenizerString
|
67
|
+
]
|
68
|
+
|
69
|
+
|
70
|
+
class ConfigurationChunkProcessing(BaseModel):
|
71
|
+
ignore_headers_and_footers: Optional[bool] = None
|
72
|
+
"""DEPRECATED: use `segment_processing.ignore` instead"""
|
73
|
+
|
74
|
+
target_length: Optional[int] = None
|
75
|
+
"""The target number of words in each chunk.
|
76
|
+
|
77
|
+
If 0, each chunk will contain a single segment.
|
78
|
+
"""
|
79
|
+
|
80
|
+
tokenizer: Optional[ConfigurationChunkProcessingTokenizer] = None
|
81
|
+
"""The tokenizer to use for the chunking process."""
|
82
|
+
|
83
|
+
|
84
|
+
class ConfigurationLlmProcessingFallbackStrategyModel(BaseModel):
|
85
|
+
model: str = FieldInfo(alias="Model")
|
86
|
+
"""Use a specific model as fallback"""
|
87
|
+
|
88
|
+
|
89
|
+
ConfigurationLlmProcessingFallbackStrategy: TypeAlias = Union[
|
90
|
+
Literal["None", "Default"], ConfigurationLlmProcessingFallbackStrategyModel
|
91
|
+
]
|
92
|
+
|
93
|
+
|
94
|
+
class ConfigurationLlmProcessing(BaseModel):
|
95
|
+
fallback_strategy: Optional[ConfigurationLlmProcessingFallbackStrategy] = None
|
96
|
+
"""The fallback strategy to use for the LLMs in the task."""
|
97
|
+
|
98
|
+
llm_model_id: Optional[str] = None
|
99
|
+
"""The ID of the model to use for the task.
|
100
|
+
|
101
|
+
If not provided, the default model will be used. Please check the documentation
|
102
|
+
for the model you want to use.
|
103
|
+
"""
|
104
|
+
|
105
|
+
max_completion_tokens: Optional[int] = None
|
106
|
+
"""The maximum number of tokens to generate."""
|
107
|
+
|
108
|
+
temperature: Optional[float] = None
|
109
|
+
"""The temperature to use for the LLM."""
|
110
|
+
|
111
|
+
|
112
|
+
class ConfigurationSegmentProcessingCaption(BaseModel):
|
113
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
114
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
115
|
+
|
116
|
+
- `All` crops all images in the item
|
117
|
+
- `Auto` crops images only if required for post-processing
|
118
|
+
"""
|
119
|
+
|
120
|
+
description: Optional[bool] = None
|
121
|
+
"""Generate LLM descriptions for this segment"""
|
122
|
+
|
123
|
+
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
124
|
+
"""**DEPRECATED**: `embed` field is auto populated"""
|
125
|
+
|
126
|
+
extended_context: Optional[bool] = None
|
127
|
+
"""Use the full page image as context for LLM generation"""
|
128
|
+
|
129
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
130
|
+
|
131
|
+
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
132
|
+
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
133
|
+
|
134
|
+
llm: Optional[str] = None
|
135
|
+
"""**DEPRECATED**: use description instead"""
|
136
|
+
|
137
|
+
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
138
|
+
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
139
|
+
|
140
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
141
|
+
|
142
|
+
|
143
|
+
class ConfigurationSegmentProcessingFootnote(BaseModel):
|
144
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
145
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
146
|
+
|
147
|
+
- `All` crops all images in the item
|
148
|
+
- `Auto` crops images only if required for post-processing
|
149
|
+
"""
|
150
|
+
|
151
|
+
description: Optional[bool] = None
|
152
|
+
"""Generate LLM descriptions for this segment"""
|
153
|
+
|
154
|
+
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
155
|
+
"""**DEPRECATED**: `embed` field is auto populated"""
|
156
|
+
|
157
|
+
extended_context: Optional[bool] = None
|
158
|
+
"""Use the full page image as context for LLM generation"""
|
159
|
+
|
160
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
161
|
+
|
162
|
+
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
163
|
+
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
164
|
+
|
165
|
+
llm: Optional[str] = None
|
166
|
+
"""**DEPRECATED**: use description instead"""
|
167
|
+
|
168
|
+
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
169
|
+
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
170
|
+
|
171
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
172
|
+
|
173
|
+
|
174
|
+
class ConfigurationSegmentProcessingFormula(BaseModel):
|
175
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
176
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
177
|
+
|
178
|
+
- `All` crops all images in the item
|
179
|
+
- `Auto` crops images only if required for post-processing
|
180
|
+
"""
|
181
|
+
|
182
|
+
description: Optional[bool] = None
|
183
|
+
"""Generate LLM descriptions for this segment"""
|
184
|
+
|
185
|
+
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
186
|
+
"""**DEPRECATED**: `embed` field is auto populated"""
|
187
|
+
|
188
|
+
extended_context: Optional[bool] = None
|
189
|
+
"""Use the full page image as context for LLM generation"""
|
190
|
+
|
191
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
192
|
+
|
193
|
+
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
194
|
+
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
195
|
+
|
196
|
+
llm: Optional[str] = None
|
197
|
+
"""**DEPRECATED**: use description instead"""
|
198
|
+
|
199
|
+
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
200
|
+
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
201
|
+
|
202
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
203
|
+
|
204
|
+
|
205
|
+
class ConfigurationSegmentProcessingListItem(BaseModel):
|
206
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
207
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
208
|
+
|
209
|
+
- `All` crops all images in the item
|
210
|
+
- `Auto` crops images only if required for post-processing
|
211
|
+
"""
|
212
|
+
|
213
|
+
description: Optional[bool] = None
|
214
|
+
"""Generate LLM descriptions for this segment"""
|
215
|
+
|
216
|
+
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
217
|
+
"""**DEPRECATED**: `embed` field is auto populated"""
|
218
|
+
|
219
|
+
extended_context: Optional[bool] = None
|
220
|
+
"""Use the full page image as context for LLM generation"""
|
221
|
+
|
222
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
223
|
+
|
224
|
+
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
225
|
+
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
226
|
+
|
227
|
+
llm: Optional[str] = None
|
228
|
+
"""**DEPRECATED**: use description instead"""
|
229
|
+
|
230
|
+
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
231
|
+
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
232
|
+
|
233
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
234
|
+
|
235
|
+
|
236
|
+
class ConfigurationSegmentProcessingPage(BaseModel):
|
237
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
238
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
239
|
+
|
240
|
+
- `All` crops all images in the item
|
241
|
+
- `Auto` crops images only if required for post-processing
|
242
|
+
"""
|
243
|
+
|
244
|
+
description: Optional[bool] = None
|
245
|
+
"""Generate LLM descriptions for this segment"""
|
246
|
+
|
247
|
+
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
248
|
+
"""**DEPRECATED**: `embed` field is auto populated"""
|
249
|
+
|
250
|
+
extended_context: Optional[bool] = None
|
251
|
+
"""Use the full page image as context for LLM generation"""
|
252
|
+
|
253
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
254
|
+
|
255
|
+
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
256
|
+
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
257
|
+
|
258
|
+
llm: Optional[str] = None
|
259
|
+
"""**DEPRECATED**: use description instead"""
|
260
|
+
|
261
|
+
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
262
|
+
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
263
|
+
|
264
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
265
|
+
|
266
|
+
|
267
|
+
class ConfigurationSegmentProcessingPageFooter(BaseModel):
|
268
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
269
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
270
|
+
|
271
|
+
- `All` crops all images in the item
|
272
|
+
- `Auto` crops images only if required for post-processing
|
273
|
+
"""
|
274
|
+
|
275
|
+
description: Optional[bool] = None
|
276
|
+
"""Generate LLM descriptions for this segment"""
|
277
|
+
|
278
|
+
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
279
|
+
"""**DEPRECATED**: `embed` field is auto populated"""
|
280
|
+
|
281
|
+
extended_context: Optional[bool] = None
|
282
|
+
"""Use the full page image as context for LLM generation"""
|
283
|
+
|
284
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
285
|
+
|
286
|
+
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
287
|
+
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
288
|
+
|
289
|
+
llm: Optional[str] = None
|
290
|
+
"""**DEPRECATED**: use description instead"""
|
291
|
+
|
292
|
+
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
293
|
+
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
294
|
+
|
295
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
296
|
+
|
297
|
+
|
298
|
+
class ConfigurationSegmentProcessingPageHeader(BaseModel):
|
299
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
300
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
301
|
+
|
302
|
+
- `All` crops all images in the item
|
303
|
+
- `Auto` crops images only if required for post-processing
|
304
|
+
"""
|
305
|
+
|
306
|
+
description: Optional[bool] = None
|
307
|
+
"""Generate LLM descriptions for this segment"""
|
308
|
+
|
309
|
+
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
310
|
+
"""**DEPRECATED**: `embed` field is auto populated"""
|
311
|
+
|
312
|
+
extended_context: Optional[bool] = None
|
313
|
+
"""Use the full page image as context for LLM generation"""
|
314
|
+
|
315
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
316
|
+
|
317
|
+
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
318
|
+
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
319
|
+
|
320
|
+
llm: Optional[str] = None
|
321
|
+
"""**DEPRECATED**: use description instead"""
|
322
|
+
|
323
|
+
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
324
|
+
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
325
|
+
|
326
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
327
|
+
|
328
|
+
|
329
|
+
class ConfigurationSegmentProcessingPicture(BaseModel):
|
330
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
331
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
332
|
+
|
333
|
+
- `All` crops all images in the item
|
334
|
+
- `Auto` crops images only if required for post-processing
|
335
|
+
"""
|
336
|
+
|
337
|
+
description: Optional[bool] = None
|
338
|
+
"""Generate LLM descriptions for this segment"""
|
339
|
+
|
340
|
+
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
341
|
+
"""**DEPRECATED**: `embed` field is auto populated"""
|
342
|
+
|
343
|
+
extended_context: Optional[bool] = None
|
344
|
+
"""Use the full page image as context for LLM generation"""
|
345
|
+
|
346
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
347
|
+
|
348
|
+
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
349
|
+
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
350
|
+
|
351
|
+
llm: Optional[str] = None
|
352
|
+
"""**DEPRECATED**: use description instead"""
|
353
|
+
|
354
|
+
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
355
|
+
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
356
|
+
|
357
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
358
|
+
|
359
|
+
|
360
|
+
class ConfigurationSegmentProcessingSectionHeader(BaseModel):
|
361
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
362
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
363
|
+
|
364
|
+
- `All` crops all images in the item
|
365
|
+
- `Auto` crops images only if required for post-processing
|
366
|
+
"""
|
367
|
+
|
368
|
+
description: Optional[bool] = None
|
369
|
+
"""Generate LLM descriptions for this segment"""
|
370
|
+
|
371
|
+
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
372
|
+
"""**DEPRECATED**: `embed` field is auto populated"""
|
373
|
+
|
374
|
+
extended_context: Optional[bool] = None
|
375
|
+
"""Use the full page image as context for LLM generation"""
|
376
|
+
|
377
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
378
|
+
|
379
|
+
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
380
|
+
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
381
|
+
|
382
|
+
llm: Optional[str] = None
|
383
|
+
"""**DEPRECATED**: use description instead"""
|
384
|
+
|
385
|
+
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
386
|
+
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
387
|
+
|
388
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
389
|
+
|
390
|
+
|
391
|
+
class ConfigurationSegmentProcessingTable(BaseModel):
|
392
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
393
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
394
|
+
|
395
|
+
- `All` crops all images in the item
|
396
|
+
- `Auto` crops images only if required for post-processing
|
397
|
+
"""
|
398
|
+
|
399
|
+
description: Optional[bool] = None
|
400
|
+
"""Generate LLM descriptions for this segment"""
|
401
|
+
|
402
|
+
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
403
|
+
"""**DEPRECATED**: `embed` field is auto populated"""
|
404
|
+
|
405
|
+
extended_context: Optional[bool] = None
|
406
|
+
"""Use the full page image as context for LLM generation"""
|
407
|
+
|
408
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
409
|
+
|
410
|
+
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
411
|
+
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
412
|
+
|
413
|
+
llm: Optional[str] = None
|
414
|
+
"""**DEPRECATED**: use description instead"""
|
415
|
+
|
416
|
+
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
417
|
+
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
418
|
+
|
419
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
420
|
+
|
421
|
+
|
422
|
+
class ConfigurationSegmentProcessingText(BaseModel):
|
423
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
424
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
425
|
+
|
426
|
+
- `All` crops all images in the item
|
427
|
+
- `Auto` crops images only if required for post-processing
|
428
|
+
"""
|
429
|
+
|
430
|
+
description: Optional[bool] = None
|
431
|
+
"""Generate LLM descriptions for this segment"""
|
432
|
+
|
433
|
+
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
434
|
+
"""**DEPRECATED**: `embed` field is auto populated"""
|
435
|
+
|
436
|
+
extended_context: Optional[bool] = None
|
437
|
+
"""Use the full page image as context for LLM generation"""
|
438
|
+
|
439
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
440
|
+
|
441
|
+
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
442
|
+
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
443
|
+
|
444
|
+
llm: Optional[str] = None
|
445
|
+
"""**DEPRECATED**: use description instead"""
|
446
|
+
|
447
|
+
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
448
|
+
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
449
|
+
|
450
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
451
|
+
|
452
|
+
|
453
|
+
class ConfigurationSegmentProcessingTitle(BaseModel):
|
454
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
455
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
456
|
+
|
457
|
+
- `All` crops all images in the item
|
458
|
+
- `Auto` crops images only if required for post-processing
|
459
|
+
"""
|
460
|
+
|
461
|
+
description: Optional[bool] = None
|
462
|
+
"""Generate LLM descriptions for this segment"""
|
463
|
+
|
464
|
+
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
|
465
|
+
"""**DEPRECATED**: `embed` field is auto populated"""
|
466
|
+
|
467
|
+
extended_context: Optional[bool] = None
|
468
|
+
"""Use the full page image as context for LLM generation"""
|
469
|
+
|
470
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
471
|
+
|
472
|
+
html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
473
|
+
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
474
|
+
|
475
|
+
llm: Optional[str] = None
|
476
|
+
"""**DEPRECATED**: use description instead"""
|
477
|
+
|
478
|
+
markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
479
|
+
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
480
|
+
|
481
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
482
|
+
|
483
|
+
|
484
|
+
class ConfigurationSegmentProcessing(BaseModel):
|
485
|
+
caption: Optional[ConfigurationSegmentProcessingCaption] = FieldInfo(alias="Caption", default=None)
|
486
|
+
"""Controls the processing and generation for the segment.
|
487
|
+
|
488
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
489
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
490
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
491
|
+
post-processing.
|
492
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
493
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
494
|
+
- `Auto`: Process content automatically
|
495
|
+
- `LLM`: Use large language models for processing
|
496
|
+
- `Ignore`: Exclude segments from final output
|
497
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
498
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
499
|
+
configuration.
|
500
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
501
|
+
|
502
|
+
**Deprecated fields (for backwards compatibility):**
|
503
|
+
|
504
|
+
- `llm` - **DEPRECATED**: Use `description` instead
|
505
|
+
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
506
|
+
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
507
|
+
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
508
|
+
"""
|
509
|
+
|
510
|
+
footnote: Optional[ConfigurationSegmentProcessingFootnote] = FieldInfo(alias="Footnote", default=None)
|
511
|
+
"""Controls the processing and generation for the segment.
|
512
|
+
|
513
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
514
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
515
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
516
|
+
post-processing.
|
517
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
518
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
519
|
+
- `Auto`: Process content automatically
|
520
|
+
- `LLM`: Use large language models for processing
|
521
|
+
- `Ignore`: Exclude segments from final output
|
522
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
523
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
524
|
+
configuration.
|
525
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
526
|
+
|
527
|
+
**Deprecated fields (for backwards compatibility):**
|
528
|
+
|
529
|
+
- `llm` - **DEPRECATED**: Use `description` instead
|
530
|
+
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
531
|
+
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
532
|
+
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
533
|
+
"""
|
534
|
+
|
535
|
+
formula: Optional[ConfigurationSegmentProcessingFormula] = FieldInfo(alias="Formula", default=None)
|
536
|
+
"""Controls the processing and generation for the segment.
|
537
|
+
|
538
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
539
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
540
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
541
|
+
post-processing.
|
542
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
543
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
544
|
+
- `Auto`: Process content automatically
|
545
|
+
- `LLM`: Use large language models for processing
|
546
|
+
- `Ignore`: Exclude segments from final output
|
547
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
548
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
549
|
+
configuration.
|
550
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
551
|
+
|
552
|
+
**Deprecated fields (for backwards compatibility):**
|
553
|
+
|
554
|
+
- `llm` - **DEPRECATED**: Use `description` instead
|
555
|
+
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
556
|
+
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
557
|
+
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
558
|
+
"""
|
559
|
+
|
560
|
+
list_item: Optional[ConfigurationSegmentProcessingListItem] = FieldInfo(alias="ListItem", default=None)
|
561
|
+
"""Controls the processing and generation for the segment.
|
562
|
+
|
563
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
564
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
565
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
566
|
+
post-processing.
|
567
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
568
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
569
|
+
- `Auto`: Process content automatically
|
570
|
+
- `LLM`: Use large language models for processing
|
571
|
+
- `Ignore`: Exclude segments from final output
|
572
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
573
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
574
|
+
configuration.
|
575
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
576
|
+
|
577
|
+
**Deprecated fields (for backwards compatibility):**
|
578
|
+
|
579
|
+
- `llm` - **DEPRECATED**: Use `description` instead
|
580
|
+
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
581
|
+
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
582
|
+
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
583
|
+
"""
|
584
|
+
|
585
|
+
page: Optional[ConfigurationSegmentProcessingPage] = FieldInfo(alias="Page", default=None)
|
586
|
+
"""Controls the processing and generation for the segment.
|
587
|
+
|
588
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
589
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
590
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
591
|
+
post-processing.
|
592
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
593
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
594
|
+
- `Auto`: Process content automatically
|
595
|
+
- `LLM`: Use large language models for processing
|
596
|
+
- `Ignore`: Exclude segments from final output
|
597
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
598
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
599
|
+
configuration.
|
600
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
601
|
+
|
602
|
+
**Deprecated fields (for backwards compatibility):**
|
603
|
+
|
604
|
+
- `llm` - **DEPRECATED**: Use `description` instead
|
605
|
+
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
606
|
+
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
607
|
+
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
608
|
+
"""
|
609
|
+
|
610
|
+
page_footer: Optional[ConfigurationSegmentProcessingPageFooter] = FieldInfo(alias="PageFooter", default=None)
|
611
|
+
"""Controls the processing and generation for the segment.
|
612
|
+
|
613
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
614
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
615
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
616
|
+
post-processing.
|
617
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
618
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
619
|
+
- `Auto`: Process content automatically
|
620
|
+
- `LLM`: Use large language models for processing
|
621
|
+
- `Ignore`: Exclude segments from final output
|
622
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
623
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
624
|
+
configuration.
|
625
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
626
|
+
|
627
|
+
**Deprecated fields (for backwards compatibility):**
|
628
|
+
|
629
|
+
- `llm` - **DEPRECATED**: Use `description` instead
|
630
|
+
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
631
|
+
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
632
|
+
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
633
|
+
"""
|
634
|
+
|
635
|
+
page_header: Optional[ConfigurationSegmentProcessingPageHeader] = FieldInfo(alias="PageHeader", default=None)
|
636
|
+
"""Controls the processing and generation for the segment.
|
637
|
+
|
638
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
639
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
640
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
641
|
+
post-processing.
|
642
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
643
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
644
|
+
- `Auto`: Process content automatically
|
645
|
+
- `LLM`: Use large language models for processing
|
646
|
+
- `Ignore`: Exclude segments from final output
|
647
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
648
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
649
|
+
configuration.
|
650
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
651
|
+
|
652
|
+
**Deprecated fields (for backwards compatibility):**
|
653
|
+
|
654
|
+
- `llm` - **DEPRECATED**: Use `description` instead
|
655
|
+
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
656
|
+
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
657
|
+
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
658
|
+
"""
|
659
|
+
|
660
|
+
picture: Optional[ConfigurationSegmentProcessingPicture] = FieldInfo(alias="Picture", default=None)
|
661
|
+
"""Controls the processing and generation for the segment.
|
662
|
+
|
663
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
664
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
665
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
666
|
+
post-processing.
|
667
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
668
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
669
|
+
- `Auto`: Process content automatically
|
670
|
+
- `LLM`: Use large language models for processing
|
671
|
+
- `Ignore`: Exclude segments from final output
|
672
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
673
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
674
|
+
configuration.
|
675
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
676
|
+
|
677
|
+
**Deprecated fields (for backwards compatibility):**
|
678
|
+
|
679
|
+
- `llm` - **DEPRECATED**: Use `description` instead
|
680
|
+
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
681
|
+
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
682
|
+
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
683
|
+
"""
|
684
|
+
|
685
|
+
section_header: Optional[ConfigurationSegmentProcessingSectionHeader] = FieldInfo(
|
686
|
+
alias="SectionHeader", default=None
|
687
|
+
)
|
688
|
+
"""Controls the processing and generation for the segment.
|
689
|
+
|
690
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
691
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
692
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
693
|
+
post-processing.
|
694
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
695
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
696
|
+
- `Auto`: Process content automatically
|
697
|
+
- `LLM`: Use large language models for processing
|
698
|
+
- `Ignore`: Exclude segments from final output
|
699
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
700
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
701
|
+
configuration.
|
702
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
703
|
+
|
704
|
+
**Deprecated fields (for backwards compatibility):**
|
705
|
+
|
706
|
+
- `llm` - **DEPRECATED**: Use `description` instead
|
707
|
+
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
708
|
+
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
709
|
+
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
710
|
+
"""
|
711
|
+
|
712
|
+
table: Optional[ConfigurationSegmentProcessingTable] = FieldInfo(alias="Table", default=None)
|
713
|
+
"""Controls the processing and generation for the segment.
|
714
|
+
|
715
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
716
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
717
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
718
|
+
post-processing.
|
719
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
720
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
721
|
+
- `Auto`: Process content automatically
|
722
|
+
- `LLM`: Use large language models for processing
|
723
|
+
- `Ignore`: Exclude segments from final output
|
724
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
725
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
726
|
+
configuration.
|
727
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
728
|
+
|
729
|
+
**Deprecated fields (for backwards compatibility):**
|
730
|
+
|
731
|
+
- `llm` - **DEPRECATED**: Use `description` instead
|
732
|
+
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
733
|
+
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
734
|
+
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
735
|
+
"""
|
736
|
+
|
737
|
+
text: Optional[ConfigurationSegmentProcessingText] = FieldInfo(alias="Text", default=None)
|
738
|
+
"""Controls the processing and generation for the segment.
|
739
|
+
|
740
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
741
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
742
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
743
|
+
post-processing.
|
744
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
745
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
746
|
+
- `Auto`: Process content automatically
|
747
|
+
- `LLM`: Use large language models for processing
|
748
|
+
- `Ignore`: Exclude segments from final output
|
749
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
750
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
751
|
+
configuration.
|
752
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
753
|
+
|
754
|
+
**Deprecated fields (for backwards compatibility):**
|
755
|
+
|
756
|
+
- `llm` - **DEPRECATED**: Use `description` instead
|
757
|
+
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
758
|
+
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
759
|
+
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
760
|
+
"""
|
761
|
+
|
762
|
+
title: Optional[ConfigurationSegmentProcessingTitle] = FieldInfo(alias="Title", default=None)
|
763
|
+
"""Controls the processing and generation for the segment.
|
764
|
+
|
765
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
766
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
767
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
768
|
+
post-processing.
|
769
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
770
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
771
|
+
- `Auto`: Process content automatically
|
772
|
+
- `LLM`: Use large language models for processing
|
773
|
+
- `Ignore`: Exclude segments from final output
|
774
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
775
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
776
|
+
configuration.
|
777
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
778
|
+
|
779
|
+
**Deprecated fields (for backwards compatibility):**
|
780
|
+
|
781
|
+
- `llm` - **DEPRECATED**: Use `description` instead
|
782
|
+
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
783
|
+
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
784
|
+
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
785
|
+
"""
|
786
|
+
|
787
|
+
|
788
|
+
class ConfigurationClientVersionManualSDK(BaseModel):
|
789
|
+
manual_sdk: str = FieldInfo(alias="ManualSdk")
|
790
|
+
"""Current manually-maintained SDK"""
|
791
|
+
|
792
|
+
|
793
|
+
class ConfigurationClientVersionGeneratedSDK(BaseModel):
|
794
|
+
generated_sdk: str = FieldInfo(alias="GeneratedSdk")
|
795
|
+
"""Future auto-generated SDK"""
|
796
|
+
|
797
|
+
|
798
|
+
ConfigurationClientVersion: TypeAlias = Union[
|
799
|
+
Literal["Legacy"], ConfigurationClientVersionManualSDK, ConfigurationClientVersionGeneratedSDK, None
|
800
|
+
]
|
801
|
+
|
802
|
+
|
27
803
|
class Configuration(BaseModel):
|
28
|
-
chunk_processing:
|
804
|
+
chunk_processing: ConfigurationChunkProcessing
|
29
805
|
"""Controls the setting for the chunking and post-processing of each chunk."""
|
30
806
|
|
31
807
|
error_handling: Literal["Fail", "Continue"]
|
@@ -36,7 +812,7 @@ class Configuration(BaseModel):
|
|
36
812
|
LLM refusals etc.)
|
37
813
|
"""
|
38
814
|
|
39
|
-
llm_processing:
|
815
|
+
llm_processing: ConfigurationLlmProcessing
|
40
816
|
"""Controls the LLM used for the task."""
|
41
817
|
|
42
818
|
ocr_strategy: Literal["All", "Auto"]
|
@@ -48,7 +824,7 @@ class Configuration(BaseModel):
|
|
48
824
|
used.
|
49
825
|
"""
|
50
826
|
|
51
|
-
segment_processing:
|
827
|
+
segment_processing: ConfigurationSegmentProcessing
|
52
828
|
"""Defines how each segment type is handled when generating the final output.
|
53
829
|
|
54
830
|
Each segment uses one of three strategies. The chosen strategy controls: •
|
@@ -56,18 +832,14 @@ class Configuration(BaseModel):
|
|
56
832
|
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
57
833
|
`Markdown`).
|
58
834
|
|
59
|
-
Optional flags such as image **cropping**, **extended context**, and
|
60
|
-
descriptions** further refine behaviour.
|
61
|
-
|
62
|
-
---
|
835
|
+
Optional flags such as image **cropping**, **extended context**, and
|
836
|
+
**descriptions** further refine behaviour.
|
63
837
|
|
64
838
|
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
65
|
-
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM**
|
66
|
-
description on) • `Picture` → **LLM** (Markdown, description
|
67
|
-
• `Formula`, `Page` → **LLM** (Markdown) •
|
68
|
-
**Ignore** (removed from output)
|
69
|
-
|
70
|
-
---
|
839
|
+
`Caption`, `Footnote` → **Auto** (Markdown, description off) • `Table` → **LLM**
|
840
|
+
(HTML, description on) • `Picture` → **LLM** (Markdown, description off,
|
841
|
+
cropping _All_) • `Formula`, `Page` → **LLM** (Markdown, description off) •
|
842
|
+
`PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
71
843
|
|
72
844
|
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
73
845
|
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
@@ -83,6 +855,9 @@ class Configuration(BaseModel):
|
|
83
855
|
layout element detection and only simple chunking.
|
84
856
|
"""
|
85
857
|
|
858
|
+
client_version: Optional[ConfigurationClientVersion] = None
|
859
|
+
"""Client version for backwards compatibility processing"""
|
860
|
+
|
86
861
|
expires_in: Optional[int] = None
|
87
862
|
"""
|
88
863
|
The number of seconds until task is deleted. Expired tasks can **not** be
|
@@ -104,8 +879,36 @@ class Configuration(BaseModel):
|
|
104
879
|
"""
|
105
880
|
|
106
881
|
|
882
|
+
class OutputChunkSegmentBbox(BaseModel):
|
883
|
+
height: float
|
884
|
+
"""The height of the bounding box."""
|
885
|
+
|
886
|
+
left: float
|
887
|
+
"""The left coordinate of the bounding box."""
|
888
|
+
|
889
|
+
top: float
|
890
|
+
"""The top coordinate of the bounding box."""
|
891
|
+
|
892
|
+
width: float
|
893
|
+
"""The width of the bounding box."""
|
894
|
+
|
895
|
+
|
896
|
+
class OutputChunkSegmentOcrBbox(BaseModel):
|
897
|
+
height: float
|
898
|
+
"""The height of the bounding box."""
|
899
|
+
|
900
|
+
left: float
|
901
|
+
"""The left coordinate of the bounding box."""
|
902
|
+
|
903
|
+
top: float
|
904
|
+
"""The top coordinate of the bounding box."""
|
905
|
+
|
906
|
+
width: float
|
907
|
+
"""The width of the bounding box."""
|
908
|
+
|
909
|
+
|
107
910
|
class OutputChunkSegmentOcr(BaseModel):
|
108
|
-
bbox:
|
911
|
+
bbox: OutputChunkSegmentOcrBbox
|
109
912
|
"""Bounding box for an item. It is used for chunks, segments and OCR results."""
|
110
913
|
|
111
914
|
text: str
|
@@ -166,8 +969,36 @@ class OutputChunkSegmentSSCell(BaseModel):
|
|
166
969
|
"""
|
167
970
|
|
168
971
|
|
972
|
+
class OutputChunkSegmentSSHeaderBbox(BaseModel):
|
973
|
+
height: float
|
974
|
+
"""The height of the bounding box."""
|
975
|
+
|
976
|
+
left: float
|
977
|
+
"""The left coordinate of the bounding box."""
|
978
|
+
|
979
|
+
top: float
|
980
|
+
"""The top coordinate of the bounding box."""
|
981
|
+
|
982
|
+
width: float
|
983
|
+
"""The width of the bounding box."""
|
984
|
+
|
985
|
+
|
986
|
+
class OutputChunkSegmentSSHeaderOcrBbox(BaseModel):
|
987
|
+
height: float
|
988
|
+
"""The height of the bounding box."""
|
989
|
+
|
990
|
+
left: float
|
991
|
+
"""The left coordinate of the bounding box."""
|
992
|
+
|
993
|
+
top: float
|
994
|
+
"""The top coordinate of the bounding box."""
|
995
|
+
|
996
|
+
width: float
|
997
|
+
"""The width of the bounding box."""
|
998
|
+
|
999
|
+
|
169
1000
|
class OutputChunkSegmentSSHeaderOcr(BaseModel):
|
170
|
-
bbox:
|
1001
|
+
bbox: OutputChunkSegmentSSHeaderOcrBbox
|
171
1002
|
"""Bounding box for an item. It is used for chunks, segments and OCR results."""
|
172
1003
|
|
173
1004
|
text: str
|
@@ -178,7 +1009,7 @@ class OutputChunkSegmentSSHeaderOcr(BaseModel):
|
|
178
1009
|
|
179
1010
|
|
180
1011
|
class OutputChunkSegment(BaseModel):
|
181
|
-
bbox:
|
1012
|
+
bbox: OutputChunkSegmentBbox
|
182
1013
|
"""Bounding box for an item. It is used for chunks, segments and OCR results."""
|
183
1014
|
|
184
1015
|
page_height: float
|
@@ -248,7 +1079,7 @@ class OutputChunkSegment(BaseModel):
|
|
248
1079
|
ss_cells: Optional[List[OutputChunkSegmentSSCell]] = None
|
249
1080
|
"""Cells of the segment. Only used for Spreadsheets."""
|
250
1081
|
|
251
|
-
ss_header_bbox: Optional[
|
1082
|
+
ss_header_bbox: Optional[OutputChunkSegmentSSHeaderBbox] = None
|
252
1083
|
"""Bounding box of the header of the segment, if found.
|
253
1084
|
|
254
1085
|
Only used for Spreadsheets.
|
@@ -286,7 +1117,10 @@ class OutputChunkSegment(BaseModel):
|
|
286
1117
|
|
287
1118
|
class OutputChunk(BaseModel):
|
288
1119
|
chunk_length: int
|
289
|
-
"""The total number of tokens in the
|
1120
|
+
"""The total number of tokens in the `embed` field of the chunk.
|
1121
|
+
|
1122
|
+
Calculated by the `tokenizer`.
|
1123
|
+
"""
|
290
1124
|
|
291
1125
|
segments: List[OutputChunkSegment]
|
292
1126
|
"""
|
@@ -299,12 +1133,17 @@ class OutputChunk(BaseModel):
|
|
299
1133
|
chunk_id: Optional[str] = None
|
300
1134
|
"""The unique identifier for the chunk."""
|
301
1135
|
|
1136
|
+
content: Optional[str] = None
|
1137
|
+
"""The content of the chunk.
|
1138
|
+
|
1139
|
+
This is the text that is generated by combining the `content` field from each
|
1140
|
+
segment. Can be used provided as context to the LLM.
|
1141
|
+
"""
|
1142
|
+
|
302
1143
|
embed: Optional[str] = None
|
303
1144
|
"""Suggested text to be embedded for the chunk.
|
304
1145
|
|
305
|
-
This text is generated by combining the embed
|
306
|
-
according to the configured embed sources (HTML, Markdown, LLM, or Content). Can
|
307
|
-
be configured using `embed_sources` in the `SegmentProcessing` configuration.
|
1146
|
+
This text is generated by combining the `embed` field from each segment.
|
308
1147
|
"""
|
309
1148
|
|
310
1149
|
|