chunkr-ai 0.1.0a6__py3-none-any.whl → 0.1.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/__init__.py +2 -0
- chunkr_ai/_base_client.py +3 -3
- chunkr_ai/_client.py +31 -3
- chunkr_ai/_compat.py +48 -48
- chunkr_ai/_constants.py +5 -5
- chunkr_ai/_exceptions.py +4 -0
- chunkr_ai/_models.py +41 -41
- chunkr_ai/_types.py +35 -1
- chunkr_ai/_utils/__init__.py +9 -2
- chunkr_ai/_utils/_compat.py +45 -0
- chunkr_ai/_utils/_datetime_parse.py +136 -0
- chunkr_ai/_utils/_transform.py +11 -1
- chunkr_ai/_utils/_typing.py +6 -1
- chunkr_ai/_utils/_utils.py +0 -1
- chunkr_ai/_version.py +1 -1
- chunkr_ai/resources/__init__.py +14 -0
- chunkr_ai/resources/files.py +3 -3
- chunkr_ai/resources/tasks/__init__.py +14 -0
- chunkr_ai/resources/tasks/extract.py +393 -0
- chunkr_ai/resources/tasks/parse.py +110 -286
- chunkr_ai/resources/tasks/tasks.py +64 -32
- chunkr_ai/resources/webhooks.py +193 -0
- chunkr_ai/types/__init__.py +27 -1
- chunkr_ai/types/bounding_box.py +19 -0
- chunkr_ai/types/cell.py +39 -0
- chunkr_ai/types/cell_style.py +28 -0
- chunkr_ai/types/chunk.py +40 -0
- chunkr_ai/types/chunk_processing.py +40 -0
- chunkr_ai/types/chunk_processing_param.py +42 -0
- chunkr_ai/types/extract_configuration.py +24 -0
- chunkr_ai/types/extract_output_response.py +62 -0
- chunkr_ai/types/file_create_params.py +2 -1
- chunkr_ai/types/file_info.py +21 -0
- chunkr_ai/types/generation_config.py +29 -0
- chunkr_ai/types/generation_config_param.py +29 -0
- chunkr_ai/types/llm_processing.py +36 -0
- chunkr_ai/types/llm_processing_param.py +36 -0
- chunkr_ai/types/ocr_result.py +28 -0
- chunkr_ai/types/page.py +27 -0
- chunkr_ai/types/parse_configuration.py +64 -0
- chunkr_ai/types/parse_configuration_param.py +65 -0
- chunkr_ai/types/parse_output_response.py +29 -0
- chunkr_ai/types/segment.py +109 -0
- chunkr_ai/types/segment_processing.py +228 -0
- chunkr_ai/types/segment_processing_param.py +229 -0
- chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
- chunkr_ai/types/task_get_params.py +0 -3
- chunkr_ai/types/task_list_params.py +7 -1
- chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
- chunkr_ai/types/task_response.py +68 -0
- chunkr_ai/types/tasks/__init__.py +7 -1
- chunkr_ai/types/tasks/extract_create_params.py +47 -0
- chunkr_ai/types/tasks/extract_create_response.py +67 -0
- chunkr_ai/types/tasks/extract_get_params.py +18 -0
- chunkr_ai/types/tasks/extract_get_response.py +67 -0
- chunkr_ai/types/tasks/parse_create_params.py +25 -793
- chunkr_ai/types/tasks/parse_create_response.py +55 -0
- chunkr_ai/types/tasks/parse_get_params.py +18 -0
- chunkr_ai/types/tasks/parse_get_response.py +55 -0
- chunkr_ai/types/unwrap_webhook_event.py +11 -0
- chunkr_ai/types/version_info.py +31 -0
- chunkr_ai/types/webhook_url_response.py +9 -0
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/METADATA +14 -13
- chunkr_ai-0.1.0a8.dist-info/RECORD +88 -0
- chunkr_ai/types/task.py +0 -1225
- chunkr_ai/types/tasks/parse_update_params.py +0 -845
- chunkr_ai-0.1.0a6.dist-info/RECORD +0 -52
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/licenses/LICENSE +0 -0
@@ -2,39 +2,19 @@
|
|
2
2
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
|
-
from typing import
|
6
|
-
from typing_extensions import Literal, Required,
|
5
|
+
from typing import Optional
|
6
|
+
from typing_extensions import Literal, Required, TypedDict
|
7
7
|
|
8
|
-
from
|
8
|
+
from ..llm_processing_param import LlmProcessingParam
|
9
|
+
from ..chunk_processing_param import ChunkProcessingParam
|
10
|
+
from ..segment_processing_param import SegmentProcessingParam
|
9
11
|
|
10
|
-
__all__ = [
|
11
|
-
"ParseCreateParams",
|
12
|
-
"ChunkProcessing",
|
13
|
-
"ChunkProcessingTokenizer",
|
14
|
-
"ChunkProcessingTokenizerEnum",
|
15
|
-
"ChunkProcessingTokenizerString",
|
16
|
-
"LlmProcessing",
|
17
|
-
"LlmProcessingFallbackStrategy",
|
18
|
-
"LlmProcessingFallbackStrategyModel",
|
19
|
-
"SegmentProcessing",
|
20
|
-
"SegmentProcessingCaption",
|
21
|
-
"SegmentProcessingFootnote",
|
22
|
-
"SegmentProcessingFormula",
|
23
|
-
"SegmentProcessingListItem",
|
24
|
-
"SegmentProcessingPage",
|
25
|
-
"SegmentProcessingPageFooter",
|
26
|
-
"SegmentProcessingPageHeader",
|
27
|
-
"SegmentProcessingPicture",
|
28
|
-
"SegmentProcessingSectionHeader",
|
29
|
-
"SegmentProcessingTable",
|
30
|
-
"SegmentProcessingText",
|
31
|
-
"SegmentProcessingTitle",
|
32
|
-
]
|
12
|
+
__all__ = ["ParseCreateParams"]
|
33
13
|
|
34
14
|
|
35
15
|
class ParseCreateParams(TypedDict, total=False):
|
36
16
|
file: Required[str]
|
37
|
-
"""The file to be
|
17
|
+
"""The file to be parsed. Supported inputs:
|
38
18
|
|
39
19
|
- `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
|
40
20
|
API
|
@@ -42,10 +22,10 @@ class ParseCreateParams(TypedDict, total=False):
|
|
42
22
|
- `data:*;base64,...` or raw base64 string
|
43
23
|
"""
|
44
24
|
|
45
|
-
chunk_processing:
|
25
|
+
chunk_processing: ChunkProcessingParam
|
46
26
|
"""Controls the setting for the chunking and post-processing of each chunk."""
|
47
27
|
|
48
|
-
error_handling:
|
28
|
+
error_handling: Literal["Fail", "Continue"]
|
49
29
|
"""Controls how errors are handled during processing:
|
50
30
|
|
51
31
|
- `Fail`: Stops processing and fails the task when any error occurs
|
@@ -60,12 +40,12 @@ class ParseCreateParams(TypedDict, total=False):
|
|
60
40
|
"""
|
61
41
|
|
62
42
|
file_name: Optional[str]
|
63
|
-
"""The name of the file to be
|
43
|
+
"""The name of the file to be parsed. If not set a name will be generated."""
|
64
44
|
|
65
|
-
llm_processing:
|
45
|
+
llm_processing: LlmProcessingParam
|
66
46
|
"""Controls the LLM used for the task."""
|
67
47
|
|
68
|
-
ocr_strategy:
|
48
|
+
ocr_strategy: Literal["All", "Auto"]
|
69
49
|
"""Controls the Optical Character Recognition (OCR) strategy.
|
70
50
|
|
71
51
|
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
@@ -74,778 +54,30 @@ class ParseCreateParams(TypedDict, total=False):
|
|
74
54
|
used.
|
75
55
|
"""
|
76
56
|
|
77
|
-
pipeline:
|
78
|
-
"""
|
79
|
-
Choose the provider whose models will be used for segmentation and OCR. The
|
80
|
-
output will be unified to the Chunkr `output` format.
|
81
|
-
"""
|
82
|
-
|
83
|
-
segment_processing: Optional[SegmentProcessing]
|
84
|
-
"""Defines how each segment type is handled when generating the final output.
|
85
|
-
|
86
|
-
Each segment uses one of three strategies. The chosen strategy controls:
|
57
|
+
pipeline: Literal["Azure", "Chunkr"]
|
87
58
|
|
88
|
-
|
89
|
-
|
90
|
-
- The output format (`Html` or `Markdown`).
|
59
|
+
segment_processing: Optional[SegmentProcessingParam]
|
60
|
+
"""Configuration for how each document segment is processed and formatted.
|
91
61
|
|
92
|
-
|
93
|
-
**descriptions** further refine behaviour.
|
62
|
+
Each segment has sensible defaults, but you can override specific settings:
|
94
63
|
|
95
|
-
|
64
|
+
- `format`: Output as `Html` or `Markdown`
|
65
|
+
- `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
|
66
|
+
- `crop_image`: Whether to crop images to segment bounds
|
67
|
+
- `extended_context`: Use full page as context for LLM processing
|
68
|
+
- `description`: Generate descriptions for segments
|
96
69
|
|
97
|
-
|
98
|
-
(Markdown, description off)
|
99
|
-
- `Table` → **LLM** (HTML, description on)
|
100
|
-
- `Picture` → **LLM** (Markdown, description off, cropping _All_)
|
101
|
-
- `Formula`, `Page` → **LLM** (Markdown, description off)
|
102
|
-
- `PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
70
|
+
**Defaults per segment type:** Check the documentation for more details.
|
103
71
|
|
104
|
-
|
105
|
-
|
106
|
-
- **Auto** – rule-based content generation.
|
107
|
-
- **LLM** – generate content with an LLM.
|
108
|
-
- **Ignore** – exclude the segment entirely.
|
72
|
+
Only specify the fields you want to change - everything else uses the defaults.
|
109
73
|
"""
|
110
74
|
|
111
|
-
segmentation_strategy:
|
75
|
+
segmentation_strategy: Literal["LayoutAnalysis", "Page"]
|
112
76
|
"""Controls the segmentation strategy:
|
113
77
|
|
114
78
|
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
115
79
|
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
116
|
-
segmentation and better chunking.
|
80
|
+
segmentation and better chunking.
|
117
81
|
- `Page`: Treats each page as a single segment. Faster processing, but without
|
118
82
|
layout element detection and only simple chunking.
|
119
83
|
"""
|
120
|
-
|
121
|
-
|
122
|
-
class ChunkProcessingTokenizerEnum(TypedDict, total=False):
|
123
|
-
enum: Required[
|
124
|
-
Annotated[Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"], PropertyInfo(alias="Enum")]
|
125
|
-
]
|
126
|
-
"""Use one of the predefined tokenizer types"""
|
127
|
-
|
128
|
-
|
129
|
-
class ChunkProcessingTokenizerString(TypedDict, total=False):
|
130
|
-
string: Required[Annotated[str, PropertyInfo(alias="String")]]
|
131
|
-
"""
|
132
|
-
Use any Hugging Face tokenizer by specifying its model ID Examples:
|
133
|
-
"Qwen/Qwen-tokenizer", "facebook/bart-large"
|
134
|
-
"""
|
135
|
-
|
136
|
-
|
137
|
-
ChunkProcessingTokenizer: TypeAlias = Union[ChunkProcessingTokenizerEnum, ChunkProcessingTokenizerString]
|
138
|
-
|
139
|
-
|
140
|
-
class ChunkProcessing(TypedDict, total=False):
|
141
|
-
ignore_headers_and_footers: Optional[bool]
|
142
|
-
"""DEPRECATED: use `segment_processing.ignore` instead"""
|
143
|
-
|
144
|
-
target_length: int
|
145
|
-
"""The target number of words in each chunk.
|
146
|
-
|
147
|
-
If 0, each chunk will contain a single segment.
|
148
|
-
"""
|
149
|
-
|
150
|
-
tokenizer: ChunkProcessingTokenizer
|
151
|
-
"""The tokenizer to use for the chunking process."""
|
152
|
-
|
153
|
-
|
154
|
-
class LlmProcessingFallbackStrategyModel(TypedDict, total=False):
|
155
|
-
model: Required[Annotated[str, PropertyInfo(alias="Model")]]
|
156
|
-
"""Use a specific model as fallback"""
|
157
|
-
|
158
|
-
|
159
|
-
LlmProcessingFallbackStrategy: TypeAlias = Union[Literal["None", "Default"], LlmProcessingFallbackStrategyModel]
|
160
|
-
|
161
|
-
|
162
|
-
class LlmProcessing(TypedDict, total=False):
|
163
|
-
fallback_strategy: LlmProcessingFallbackStrategy
|
164
|
-
"""The fallback strategy to use for the LLMs in the task."""
|
165
|
-
|
166
|
-
llm_model_id: Optional[str]
|
167
|
-
"""The ID of the model to use for the task.
|
168
|
-
|
169
|
-
If not provided, the default model will be used. Please check the documentation
|
170
|
-
for the model you want to use.
|
171
|
-
"""
|
172
|
-
|
173
|
-
max_completion_tokens: Optional[int]
|
174
|
-
"""The maximum number of tokens to generate."""
|
175
|
-
|
176
|
-
temperature: float
|
177
|
-
"""The temperature to use for the LLM."""
|
178
|
-
|
179
|
-
|
180
|
-
class SegmentProcessingCaption(TypedDict, total=False):
|
181
|
-
crop_image: Literal["All", "Auto"]
|
182
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
183
|
-
|
184
|
-
- `All` crops all images in the item
|
185
|
-
- `Auto` crops images only if required for post-processing
|
186
|
-
"""
|
187
|
-
|
188
|
-
description: bool
|
189
|
-
"""Generate LLM descriptions for this segment"""
|
190
|
-
|
191
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
|
192
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
193
|
-
|
194
|
-
extended_context: bool
|
195
|
-
"""Use the full page image as context for LLM generation"""
|
196
|
-
|
197
|
-
format: Literal["Html", "Markdown"]
|
198
|
-
|
199
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]]
|
200
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
201
|
-
|
202
|
-
llm: Optional[str]
|
203
|
-
"""**DEPRECATED**: use description instead"""
|
204
|
-
|
205
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
|
206
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
207
|
-
|
208
|
-
strategy: Literal["LLM", "Auto", "Ignore"]
|
209
|
-
|
210
|
-
|
211
|
-
class SegmentProcessingFootnote(TypedDict, total=False):
|
212
|
-
crop_image: Literal["All", "Auto"]
|
213
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
214
|
-
|
215
|
-
- `All` crops all images in the item
|
216
|
-
- `Auto` crops images only if required for post-processing
|
217
|
-
"""
|
218
|
-
|
219
|
-
description: bool
|
220
|
-
"""Generate LLM descriptions for this segment"""
|
221
|
-
|
222
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
|
223
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
224
|
-
|
225
|
-
extended_context: bool
|
226
|
-
"""Use the full page image as context for LLM generation"""
|
227
|
-
|
228
|
-
format: Literal["Html", "Markdown"]
|
229
|
-
|
230
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]]
|
231
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
232
|
-
|
233
|
-
llm: Optional[str]
|
234
|
-
"""**DEPRECATED**: use description instead"""
|
235
|
-
|
236
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
|
237
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
238
|
-
|
239
|
-
strategy: Literal["LLM", "Auto", "Ignore"]
|
240
|
-
|
241
|
-
|
242
|
-
class SegmentProcessingFormula(TypedDict, total=False):
|
243
|
-
crop_image: Literal["All", "Auto"]
|
244
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
245
|
-
|
246
|
-
- `All` crops all images in the item
|
247
|
-
- `Auto` crops images only if required for post-processing
|
248
|
-
"""
|
249
|
-
|
250
|
-
description: bool
|
251
|
-
"""Generate LLM descriptions for this segment"""
|
252
|
-
|
253
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
|
254
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
255
|
-
|
256
|
-
extended_context: bool
|
257
|
-
"""Use the full page image as context for LLM generation"""
|
258
|
-
|
259
|
-
format: Literal["Html", "Markdown"]
|
260
|
-
|
261
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]]
|
262
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
263
|
-
|
264
|
-
llm: Optional[str]
|
265
|
-
"""**DEPRECATED**: use description instead"""
|
266
|
-
|
267
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
|
268
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
269
|
-
|
270
|
-
strategy: Literal["LLM", "Auto", "Ignore"]
|
271
|
-
|
272
|
-
|
273
|
-
class SegmentProcessingListItem(TypedDict, total=False):
|
274
|
-
crop_image: Literal["All", "Auto"]
|
275
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
276
|
-
|
277
|
-
- `All` crops all images in the item
|
278
|
-
- `Auto` crops images only if required for post-processing
|
279
|
-
"""
|
280
|
-
|
281
|
-
description: bool
|
282
|
-
"""Generate LLM descriptions for this segment"""
|
283
|
-
|
284
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
|
285
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
286
|
-
|
287
|
-
extended_context: bool
|
288
|
-
"""Use the full page image as context for LLM generation"""
|
289
|
-
|
290
|
-
format: Literal["Html", "Markdown"]
|
291
|
-
|
292
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]]
|
293
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
294
|
-
|
295
|
-
llm: Optional[str]
|
296
|
-
"""**DEPRECATED**: use description instead"""
|
297
|
-
|
298
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
|
299
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
300
|
-
|
301
|
-
strategy: Literal["LLM", "Auto", "Ignore"]
|
302
|
-
|
303
|
-
|
304
|
-
class SegmentProcessingPage(TypedDict, total=False):
|
305
|
-
crop_image: Literal["All", "Auto"]
|
306
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
307
|
-
|
308
|
-
- `All` crops all images in the item
|
309
|
-
- `Auto` crops images only if required for post-processing
|
310
|
-
"""
|
311
|
-
|
312
|
-
description: bool
|
313
|
-
"""Generate LLM descriptions for this segment"""
|
314
|
-
|
315
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
|
316
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
317
|
-
|
318
|
-
extended_context: bool
|
319
|
-
"""Use the full page image as context for LLM generation"""
|
320
|
-
|
321
|
-
format: Literal["Html", "Markdown"]
|
322
|
-
|
323
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]]
|
324
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
325
|
-
|
326
|
-
llm: Optional[str]
|
327
|
-
"""**DEPRECATED**: use description instead"""
|
328
|
-
|
329
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
|
330
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
331
|
-
|
332
|
-
strategy: Literal["LLM", "Auto", "Ignore"]
|
333
|
-
|
334
|
-
|
335
|
-
class SegmentProcessingPageFooter(TypedDict, total=False):
|
336
|
-
crop_image: Literal["All", "Auto"]
|
337
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
338
|
-
|
339
|
-
- `All` crops all images in the item
|
340
|
-
- `Auto` crops images only if required for post-processing
|
341
|
-
"""
|
342
|
-
|
343
|
-
description: bool
|
344
|
-
"""Generate LLM descriptions for this segment"""
|
345
|
-
|
346
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
|
347
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
348
|
-
|
349
|
-
extended_context: bool
|
350
|
-
"""Use the full page image as context for LLM generation"""
|
351
|
-
|
352
|
-
format: Literal["Html", "Markdown"]
|
353
|
-
|
354
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]]
|
355
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
356
|
-
|
357
|
-
llm: Optional[str]
|
358
|
-
"""**DEPRECATED**: use description instead"""
|
359
|
-
|
360
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
|
361
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
362
|
-
|
363
|
-
strategy: Literal["LLM", "Auto", "Ignore"]
|
364
|
-
|
365
|
-
|
366
|
-
class SegmentProcessingPageHeader(TypedDict, total=False):
|
367
|
-
crop_image: Literal["All", "Auto"]
|
368
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
369
|
-
|
370
|
-
- `All` crops all images in the item
|
371
|
-
- `Auto` crops images only if required for post-processing
|
372
|
-
"""
|
373
|
-
|
374
|
-
description: bool
|
375
|
-
"""Generate LLM descriptions for this segment"""
|
376
|
-
|
377
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
|
378
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
379
|
-
|
380
|
-
extended_context: bool
|
381
|
-
"""Use the full page image as context for LLM generation"""
|
382
|
-
|
383
|
-
format: Literal["Html", "Markdown"]
|
384
|
-
|
385
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]]
|
386
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
387
|
-
|
388
|
-
llm: Optional[str]
|
389
|
-
"""**DEPRECATED**: use description instead"""
|
390
|
-
|
391
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
|
392
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
393
|
-
|
394
|
-
strategy: Literal["LLM", "Auto", "Ignore"]
|
395
|
-
|
396
|
-
|
397
|
-
class SegmentProcessingPicture(TypedDict, total=False):
|
398
|
-
crop_image: Literal["All", "Auto"]
|
399
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
400
|
-
|
401
|
-
- `All` crops all images in the item
|
402
|
-
- `Auto` crops images only if required for post-processing
|
403
|
-
"""
|
404
|
-
|
405
|
-
description: bool
|
406
|
-
"""Generate LLM descriptions for this segment"""
|
407
|
-
|
408
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
|
409
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
410
|
-
|
411
|
-
extended_context: bool
|
412
|
-
"""Use the full page image as context for LLM generation"""
|
413
|
-
|
414
|
-
format: Literal["Html", "Markdown"]
|
415
|
-
|
416
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]]
|
417
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
418
|
-
|
419
|
-
llm: Optional[str]
|
420
|
-
"""**DEPRECATED**: use description instead"""
|
421
|
-
|
422
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
|
423
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
424
|
-
|
425
|
-
strategy: Literal["LLM", "Auto", "Ignore"]
|
426
|
-
|
427
|
-
|
428
|
-
class SegmentProcessingSectionHeader(TypedDict, total=False):
|
429
|
-
crop_image: Literal["All", "Auto"]
|
430
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
431
|
-
|
432
|
-
- `All` crops all images in the item
|
433
|
-
- `Auto` crops images only if required for post-processing
|
434
|
-
"""
|
435
|
-
|
436
|
-
description: bool
|
437
|
-
"""Generate LLM descriptions for this segment"""
|
438
|
-
|
439
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
|
440
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
441
|
-
|
442
|
-
extended_context: bool
|
443
|
-
"""Use the full page image as context for LLM generation"""
|
444
|
-
|
445
|
-
format: Literal["Html", "Markdown"]
|
446
|
-
|
447
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]]
|
448
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
449
|
-
|
450
|
-
llm: Optional[str]
|
451
|
-
"""**DEPRECATED**: use description instead"""
|
452
|
-
|
453
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
|
454
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
455
|
-
|
456
|
-
strategy: Literal["LLM", "Auto", "Ignore"]
|
457
|
-
|
458
|
-
|
459
|
-
class SegmentProcessingTable(TypedDict, total=False):
|
460
|
-
crop_image: Literal["All", "Auto"]
|
461
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
462
|
-
|
463
|
-
- `All` crops all images in the item
|
464
|
-
- `Auto` crops images only if required for post-processing
|
465
|
-
"""
|
466
|
-
|
467
|
-
description: bool
|
468
|
-
"""Generate LLM descriptions for this segment"""
|
469
|
-
|
470
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
|
471
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
472
|
-
|
473
|
-
extended_context: bool
|
474
|
-
"""Use the full page image as context for LLM generation"""
|
475
|
-
|
476
|
-
format: Literal["Html", "Markdown"]
|
477
|
-
|
478
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]]
|
479
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
480
|
-
|
481
|
-
llm: Optional[str]
|
482
|
-
"""**DEPRECATED**: use description instead"""
|
483
|
-
|
484
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
|
485
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
486
|
-
|
487
|
-
strategy: Literal["LLM", "Auto", "Ignore"]
|
488
|
-
|
489
|
-
|
490
|
-
class SegmentProcessingText(TypedDict, total=False):
|
491
|
-
crop_image: Literal["All", "Auto"]
|
492
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
493
|
-
|
494
|
-
- `All` crops all images in the item
|
495
|
-
- `Auto` crops images only if required for post-processing
|
496
|
-
"""
|
497
|
-
|
498
|
-
description: bool
|
499
|
-
"""Generate LLM descriptions for this segment"""
|
500
|
-
|
501
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
|
502
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
503
|
-
|
504
|
-
extended_context: bool
|
505
|
-
"""Use the full page image as context for LLM generation"""
|
506
|
-
|
507
|
-
format: Literal["Html", "Markdown"]
|
508
|
-
|
509
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]]
|
510
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
511
|
-
|
512
|
-
llm: Optional[str]
|
513
|
-
"""**DEPRECATED**: use description instead"""
|
514
|
-
|
515
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
|
516
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
517
|
-
|
518
|
-
strategy: Literal["LLM", "Auto", "Ignore"]
|
519
|
-
|
520
|
-
|
521
|
-
class SegmentProcessingTitle(TypedDict, total=False):
|
522
|
-
crop_image: Literal["All", "Auto"]
|
523
|
-
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
524
|
-
|
525
|
-
- `All` crops all images in the item
|
526
|
-
- `Auto` crops images only if required for post-processing
|
527
|
-
"""
|
528
|
-
|
529
|
-
description: bool
|
530
|
-
"""Generate LLM descriptions for this segment"""
|
531
|
-
|
532
|
-
embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
|
533
|
-
"""**DEPRECATED**: `embed` field is auto populated"""
|
534
|
-
|
535
|
-
extended_context: bool
|
536
|
-
"""Use the full page image as context for LLM generation"""
|
537
|
-
|
538
|
-
format: Literal["Html", "Markdown"]
|
539
|
-
|
540
|
-
html: Optional[Literal["LLM", "Auto", "Ignore"]]
|
541
|
-
"""**DEPRECATED**: Use `format: html` and `strategy` instead."""
|
542
|
-
|
543
|
-
llm: Optional[str]
|
544
|
-
"""**DEPRECATED**: use description instead"""
|
545
|
-
|
546
|
-
markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
|
547
|
-
"""**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
|
548
|
-
|
549
|
-
strategy: Literal["LLM", "Auto", "Ignore"]
|
550
|
-
|
551
|
-
|
552
|
-
class SegmentProcessing(TypedDict, total=False):
|
553
|
-
caption: Annotated[Optional[SegmentProcessingCaption], PropertyInfo(alias="Caption")]
|
554
|
-
"""Controls the processing and generation for the segment.
|
555
|
-
|
556
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
557
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
558
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
559
|
-
post-processing.
|
560
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
561
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
562
|
-
- `Auto`: Process content automatically
|
563
|
-
- `LLM`: Use large language models for processing
|
564
|
-
- `Ignore`: Exclude segments from final output
|
565
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
566
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
567
|
-
configuration.
|
568
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
569
|
-
|
570
|
-
**Deprecated fields (for backwards compatibility):**
|
571
|
-
|
572
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
573
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
574
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
575
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
576
|
-
"""
|
577
|
-
|
578
|
-
footnote: Annotated[Optional[SegmentProcessingFootnote], PropertyInfo(alias="Footnote")]
|
579
|
-
"""Controls the processing and generation for the segment.
|
580
|
-
|
581
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
582
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
583
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
584
|
-
post-processing.
|
585
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
586
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
587
|
-
- `Auto`: Process content automatically
|
588
|
-
- `LLM`: Use large language models for processing
|
589
|
-
- `Ignore`: Exclude segments from final output
|
590
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
591
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
592
|
-
configuration.
|
593
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
594
|
-
|
595
|
-
**Deprecated fields (for backwards compatibility):**
|
596
|
-
|
597
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
598
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
599
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
600
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
601
|
-
"""
|
602
|
-
|
603
|
-
formula: Annotated[Optional[SegmentProcessingFormula], PropertyInfo(alias="Formula")]
|
604
|
-
"""Controls the processing and generation for the segment.
|
605
|
-
|
606
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
607
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
608
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
609
|
-
post-processing.
|
610
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
611
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
612
|
-
- `Auto`: Process content automatically
|
613
|
-
- `LLM`: Use large language models for processing
|
614
|
-
- `Ignore`: Exclude segments from final output
|
615
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
616
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
617
|
-
configuration.
|
618
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
619
|
-
|
620
|
-
**Deprecated fields (for backwards compatibility):**
|
621
|
-
|
622
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
623
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
624
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
625
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
626
|
-
"""
|
627
|
-
|
628
|
-
list_item: Annotated[Optional[SegmentProcessingListItem], PropertyInfo(alias="ListItem")]
|
629
|
-
"""Controls the processing and generation for the segment.
|
630
|
-
|
631
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
632
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
633
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
634
|
-
post-processing.
|
635
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
636
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
637
|
-
- `Auto`: Process content automatically
|
638
|
-
- `LLM`: Use large language models for processing
|
639
|
-
- `Ignore`: Exclude segments from final output
|
640
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
641
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
642
|
-
configuration.
|
643
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
644
|
-
|
645
|
-
**Deprecated fields (for backwards compatibility):**
|
646
|
-
|
647
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
648
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
649
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
650
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
651
|
-
"""
|
652
|
-
|
653
|
-
page: Annotated[Optional[SegmentProcessingPage], PropertyInfo(alias="Page")]
|
654
|
-
"""Controls the processing and generation for the segment.
|
655
|
-
|
656
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
657
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
658
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
659
|
-
post-processing.
|
660
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
661
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
662
|
-
- `Auto`: Process content automatically
|
663
|
-
- `LLM`: Use large language models for processing
|
664
|
-
- `Ignore`: Exclude segments from final output
|
665
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
666
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
667
|
-
configuration.
|
668
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
669
|
-
|
670
|
-
**Deprecated fields (for backwards compatibility):**
|
671
|
-
|
672
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
673
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
674
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
675
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
676
|
-
"""
|
677
|
-
|
678
|
-
page_footer: Annotated[Optional[SegmentProcessingPageFooter], PropertyInfo(alias="PageFooter")]
|
679
|
-
"""Controls the processing and generation for the segment.
|
680
|
-
|
681
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
682
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
683
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
684
|
-
post-processing.
|
685
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
686
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
687
|
-
- `Auto`: Process content automatically
|
688
|
-
- `LLM`: Use large language models for processing
|
689
|
-
- `Ignore`: Exclude segments from final output
|
690
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
691
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
692
|
-
configuration.
|
693
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
694
|
-
|
695
|
-
**Deprecated fields (for backwards compatibility):**
|
696
|
-
|
697
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
698
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
699
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
700
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
701
|
-
"""
|
702
|
-
|
703
|
-
page_header: Annotated[Optional[SegmentProcessingPageHeader], PropertyInfo(alias="PageHeader")]
|
704
|
-
"""Controls the processing and generation for the segment.
|
705
|
-
|
706
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
707
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
708
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
709
|
-
post-processing.
|
710
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
711
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
712
|
-
- `Auto`: Process content automatically
|
713
|
-
- `LLM`: Use large language models for processing
|
714
|
-
- `Ignore`: Exclude segments from final output
|
715
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
716
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
717
|
-
configuration.
|
718
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
719
|
-
|
720
|
-
**Deprecated fields (for backwards compatibility):**
|
721
|
-
|
722
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
723
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
724
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
725
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
726
|
-
"""
|
727
|
-
|
728
|
-
picture: Annotated[Optional[SegmentProcessingPicture], PropertyInfo(alias="Picture")]
|
729
|
-
"""Controls the processing and generation for the segment.
|
730
|
-
|
731
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
732
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
733
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
734
|
-
post-processing.
|
735
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
736
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
737
|
-
- `Auto`: Process content automatically
|
738
|
-
- `LLM`: Use large language models for processing
|
739
|
-
- `Ignore`: Exclude segments from final output
|
740
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
741
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
742
|
-
configuration.
|
743
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
744
|
-
|
745
|
-
**Deprecated fields (for backwards compatibility):**
|
746
|
-
|
747
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
748
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
749
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
750
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
751
|
-
"""
|
752
|
-
|
753
|
-
section_header: Annotated[Optional[SegmentProcessingSectionHeader], PropertyInfo(alias="SectionHeader")]
|
754
|
-
"""Controls the processing and generation for the segment.
|
755
|
-
|
756
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
757
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
758
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
759
|
-
post-processing.
|
760
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
761
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
762
|
-
- `Auto`: Process content automatically
|
763
|
-
- `LLM`: Use large language models for processing
|
764
|
-
- `Ignore`: Exclude segments from final output
|
765
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
766
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
767
|
-
configuration.
|
768
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
769
|
-
|
770
|
-
**Deprecated fields (for backwards compatibility):**
|
771
|
-
|
772
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
773
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
774
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
775
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
776
|
-
"""
|
777
|
-
|
778
|
-
table: Annotated[Optional[SegmentProcessingTable], PropertyInfo(alias="Table")]
|
779
|
-
"""Controls the processing and generation for the segment.
|
780
|
-
|
781
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
782
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
783
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
784
|
-
post-processing.
|
785
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
786
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
787
|
-
- `Auto`: Process content automatically
|
788
|
-
- `LLM`: Use large language models for processing
|
789
|
-
- `Ignore`: Exclude segments from final output
|
790
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
791
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
792
|
-
configuration.
|
793
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
794
|
-
|
795
|
-
**Deprecated fields (for backwards compatibility):**
|
796
|
-
|
797
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
798
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
799
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
800
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
801
|
-
"""
|
802
|
-
|
803
|
-
text: Annotated[Optional[SegmentProcessingText], PropertyInfo(alias="Text")]
|
804
|
-
"""Controls the processing and generation for the segment.
|
805
|
-
|
806
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
807
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
808
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
809
|
-
post-processing.
|
810
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
811
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
812
|
-
- `Auto`: Process content automatically
|
813
|
-
- `LLM`: Use large language models for processing
|
814
|
-
- `Ignore`: Exclude segments from final output
|
815
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
816
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
817
|
-
configuration.
|
818
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
819
|
-
|
820
|
-
**Deprecated fields (for backwards compatibility):**
|
821
|
-
|
822
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
823
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
824
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
825
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
826
|
-
"""
|
827
|
-
|
828
|
-
title: Annotated[Optional[SegmentProcessingTitle], PropertyInfo(alias="Title")]
|
829
|
-
"""Controls the processing and generation for the segment.
|
830
|
-
|
831
|
-
- `crop_image` controls whether to crop the file's images to the segment's
|
832
|
-
bounding box. The cropped image will be stored in the segment's `image` field.
|
833
|
-
Use `All` to always crop, or `Auto` to only crop when needed for
|
834
|
-
post-processing.
|
835
|
-
- `format` specifies the output format: `Html` or `Markdown`
|
836
|
-
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
837
|
-
- `Auto`: Process content automatically
|
838
|
-
- `LLM`: Use large language models for processing
|
839
|
-
- `Ignore`: Exclude segments from final output
|
840
|
-
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
841
|
-
uses chunkr's own VLM models and is not configurable via LLM processing
|
842
|
-
configuration.
|
843
|
-
- `extended_context` uses the full page image as context for LLM generation.
|
844
|
-
|
845
|
-
**Deprecated fields (for backwards compatibility):**
|
846
|
-
|
847
|
-
- `llm` - **DEPRECATED**: Use `description` instead
|
848
|
-
- `embed_sources` - **DEPRECATED**: Embed field is auto-populated
|
849
|
-
- `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
|
850
|
-
- `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
|
851
|
-
"""
|