chunkr-ai 0.1.0a6__py3-none-any.whl → 0.1.0a7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. chunkr_ai/__init__.py +2 -0
  2. chunkr_ai/_client.py +31 -3
  3. chunkr_ai/_constants.py +5 -5
  4. chunkr_ai/_exceptions.py +4 -0
  5. chunkr_ai/_models.py +1 -1
  6. chunkr_ai/_types.py +35 -1
  7. chunkr_ai/_utils/__init__.py +1 -0
  8. chunkr_ai/_utils/_typing.py +5 -0
  9. chunkr_ai/_version.py +1 -1
  10. chunkr_ai/resources/__init__.py +14 -0
  11. chunkr_ai/resources/files.py +3 -3
  12. chunkr_ai/resources/tasks/__init__.py +14 -0
  13. chunkr_ai/resources/tasks/extract.py +409 -0
  14. chunkr_ai/resources/tasks/parse.py +124 -284
  15. chunkr_ai/resources/tasks/tasks.py +62 -14
  16. chunkr_ai/resources/webhooks.py +193 -0
  17. chunkr_ai/types/__init__.py +27 -1
  18. chunkr_ai/types/bounding_box.py +19 -0
  19. chunkr_ai/types/cell.py +39 -0
  20. chunkr_ai/types/cell_style.py +28 -0
  21. chunkr_ai/types/chunk.py +40 -0
  22. chunkr_ai/types/chunk_processing.py +40 -0
  23. chunkr_ai/types/chunk_processing_param.py +42 -0
  24. chunkr_ai/types/extract_configuration.py +24 -0
  25. chunkr_ai/types/extract_output_response.py +19 -0
  26. chunkr_ai/types/file_create_params.py +2 -1
  27. chunkr_ai/types/file_info.py +21 -0
  28. chunkr_ai/types/generation_config.py +29 -0
  29. chunkr_ai/types/generation_config_param.py +29 -0
  30. chunkr_ai/types/llm_processing.py +36 -0
  31. chunkr_ai/types/llm_processing_param.py +36 -0
  32. chunkr_ai/types/ocr_result.py +28 -0
  33. chunkr_ai/types/page.py +27 -0
  34. chunkr_ai/types/parse_configuration.py +64 -0
  35. chunkr_ai/types/parse_configuration_param.py +65 -0
  36. chunkr_ai/types/parse_output_response.py +29 -0
  37. chunkr_ai/types/segment.py +109 -0
  38. chunkr_ai/types/segment_processing.py +228 -0
  39. chunkr_ai/types/segment_processing_param.py +229 -0
  40. chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
  41. chunkr_ai/types/task_list_params.py +7 -1
  42. chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
  43. chunkr_ai/types/task_response.py +68 -0
  44. chunkr_ai/types/tasks/__init__.py +7 -1
  45. chunkr_ai/types/tasks/extract_create_params.py +47 -0
  46. chunkr_ai/types/tasks/extract_create_response.py +214 -0
  47. chunkr_ai/types/tasks/extract_get_params.py +21 -0
  48. chunkr_ai/types/tasks/extract_get_response.py +214 -0
  49. chunkr_ai/types/tasks/parse_create_params.py +25 -793
  50. chunkr_ai/types/tasks/parse_create_response.py +55 -0
  51. chunkr_ai/types/tasks/parse_get_params.py +21 -0
  52. chunkr_ai/types/tasks/parse_get_response.py +55 -0
  53. chunkr_ai/types/unwrap_webhook_event.py +11 -0
  54. chunkr_ai/types/version_info.py +31 -0
  55. chunkr_ai/types/webhook_url_response.py +9 -0
  56. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a7.dist-info}/METADATA +14 -13
  57. chunkr_ai-0.1.0a7.dist-info/RECORD +86 -0
  58. chunkr_ai/types/task.py +0 -1225
  59. chunkr_ai/types/tasks/parse_update_params.py +0 -845
  60. chunkr_ai-0.1.0a6.dist-info/RECORD +0 -52
  61. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a7.dist-info}/WHEEL +0 -0
  62. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a7.dist-info}/licenses/LICENSE +0 -0
@@ -2,39 +2,19 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import List, Union, Optional
6
- from typing_extensions import Literal, Required, Annotated, TypeAlias, TypedDict
5
+ from typing import Optional
6
+ from typing_extensions import Literal, Required, TypedDict
7
7
 
8
- from ..._utils import PropertyInfo
8
+ from ..llm_processing_param import LlmProcessingParam
9
+ from ..chunk_processing_param import ChunkProcessingParam
10
+ from ..segment_processing_param import SegmentProcessingParam
9
11
 
10
- __all__ = [
11
- "ParseCreateParams",
12
- "ChunkProcessing",
13
- "ChunkProcessingTokenizer",
14
- "ChunkProcessingTokenizerEnum",
15
- "ChunkProcessingTokenizerString",
16
- "LlmProcessing",
17
- "LlmProcessingFallbackStrategy",
18
- "LlmProcessingFallbackStrategyModel",
19
- "SegmentProcessing",
20
- "SegmentProcessingCaption",
21
- "SegmentProcessingFootnote",
22
- "SegmentProcessingFormula",
23
- "SegmentProcessingListItem",
24
- "SegmentProcessingPage",
25
- "SegmentProcessingPageFooter",
26
- "SegmentProcessingPageHeader",
27
- "SegmentProcessingPicture",
28
- "SegmentProcessingSectionHeader",
29
- "SegmentProcessingTable",
30
- "SegmentProcessingText",
31
- "SegmentProcessingTitle",
32
- ]
12
+ __all__ = ["ParseCreateParams"]
33
13
 
34
14
 
35
15
  class ParseCreateParams(TypedDict, total=False):
36
16
  file: Required[str]
37
- """The file to be uploaded. Supported inputs:
17
+ """The file to be parsed. Supported inputs:
38
18
 
39
19
  - `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
40
20
  API
@@ -42,10 +22,10 @@ class ParseCreateParams(TypedDict, total=False):
42
22
  - `data:*;base64,...` or raw base64 string
43
23
  """
44
24
 
45
- chunk_processing: Optional[ChunkProcessing]
25
+ chunk_processing: ChunkProcessingParam
46
26
  """Controls the setting for the chunking and post-processing of each chunk."""
47
27
 
48
- error_handling: Optional[Literal["Fail", "Continue"]]
28
+ error_handling: Literal["Fail", "Continue"]
49
29
  """Controls how errors are handled during processing:
50
30
 
51
31
  - `Fail`: Stops processing and fails the task when any error occurs
@@ -60,12 +40,12 @@ class ParseCreateParams(TypedDict, total=False):
60
40
  """
61
41
 
62
42
  file_name: Optional[str]
63
- """The name of the file to be uploaded. If not set a name will be generated."""
43
+ """The name of the file to be parsed. If not set a name will be generated."""
64
44
 
65
- llm_processing: Optional[LlmProcessing]
45
+ llm_processing: LlmProcessingParam
66
46
  """Controls the LLM used for the task."""
67
47
 
68
- ocr_strategy: Optional[Literal["All", "Auto"]]
48
+ ocr_strategy: Literal["All", "Auto"]
69
49
  """Controls the Optical Character Recognition (OCR) strategy.
70
50
 
71
51
  - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
@@ -74,778 +54,30 @@ class ParseCreateParams(TypedDict, total=False):
74
54
  used.
75
55
  """
76
56
 
77
- pipeline: Optional[Literal["Azure", "Chunkr"]]
78
- """
79
- Choose the provider whose models will be used for segmentation and OCR. The
80
- output will be unified to the Chunkr `output` format.
81
- """
82
-
83
- segment_processing: Optional[SegmentProcessing]
84
- """Defines how each segment type is handled when generating the final output.
85
-
86
- Each segment uses one of three strategies. The chosen strategy controls:
57
+ pipeline: Literal["Azure", "Chunkr"]
87
58
 
88
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
89
- - How the content is produced (rule-based vs. LLM).
90
- - The output format (`Html` or `Markdown`).
59
+ segment_processing: Optional[SegmentProcessingParam]
60
+ """Configuration for how each document segment is processed and formatted.
91
61
 
92
- Optional flags such as image **cropping**, **extended context**, and
93
- **descriptions** further refine behaviour.
62
+ Each segment has sensible defaults, but you can override specific settings:
94
63
 
95
- **Default strategy per segment**
64
+ - `format`: Output as `Html` or `Markdown`
65
+ - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
66
+ - `crop_image`: Whether to crop images to segment bounds
67
+ - `extended_context`: Use full page as context for LLM processing
68
+ - `description`: Generate descriptions for segments
96
69
 
97
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` **Auto**
98
- (Markdown, description off)
99
- - `Table` → **LLM** (HTML, description on)
100
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
101
- - `Formula`, `Page` → **LLM** (Markdown, description off)
102
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
70
+ **Defaults per segment type:** Check the documentation for more details.
103
71
 
104
- **Strategy reference**
105
-
106
- - **Auto** – rule-based content generation.
107
- - **LLM** – generate content with an LLM.
108
- - **Ignore** – exclude the segment entirely.
72
+ Only specify the fields you want to change - everything else uses the defaults.
109
73
  """
110
74
 
111
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]]
75
+ segmentation_strategy: Literal["LayoutAnalysis", "Page"]
112
76
  """Controls the segmentation strategy:
113
77
 
114
78
  - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
115
79
  `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
116
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
80
+ segmentation and better chunking.
117
81
  - `Page`: Treats each page as a single segment. Faster processing, but without
118
82
  layout element detection and only simple chunking.
119
83
  """
120
-
121
-
122
- class ChunkProcessingTokenizerEnum(TypedDict, total=False):
123
- enum: Required[
124
- Annotated[Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"], PropertyInfo(alias="Enum")]
125
- ]
126
- """Use one of the predefined tokenizer types"""
127
-
128
-
129
- class ChunkProcessingTokenizerString(TypedDict, total=False):
130
- string: Required[Annotated[str, PropertyInfo(alias="String")]]
131
- """
132
- Use any Hugging Face tokenizer by specifying its model ID Examples:
133
- "Qwen/Qwen-tokenizer", "facebook/bart-large"
134
- """
135
-
136
-
137
- ChunkProcessingTokenizer: TypeAlias = Union[ChunkProcessingTokenizerEnum, ChunkProcessingTokenizerString]
138
-
139
-
140
- class ChunkProcessing(TypedDict, total=False):
141
- ignore_headers_and_footers: Optional[bool]
142
- """DEPRECATED: use `segment_processing.ignore` instead"""
143
-
144
- target_length: int
145
- """The target number of words in each chunk.
146
-
147
- If 0, each chunk will contain a single segment.
148
- """
149
-
150
- tokenizer: ChunkProcessingTokenizer
151
- """The tokenizer to use for the chunking process."""
152
-
153
-
154
- class LlmProcessingFallbackStrategyModel(TypedDict, total=False):
155
- model: Required[Annotated[str, PropertyInfo(alias="Model")]]
156
- """Use a specific model as fallback"""
157
-
158
-
159
- LlmProcessingFallbackStrategy: TypeAlias = Union[Literal["None", "Default"], LlmProcessingFallbackStrategyModel]
160
-
161
-
162
- class LlmProcessing(TypedDict, total=False):
163
- fallback_strategy: LlmProcessingFallbackStrategy
164
- """The fallback strategy to use for the LLMs in the task."""
165
-
166
- llm_model_id: Optional[str]
167
- """The ID of the model to use for the task.
168
-
169
- If not provided, the default model will be used. Please check the documentation
170
- for the model you want to use.
171
- """
172
-
173
- max_completion_tokens: Optional[int]
174
- """The maximum number of tokens to generate."""
175
-
176
- temperature: float
177
- """The temperature to use for the LLM."""
178
-
179
-
180
- class SegmentProcessingCaption(TypedDict, total=False):
181
- crop_image: Literal["All", "Auto"]
182
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
183
-
184
- - `All` crops all images in the item
185
- - `Auto` crops images only if required for post-processing
186
- """
187
-
188
- description: bool
189
- """Generate LLM descriptions for this segment"""
190
-
191
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
192
- """**DEPRECATED**: `embed` field is auto populated"""
193
-
194
- extended_context: bool
195
- """Use the full page image as context for LLM generation"""
196
-
197
- format: Literal["Html", "Markdown"]
198
-
199
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
200
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
201
-
202
- llm: Optional[str]
203
- """**DEPRECATED**: use description instead"""
204
-
205
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
206
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
207
-
208
- strategy: Literal["LLM", "Auto", "Ignore"]
209
-
210
-
211
- class SegmentProcessingFootnote(TypedDict, total=False):
212
- crop_image: Literal["All", "Auto"]
213
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
214
-
215
- - `All` crops all images in the item
216
- - `Auto` crops images only if required for post-processing
217
- """
218
-
219
- description: bool
220
- """Generate LLM descriptions for this segment"""
221
-
222
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
223
- """**DEPRECATED**: `embed` field is auto populated"""
224
-
225
- extended_context: bool
226
- """Use the full page image as context for LLM generation"""
227
-
228
- format: Literal["Html", "Markdown"]
229
-
230
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
231
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
232
-
233
- llm: Optional[str]
234
- """**DEPRECATED**: use description instead"""
235
-
236
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
237
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
238
-
239
- strategy: Literal["LLM", "Auto", "Ignore"]
240
-
241
-
242
- class SegmentProcessingFormula(TypedDict, total=False):
243
- crop_image: Literal["All", "Auto"]
244
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
245
-
246
- - `All` crops all images in the item
247
- - `Auto` crops images only if required for post-processing
248
- """
249
-
250
- description: bool
251
- """Generate LLM descriptions for this segment"""
252
-
253
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
254
- """**DEPRECATED**: `embed` field is auto populated"""
255
-
256
- extended_context: bool
257
- """Use the full page image as context for LLM generation"""
258
-
259
- format: Literal["Html", "Markdown"]
260
-
261
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
262
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
263
-
264
- llm: Optional[str]
265
- """**DEPRECATED**: use description instead"""
266
-
267
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
268
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
269
-
270
- strategy: Literal["LLM", "Auto", "Ignore"]
271
-
272
-
273
- class SegmentProcessingListItem(TypedDict, total=False):
274
- crop_image: Literal["All", "Auto"]
275
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
276
-
277
- - `All` crops all images in the item
278
- - `Auto` crops images only if required for post-processing
279
- """
280
-
281
- description: bool
282
- """Generate LLM descriptions for this segment"""
283
-
284
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
285
- """**DEPRECATED**: `embed` field is auto populated"""
286
-
287
- extended_context: bool
288
- """Use the full page image as context for LLM generation"""
289
-
290
- format: Literal["Html", "Markdown"]
291
-
292
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
293
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
294
-
295
- llm: Optional[str]
296
- """**DEPRECATED**: use description instead"""
297
-
298
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
299
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
300
-
301
- strategy: Literal["LLM", "Auto", "Ignore"]
302
-
303
-
304
- class SegmentProcessingPage(TypedDict, total=False):
305
- crop_image: Literal["All", "Auto"]
306
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
307
-
308
- - `All` crops all images in the item
309
- - `Auto` crops images only if required for post-processing
310
- """
311
-
312
- description: bool
313
- """Generate LLM descriptions for this segment"""
314
-
315
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
316
- """**DEPRECATED**: `embed` field is auto populated"""
317
-
318
- extended_context: bool
319
- """Use the full page image as context for LLM generation"""
320
-
321
- format: Literal["Html", "Markdown"]
322
-
323
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
324
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
325
-
326
- llm: Optional[str]
327
- """**DEPRECATED**: use description instead"""
328
-
329
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
330
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
331
-
332
- strategy: Literal["LLM", "Auto", "Ignore"]
333
-
334
-
335
- class SegmentProcessingPageFooter(TypedDict, total=False):
336
- crop_image: Literal["All", "Auto"]
337
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
338
-
339
- - `All` crops all images in the item
340
- - `Auto` crops images only if required for post-processing
341
- """
342
-
343
- description: bool
344
- """Generate LLM descriptions for this segment"""
345
-
346
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
347
- """**DEPRECATED**: `embed` field is auto populated"""
348
-
349
- extended_context: bool
350
- """Use the full page image as context for LLM generation"""
351
-
352
- format: Literal["Html", "Markdown"]
353
-
354
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
355
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
356
-
357
- llm: Optional[str]
358
- """**DEPRECATED**: use description instead"""
359
-
360
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
361
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
362
-
363
- strategy: Literal["LLM", "Auto", "Ignore"]
364
-
365
-
366
- class SegmentProcessingPageHeader(TypedDict, total=False):
367
- crop_image: Literal["All", "Auto"]
368
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
369
-
370
- - `All` crops all images in the item
371
- - `Auto` crops images only if required for post-processing
372
- """
373
-
374
- description: bool
375
- """Generate LLM descriptions for this segment"""
376
-
377
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
378
- """**DEPRECATED**: `embed` field is auto populated"""
379
-
380
- extended_context: bool
381
- """Use the full page image as context for LLM generation"""
382
-
383
- format: Literal["Html", "Markdown"]
384
-
385
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
386
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
387
-
388
- llm: Optional[str]
389
- """**DEPRECATED**: use description instead"""
390
-
391
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
392
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
393
-
394
- strategy: Literal["LLM", "Auto", "Ignore"]
395
-
396
-
397
- class SegmentProcessingPicture(TypedDict, total=False):
398
- crop_image: Literal["All", "Auto"]
399
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
400
-
401
- - `All` crops all images in the item
402
- - `Auto` crops images only if required for post-processing
403
- """
404
-
405
- description: bool
406
- """Generate LLM descriptions for this segment"""
407
-
408
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
409
- """**DEPRECATED**: `embed` field is auto populated"""
410
-
411
- extended_context: bool
412
- """Use the full page image as context for LLM generation"""
413
-
414
- format: Literal["Html", "Markdown"]
415
-
416
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
417
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
418
-
419
- llm: Optional[str]
420
- """**DEPRECATED**: use description instead"""
421
-
422
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
423
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
424
-
425
- strategy: Literal["LLM", "Auto", "Ignore"]
426
-
427
-
428
- class SegmentProcessingSectionHeader(TypedDict, total=False):
429
- crop_image: Literal["All", "Auto"]
430
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
431
-
432
- - `All` crops all images in the item
433
- - `Auto` crops images only if required for post-processing
434
- """
435
-
436
- description: bool
437
- """Generate LLM descriptions for this segment"""
438
-
439
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
440
- """**DEPRECATED**: `embed` field is auto populated"""
441
-
442
- extended_context: bool
443
- """Use the full page image as context for LLM generation"""
444
-
445
- format: Literal["Html", "Markdown"]
446
-
447
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
448
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
449
-
450
- llm: Optional[str]
451
- """**DEPRECATED**: use description instead"""
452
-
453
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
454
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
455
-
456
- strategy: Literal["LLM", "Auto", "Ignore"]
457
-
458
-
459
- class SegmentProcessingTable(TypedDict, total=False):
460
- crop_image: Literal["All", "Auto"]
461
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
462
-
463
- - `All` crops all images in the item
464
- - `Auto` crops images only if required for post-processing
465
- """
466
-
467
- description: bool
468
- """Generate LLM descriptions for this segment"""
469
-
470
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
471
- """**DEPRECATED**: `embed` field is auto populated"""
472
-
473
- extended_context: bool
474
- """Use the full page image as context for LLM generation"""
475
-
476
- format: Literal["Html", "Markdown"]
477
-
478
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
479
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
480
-
481
- llm: Optional[str]
482
- """**DEPRECATED**: use description instead"""
483
-
484
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
485
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
486
-
487
- strategy: Literal["LLM", "Auto", "Ignore"]
488
-
489
-
490
- class SegmentProcessingText(TypedDict, total=False):
491
- crop_image: Literal["All", "Auto"]
492
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
493
-
494
- - `All` crops all images in the item
495
- - `Auto` crops images only if required for post-processing
496
- """
497
-
498
- description: bool
499
- """Generate LLM descriptions for this segment"""
500
-
501
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
502
- """**DEPRECATED**: `embed` field is auto populated"""
503
-
504
- extended_context: bool
505
- """Use the full page image as context for LLM generation"""
506
-
507
- format: Literal["Html", "Markdown"]
508
-
509
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
510
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
511
-
512
- llm: Optional[str]
513
- """**DEPRECATED**: use description instead"""
514
-
515
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
516
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
517
-
518
- strategy: Literal["LLM", "Auto", "Ignore"]
519
-
520
-
521
- class SegmentProcessingTitle(TypedDict, total=False):
522
- crop_image: Literal["All", "Auto"]
523
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
524
-
525
- - `All` crops all images in the item
526
- - `Auto` crops images only if required for post-processing
527
- """
528
-
529
- description: bool
530
- """Generate LLM descriptions for this segment"""
531
-
532
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
533
- """**DEPRECATED**: `embed` field is auto populated"""
534
-
535
- extended_context: bool
536
- """Use the full page image as context for LLM generation"""
537
-
538
- format: Literal["Html", "Markdown"]
539
-
540
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
541
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
542
-
543
- llm: Optional[str]
544
- """**DEPRECATED**: use description instead"""
545
-
546
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
547
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
548
-
549
- strategy: Literal["LLM", "Auto", "Ignore"]
550
-
551
-
552
- class SegmentProcessing(TypedDict, total=False):
553
- caption: Annotated[Optional[SegmentProcessingCaption], PropertyInfo(alias="Caption")]
554
- """Controls the processing and generation for the segment.
555
-
556
- - `crop_image` controls whether to crop the file's images to the segment's
557
- bounding box. The cropped image will be stored in the segment's `image` field.
558
- Use `All` to always crop, or `Auto` to only crop when needed for
559
- post-processing.
560
- - `format` specifies the output format: `Html` or `Markdown`
561
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
562
- - `Auto`: Process content automatically
563
- - `LLM`: Use large language models for processing
564
- - `Ignore`: Exclude segments from final output
565
- - `description` enables LLM-generated descriptions for segments. **Note:** This
566
- uses chunkr's own VLM models and is not configurable via LLM processing
567
- configuration.
568
- - `extended_context` uses the full page image as context for LLM generation.
569
-
570
- **Deprecated fields (for backwards compatibility):**
571
-
572
- - `llm` - **DEPRECATED**: Use `description` instead
573
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
574
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
575
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
576
- """
577
-
578
- footnote: Annotated[Optional[SegmentProcessingFootnote], PropertyInfo(alias="Footnote")]
579
- """Controls the processing and generation for the segment.
580
-
581
- - `crop_image` controls whether to crop the file's images to the segment's
582
- bounding box. The cropped image will be stored in the segment's `image` field.
583
- Use `All` to always crop, or `Auto` to only crop when needed for
584
- post-processing.
585
- - `format` specifies the output format: `Html` or `Markdown`
586
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
587
- - `Auto`: Process content automatically
588
- - `LLM`: Use large language models for processing
589
- - `Ignore`: Exclude segments from final output
590
- - `description` enables LLM-generated descriptions for segments. **Note:** This
591
- uses chunkr's own VLM models and is not configurable via LLM processing
592
- configuration.
593
- - `extended_context` uses the full page image as context for LLM generation.
594
-
595
- **Deprecated fields (for backwards compatibility):**
596
-
597
- - `llm` - **DEPRECATED**: Use `description` instead
598
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
599
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
600
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
601
- """
602
-
603
- formula: Annotated[Optional[SegmentProcessingFormula], PropertyInfo(alias="Formula")]
604
- """Controls the processing and generation for the segment.
605
-
606
- - `crop_image` controls whether to crop the file's images to the segment's
607
- bounding box. The cropped image will be stored in the segment's `image` field.
608
- Use `All` to always crop, or `Auto` to only crop when needed for
609
- post-processing.
610
- - `format` specifies the output format: `Html` or `Markdown`
611
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
612
- - `Auto`: Process content automatically
613
- - `LLM`: Use large language models for processing
614
- - `Ignore`: Exclude segments from final output
615
- - `description` enables LLM-generated descriptions for segments. **Note:** This
616
- uses chunkr's own VLM models and is not configurable via LLM processing
617
- configuration.
618
- - `extended_context` uses the full page image as context for LLM generation.
619
-
620
- **Deprecated fields (for backwards compatibility):**
621
-
622
- - `llm` - **DEPRECATED**: Use `description` instead
623
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
624
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
625
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
626
- """
627
-
628
- list_item: Annotated[Optional[SegmentProcessingListItem], PropertyInfo(alias="ListItem")]
629
- """Controls the processing and generation for the segment.
630
-
631
- - `crop_image` controls whether to crop the file's images to the segment's
632
- bounding box. The cropped image will be stored in the segment's `image` field.
633
- Use `All` to always crop, or `Auto` to only crop when needed for
634
- post-processing.
635
- - `format` specifies the output format: `Html` or `Markdown`
636
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
637
- - `Auto`: Process content automatically
638
- - `LLM`: Use large language models for processing
639
- - `Ignore`: Exclude segments from final output
640
- - `description` enables LLM-generated descriptions for segments. **Note:** This
641
- uses chunkr's own VLM models and is not configurable via LLM processing
642
- configuration.
643
- - `extended_context` uses the full page image as context for LLM generation.
644
-
645
- **Deprecated fields (for backwards compatibility):**
646
-
647
- - `llm` - **DEPRECATED**: Use `description` instead
648
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
649
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
650
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
651
- """
652
-
653
- page: Annotated[Optional[SegmentProcessingPage], PropertyInfo(alias="Page")]
654
- """Controls the processing and generation for the segment.
655
-
656
- - `crop_image` controls whether to crop the file's images to the segment's
657
- bounding box. The cropped image will be stored in the segment's `image` field.
658
- Use `All` to always crop, or `Auto` to only crop when needed for
659
- post-processing.
660
- - `format` specifies the output format: `Html` or `Markdown`
661
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
662
- - `Auto`: Process content automatically
663
- - `LLM`: Use large language models for processing
664
- - `Ignore`: Exclude segments from final output
665
- - `description` enables LLM-generated descriptions for segments. **Note:** This
666
- uses chunkr's own VLM models and is not configurable via LLM processing
667
- configuration.
668
- - `extended_context` uses the full page image as context for LLM generation.
669
-
670
- **Deprecated fields (for backwards compatibility):**
671
-
672
- - `llm` - **DEPRECATED**: Use `description` instead
673
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
674
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
675
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
676
- """
677
-
678
- page_footer: Annotated[Optional[SegmentProcessingPageFooter], PropertyInfo(alias="PageFooter")]
679
- """Controls the processing and generation for the segment.
680
-
681
- - `crop_image` controls whether to crop the file's images to the segment's
682
- bounding box. The cropped image will be stored in the segment's `image` field.
683
- Use `All` to always crop, or `Auto` to only crop when needed for
684
- post-processing.
685
- - `format` specifies the output format: `Html` or `Markdown`
686
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
687
- - `Auto`: Process content automatically
688
- - `LLM`: Use large language models for processing
689
- - `Ignore`: Exclude segments from final output
690
- - `description` enables LLM-generated descriptions for segments. **Note:** This
691
- uses chunkr's own VLM models and is not configurable via LLM processing
692
- configuration.
693
- - `extended_context` uses the full page image as context for LLM generation.
694
-
695
- **Deprecated fields (for backwards compatibility):**
696
-
697
- - `llm` - **DEPRECATED**: Use `description` instead
698
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
699
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
700
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
701
- """
702
-
703
- page_header: Annotated[Optional[SegmentProcessingPageHeader], PropertyInfo(alias="PageHeader")]
704
- """Controls the processing and generation for the segment.
705
-
706
- - `crop_image` controls whether to crop the file's images to the segment's
707
- bounding box. The cropped image will be stored in the segment's `image` field.
708
- Use `All` to always crop, or `Auto` to only crop when needed for
709
- post-processing.
710
- - `format` specifies the output format: `Html` or `Markdown`
711
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
712
- - `Auto`: Process content automatically
713
- - `LLM`: Use large language models for processing
714
- - `Ignore`: Exclude segments from final output
715
- - `description` enables LLM-generated descriptions for segments. **Note:** This
716
- uses chunkr's own VLM models and is not configurable via LLM processing
717
- configuration.
718
- - `extended_context` uses the full page image as context for LLM generation.
719
-
720
- **Deprecated fields (for backwards compatibility):**
721
-
722
- - `llm` - **DEPRECATED**: Use `description` instead
723
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
724
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
725
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
726
- """
727
-
728
- picture: Annotated[Optional[SegmentProcessingPicture], PropertyInfo(alias="Picture")]
729
- """Controls the processing and generation for the segment.
730
-
731
- - `crop_image` controls whether to crop the file's images to the segment's
732
- bounding box. The cropped image will be stored in the segment's `image` field.
733
- Use `All` to always crop, or `Auto` to only crop when needed for
734
- post-processing.
735
- - `format` specifies the output format: `Html` or `Markdown`
736
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
737
- - `Auto`: Process content automatically
738
- - `LLM`: Use large language models for processing
739
- - `Ignore`: Exclude segments from final output
740
- - `description` enables LLM-generated descriptions for segments. **Note:** This
741
- uses chunkr's own VLM models and is not configurable via LLM processing
742
- configuration.
743
- - `extended_context` uses the full page image as context for LLM generation.
744
-
745
- **Deprecated fields (for backwards compatibility):**
746
-
747
- - `llm` - **DEPRECATED**: Use `description` instead
748
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
749
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
750
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
751
- """
752
-
753
- section_header: Annotated[Optional[SegmentProcessingSectionHeader], PropertyInfo(alias="SectionHeader")]
754
- """Controls the processing and generation for the segment.
755
-
756
- - `crop_image` controls whether to crop the file's images to the segment's
757
- bounding box. The cropped image will be stored in the segment's `image` field.
758
- Use `All` to always crop, or `Auto` to only crop when needed for
759
- post-processing.
760
- - `format` specifies the output format: `Html` or `Markdown`
761
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
762
- - `Auto`: Process content automatically
763
- - `LLM`: Use large language models for processing
764
- - `Ignore`: Exclude segments from final output
765
- - `description` enables LLM-generated descriptions for segments. **Note:** This
766
- uses chunkr's own VLM models and is not configurable via LLM processing
767
- configuration.
768
- - `extended_context` uses the full page image as context for LLM generation.
769
-
770
- **Deprecated fields (for backwards compatibility):**
771
-
772
- - `llm` - **DEPRECATED**: Use `description` instead
773
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
774
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
775
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
776
- """
777
-
778
- table: Annotated[Optional[SegmentProcessingTable], PropertyInfo(alias="Table")]
779
- """Controls the processing and generation for the segment.
780
-
781
- - `crop_image` controls whether to crop the file's images to the segment's
782
- bounding box. The cropped image will be stored in the segment's `image` field.
783
- Use `All` to always crop, or `Auto` to only crop when needed for
784
- post-processing.
785
- - `format` specifies the output format: `Html` or `Markdown`
786
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
787
- - `Auto`: Process content automatically
788
- - `LLM`: Use large language models for processing
789
- - `Ignore`: Exclude segments from final output
790
- - `description` enables LLM-generated descriptions for segments. **Note:** This
791
- uses chunkr's own VLM models and is not configurable via LLM processing
792
- configuration.
793
- - `extended_context` uses the full page image as context for LLM generation.
794
-
795
- **Deprecated fields (for backwards compatibility):**
796
-
797
- - `llm` - **DEPRECATED**: Use `description` instead
798
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
799
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
800
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
801
- """
802
-
803
- text: Annotated[Optional[SegmentProcessingText], PropertyInfo(alias="Text")]
804
- """Controls the processing and generation for the segment.
805
-
806
- - `crop_image` controls whether to crop the file's images to the segment's
807
- bounding box. The cropped image will be stored in the segment's `image` field.
808
- Use `All` to always crop, or `Auto` to only crop when needed for
809
- post-processing.
810
- - `format` specifies the output format: `Html` or `Markdown`
811
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
812
- - `Auto`: Process content automatically
813
- - `LLM`: Use large language models for processing
814
- - `Ignore`: Exclude segments from final output
815
- - `description` enables LLM-generated descriptions for segments. **Note:** This
816
- uses chunkr's own VLM models and is not configurable via LLM processing
817
- configuration.
818
- - `extended_context` uses the full page image as context for LLM generation.
819
-
820
- **Deprecated fields (for backwards compatibility):**
821
-
822
- - `llm` - **DEPRECATED**: Use `description` instead
823
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
824
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
825
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
826
- """
827
-
828
- title: Annotated[Optional[SegmentProcessingTitle], PropertyInfo(alias="Title")]
829
- """Controls the processing and generation for the segment.
830
-
831
- - `crop_image` controls whether to crop the file's images to the segment's
832
- bounding box. The cropped image will be stored in the segment's `image` field.
833
- Use `All` to always crop, or `Auto` to only crop when needed for
834
- post-processing.
835
- - `format` specifies the output format: `Html` or `Markdown`
836
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
837
- - `Auto`: Process content automatically
838
- - `LLM`: Use large language models for processing
839
- - `Ignore`: Exclude segments from final output
840
- - `description` enables LLM-generated descriptions for segments. **Note:** This
841
- uses chunkr's own VLM models and is not configurable via LLM processing
842
- configuration.
843
- - `extended_context` uses the full page image as context for LLM generation.
844
-
845
- **Deprecated fields (for backwards compatibility):**
846
-
847
- - `llm` - **DEPRECATED**: Use `description` instead
848
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
849
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
850
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
851
- """