chunkr-ai 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. chunkr_ai/_client.py +18 -9
  2. chunkr_ai/_files.py +1 -1
  3. chunkr_ai/_version.py +1 -1
  4. chunkr_ai/pagination.py +61 -1
  5. chunkr_ai/resources/__init__.py +27 -13
  6. chunkr_ai/resources/files.py +712 -0
  7. chunkr_ai/resources/tasks/__init__.py +33 -0
  8. chunkr_ai/resources/tasks/parse.py +612 -0
  9. chunkr_ai/resources/tasks/tasks.py +596 -0
  10. chunkr_ai/types/__init__.py +7 -19
  11. chunkr_ai/types/delete.py +10 -0
  12. chunkr_ai/types/file.py +30 -0
  13. chunkr_ai/types/file_create_params.py +17 -0
  14. chunkr_ai/types/file_list_params.py +28 -0
  15. chunkr_ai/types/file_url.py +15 -0
  16. chunkr_ai/types/file_url_params.py +15 -0
  17. chunkr_ai/types/files_page_response.py +20 -0
  18. chunkr_ai/types/task.py +866 -27
  19. chunkr_ai/types/tasks/__init__.py +6 -0
  20. chunkr_ai/types/tasks/parse_create_params.py +844 -0
  21. chunkr_ai/types/tasks/parse_update_params.py +838 -0
  22. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/METADATA +39 -21
  23. chunkr_ai-0.1.0a3.dist-info/RECORD +52 -0
  24. chunkr_ai/resources/task.py +0 -1166
  25. chunkr_ai/types/auto_generation_config.py +0 -39
  26. chunkr_ai/types/auto_generation_config_param.py +0 -39
  27. chunkr_ai/types/bounding_box.py +0 -19
  28. chunkr_ai/types/chunk_processing.py +0 -40
  29. chunkr_ai/types/chunk_processing_param.py +0 -42
  30. chunkr_ai/types/ignore_generation_config.py +0 -39
  31. chunkr_ai/types/ignore_generation_config_param.py +0 -39
  32. chunkr_ai/types/llm_generation_config.py +0 -39
  33. chunkr_ai/types/llm_generation_config_param.py +0 -39
  34. chunkr_ai/types/llm_processing.py +0 -36
  35. chunkr_ai/types/llm_processing_param.py +0 -36
  36. chunkr_ai/types/picture_generation_config.py +0 -39
  37. chunkr_ai/types/picture_generation_config_param.py +0 -39
  38. chunkr_ai/types/segment_processing.py +0 -280
  39. chunkr_ai/types/segment_processing_param.py +0 -281
  40. chunkr_ai/types/table_generation_config.py +0 -39
  41. chunkr_ai/types/table_generation_config_param.py +0 -39
  42. chunkr_ai/types/task_parse_params.py +0 -90
  43. chunkr_ai/types/task_update_params.py +0 -90
  44. chunkr_ai-0.1.0a1.dist-info/RECORD +0 -58
  45. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/WHEEL +0 -0
  46. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,844 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List, Union, Optional
6
+ from typing_extensions import Literal, Required, Annotated, TypeAlias, TypedDict
7
+
8
+ from ..._utils import PropertyInfo
9
+
10
+ __all__ = [
11
+ "ParseCreateParams",
12
+ "ChunkProcessing",
13
+ "ChunkProcessingTokenizer",
14
+ "ChunkProcessingTokenizerEnum",
15
+ "ChunkProcessingTokenizerString",
16
+ "LlmProcessing",
17
+ "LlmProcessingFallbackStrategy",
18
+ "LlmProcessingFallbackStrategyModel",
19
+ "SegmentProcessing",
20
+ "SegmentProcessingCaption",
21
+ "SegmentProcessingFootnote",
22
+ "SegmentProcessingFormula",
23
+ "SegmentProcessingListItem",
24
+ "SegmentProcessingPage",
25
+ "SegmentProcessingPageFooter",
26
+ "SegmentProcessingPageHeader",
27
+ "SegmentProcessingPicture",
28
+ "SegmentProcessingSectionHeader",
29
+ "SegmentProcessingTable",
30
+ "SegmentProcessingText",
31
+ "SegmentProcessingTitle",
32
+ ]
33
+
34
+
35
+ class ParseCreateParams(TypedDict, total=False):
36
+ file: Required[str]
37
+ """The file to be uploaded. Supported inputs:
38
+
39
+ - `ch://files/{file_id}`: References a previously uploaded file you own
40
+ (authorization enforced)
41
+ - `http(s)://...`: Remote URL to fetch
42
+ - `data:*;base64,...` or raw base64 string
43
+ """
44
+
45
+ chunk_processing: Optional[ChunkProcessing]
46
+ """Controls the setting for the chunking and post-processing of each chunk."""
47
+
48
+ error_handling: Optional[Literal["Fail", "Continue"]]
49
+ """Controls how errors are handled during processing:
50
+
51
+ - `Fail`: Stops processing and fails the task when any error occurs
52
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
53
+ LLM refusals etc.)
54
+ """
55
+
56
+ expires_in: Optional[int]
57
+ """
58
+ The number of seconds until task is deleted. Expired tasks can **not** be
59
+ updated, polled or accessed via web interface.
60
+ """
61
+
62
+ file_name: Optional[str]
63
+ """The name of the file to be uploaded. If not set a name will be generated."""
64
+
65
+ llm_processing: Optional[LlmProcessing]
66
+ """Controls the LLM used for the task."""
67
+
68
+ ocr_strategy: Optional[Literal["All", "Auto"]]
69
+ """Controls the Optical Character Recognition (OCR) strategy.
70
+
71
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
72
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
73
+ text. When text layer is present the bounding boxes from the text layer are
74
+ used.
75
+ """
76
+
77
+ pipeline: Optional[Literal["Azure", "Chunkr"]]
78
+ """
79
+ Choose the provider whose models will be used for segmentation and OCR. The
80
+ output will be unified to the Chunkr `output` format.
81
+ """
82
+
83
+ segment_processing: Optional[SegmentProcessing]
84
+ """Defines how each segment type is handled when generating the final output.
85
+
86
+ Each segment uses one of three strategies. The chosen strategy controls: •
87
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
88
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
89
+ `Markdown`).
90
+
91
+ Optional flags such as image **cropping**, **extended context**, and
92
+ **descriptions** further refine behaviour.
93
+
94
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
95
+ `Caption`, `Footnote` → **Auto** (Markdown, description off) • `Table` → **LLM**
96
+ (HTML, description on) • `Picture` → **LLM** (Markdown, description off,
97
+ cropping _All_) • `Formula`, `Page` → **LLM** (Markdown, description off) •
98
+ `PageHeader`, `PageFooter` → **Ignore** (removed from output)
99
+
100
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
101
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
102
+ """
103
+
104
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]]
105
+ """Controls the segmentation strategy:
106
+
107
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
108
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
109
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
110
+ - `Page`: Treats each page as a single segment. Faster processing, but without
111
+ layout element detection and only simple chunking.
112
+ """
113
+
114
+
115
+ class ChunkProcessingTokenizerEnum(TypedDict, total=False):
116
+ enum: Required[
117
+ Annotated[Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"], PropertyInfo(alias="Enum")]
118
+ ]
119
+ """Use one of the predefined tokenizer types"""
120
+
121
+
122
+ class ChunkProcessingTokenizerString(TypedDict, total=False):
123
+ string: Required[Annotated[str, PropertyInfo(alias="String")]]
124
+ """
125
+ Use any Hugging Face tokenizer by specifying its model ID Examples:
126
+ "Qwen/Qwen-tokenizer", "facebook/bart-large"
127
+ """
128
+
129
+
130
+ ChunkProcessingTokenizer: TypeAlias = Union[ChunkProcessingTokenizerEnum, ChunkProcessingTokenizerString]
131
+
132
+
133
+ class ChunkProcessing(TypedDict, total=False):
134
+ ignore_headers_and_footers: Optional[bool]
135
+ """DEPRECATED: use `segment_processing.ignore` instead"""
136
+
137
+ target_length: int
138
+ """The target number of words in each chunk.
139
+
140
+ If 0, each chunk will contain a single segment.
141
+ """
142
+
143
+ tokenizer: ChunkProcessingTokenizer
144
+ """The tokenizer to use for the chunking process."""
145
+
146
+
147
+ class LlmProcessingFallbackStrategyModel(TypedDict, total=False):
148
+ model: Required[Annotated[str, PropertyInfo(alias="Model")]]
149
+ """Use a specific model as fallback"""
150
+
151
+
152
+ LlmProcessingFallbackStrategy: TypeAlias = Union[Literal["None", "Default"], LlmProcessingFallbackStrategyModel]
153
+
154
+
155
+ class LlmProcessing(TypedDict, total=False):
156
+ fallback_strategy: LlmProcessingFallbackStrategy
157
+ """The fallback strategy to use for the LLMs in the task."""
158
+
159
+ llm_model_id: Optional[str]
160
+ """The ID of the model to use for the task.
161
+
162
+ If not provided, the default model will be used. Please check the documentation
163
+ for the model you want to use.
164
+ """
165
+
166
+ max_completion_tokens: Optional[int]
167
+ """The maximum number of tokens to generate."""
168
+
169
+ temperature: float
170
+ """The temperature to use for the LLM."""
171
+
172
+
173
+ class SegmentProcessingCaption(TypedDict, total=False):
174
+ crop_image: Literal["All", "Auto"]
175
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
176
+
177
+ - `All` crops all images in the item
178
+ - `Auto` crops images only if required for post-processing
179
+ """
180
+
181
+ description: bool
182
+ """Generate LLM descriptions for this segment"""
183
+
184
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
185
+ """**DEPRECATED**: `embed` field is auto populated"""
186
+
187
+ extended_context: bool
188
+ """Use the full page image as context for LLM generation"""
189
+
190
+ format: Literal["Html", "Markdown"]
191
+
192
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
193
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
194
+
195
+ llm: Optional[str]
196
+ """**DEPRECATED**: use description instead"""
197
+
198
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
199
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
200
+
201
+ strategy: Literal["LLM", "Auto", "Ignore"]
202
+
203
+
204
+ class SegmentProcessingFootnote(TypedDict, total=False):
205
+ crop_image: Literal["All", "Auto"]
206
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
207
+
208
+ - `All` crops all images in the item
209
+ - `Auto` crops images only if required for post-processing
210
+ """
211
+
212
+ description: bool
213
+ """Generate LLM descriptions for this segment"""
214
+
215
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
216
+ """**DEPRECATED**: `embed` field is auto populated"""
217
+
218
+ extended_context: bool
219
+ """Use the full page image as context for LLM generation"""
220
+
221
+ format: Literal["Html", "Markdown"]
222
+
223
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
224
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
225
+
226
+ llm: Optional[str]
227
+ """**DEPRECATED**: use description instead"""
228
+
229
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
230
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
231
+
232
+ strategy: Literal["LLM", "Auto", "Ignore"]
233
+
234
+
235
+ class SegmentProcessingFormula(TypedDict, total=False):
236
+ crop_image: Literal["All", "Auto"]
237
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
238
+
239
+ - `All` crops all images in the item
240
+ - `Auto` crops images only if required for post-processing
241
+ """
242
+
243
+ description: bool
244
+ """Generate LLM descriptions for this segment"""
245
+
246
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
247
+ """**DEPRECATED**: `embed` field is auto populated"""
248
+
249
+ extended_context: bool
250
+ """Use the full page image as context for LLM generation"""
251
+
252
+ format: Literal["Html", "Markdown"]
253
+
254
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
255
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
256
+
257
+ llm: Optional[str]
258
+ """**DEPRECATED**: use description instead"""
259
+
260
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
261
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
262
+
263
+ strategy: Literal["LLM", "Auto", "Ignore"]
264
+
265
+
266
+ class SegmentProcessingListItem(TypedDict, total=False):
267
+ crop_image: Literal["All", "Auto"]
268
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
269
+
270
+ - `All` crops all images in the item
271
+ - `Auto` crops images only if required for post-processing
272
+ """
273
+
274
+ description: bool
275
+ """Generate LLM descriptions for this segment"""
276
+
277
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
278
+ """**DEPRECATED**: `embed` field is auto populated"""
279
+
280
+ extended_context: bool
281
+ """Use the full page image as context for LLM generation"""
282
+
283
+ format: Literal["Html", "Markdown"]
284
+
285
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
286
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
287
+
288
+ llm: Optional[str]
289
+ """**DEPRECATED**: use description instead"""
290
+
291
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
292
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
293
+
294
+ strategy: Literal["LLM", "Auto", "Ignore"]
295
+
296
+
297
+ class SegmentProcessingPage(TypedDict, total=False):
298
+ crop_image: Literal["All", "Auto"]
299
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
300
+
301
+ - `All` crops all images in the item
302
+ - `Auto` crops images only if required for post-processing
303
+ """
304
+
305
+ description: bool
306
+ """Generate LLM descriptions for this segment"""
307
+
308
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
309
+ """**DEPRECATED**: `embed` field is auto populated"""
310
+
311
+ extended_context: bool
312
+ """Use the full page image as context for LLM generation"""
313
+
314
+ format: Literal["Html", "Markdown"]
315
+
316
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
317
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
318
+
319
+ llm: Optional[str]
320
+ """**DEPRECATED**: use description instead"""
321
+
322
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
323
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
324
+
325
+ strategy: Literal["LLM", "Auto", "Ignore"]
326
+
327
+
328
+ class SegmentProcessingPageFooter(TypedDict, total=False):
329
+ crop_image: Literal["All", "Auto"]
330
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
331
+
332
+ - `All` crops all images in the item
333
+ - `Auto` crops images only if required for post-processing
334
+ """
335
+
336
+ description: bool
337
+ """Generate LLM descriptions for this segment"""
338
+
339
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
340
+ """**DEPRECATED**: `embed` field is auto populated"""
341
+
342
+ extended_context: bool
343
+ """Use the full page image as context for LLM generation"""
344
+
345
+ format: Literal["Html", "Markdown"]
346
+
347
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
348
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
349
+
350
+ llm: Optional[str]
351
+ """**DEPRECATED**: use description instead"""
352
+
353
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
354
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
355
+
356
+ strategy: Literal["LLM", "Auto", "Ignore"]
357
+
358
+
359
+ class SegmentProcessingPageHeader(TypedDict, total=False):
360
+ crop_image: Literal["All", "Auto"]
361
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
362
+
363
+ - `All` crops all images in the item
364
+ - `Auto` crops images only if required for post-processing
365
+ """
366
+
367
+ description: bool
368
+ """Generate LLM descriptions for this segment"""
369
+
370
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
371
+ """**DEPRECATED**: `embed` field is auto populated"""
372
+
373
+ extended_context: bool
374
+ """Use the full page image as context for LLM generation"""
375
+
376
+ format: Literal["Html", "Markdown"]
377
+
378
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
379
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
380
+
381
+ llm: Optional[str]
382
+ """**DEPRECATED**: use description instead"""
383
+
384
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
385
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
386
+
387
+ strategy: Literal["LLM", "Auto", "Ignore"]
388
+
389
+
390
+ class SegmentProcessingPicture(TypedDict, total=False):
391
+ crop_image: Literal["All", "Auto"]
392
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
393
+
394
+ - `All` crops all images in the item
395
+ - `Auto` crops images only if required for post-processing
396
+ """
397
+
398
+ description: bool
399
+ """Generate LLM descriptions for this segment"""
400
+
401
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
402
+ """**DEPRECATED**: `embed` field is auto populated"""
403
+
404
+ extended_context: bool
405
+ """Use the full page image as context for LLM generation"""
406
+
407
+ format: Literal["Html", "Markdown"]
408
+
409
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
410
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
411
+
412
+ llm: Optional[str]
413
+ """**DEPRECATED**: use description instead"""
414
+
415
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
416
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
417
+
418
+ strategy: Literal["LLM", "Auto", "Ignore"]
419
+
420
+
421
+ class SegmentProcessingSectionHeader(TypedDict, total=False):
422
+ crop_image: Literal["All", "Auto"]
423
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
424
+
425
+ - `All` crops all images in the item
426
+ - `Auto` crops images only if required for post-processing
427
+ """
428
+
429
+ description: bool
430
+ """Generate LLM descriptions for this segment"""
431
+
432
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
433
+ """**DEPRECATED**: `embed` field is auto populated"""
434
+
435
+ extended_context: bool
436
+ """Use the full page image as context for LLM generation"""
437
+
438
+ format: Literal["Html", "Markdown"]
439
+
440
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
441
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
442
+
443
+ llm: Optional[str]
444
+ """**DEPRECATED**: use description instead"""
445
+
446
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
447
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
448
+
449
+ strategy: Literal["LLM", "Auto", "Ignore"]
450
+
451
+
452
+ class SegmentProcessingTable(TypedDict, total=False):
453
+ crop_image: Literal["All", "Auto"]
454
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
455
+
456
+ - `All` crops all images in the item
457
+ - `Auto` crops images only if required for post-processing
458
+ """
459
+
460
+ description: bool
461
+ """Generate LLM descriptions for this segment"""
462
+
463
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
464
+ """**DEPRECATED**: `embed` field is auto populated"""
465
+
466
+ extended_context: bool
467
+ """Use the full page image as context for LLM generation"""
468
+
469
+ format: Literal["Html", "Markdown"]
470
+
471
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
472
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
473
+
474
+ llm: Optional[str]
475
+ """**DEPRECATED**: use description instead"""
476
+
477
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
478
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
479
+
480
+ strategy: Literal["LLM", "Auto", "Ignore"]
481
+
482
+
483
+ class SegmentProcessingText(TypedDict, total=False):
484
+ crop_image: Literal["All", "Auto"]
485
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
486
+
487
+ - `All` crops all images in the item
488
+ - `Auto` crops images only if required for post-processing
489
+ """
490
+
491
+ description: bool
492
+ """Generate LLM descriptions for this segment"""
493
+
494
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
495
+ """**DEPRECATED**: `embed` field is auto populated"""
496
+
497
+ extended_context: bool
498
+ """Use the full page image as context for LLM generation"""
499
+
500
+ format: Literal["Html", "Markdown"]
501
+
502
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
503
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
504
+
505
+ llm: Optional[str]
506
+ """**DEPRECATED**: use description instead"""
507
+
508
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
509
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
510
+
511
+ strategy: Literal["LLM", "Auto", "Ignore"]
512
+
513
+
514
+ class SegmentProcessingTitle(TypedDict, total=False):
515
+ crop_image: Literal["All", "Auto"]
516
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
517
+
518
+ - `All` crops all images in the item
519
+ - `Auto` crops images only if required for post-processing
520
+ """
521
+
522
+ description: bool
523
+ """Generate LLM descriptions for this segment"""
524
+
525
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
526
+ """**DEPRECATED**: `embed` field is auto populated"""
527
+
528
+ extended_context: bool
529
+ """Use the full page image as context for LLM generation"""
530
+
531
+ format: Literal["Html", "Markdown"]
532
+
533
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
534
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
535
+
536
+ llm: Optional[str]
537
+ """**DEPRECATED**: use description instead"""
538
+
539
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
540
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
541
+
542
+ strategy: Literal["LLM", "Auto", "Ignore"]
543
+
544
+
545
+ class SegmentProcessing(TypedDict, total=False):
546
+ caption: Annotated[Optional[SegmentProcessingCaption], PropertyInfo(alias="Caption")]
547
+ """Controls the processing and generation for the segment.
548
+
549
+ - `crop_image` controls whether to crop the file's images to the segment's
550
+ bounding box. The cropped image will be stored in the segment's `image` field.
551
+ Use `All` to always crop, or `Auto` to only crop when needed for
552
+ post-processing.
553
+ - `format` specifies the output format: `Html` or `Markdown`
554
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
555
+ - `Auto`: Process content automatically
556
+ - `LLM`: Use large language models for processing
557
+ - `Ignore`: Exclude segments from final output
558
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
559
+ uses chunkr's own VLM models and is not configurable via LLM processing
560
+ configuration.
561
+ - `extended_context` uses the full page image as context for LLM generation.
562
+
563
+ **Deprecated fields (for backwards compatibility):**
564
+
565
+ - `llm` - **DEPRECATED**: Use `description` instead
566
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
567
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
568
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
569
+ """
570
+
571
+ footnote: Annotated[Optional[SegmentProcessingFootnote], PropertyInfo(alias="Footnote")]
572
+ """Controls the processing and generation for the segment.
573
+
574
+ - `crop_image` controls whether to crop the file's images to the segment's
575
+ bounding box. The cropped image will be stored in the segment's `image` field.
576
+ Use `All` to always crop, or `Auto` to only crop when needed for
577
+ post-processing.
578
+ - `format` specifies the output format: `Html` or `Markdown`
579
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
580
+ - `Auto`: Process content automatically
581
+ - `LLM`: Use large language models for processing
582
+ - `Ignore`: Exclude segments from final output
583
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
584
+ uses chunkr's own VLM models and is not configurable via LLM processing
585
+ configuration.
586
+ - `extended_context` uses the full page image as context for LLM generation.
587
+
588
+ **Deprecated fields (for backwards compatibility):**
589
+
590
+ - `llm` - **DEPRECATED**: Use `description` instead
591
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
592
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
593
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
594
+ """
595
+
596
+ formula: Annotated[Optional[SegmentProcessingFormula], PropertyInfo(alias="Formula")]
597
+ """Controls the processing and generation for the segment.
598
+
599
+ - `crop_image` controls whether to crop the file's images to the segment's
600
+ bounding box. The cropped image will be stored in the segment's `image` field.
601
+ Use `All` to always crop, or `Auto` to only crop when needed for
602
+ post-processing.
603
+ - `format` specifies the output format: `Html` or `Markdown`
604
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
605
+ - `Auto`: Process content automatically
606
+ - `LLM`: Use large language models for processing
607
+ - `Ignore`: Exclude segments from final output
608
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
609
+ uses chunkr's own VLM models and is not configurable via LLM processing
610
+ configuration.
611
+ - `extended_context` uses the full page image as context for LLM generation.
612
+
613
+ **Deprecated fields (for backwards compatibility):**
614
+
615
+ - `llm` - **DEPRECATED**: Use `description` instead
616
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
617
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
618
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
619
+ """
620
+
621
+ list_item: Annotated[Optional[SegmentProcessingListItem], PropertyInfo(alias="ListItem")]
622
+ """Controls the processing and generation for the segment.
623
+
624
+ - `crop_image` controls whether to crop the file's images to the segment's
625
+ bounding box. The cropped image will be stored in the segment's `image` field.
626
+ Use `All` to always crop, or `Auto` to only crop when needed for
627
+ post-processing.
628
+ - `format` specifies the output format: `Html` or `Markdown`
629
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
630
+ - `Auto`: Process content automatically
631
+ - `LLM`: Use large language models for processing
632
+ - `Ignore`: Exclude segments from final output
633
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
634
+ uses chunkr's own VLM models and is not configurable via LLM processing
635
+ configuration.
636
+ - `extended_context` uses the full page image as context for LLM generation.
637
+
638
+ **Deprecated fields (for backwards compatibility):**
639
+
640
+ - `llm` - **DEPRECATED**: Use `description` instead
641
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
642
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
643
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
644
+ """
645
+
646
+ page: Annotated[Optional[SegmentProcessingPage], PropertyInfo(alias="Page")]
647
+ """Controls the processing and generation for the segment.
648
+
649
+ - `crop_image` controls whether to crop the file's images to the segment's
650
+ bounding box. The cropped image will be stored in the segment's `image` field.
651
+ Use `All` to always crop, or `Auto` to only crop when needed for
652
+ post-processing.
653
+ - `format` specifies the output format: `Html` or `Markdown`
654
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
655
+ - `Auto`: Process content automatically
656
+ - `LLM`: Use large language models for processing
657
+ - `Ignore`: Exclude segments from final output
658
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
659
+ uses chunkr's own VLM models and is not configurable via LLM processing
660
+ configuration.
661
+ - `extended_context` uses the full page image as context for LLM generation.
662
+
663
+ **Deprecated fields (for backwards compatibility):**
664
+
665
+ - `llm` - **DEPRECATED**: Use `description` instead
666
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
667
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
668
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
669
+ """
670
+
671
+ page_footer: Annotated[Optional[SegmentProcessingPageFooter], PropertyInfo(alias="PageFooter")]
672
+ """Controls the processing and generation for the segment.
673
+
674
+ - `crop_image` controls whether to crop the file's images to the segment's
675
+ bounding box. The cropped image will be stored in the segment's `image` field.
676
+ Use `All` to always crop, or `Auto` to only crop when needed for
677
+ post-processing.
678
+ - `format` specifies the output format: `Html` or `Markdown`
679
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
680
+ - `Auto`: Process content automatically
681
+ - `LLM`: Use large language models for processing
682
+ - `Ignore`: Exclude segments from final output
683
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
684
+ uses chunkr's own VLM models and is not configurable via LLM processing
685
+ configuration.
686
+ - `extended_context` uses the full page image as context for LLM generation.
687
+
688
+ **Deprecated fields (for backwards compatibility):**
689
+
690
+ - `llm` - **DEPRECATED**: Use `description` instead
691
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
692
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
693
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
694
+ """
695
+
696
+ page_header: Annotated[Optional[SegmentProcessingPageHeader], PropertyInfo(alias="PageHeader")]
697
+ """Controls the processing and generation for the segment.
698
+
699
+ - `crop_image` controls whether to crop the file's images to the segment's
700
+ bounding box. The cropped image will be stored in the segment's `image` field.
701
+ Use `All` to always crop, or `Auto` to only crop when needed for
702
+ post-processing.
703
+ - `format` specifies the output format: `Html` or `Markdown`
704
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
705
+ - `Auto`: Process content automatically
706
+ - `LLM`: Use large language models for processing
707
+ - `Ignore`: Exclude segments from final output
708
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
709
+ uses chunkr's own VLM models and is not configurable via LLM processing
710
+ configuration.
711
+ - `extended_context` uses the full page image as context for LLM generation.
712
+
713
+ **Deprecated fields (for backwards compatibility):**
714
+
715
+ - `llm` - **DEPRECATED**: Use `description` instead
716
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
717
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
718
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
719
+ """
720
+
721
+ picture: Annotated[Optional[SegmentProcessingPicture], PropertyInfo(alias="Picture")]
722
+ """Controls the processing and generation for the segment.
723
+
724
+ - `crop_image` controls whether to crop the file's images to the segment's
725
+ bounding box. The cropped image will be stored in the segment's `image` field.
726
+ Use `All` to always crop, or `Auto` to only crop when needed for
727
+ post-processing.
728
+ - `format` specifies the output format: `Html` or `Markdown`
729
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
730
+ - `Auto`: Process content automatically
731
+ - `LLM`: Use large language models for processing
732
+ - `Ignore`: Exclude segments from final output
733
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
734
+ uses chunkr's own VLM models and is not configurable via LLM processing
735
+ configuration.
736
+ - `extended_context` uses the full page image as context for LLM generation.
737
+
738
+ **Deprecated fields (for backwards compatibility):**
739
+
740
+ - `llm` - **DEPRECATED**: Use `description` instead
741
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
742
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
743
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
744
+ """
745
+
746
+ section_header: Annotated[Optional[SegmentProcessingSectionHeader], PropertyInfo(alias="SectionHeader")]
747
+ """Controls the processing and generation for the segment.
748
+
749
+ - `crop_image` controls whether to crop the file's images to the segment's
750
+ bounding box. The cropped image will be stored in the segment's `image` field.
751
+ Use `All` to always crop, or `Auto` to only crop when needed for
752
+ post-processing.
753
+ - `format` specifies the output format: `Html` or `Markdown`
754
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
755
+ - `Auto`: Process content automatically
756
+ - `LLM`: Use large language models for processing
757
+ - `Ignore`: Exclude segments from final output
758
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
759
+ uses chunkr's own VLM models and is not configurable via LLM processing
760
+ configuration.
761
+ - `extended_context` uses the full page image as context for LLM generation.
762
+
763
+ **Deprecated fields (for backwards compatibility):**
764
+
765
+ - `llm` - **DEPRECATED**: Use `description` instead
766
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
767
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
768
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
769
+ """
770
+
771
+ table: Annotated[Optional[SegmentProcessingTable], PropertyInfo(alias="Table")]
772
+ """Controls the processing and generation for the segment.
773
+
774
+ - `crop_image` controls whether to crop the file's images to the segment's
775
+ bounding box. The cropped image will be stored in the segment's `image` field.
776
+ Use `All` to always crop, or `Auto` to only crop when needed for
777
+ post-processing.
778
+ - `format` specifies the output format: `Html` or `Markdown`
779
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
780
+ - `Auto`: Process content automatically
781
+ - `LLM`: Use large language models for processing
782
+ - `Ignore`: Exclude segments from final output
783
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
784
+ uses chunkr's own VLM models and is not configurable via LLM processing
785
+ configuration.
786
+ - `extended_context` uses the full page image as context for LLM generation.
787
+
788
+ **Deprecated fields (for backwards compatibility):**
789
+
790
+ - `llm` - **DEPRECATED**: Use `description` instead
791
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
792
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
793
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
794
+ """
795
+
796
+ text: Annotated[Optional[SegmentProcessingText], PropertyInfo(alias="Text")]
797
+ """Controls the processing and generation for the segment.
798
+
799
+ - `crop_image` controls whether to crop the file's images to the segment's
800
+ bounding box. The cropped image will be stored in the segment's `image` field.
801
+ Use `All` to always crop, or `Auto` to only crop when needed for
802
+ post-processing.
803
+ - `format` specifies the output format: `Html` or `Markdown`
804
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
805
+ - `Auto`: Process content automatically
806
+ - `LLM`: Use large language models for processing
807
+ - `Ignore`: Exclude segments from final output
808
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
809
+ uses chunkr's own VLM models and is not configurable via LLM processing
810
+ configuration.
811
+ - `extended_context` uses the full page image as context for LLM generation.
812
+
813
+ **Deprecated fields (for backwards compatibility):**
814
+
815
+ - `llm` - **DEPRECATED**: Use `description` instead
816
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
817
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
818
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
819
+ """
820
+
821
+ title: Annotated[Optional[SegmentProcessingTitle], PropertyInfo(alias="Title")]
822
+ """Controls the processing and generation for the segment.
823
+
824
+ - `crop_image` controls whether to crop the file's images to the segment's
825
+ bounding box. The cropped image will be stored in the segment's `image` field.
826
+ Use `All` to always crop, or `Auto` to only crop when needed for
827
+ post-processing.
828
+ - `format` specifies the output format: `Html` or `Markdown`
829
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
830
+ - `Auto`: Process content automatically
831
+ - `LLM`: Use large language models for processing
832
+ - `Ignore`: Exclude segments from final output
833
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
834
+ uses chunkr's own VLM models and is not configurable via LLM processing
835
+ configuration.
836
+ - `extended_context` uses the full page image as context for LLM generation.
837
+
838
+ **Deprecated fields (for backwards compatibility):**
839
+
840
+ - `llm` - **DEPRECATED**: Use `description` instead
841
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
842
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
843
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
844
+ """