chunkr-ai 0.1.0a1__py3-none-any.whl → 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. chunkr_ai/_client.py +2 -1
  2. chunkr_ai/_version.py +1 -1
  3. chunkr_ai/resources/task/__init__.py +33 -0
  4. chunkr_ai/resources/{task.py → task/parse.py} +146 -696
  5. chunkr_ai/resources/task/task.py +664 -0
  6. chunkr_ai/types/__init__.py +0 -19
  7. chunkr_ai/types/task/__init__.py +7 -0
  8. chunkr_ai/types/task/parse_create_params.py +806 -0
  9. chunkr_ai/types/task/parse_update_params.py +806 -0
  10. chunkr_ai/types/task/task.py +1186 -0
  11. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a2.dist-info}/METADATA +12 -12
  12. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a2.dist-info}/RECORD +14 -28
  13. chunkr_ai/types/auto_generation_config.py +0 -39
  14. chunkr_ai/types/auto_generation_config_param.py +0 -39
  15. chunkr_ai/types/bounding_box.py +0 -19
  16. chunkr_ai/types/chunk_processing.py +0 -40
  17. chunkr_ai/types/chunk_processing_param.py +0 -42
  18. chunkr_ai/types/ignore_generation_config.py +0 -39
  19. chunkr_ai/types/ignore_generation_config_param.py +0 -39
  20. chunkr_ai/types/llm_generation_config.py +0 -39
  21. chunkr_ai/types/llm_generation_config_param.py +0 -39
  22. chunkr_ai/types/llm_processing.py +0 -36
  23. chunkr_ai/types/llm_processing_param.py +0 -36
  24. chunkr_ai/types/picture_generation_config.py +0 -39
  25. chunkr_ai/types/picture_generation_config_param.py +0 -39
  26. chunkr_ai/types/segment_processing.py +0 -280
  27. chunkr_ai/types/segment_processing_param.py +0 -281
  28. chunkr_ai/types/table_generation_config.py +0 -39
  29. chunkr_ai/types/table_generation_config_param.py +0 -39
  30. chunkr_ai/types/task.py +0 -379
  31. chunkr_ai/types/task_parse_params.py +0 -90
  32. chunkr_ai/types/task_update_params.py +0 -90
  33. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a2.dist-info}/WHEEL +0 -0
  34. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,806 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List, Union, Optional
6
+ from typing_extensions import Literal, Required, Annotated, TypeAlias, TypedDict
7
+
8
+ from ..._utils import PropertyInfo
9
+
10
+ __all__ = [
11
+ "ParseUpdateParams",
12
+ "ChunkProcessing",
13
+ "ChunkProcessingTokenizer",
14
+ "ChunkProcessingTokenizerEnum",
15
+ "ChunkProcessingTokenizerString",
16
+ "LlmProcessing",
17
+ "LlmProcessingFallbackStrategy",
18
+ "LlmProcessingFallbackStrategyModel",
19
+ "SegmentProcessing",
20
+ "SegmentProcessingCaption",
21
+ "SegmentProcessingFootnote",
22
+ "SegmentProcessingFormula",
23
+ "SegmentProcessingListItem",
24
+ "SegmentProcessingPage",
25
+ "SegmentProcessingPageFooter",
26
+ "SegmentProcessingPageHeader",
27
+ "SegmentProcessingPicture",
28
+ "SegmentProcessingSectionHeader",
29
+ "SegmentProcessingTable",
30
+ "SegmentProcessingText",
31
+ "SegmentProcessingTitle",
32
+ ]
33
+
34
+
35
+ class ParseUpdateParams(TypedDict, total=False):
36
+ chunk_processing: Optional[ChunkProcessing]
37
+ """Controls the setting for the chunking and post-processing of each chunk."""
38
+
39
+ error_handling: Optional[Literal["Fail", "Continue"]]
40
+ """Controls how errors are handled during processing:
41
+
42
+ - `Fail`: Stops processing and fails the task when any error occurs
43
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
44
+ LLM refusals etc.)
45
+ """
46
+
47
+ expires_in: Optional[int]
48
+ """
49
+ The number of seconds until task is deleted. Expired tasks can **not** be
50
+ updated, polled or accessed via web interface.
51
+ """
52
+
53
+ high_resolution: Optional[bool]
54
+ """Whether to use high-resolution images for cropping and post-processing.
55
+
56
+ (Latency penalty: ~7 seconds per page)
57
+ """
58
+
59
+ llm_processing: Optional[LlmProcessing]
60
+ """Controls the LLM used for the task."""
61
+
62
+ ocr_strategy: Optional[Literal["All", "Auto"]]
63
+ """Controls the Optical Character Recognition (OCR) strategy.
64
+
65
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
66
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
67
+ text. When text layer is present the bounding boxes from the text layer are
68
+ used.
69
+ """
70
+
71
+ pipeline: Optional[Literal["Azure", "Chunkr"]]
72
+ """
73
+ Choose the provider whose models will be used for segmentation and OCR. The
74
+ output will be unified to the Chunkr `output` format.
75
+ """
76
+
77
+ segment_processing: Optional[SegmentProcessing]
78
+ """Defines how each segment type is handled when generating the final output.
79
+
80
+ Each segment uses one of three strategies. The chosen strategy controls: •
81
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
82
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
83
+ `Markdown`).
84
+
85
+ Optional flags such as image **cropping**, **extended context**, and **LLM
86
+ descriptions** further refine behaviour.
87
+
88
+ ---
89
+
90
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
91
+ `Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
92
+ description on) • `Picture` → **LLM** (Markdown, description off, cropping
93
+ _All_) • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
94
+ **Ignore** (removed from output)
95
+
96
+ ---
97
+
98
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
99
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
100
+ """
101
+
102
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]]
103
+ """Controls the segmentation strategy:
104
+
105
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
106
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
107
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
108
+ - `Page`: Treats each page as a single segment. Faster processing, but without
109
+ layout element detection and only simple chunking.
110
+ """
111
+
112
+
113
+ class ChunkProcessingTokenizerEnum(TypedDict, total=False):
114
+ enum: Required[
115
+ Annotated[Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"], PropertyInfo(alias="Enum")]
116
+ ]
117
+ """Use one of the predefined tokenizer types"""
118
+
119
+
120
+ class ChunkProcessingTokenizerString(TypedDict, total=False):
121
+ string: Required[Annotated[str, PropertyInfo(alias="String")]]
122
+ """
123
+ Use any Hugging Face tokenizer by specifying its model ID Examples:
124
+ "Qwen/Qwen-tokenizer", "facebook/bart-large"
125
+ """
126
+
127
+
128
+ ChunkProcessingTokenizer: TypeAlias = Union[ChunkProcessingTokenizerEnum, ChunkProcessingTokenizerString]
129
+
130
+
131
+ class ChunkProcessing(TypedDict, total=False):
132
+ ignore_headers_and_footers: Optional[bool]
133
+ """DEPRECATED: use `segment_processing.ignore` instead"""
134
+
135
+ target_length: int
136
+ """The target number of words in each chunk.
137
+
138
+ If 0, each chunk will contain a single segment.
139
+ """
140
+
141
+ tokenizer: ChunkProcessingTokenizer
142
+ """The tokenizer to use for the chunking process."""
143
+
144
+
145
+ class LlmProcessingFallbackStrategyModel(TypedDict, total=False):
146
+ model: Required[Annotated[str, PropertyInfo(alias="Model")]]
147
+ """Use a specific model as fallback"""
148
+
149
+
150
+ LlmProcessingFallbackStrategy: TypeAlias = Union[Literal["None", "Default"], LlmProcessingFallbackStrategyModel]
151
+
152
+
153
+ class LlmProcessing(TypedDict, total=False):
154
+ fallback_strategy: LlmProcessingFallbackStrategy
155
+ """The fallback strategy to use for the LLMs in the task."""
156
+
157
+ llm_model_id: Optional[str]
158
+ """The ID of the model to use for the task.
159
+
160
+ If not provided, the default model will be used. Please check the documentation
161
+ for the model you want to use.
162
+ """
163
+
164
+ max_completion_tokens: Optional[int]
165
+ """The maximum number of tokens to generate."""
166
+
167
+ temperature: float
168
+ """The temperature to use for the LLM."""
169
+
170
+
171
+ class SegmentProcessingCaption(TypedDict, total=False):
172
+ crop_image: Literal["All", "Auto"]
173
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
174
+
175
+ - `All` crops all images in the item
176
+ - `Auto` crops images only if required for post-processing
177
+ """
178
+
179
+ description: bool
180
+ """Generate LLM descriptions for this segment"""
181
+
182
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
183
+ """**DEPRECATED**: `embed` field is auto populated"""
184
+
185
+ extended_context: bool
186
+ """Use the full page image as context for LLM generation"""
187
+
188
+ format: Literal["Html", "Markdown"]
189
+
190
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
191
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
192
+
193
+ llm: Optional[str]
194
+ """**DEPRECATED**: use description instead"""
195
+
196
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
197
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
198
+
199
+ strategy: Literal["LLM", "Auto", "Ignore"]
200
+
201
+
202
+ class SegmentProcessingFootnote(TypedDict, total=False):
203
+ crop_image: Literal["All", "Auto"]
204
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
205
+
206
+ - `All` crops all images in the item
207
+ - `Auto` crops images only if required for post-processing
208
+ """
209
+
210
+ description: bool
211
+ """Generate LLM descriptions for this segment"""
212
+
213
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
214
+ """**DEPRECATED**: `embed` field is auto populated"""
215
+
216
+ extended_context: bool
217
+ """Use the full page image as context for LLM generation"""
218
+
219
+ format: Literal["Html", "Markdown"]
220
+
221
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
222
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
223
+
224
+ llm: Optional[str]
225
+ """**DEPRECATED**: use description instead"""
226
+
227
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
228
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
229
+
230
+ strategy: Literal["LLM", "Auto", "Ignore"]
231
+
232
+
233
+ class SegmentProcessingFormula(TypedDict, total=False):
234
+ crop_image: Literal["All", "Auto"]
235
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
236
+
237
+ - `All` crops all images in the item
238
+ - `Auto` crops images only if required for post-processing
239
+ """
240
+
241
+ description: bool
242
+ """Generate LLM descriptions for this segment"""
243
+
244
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
245
+ """**DEPRECATED**: `embed` field is auto populated"""
246
+
247
+ extended_context: bool
248
+ """Use the full page image as context for LLM generation"""
249
+
250
+ format: Literal["Html", "Markdown"]
251
+
252
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
253
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
254
+
255
+ llm: Optional[str]
256
+ """**DEPRECATED**: use description instead"""
257
+
258
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
259
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
260
+
261
+ strategy: Literal["LLM", "Auto", "Ignore"]
262
+
263
+
264
+ class SegmentProcessingListItem(TypedDict, total=False):
265
+ crop_image: Literal["All", "Auto"]
266
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
267
+
268
+ - `All` crops all images in the item
269
+ - `Auto` crops images only if required for post-processing
270
+ """
271
+
272
+ description: bool
273
+ """Generate LLM descriptions for this segment"""
274
+
275
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
276
+ """**DEPRECATED**: `embed` field is auto populated"""
277
+
278
+ extended_context: bool
279
+ """Use the full page image as context for LLM generation"""
280
+
281
+ format: Literal["Html", "Markdown"]
282
+
283
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
284
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
285
+
286
+ llm: Optional[str]
287
+ """**DEPRECATED**: use description instead"""
288
+
289
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
290
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
291
+
292
+ strategy: Literal["LLM", "Auto", "Ignore"]
293
+
294
+
295
+ class SegmentProcessingPage(TypedDict, total=False):
296
+ crop_image: Literal["All", "Auto"]
297
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
298
+
299
+ - `All` crops all images in the item
300
+ - `Auto` crops images only if required for post-processing
301
+ """
302
+
303
+ description: bool
304
+ """Generate LLM descriptions for this segment"""
305
+
306
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
307
+ """**DEPRECATED**: `embed` field is auto populated"""
308
+
309
+ extended_context: bool
310
+ """Use the full page image as context for LLM generation"""
311
+
312
+ format: Literal["Html", "Markdown"]
313
+
314
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
315
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
316
+
317
+ llm: Optional[str]
318
+ """**DEPRECATED**: use description instead"""
319
+
320
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
321
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
322
+
323
+ strategy: Literal["LLM", "Auto", "Ignore"]
324
+
325
+
326
+ class SegmentProcessingPageFooter(TypedDict, total=False):
327
+ crop_image: Literal["All", "Auto"]
328
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
329
+
330
+ - `All` crops all images in the item
331
+ - `Auto` crops images only if required for post-processing
332
+ """
333
+
334
+ description: bool
335
+ """Generate LLM descriptions for this segment"""
336
+
337
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
338
+ """**DEPRECATED**: `embed` field is auto populated"""
339
+
340
+ extended_context: bool
341
+ """Use the full page image as context for LLM generation"""
342
+
343
+ format: Literal["Html", "Markdown"]
344
+
345
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
346
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
347
+
348
+ llm: Optional[str]
349
+ """**DEPRECATED**: use description instead"""
350
+
351
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
352
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
353
+
354
+ strategy: Literal["LLM", "Auto", "Ignore"]
355
+
356
+
357
+ class SegmentProcessingPageHeader(TypedDict, total=False):
358
+ crop_image: Literal["All", "Auto"]
359
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
360
+
361
+ - `All` crops all images in the item
362
+ - `Auto` crops images only if required for post-processing
363
+ """
364
+
365
+ description: bool
366
+ """Generate LLM descriptions for this segment"""
367
+
368
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
369
+ """**DEPRECATED**: `embed` field is auto populated"""
370
+
371
+ extended_context: bool
372
+ """Use the full page image as context for LLM generation"""
373
+
374
+ format: Literal["Html", "Markdown"]
375
+
376
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
377
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
378
+
379
+ llm: Optional[str]
380
+ """**DEPRECATED**: use description instead"""
381
+
382
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
383
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
384
+
385
+ strategy: Literal["LLM", "Auto", "Ignore"]
386
+
387
+
388
+ class SegmentProcessingPicture(TypedDict, total=False):
389
+ crop_image: Literal["All", "Auto"]
390
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
391
+
392
+ - `All` crops all images in the item
393
+ - `Auto` crops images only if required for post-processing
394
+ """
395
+
396
+ description: bool
397
+ """Generate LLM descriptions for this segment"""
398
+
399
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
400
+ """**DEPRECATED**: `embed` field is auto populated"""
401
+
402
+ extended_context: bool
403
+ """Use the full page image as context for LLM generation"""
404
+
405
+ format: Literal["Html", "Markdown"]
406
+
407
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
408
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
409
+
410
+ llm: Optional[str]
411
+ """**DEPRECATED**: use description instead"""
412
+
413
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
414
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
415
+
416
+ strategy: Literal["LLM", "Auto", "Ignore"]
417
+
418
+
419
+ class SegmentProcessingSectionHeader(TypedDict, total=False):
420
+ crop_image: Literal["All", "Auto"]
421
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
422
+
423
+ - `All` crops all images in the item
424
+ - `Auto` crops images only if required for post-processing
425
+ """
426
+
427
+ description: bool
428
+ """Generate LLM descriptions for this segment"""
429
+
430
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
431
+ """**DEPRECATED**: `embed` field is auto populated"""
432
+
433
+ extended_context: bool
434
+ """Use the full page image as context for LLM generation"""
435
+
436
+ format: Literal["Html", "Markdown"]
437
+
438
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
439
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
440
+
441
+ llm: Optional[str]
442
+ """**DEPRECATED**: use description instead"""
443
+
444
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
445
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
446
+
447
+ strategy: Literal["LLM", "Auto", "Ignore"]
448
+
449
+
450
+ class SegmentProcessingTable(TypedDict, total=False):
451
+ crop_image: Literal["All", "Auto"]
452
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
453
+
454
+ - `All` crops all images in the item
455
+ - `Auto` crops images only if required for post-processing
456
+ """
457
+
458
+ description: bool
459
+ """Generate LLM descriptions for this segment"""
460
+
461
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
462
+ """**DEPRECATED**: `embed` field is auto populated"""
463
+
464
+ extended_context: bool
465
+ """Use the full page image as context for LLM generation"""
466
+
467
+ format: Literal["Html", "Markdown"]
468
+
469
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
470
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
471
+
472
+ llm: Optional[str]
473
+ """**DEPRECATED**: use description instead"""
474
+
475
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
476
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
477
+
478
+ strategy: Literal["LLM", "Auto", "Ignore"]
479
+
480
+
481
+ class SegmentProcessingText(TypedDict, total=False):
482
+ crop_image: Literal["All", "Auto"]
483
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
484
+
485
+ - `All` crops all images in the item
486
+ - `Auto` crops images only if required for post-processing
487
+ """
488
+
489
+ description: bool
490
+ """Generate LLM descriptions for this segment"""
491
+
492
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
493
+ """**DEPRECATED**: `embed` field is auto populated"""
494
+
495
+ extended_context: bool
496
+ """Use the full page image as context for LLM generation"""
497
+
498
+ format: Literal["Html", "Markdown"]
499
+
500
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
501
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
502
+
503
+ llm: Optional[str]
504
+ """**DEPRECATED**: use description instead"""
505
+
506
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
507
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
508
+
509
+ strategy: Literal["LLM", "Auto", "Ignore"]
510
+
511
+
512
+ class SegmentProcessingTitle(TypedDict, total=False):
513
+ crop_image: Literal["All", "Auto"]
514
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
515
+
516
+ - `All` crops all images in the item
517
+ - `Auto` crops images only if required for post-processing
518
+ """
519
+
520
+ description: bool
521
+ """Generate LLM descriptions for this segment"""
522
+
523
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
524
+ """**DEPRECATED**: `embed` field is auto populated"""
525
+
526
+ extended_context: bool
527
+ """Use the full page image as context for LLM generation"""
528
+
529
+ format: Literal["Html", "Markdown"]
530
+
531
+ html: Optional[Literal["LLM", "Auto", "Ignore"]]
532
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
533
+
534
+ llm: Optional[str]
535
+ """**DEPRECATED**: use description instead"""
536
+
537
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
538
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
539
+
540
+ strategy: Literal["LLM", "Auto", "Ignore"]
541
+
542
+
543
+ class SegmentProcessing(TypedDict, total=False):
544
+ caption: Annotated[Optional[SegmentProcessingCaption], PropertyInfo(alias="Caption")]
545
+ """Controls the processing and generation for the segment.
546
+
547
+ - `crop_image` controls whether to crop the file's images to the segment's
548
+ bounding box. The cropped image will be stored in the segment's `image` field.
549
+ Use `All` to always crop, or `Auto` to only crop when needed for
550
+ post-processing.
551
+ - `format` specifies the output format: `Html` or `Markdown`
552
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
553
+ - `Auto`: Process content automatically
554
+ - `LLM`: Use large language models for processing
555
+ - `Ignore`: Exclude segments from final output
556
+ - `description` enables LLM-generated descriptions for segments
557
+
558
+ **Deprecated fields (for backwards compatibility):**
559
+
560
+ - `llm` - **DEPRECATED**: Use `description` instead
561
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
562
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
563
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
564
+ """
565
+
566
+ footnote: Annotated[Optional[SegmentProcessingFootnote], PropertyInfo(alias="Footnote")]
567
+ """Controls the processing and generation for the segment.
568
+
569
+ - `crop_image` controls whether to crop the file's images to the segment's
570
+ bounding box. The cropped image will be stored in the segment's `image` field.
571
+ Use `All` to always crop, or `Auto` to only crop when needed for
572
+ post-processing.
573
+ - `format` specifies the output format: `Html` or `Markdown`
574
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
575
+ - `Auto`: Process content automatically
576
+ - `LLM`: Use large language models for processing
577
+ - `Ignore`: Exclude segments from final output
578
+ - `description` enables LLM-generated descriptions for segments
579
+
580
+ **Deprecated fields (for backwards compatibility):**
581
+
582
+ - `llm` - **DEPRECATED**: Use `description` instead
583
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
584
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
585
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
586
+ """
587
+
588
+ formula: Annotated[Optional[SegmentProcessingFormula], PropertyInfo(alias="Formula")]
589
+ """Controls the processing and generation for the segment.
590
+
591
+ - `crop_image` controls whether to crop the file's images to the segment's
592
+ bounding box. The cropped image will be stored in the segment's `image` field.
593
+ Use `All` to always crop, or `Auto` to only crop when needed for
594
+ post-processing.
595
+ - `format` specifies the output format: `Html` or `Markdown`
596
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
597
+ - `Auto`: Process content automatically
598
+ - `LLM`: Use large language models for processing
599
+ - `Ignore`: Exclude segments from final output
600
+ - `description` enables LLM-generated descriptions for segments
601
+
602
+ **Deprecated fields (for backwards compatibility):**
603
+
604
+ - `llm` - **DEPRECATED**: Use `description` instead
605
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
606
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
607
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
608
+ """
609
+
610
+ list_item: Annotated[Optional[SegmentProcessingListItem], PropertyInfo(alias="ListItem")]
611
+ """Controls the processing and generation for the segment.
612
+
613
+ - `crop_image` controls whether to crop the file's images to the segment's
614
+ bounding box. The cropped image will be stored in the segment's `image` field.
615
+ Use `All` to always crop, or `Auto` to only crop when needed for
616
+ post-processing.
617
+ - `format` specifies the output format: `Html` or `Markdown`
618
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
619
+ - `Auto`: Process content automatically
620
+ - `LLM`: Use large language models for processing
621
+ - `Ignore`: Exclude segments from final output
622
+ - `description` enables LLM-generated descriptions for segments
623
+
624
+ **Deprecated fields (for backwards compatibility):**
625
+
626
+ - `llm` - **DEPRECATED**: Use `description` instead
627
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
628
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
629
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
630
+ """
631
+
632
+ page: Annotated[Optional[SegmentProcessingPage], PropertyInfo(alias="Page")]
633
+ """Controls the processing and generation for the segment.
634
+
635
+ - `crop_image` controls whether to crop the file's images to the segment's
636
+ bounding box. The cropped image will be stored in the segment's `image` field.
637
+ Use `All` to always crop, or `Auto` to only crop when needed for
638
+ post-processing.
639
+ - `format` specifies the output format: `Html` or `Markdown`
640
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
641
+ - `Auto`: Process content automatically
642
+ - `LLM`: Use large language models for processing
643
+ - `Ignore`: Exclude segments from final output
644
+ - `description` enables LLM-generated descriptions for segments
645
+
646
+ **Deprecated fields (for backwards compatibility):**
647
+
648
+ - `llm` - **DEPRECATED**: Use `description` instead
649
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
650
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
651
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
652
+ """
653
+
654
+ page_footer: Annotated[Optional[SegmentProcessingPageFooter], PropertyInfo(alias="PageFooter")]
655
+ """Controls the processing and generation for the segment.
656
+
657
+ - `crop_image` controls whether to crop the file's images to the segment's
658
+ bounding box. The cropped image will be stored in the segment's `image` field.
659
+ Use `All` to always crop, or `Auto` to only crop when needed for
660
+ post-processing.
661
+ - `format` specifies the output format: `Html` or `Markdown`
662
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
663
+ - `Auto`: Process content automatically
664
+ - `LLM`: Use large language models for processing
665
+ - `Ignore`: Exclude segments from final output
666
+ - `description` enables LLM-generated descriptions for segments
667
+
668
+ **Deprecated fields (for backwards compatibility):**
669
+
670
+ - `llm` - **DEPRECATED**: Use `description` instead
671
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
672
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
673
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
674
+ """
675
+
676
+ page_header: Annotated[Optional[SegmentProcessingPageHeader], PropertyInfo(alias="PageHeader")]
677
+ """Controls the processing and generation for the segment.
678
+
679
+ - `crop_image` controls whether to crop the file's images to the segment's
680
+ bounding box. The cropped image will be stored in the segment's `image` field.
681
+ Use `All` to always crop, or `Auto` to only crop when needed for
682
+ post-processing.
683
+ - `format` specifies the output format: `Html` or `Markdown`
684
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
685
+ - `Auto`: Process content automatically
686
+ - `LLM`: Use large language models for processing
687
+ - `Ignore`: Exclude segments from final output
688
+ - `description` enables LLM-generated descriptions for segments
689
+
690
+ **Deprecated fields (for backwards compatibility):**
691
+
692
+ - `llm` - **DEPRECATED**: Use `description` instead
693
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
694
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
695
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
696
+ """
697
+
698
+ picture: Annotated[Optional[SegmentProcessingPicture], PropertyInfo(alias="Picture")]
699
+ """Controls the processing and generation for the segment.
700
+
701
+ - `crop_image` controls whether to crop the file's images to the segment's
702
+ bounding box. The cropped image will be stored in the segment's `image` field.
703
+ Use `All` to always crop, or `Auto` to only crop when needed for
704
+ post-processing.
705
+ - `format` specifies the output format: `Html` or `Markdown`
706
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
707
+ - `Auto`: Process content automatically
708
+ - `LLM`: Use large language models for processing
709
+ - `Ignore`: Exclude segments from final output
710
+ - `description` enables LLM-generated descriptions for segments
711
+
712
+ **Deprecated fields (for backwards compatibility):**
713
+
714
+ - `llm` - **DEPRECATED**: Use `description` instead
715
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
716
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
717
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
718
+ """
719
+
720
+ section_header: Annotated[Optional[SegmentProcessingSectionHeader], PropertyInfo(alias="SectionHeader")]
721
+ """Controls the processing and generation for the segment.
722
+
723
+ - `crop_image` controls whether to crop the file's images to the segment's
724
+ bounding box. The cropped image will be stored in the segment's `image` field.
725
+ Use `All` to always crop, or `Auto` to only crop when needed for
726
+ post-processing.
727
+ - `format` specifies the output format: `Html` or `Markdown`
728
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
729
+ - `Auto`: Process content automatically
730
+ - `LLM`: Use large language models for processing
731
+ - `Ignore`: Exclude segments from final output
732
+ - `description` enables LLM-generated descriptions for segments
733
+
734
+ **Deprecated fields (for backwards compatibility):**
735
+
736
+ - `llm` - **DEPRECATED**: Use `description` instead
737
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
738
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
739
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
740
+ """
741
+
742
+ table: Annotated[Optional[SegmentProcessingTable], PropertyInfo(alias="Table")]
743
+ """Controls the processing and generation for the segment.
744
+
745
+ - `crop_image` controls whether to crop the file's images to the segment's
746
+ bounding box. The cropped image will be stored in the segment's `image` field.
747
+ Use `All` to always crop, or `Auto` to only crop when needed for
748
+ post-processing.
749
+ - `format` specifies the output format: `Html` or `Markdown`
750
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
751
+ - `Auto`: Process content automatically
752
+ - `LLM`: Use large language models for processing
753
+ - `Ignore`: Exclude segments from final output
754
+ - `description` enables LLM-generated descriptions for segments
755
+
756
+ **Deprecated fields (for backwards compatibility):**
757
+
758
+ - `llm` - **DEPRECATED**: Use `description` instead
759
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
760
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
761
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
762
+ """
763
+
764
+ text: Annotated[Optional[SegmentProcessingText], PropertyInfo(alias="Text")]
765
+ """Controls the processing and generation for the segment.
766
+
767
+ - `crop_image` controls whether to crop the file's images to the segment's
768
+ bounding box. The cropped image will be stored in the segment's `image` field.
769
+ Use `All` to always crop, or `Auto` to only crop when needed for
770
+ post-processing.
771
+ - `format` specifies the output format: `Html` or `Markdown`
772
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
773
+ - `Auto`: Process content automatically
774
+ - `LLM`: Use large language models for processing
775
+ - `Ignore`: Exclude segments from final output
776
+ - `description` enables LLM-generated descriptions for segments
777
+
778
+ **Deprecated fields (for backwards compatibility):**
779
+
780
+ - `llm` - **DEPRECATED**: Use `description` instead
781
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
782
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
783
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
784
+ """
785
+
786
+ title: Annotated[Optional[SegmentProcessingTitle], PropertyInfo(alias="Title")]
787
+ """Controls the processing and generation for the segment.
788
+
789
+ - `crop_image` controls whether to crop the file's images to the segment's
790
+ bounding box. The cropped image will be stored in the segment's `image` field.
791
+ Use `All` to always crop, or `Auto` to only crop when needed for
792
+ post-processing.
793
+ - `format` specifies the output format: `Html` or `Markdown`
794
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
795
+ - `Auto`: Process content automatically
796
+ - `LLM`: Use large language models for processing
797
+ - `Ignore`: Exclude segments from final output
798
+ - `description` enables LLM-generated descriptions for segments
799
+
800
+ **Deprecated fields (for backwards compatibility):**
801
+
802
+ - `llm` - **DEPRECATED**: Use `description` instead
803
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
804
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
805
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
806
+ """