chunkr-ai 0.1.0a5__py3-none-any.whl → 0.1.0a7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. chunkr_ai/__init__.py +2 -0
  2. chunkr_ai/_client.py +31 -3
  3. chunkr_ai/_constants.py +5 -5
  4. chunkr_ai/_exceptions.py +4 -0
  5. chunkr_ai/_models.py +1 -1
  6. chunkr_ai/_types.py +35 -1
  7. chunkr_ai/_utils/__init__.py +1 -0
  8. chunkr_ai/_utils/_typing.py +5 -0
  9. chunkr_ai/_version.py +1 -1
  10. chunkr_ai/resources/__init__.py +14 -0
  11. chunkr_ai/resources/files.py +3 -3
  12. chunkr_ai/resources/tasks/__init__.py +14 -0
  13. chunkr_ai/resources/tasks/extract.py +409 -0
  14. chunkr_ai/resources/tasks/parse.py +102 -346
  15. chunkr_ai/resources/tasks/tasks.py +62 -14
  16. chunkr_ai/resources/webhooks.py +193 -0
  17. chunkr_ai/types/__init__.py +27 -1
  18. chunkr_ai/types/bounding_box.py +19 -0
  19. chunkr_ai/types/cell.py +39 -0
  20. chunkr_ai/types/cell_style.py +28 -0
  21. chunkr_ai/types/chunk.py +40 -0
  22. chunkr_ai/types/chunk_processing.py +40 -0
  23. chunkr_ai/types/chunk_processing_param.py +42 -0
  24. chunkr_ai/types/extract_configuration.py +24 -0
  25. chunkr_ai/types/extract_output_response.py +19 -0
  26. chunkr_ai/types/file_create_params.py +2 -1
  27. chunkr_ai/types/file_info.py +21 -0
  28. chunkr_ai/types/generation_config.py +29 -0
  29. chunkr_ai/types/generation_config_param.py +29 -0
  30. chunkr_ai/types/llm_processing.py +36 -0
  31. chunkr_ai/types/llm_processing_param.py +36 -0
  32. chunkr_ai/types/ocr_result.py +28 -0
  33. chunkr_ai/types/page.py +27 -0
  34. chunkr_ai/types/parse_configuration.py +64 -0
  35. chunkr_ai/types/parse_configuration_param.py +65 -0
  36. chunkr_ai/types/parse_output_response.py +29 -0
  37. chunkr_ai/types/segment.py +109 -0
  38. chunkr_ai/types/segment_processing.py +228 -0
  39. chunkr_ai/types/segment_processing_param.py +229 -0
  40. chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
  41. chunkr_ai/types/task_list_params.py +7 -1
  42. chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
  43. chunkr_ai/types/task_response.py +68 -0
  44. chunkr_ai/types/tasks/__init__.py +7 -1
  45. chunkr_ai/types/tasks/extract_create_params.py +47 -0
  46. chunkr_ai/types/tasks/extract_create_response.py +214 -0
  47. chunkr_ai/types/tasks/extract_get_params.py +21 -0
  48. chunkr_ai/types/tasks/extract_get_response.py +214 -0
  49. chunkr_ai/types/tasks/parse_create_params.py +25 -805
  50. chunkr_ai/types/tasks/parse_create_response.py +55 -0
  51. chunkr_ai/types/tasks/parse_get_params.py +21 -0
  52. chunkr_ai/types/tasks/parse_get_response.py +55 -0
  53. chunkr_ai/types/unwrap_webhook_event.py +11 -0
  54. chunkr_ai/types/version_info.py +31 -0
  55. chunkr_ai/types/webhook_url_response.py +9 -0
  56. {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/METADATA +14 -13
  57. chunkr_ai-0.1.0a7.dist-info/RECORD +86 -0
  58. chunkr_ai/types/task.py +0 -1225
  59. chunkr_ai/types/tasks/parse_update_params.py +0 -857
  60. chunkr_ai-0.1.0a5.dist-info/RECORD +0 -52
  61. {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/WHEEL +0 -0
  62. {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/licenses/LICENSE +0 -0
@@ -1,857 +0,0 @@
1
- # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
-
3
- from __future__ import annotations
4
-
5
- from typing import List, Union, Optional
6
- from typing_extensions import Literal, Required, Annotated, TypeAlias, TypedDict
7
-
8
- from ..._utils import PropertyInfo
9
-
10
- __all__ = [
11
- "ParseUpdateParams",
12
- "ChunkProcessing",
13
- "ChunkProcessingTokenizer",
14
- "ChunkProcessingTokenizerEnum",
15
- "ChunkProcessingTokenizerString",
16
- "LlmProcessing",
17
- "LlmProcessingFallbackStrategy",
18
- "LlmProcessingFallbackStrategyModel",
19
- "SegmentProcessing",
20
- "SegmentProcessingCaption",
21
- "SegmentProcessingFootnote",
22
- "SegmentProcessingFormula",
23
- "SegmentProcessingListItem",
24
- "SegmentProcessingPage",
25
- "SegmentProcessingPageFooter",
26
- "SegmentProcessingPageHeader",
27
- "SegmentProcessingPicture",
28
- "SegmentProcessingSectionHeader",
29
- "SegmentProcessingTable",
30
- "SegmentProcessingText",
31
- "SegmentProcessingTitle",
32
- ]
33
-
34
-
35
- class ParseUpdateParams(TypedDict, total=False):
36
- base64_urls: bool
37
- """Whether to return base64 encoded URLs. If false, presigned URLs are returned."""
38
-
39
- include_chunks: bool
40
- """Whether to include chunks in the output response"""
41
-
42
- wait_for_completion: bool
43
- """
44
- If true, server holds briefly and may return 200 when done; otherwise returns
45
- 408/409 with Retry-After headers
46
- """
47
-
48
- chunk_processing: Optional[ChunkProcessing]
49
- """Controls the setting for the chunking and post-processing of each chunk."""
50
-
51
- error_handling: Optional[Literal["Fail", "Continue"]]
52
- """Controls how errors are handled during processing:
53
-
54
- - `Fail`: Stops processing and fails the task when any error occurs
55
- - `Continue`: Attempts to continue processing despite non-critical errors (eg.
56
- LLM refusals etc.)
57
- """
58
-
59
- expires_in: Optional[int]
60
- """
61
- The number of seconds until task is deleted. Expired tasks can **not** be
62
- updated, polled or accessed via web interface.
63
- """
64
-
65
- high_resolution: Optional[bool]
66
- """Whether to use high-resolution images for cropping and post-processing.
67
-
68
- (Latency penalty: ~7 seconds per page)
69
- """
70
-
71
- llm_processing: Optional[LlmProcessing]
72
- """Controls the LLM used for the task."""
73
-
74
- ocr_strategy: Optional[Literal["All", "Auto"]]
75
- """Controls the Optical Character Recognition (OCR) strategy.
76
-
77
- - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
78
- - `Auto`: Selectively applies OCR only to pages with missing or low-quality
79
- text. When text layer is present the bounding boxes from the text layer are
80
- used.
81
- """
82
-
83
- pipeline: Optional[Literal["Azure", "Chunkr"]]
84
- """
85
- Choose the provider whose models will be used for segmentation and OCR. The
86
- output will be unified to the Chunkr `output` format.
87
- """
88
-
89
- segment_processing: Optional[SegmentProcessing]
90
- """Defines how each segment type is handled when generating the final output.
91
-
92
- Each segment uses one of three strategies. The chosen strategy controls:
93
-
94
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
95
- - How the content is produced (rule-based vs. LLM).
96
- - The output format (`Html` or `Markdown`).
97
-
98
- Optional flags such as image **cropping**, **extended context**, and
99
- **descriptions** further refine behaviour.
100
-
101
- **Default strategy per segment**
102
-
103
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
104
- (Markdown, description off)
105
- - `Table` → **LLM** (HTML, description on)
106
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
107
- - `Formula`, `Page` → **LLM** (Markdown, description off)
108
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
109
-
110
- **Strategy reference**
111
-
112
- - **Auto** – rule-based content generation.
113
- - **LLM** – generate content with an LLM.
114
- - **Ignore** – exclude the segment entirely.
115
- """
116
-
117
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]]
118
- """Controls the segmentation strategy:
119
-
120
- - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
121
- `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
122
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
123
- - `Page`: Treats each page as a single segment. Faster processing, but without
124
- layout element detection and only simple chunking.
125
- """
126
-
127
-
128
- class ChunkProcessingTokenizerEnum(TypedDict, total=False):
129
- enum: Required[
130
- Annotated[Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"], PropertyInfo(alias="Enum")]
131
- ]
132
- """Use one of the predefined tokenizer types"""
133
-
134
-
135
- class ChunkProcessingTokenizerString(TypedDict, total=False):
136
- string: Required[Annotated[str, PropertyInfo(alias="String")]]
137
- """
138
- Use any Hugging Face tokenizer by specifying its model ID Examples:
139
- "Qwen/Qwen-tokenizer", "facebook/bart-large"
140
- """
141
-
142
-
143
- ChunkProcessingTokenizer: TypeAlias = Union[ChunkProcessingTokenizerEnum, ChunkProcessingTokenizerString]
144
-
145
-
146
- class ChunkProcessing(TypedDict, total=False):
147
- ignore_headers_and_footers: Optional[bool]
148
- """DEPRECATED: use `segment_processing.ignore` instead"""
149
-
150
- target_length: int
151
- """The target number of words in each chunk.
152
-
153
- If 0, each chunk will contain a single segment.
154
- """
155
-
156
- tokenizer: ChunkProcessingTokenizer
157
- """The tokenizer to use for the chunking process."""
158
-
159
-
160
- class LlmProcessingFallbackStrategyModel(TypedDict, total=False):
161
- model: Required[Annotated[str, PropertyInfo(alias="Model")]]
162
- """Use a specific model as fallback"""
163
-
164
-
165
- LlmProcessingFallbackStrategy: TypeAlias = Union[Literal["None", "Default"], LlmProcessingFallbackStrategyModel]
166
-
167
-
168
- class LlmProcessing(TypedDict, total=False):
169
- fallback_strategy: LlmProcessingFallbackStrategy
170
- """The fallback strategy to use for the LLMs in the task."""
171
-
172
- llm_model_id: Optional[str]
173
- """The ID of the model to use for the task.
174
-
175
- If not provided, the default model will be used. Please check the documentation
176
- for the model you want to use.
177
- """
178
-
179
- max_completion_tokens: Optional[int]
180
- """The maximum number of tokens to generate."""
181
-
182
- temperature: float
183
- """The temperature to use for the LLM."""
184
-
185
-
186
- class SegmentProcessingCaption(TypedDict, total=False):
187
- crop_image: Literal["All", "Auto"]
188
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
189
-
190
- - `All` crops all images in the item
191
- - `Auto` crops images only if required for post-processing
192
- """
193
-
194
- description: bool
195
- """Generate LLM descriptions for this segment"""
196
-
197
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
198
- """**DEPRECATED**: `embed` field is auto populated"""
199
-
200
- extended_context: bool
201
- """Use the full page image as context for LLM generation"""
202
-
203
- format: Literal["Html", "Markdown"]
204
-
205
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
206
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
207
-
208
- llm: Optional[str]
209
- """**DEPRECATED**: use description instead"""
210
-
211
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
212
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
213
-
214
- strategy: Literal["LLM", "Auto", "Ignore"]
215
-
216
-
217
- class SegmentProcessingFootnote(TypedDict, total=False):
218
- crop_image: Literal["All", "Auto"]
219
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
220
-
221
- - `All` crops all images in the item
222
- - `Auto` crops images only if required for post-processing
223
- """
224
-
225
- description: bool
226
- """Generate LLM descriptions for this segment"""
227
-
228
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
229
- """**DEPRECATED**: `embed` field is auto populated"""
230
-
231
- extended_context: bool
232
- """Use the full page image as context for LLM generation"""
233
-
234
- format: Literal["Html", "Markdown"]
235
-
236
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
237
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
238
-
239
- llm: Optional[str]
240
- """**DEPRECATED**: use description instead"""
241
-
242
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
243
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
244
-
245
- strategy: Literal["LLM", "Auto", "Ignore"]
246
-
247
-
248
- class SegmentProcessingFormula(TypedDict, total=False):
249
- crop_image: Literal["All", "Auto"]
250
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
251
-
252
- - `All` crops all images in the item
253
- - `Auto` crops images only if required for post-processing
254
- """
255
-
256
- description: bool
257
- """Generate LLM descriptions for this segment"""
258
-
259
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
260
- """**DEPRECATED**: `embed` field is auto populated"""
261
-
262
- extended_context: bool
263
- """Use the full page image as context for LLM generation"""
264
-
265
- format: Literal["Html", "Markdown"]
266
-
267
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
268
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
269
-
270
- llm: Optional[str]
271
- """**DEPRECATED**: use description instead"""
272
-
273
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
274
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
275
-
276
- strategy: Literal["LLM", "Auto", "Ignore"]
277
-
278
-
279
- class SegmentProcessingListItem(TypedDict, total=False):
280
- crop_image: Literal["All", "Auto"]
281
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
282
-
283
- - `All` crops all images in the item
284
- - `Auto` crops images only if required for post-processing
285
- """
286
-
287
- description: bool
288
- """Generate LLM descriptions for this segment"""
289
-
290
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
291
- """**DEPRECATED**: `embed` field is auto populated"""
292
-
293
- extended_context: bool
294
- """Use the full page image as context for LLM generation"""
295
-
296
- format: Literal["Html", "Markdown"]
297
-
298
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
299
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
300
-
301
- llm: Optional[str]
302
- """**DEPRECATED**: use description instead"""
303
-
304
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
305
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
306
-
307
- strategy: Literal["LLM", "Auto", "Ignore"]
308
-
309
-
310
- class SegmentProcessingPage(TypedDict, total=False):
311
- crop_image: Literal["All", "Auto"]
312
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
313
-
314
- - `All` crops all images in the item
315
- - `Auto` crops images only if required for post-processing
316
- """
317
-
318
- description: bool
319
- """Generate LLM descriptions for this segment"""
320
-
321
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
322
- """**DEPRECATED**: `embed` field is auto populated"""
323
-
324
- extended_context: bool
325
- """Use the full page image as context for LLM generation"""
326
-
327
- format: Literal["Html", "Markdown"]
328
-
329
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
330
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
331
-
332
- llm: Optional[str]
333
- """**DEPRECATED**: use description instead"""
334
-
335
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
336
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
337
-
338
- strategy: Literal["LLM", "Auto", "Ignore"]
339
-
340
-
341
- class SegmentProcessingPageFooter(TypedDict, total=False):
342
- crop_image: Literal["All", "Auto"]
343
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
344
-
345
- - `All` crops all images in the item
346
- - `Auto` crops images only if required for post-processing
347
- """
348
-
349
- description: bool
350
- """Generate LLM descriptions for this segment"""
351
-
352
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
353
- """**DEPRECATED**: `embed` field is auto populated"""
354
-
355
- extended_context: bool
356
- """Use the full page image as context for LLM generation"""
357
-
358
- format: Literal["Html", "Markdown"]
359
-
360
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
361
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
362
-
363
- llm: Optional[str]
364
- """**DEPRECATED**: use description instead"""
365
-
366
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
367
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
368
-
369
- strategy: Literal["LLM", "Auto", "Ignore"]
370
-
371
-
372
- class SegmentProcessingPageHeader(TypedDict, total=False):
373
- crop_image: Literal["All", "Auto"]
374
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
375
-
376
- - `All` crops all images in the item
377
- - `Auto` crops images only if required for post-processing
378
- """
379
-
380
- description: bool
381
- """Generate LLM descriptions for this segment"""
382
-
383
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
384
- """**DEPRECATED**: `embed` field is auto populated"""
385
-
386
- extended_context: bool
387
- """Use the full page image as context for LLM generation"""
388
-
389
- format: Literal["Html", "Markdown"]
390
-
391
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
392
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
393
-
394
- llm: Optional[str]
395
- """**DEPRECATED**: use description instead"""
396
-
397
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
398
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
399
-
400
- strategy: Literal["LLM", "Auto", "Ignore"]
401
-
402
-
403
- class SegmentProcessingPicture(TypedDict, total=False):
404
- crop_image: Literal["All", "Auto"]
405
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
406
-
407
- - `All` crops all images in the item
408
- - `Auto` crops images only if required for post-processing
409
- """
410
-
411
- description: bool
412
- """Generate LLM descriptions for this segment"""
413
-
414
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
415
- """**DEPRECATED**: `embed` field is auto populated"""
416
-
417
- extended_context: bool
418
- """Use the full page image as context for LLM generation"""
419
-
420
- format: Literal["Html", "Markdown"]
421
-
422
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
423
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
424
-
425
- llm: Optional[str]
426
- """**DEPRECATED**: use description instead"""
427
-
428
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
429
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
430
-
431
- strategy: Literal["LLM", "Auto", "Ignore"]
432
-
433
-
434
- class SegmentProcessingSectionHeader(TypedDict, total=False):
435
- crop_image: Literal["All", "Auto"]
436
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
437
-
438
- - `All` crops all images in the item
439
- - `Auto` crops images only if required for post-processing
440
- """
441
-
442
- description: bool
443
- """Generate LLM descriptions for this segment"""
444
-
445
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
446
- """**DEPRECATED**: `embed` field is auto populated"""
447
-
448
- extended_context: bool
449
- """Use the full page image as context for LLM generation"""
450
-
451
- format: Literal["Html", "Markdown"]
452
-
453
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
454
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
455
-
456
- llm: Optional[str]
457
- """**DEPRECATED**: use description instead"""
458
-
459
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
460
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
461
-
462
- strategy: Literal["LLM", "Auto", "Ignore"]
463
-
464
-
465
- class SegmentProcessingTable(TypedDict, total=False):
466
- crop_image: Literal["All", "Auto"]
467
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
468
-
469
- - `All` crops all images in the item
470
- - `Auto` crops images only if required for post-processing
471
- """
472
-
473
- description: bool
474
- """Generate LLM descriptions for this segment"""
475
-
476
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
477
- """**DEPRECATED**: `embed` field is auto populated"""
478
-
479
- extended_context: bool
480
- """Use the full page image as context for LLM generation"""
481
-
482
- format: Literal["Html", "Markdown"]
483
-
484
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
485
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
486
-
487
- llm: Optional[str]
488
- """**DEPRECATED**: use description instead"""
489
-
490
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
491
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
492
-
493
- strategy: Literal["LLM", "Auto", "Ignore"]
494
-
495
-
496
- class SegmentProcessingText(TypedDict, total=False):
497
- crop_image: Literal["All", "Auto"]
498
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
499
-
500
- - `All` crops all images in the item
501
- - `Auto` crops images only if required for post-processing
502
- """
503
-
504
- description: bool
505
- """Generate LLM descriptions for this segment"""
506
-
507
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
508
- """**DEPRECATED**: `embed` field is auto populated"""
509
-
510
- extended_context: bool
511
- """Use the full page image as context for LLM generation"""
512
-
513
- format: Literal["Html", "Markdown"]
514
-
515
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
516
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
517
-
518
- llm: Optional[str]
519
- """**DEPRECATED**: use description instead"""
520
-
521
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
522
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
523
-
524
- strategy: Literal["LLM", "Auto", "Ignore"]
525
-
526
-
527
- class SegmentProcessingTitle(TypedDict, total=False):
528
- crop_image: Literal["All", "Auto"]
529
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
530
-
531
- - `All` crops all images in the item
532
- - `Auto` crops images only if required for post-processing
533
- """
534
-
535
- description: bool
536
- """Generate LLM descriptions for this segment"""
537
-
538
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
539
- """**DEPRECATED**: `embed` field is auto populated"""
540
-
541
- extended_context: bool
542
- """Use the full page image as context for LLM generation"""
543
-
544
- format: Literal["Html", "Markdown"]
545
-
546
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
547
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
548
-
549
- llm: Optional[str]
550
- """**DEPRECATED**: use description instead"""
551
-
552
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
553
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
554
-
555
- strategy: Literal["LLM", "Auto", "Ignore"]
556
-
557
-
558
- class SegmentProcessing(TypedDict, total=False):
559
- caption: Annotated[Optional[SegmentProcessingCaption], PropertyInfo(alias="Caption")]
560
- """Controls the processing and generation for the segment.
561
-
562
- - `crop_image` controls whether to crop the file's images to the segment's
563
- bounding box. The cropped image will be stored in the segment's `image` field.
564
- Use `All` to always crop, or `Auto` to only crop when needed for
565
- post-processing.
566
- - `format` specifies the output format: `Html` or `Markdown`
567
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
568
- - `Auto`: Process content automatically
569
- - `LLM`: Use large language models for processing
570
- - `Ignore`: Exclude segments from final output
571
- - `description` enables LLM-generated descriptions for segments. **Note:** This
572
- uses chunkr's own VLM models and is not configurable via LLM processing
573
- configuration.
574
- - `extended_context` uses the full page image as context for LLM generation.
575
-
576
- **Deprecated fields (for backwards compatibility):**
577
-
578
- - `llm` - **DEPRECATED**: Use `description` instead
579
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
580
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
581
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
582
- """
583
-
584
- footnote: Annotated[Optional[SegmentProcessingFootnote], PropertyInfo(alias="Footnote")]
585
- """Controls the processing and generation for the segment.
586
-
587
- - `crop_image` controls whether to crop the file's images to the segment's
588
- bounding box. The cropped image will be stored in the segment's `image` field.
589
- Use `All` to always crop, or `Auto` to only crop when needed for
590
- post-processing.
591
- - `format` specifies the output format: `Html` or `Markdown`
592
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
593
- - `Auto`: Process content automatically
594
- - `LLM`: Use large language models for processing
595
- - `Ignore`: Exclude segments from final output
596
- - `description` enables LLM-generated descriptions for segments. **Note:** This
597
- uses chunkr's own VLM models and is not configurable via LLM processing
598
- configuration.
599
- - `extended_context` uses the full page image as context for LLM generation.
600
-
601
- **Deprecated fields (for backwards compatibility):**
602
-
603
- - `llm` - **DEPRECATED**: Use `description` instead
604
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
605
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
606
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
607
- """
608
-
609
- formula: Annotated[Optional[SegmentProcessingFormula], PropertyInfo(alias="Formula")]
610
- """Controls the processing and generation for the segment.
611
-
612
- - `crop_image` controls whether to crop the file's images to the segment's
613
- bounding box. The cropped image will be stored in the segment's `image` field.
614
- Use `All` to always crop, or `Auto` to only crop when needed for
615
- post-processing.
616
- - `format` specifies the output format: `Html` or `Markdown`
617
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
618
- - `Auto`: Process content automatically
619
- - `LLM`: Use large language models for processing
620
- - `Ignore`: Exclude segments from final output
621
- - `description` enables LLM-generated descriptions for segments. **Note:** This
622
- uses chunkr's own VLM models and is not configurable via LLM processing
623
- configuration.
624
- - `extended_context` uses the full page image as context for LLM generation.
625
-
626
- **Deprecated fields (for backwards compatibility):**
627
-
628
- - `llm` - **DEPRECATED**: Use `description` instead
629
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
630
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
631
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
632
- """
633
-
634
- list_item: Annotated[Optional[SegmentProcessingListItem], PropertyInfo(alias="ListItem")]
635
- """Controls the processing and generation for the segment.
636
-
637
- - `crop_image` controls whether to crop the file's images to the segment's
638
- bounding box. The cropped image will be stored in the segment's `image` field.
639
- Use `All` to always crop, or `Auto` to only crop when needed for
640
- post-processing.
641
- - `format` specifies the output format: `Html` or `Markdown`
642
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
643
- - `Auto`: Process content automatically
644
- - `LLM`: Use large language models for processing
645
- - `Ignore`: Exclude segments from final output
646
- - `description` enables LLM-generated descriptions for segments. **Note:** This
647
- uses chunkr's own VLM models and is not configurable via LLM processing
648
- configuration.
649
- - `extended_context` uses the full page image as context for LLM generation.
650
-
651
- **Deprecated fields (for backwards compatibility):**
652
-
653
- - `llm` - **DEPRECATED**: Use `description` instead
654
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
655
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
656
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
657
- """
658
-
659
- page: Annotated[Optional[SegmentProcessingPage], PropertyInfo(alias="Page")]
660
- """Controls the processing and generation for the segment.
661
-
662
- - `crop_image` controls whether to crop the file's images to the segment's
663
- bounding box. The cropped image will be stored in the segment's `image` field.
664
- Use `All` to always crop, or `Auto` to only crop when needed for
665
- post-processing.
666
- - `format` specifies the output format: `Html` or `Markdown`
667
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
668
- - `Auto`: Process content automatically
669
- - `LLM`: Use large language models for processing
670
- - `Ignore`: Exclude segments from final output
671
- - `description` enables LLM-generated descriptions for segments. **Note:** This
672
- uses chunkr's own VLM models and is not configurable via LLM processing
673
- configuration.
674
- - `extended_context` uses the full page image as context for LLM generation.
675
-
676
- **Deprecated fields (for backwards compatibility):**
677
-
678
- - `llm` - **DEPRECATED**: Use `description` instead
679
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
680
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
681
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
682
- """
683
-
684
- page_footer: Annotated[Optional[SegmentProcessingPageFooter], PropertyInfo(alias="PageFooter")]
685
- """Controls the processing and generation for the segment.
686
-
687
- - `crop_image` controls whether to crop the file's images to the segment's
688
- bounding box. The cropped image will be stored in the segment's `image` field.
689
- Use `All` to always crop, or `Auto` to only crop when needed for
690
- post-processing.
691
- - `format` specifies the output format: `Html` or `Markdown`
692
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
693
- - `Auto`: Process content automatically
694
- - `LLM`: Use large language models for processing
695
- - `Ignore`: Exclude segments from final output
696
- - `description` enables LLM-generated descriptions for segments. **Note:** This
697
- uses chunkr's own VLM models and is not configurable via LLM processing
698
- configuration.
699
- - `extended_context` uses the full page image as context for LLM generation.
700
-
701
- **Deprecated fields (for backwards compatibility):**
702
-
703
- - `llm` - **DEPRECATED**: Use `description` instead
704
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
705
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
706
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
707
- """
708
-
709
- page_header: Annotated[Optional[SegmentProcessingPageHeader], PropertyInfo(alias="PageHeader")]
710
- """Controls the processing and generation for the segment.
711
-
712
- - `crop_image` controls whether to crop the file's images to the segment's
713
- bounding box. The cropped image will be stored in the segment's `image` field.
714
- Use `All` to always crop, or `Auto` to only crop when needed for
715
- post-processing.
716
- - `format` specifies the output format: `Html` or `Markdown`
717
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
718
- - `Auto`: Process content automatically
719
- - `LLM`: Use large language models for processing
720
- - `Ignore`: Exclude segments from final output
721
- - `description` enables LLM-generated descriptions for segments. **Note:** This
722
- uses chunkr's own VLM models and is not configurable via LLM processing
723
- configuration.
724
- - `extended_context` uses the full page image as context for LLM generation.
725
-
726
- **Deprecated fields (for backwards compatibility):**
727
-
728
- - `llm` - **DEPRECATED**: Use `description` instead
729
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
730
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
731
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
732
- """
733
-
734
- picture: Annotated[Optional[SegmentProcessingPicture], PropertyInfo(alias="Picture")]
735
- """Controls the processing and generation for the segment.
736
-
737
- - `crop_image` controls whether to crop the file's images to the segment's
738
- bounding box. The cropped image will be stored in the segment's `image` field.
739
- Use `All` to always crop, or `Auto` to only crop when needed for
740
- post-processing.
741
- - `format` specifies the output format: `Html` or `Markdown`
742
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
743
- - `Auto`: Process content automatically
744
- - `LLM`: Use large language models for processing
745
- - `Ignore`: Exclude segments from final output
746
- - `description` enables LLM-generated descriptions for segments. **Note:** This
747
- uses chunkr's own VLM models and is not configurable via LLM processing
748
- configuration.
749
- - `extended_context` uses the full page image as context for LLM generation.
750
-
751
- **Deprecated fields (for backwards compatibility):**
752
-
753
- - `llm` - **DEPRECATED**: Use `description` instead
754
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
755
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
756
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
757
- """
758
-
759
- section_header: Annotated[Optional[SegmentProcessingSectionHeader], PropertyInfo(alias="SectionHeader")]
760
- """Controls the processing and generation for the segment.
761
-
762
- - `crop_image` controls whether to crop the file's images to the segment's
763
- bounding box. The cropped image will be stored in the segment's `image` field.
764
- Use `All` to always crop, or `Auto` to only crop when needed for
765
- post-processing.
766
- - `format` specifies the output format: `Html` or `Markdown`
767
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
768
- - `Auto`: Process content automatically
769
- - `LLM`: Use large language models for processing
770
- - `Ignore`: Exclude segments from final output
771
- - `description` enables LLM-generated descriptions for segments. **Note:** This
772
- uses chunkr's own VLM models and is not configurable via LLM processing
773
- configuration.
774
- - `extended_context` uses the full page image as context for LLM generation.
775
-
776
- **Deprecated fields (for backwards compatibility):**
777
-
778
- - `llm` - **DEPRECATED**: Use `description` instead
779
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
780
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
781
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
782
- """
783
-
784
- table: Annotated[Optional[SegmentProcessingTable], PropertyInfo(alias="Table")]
785
- """Controls the processing and generation for the segment.
786
-
787
- - `crop_image` controls whether to crop the file's images to the segment's
788
- bounding box. The cropped image will be stored in the segment's `image` field.
789
- Use `All` to always crop, or `Auto` to only crop when needed for
790
- post-processing.
791
- - `format` specifies the output format: `Html` or `Markdown`
792
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
793
- - `Auto`: Process content automatically
794
- - `LLM`: Use large language models for processing
795
- - `Ignore`: Exclude segments from final output
796
- - `description` enables LLM-generated descriptions for segments. **Note:** This
797
- uses chunkr's own VLM models and is not configurable via LLM processing
798
- configuration.
799
- - `extended_context` uses the full page image as context for LLM generation.
800
-
801
- **Deprecated fields (for backwards compatibility):**
802
-
803
- - `llm` - **DEPRECATED**: Use `description` instead
804
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
805
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
806
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
807
- """
808
-
809
- text: Annotated[Optional[SegmentProcessingText], PropertyInfo(alias="Text")]
810
- """Controls the processing and generation for the segment.
811
-
812
- - `crop_image` controls whether to crop the file's images to the segment's
813
- bounding box. The cropped image will be stored in the segment's `image` field.
814
- Use `All` to always crop, or `Auto` to only crop when needed for
815
- post-processing.
816
- - `format` specifies the output format: `Html` or `Markdown`
817
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
818
- - `Auto`: Process content automatically
819
- - `LLM`: Use large language models for processing
820
- - `Ignore`: Exclude segments from final output
821
- - `description` enables LLM-generated descriptions for segments. **Note:** This
822
- uses chunkr's own VLM models and is not configurable via LLM processing
823
- configuration.
824
- - `extended_context` uses the full page image as context for LLM generation.
825
-
826
- **Deprecated fields (for backwards compatibility):**
827
-
828
- - `llm` - **DEPRECATED**: Use `description` instead
829
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
830
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
831
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
832
- """
833
-
834
- title: Annotated[Optional[SegmentProcessingTitle], PropertyInfo(alias="Title")]
835
- """Controls the processing and generation for the segment.
836
-
837
- - `crop_image` controls whether to crop the file's images to the segment's
838
- bounding box. The cropped image will be stored in the segment's `image` field.
839
- Use `All` to always crop, or `Auto` to only crop when needed for
840
- post-processing.
841
- - `format` specifies the output format: `Html` or `Markdown`
842
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
843
- - `Auto`: Process content automatically
844
- - `LLM`: Use large language models for processing
845
- - `Ignore`: Exclude segments from final output
846
- - `description` enables LLM-generated descriptions for segments. **Note:** This
847
- uses chunkr's own VLM models and is not configurable via LLM processing
848
- configuration.
849
- - `extended_context` uses the full page image as context for LLM generation.
850
-
851
- **Deprecated fields (for backwards compatibility):**
852
-
853
- - `llm` - **DEPRECATED**: Use `description` instead
854
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
855
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
856
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
857
- """