chunkr-ai 0.1.0a6__py3-none-any.whl → 0.1.0a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. chunkr_ai/__init__.py +2 -0
  2. chunkr_ai/_base_client.py +3 -3
  3. chunkr_ai/_client.py +31 -3
  4. chunkr_ai/_compat.py +48 -48
  5. chunkr_ai/_constants.py +5 -5
  6. chunkr_ai/_exceptions.py +4 -0
  7. chunkr_ai/_models.py +41 -41
  8. chunkr_ai/_types.py +35 -1
  9. chunkr_ai/_utils/__init__.py +9 -2
  10. chunkr_ai/_utils/_compat.py +45 -0
  11. chunkr_ai/_utils/_datetime_parse.py +136 -0
  12. chunkr_ai/_utils/_transform.py +11 -1
  13. chunkr_ai/_utils/_typing.py +6 -1
  14. chunkr_ai/_utils/_utils.py +0 -1
  15. chunkr_ai/_version.py +1 -1
  16. chunkr_ai/resources/__init__.py +14 -0
  17. chunkr_ai/resources/files.py +3 -3
  18. chunkr_ai/resources/tasks/__init__.py +14 -0
  19. chunkr_ai/resources/tasks/extract.py +393 -0
  20. chunkr_ai/resources/tasks/parse.py +110 -286
  21. chunkr_ai/resources/tasks/tasks.py +64 -32
  22. chunkr_ai/resources/webhooks.py +193 -0
  23. chunkr_ai/types/__init__.py +27 -1
  24. chunkr_ai/types/bounding_box.py +19 -0
  25. chunkr_ai/types/cell.py +39 -0
  26. chunkr_ai/types/cell_style.py +28 -0
  27. chunkr_ai/types/chunk.py +40 -0
  28. chunkr_ai/types/chunk_processing.py +40 -0
  29. chunkr_ai/types/chunk_processing_param.py +42 -0
  30. chunkr_ai/types/extract_configuration.py +24 -0
  31. chunkr_ai/types/extract_output_response.py +62 -0
  32. chunkr_ai/types/file_create_params.py +2 -1
  33. chunkr_ai/types/file_info.py +21 -0
  34. chunkr_ai/types/generation_config.py +29 -0
  35. chunkr_ai/types/generation_config_param.py +29 -0
  36. chunkr_ai/types/llm_processing.py +36 -0
  37. chunkr_ai/types/llm_processing_param.py +36 -0
  38. chunkr_ai/types/ocr_result.py +28 -0
  39. chunkr_ai/types/page.py +27 -0
  40. chunkr_ai/types/parse_configuration.py +64 -0
  41. chunkr_ai/types/parse_configuration_param.py +65 -0
  42. chunkr_ai/types/parse_output_response.py +29 -0
  43. chunkr_ai/types/segment.py +109 -0
  44. chunkr_ai/types/segment_processing.py +228 -0
  45. chunkr_ai/types/segment_processing_param.py +229 -0
  46. chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
  47. chunkr_ai/types/task_get_params.py +0 -3
  48. chunkr_ai/types/task_list_params.py +7 -1
  49. chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
  50. chunkr_ai/types/task_response.py +68 -0
  51. chunkr_ai/types/tasks/__init__.py +7 -1
  52. chunkr_ai/types/tasks/extract_create_params.py +47 -0
  53. chunkr_ai/types/tasks/extract_create_response.py +67 -0
  54. chunkr_ai/types/tasks/extract_get_params.py +18 -0
  55. chunkr_ai/types/tasks/extract_get_response.py +67 -0
  56. chunkr_ai/types/tasks/parse_create_params.py +25 -793
  57. chunkr_ai/types/tasks/parse_create_response.py +55 -0
  58. chunkr_ai/types/tasks/parse_get_params.py +18 -0
  59. chunkr_ai/types/tasks/parse_get_response.py +55 -0
  60. chunkr_ai/types/unwrap_webhook_event.py +11 -0
  61. chunkr_ai/types/version_info.py +31 -0
  62. chunkr_ai/types/webhook_url_response.py +9 -0
  63. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/METADATA +14 -13
  64. chunkr_ai-0.1.0a8.dist-info/RECORD +88 -0
  65. chunkr_ai/types/task.py +0 -1225
  66. chunkr_ai/types/tasks/parse_update_params.py +0 -845
  67. chunkr_ai-0.1.0a6.dist-info/RECORD +0 -52
  68. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/WHEEL +0 -0
  69. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/licenses/LICENSE +0 -0
@@ -1,845 +0,0 @@
1
- # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
-
3
- from __future__ import annotations
4
-
5
- from typing import List, Union, Optional
6
- from typing_extensions import Literal, Required, Annotated, TypeAlias, TypedDict
7
-
8
- from ..._utils import PropertyInfo
9
-
10
- __all__ = [
11
- "ParseUpdateParams",
12
- "ChunkProcessing",
13
- "ChunkProcessingTokenizer",
14
- "ChunkProcessingTokenizerEnum",
15
- "ChunkProcessingTokenizerString",
16
- "LlmProcessing",
17
- "LlmProcessingFallbackStrategy",
18
- "LlmProcessingFallbackStrategyModel",
19
- "SegmentProcessing",
20
- "SegmentProcessingCaption",
21
- "SegmentProcessingFootnote",
22
- "SegmentProcessingFormula",
23
- "SegmentProcessingListItem",
24
- "SegmentProcessingPage",
25
- "SegmentProcessingPageFooter",
26
- "SegmentProcessingPageHeader",
27
- "SegmentProcessingPicture",
28
- "SegmentProcessingSectionHeader",
29
- "SegmentProcessingTable",
30
- "SegmentProcessingText",
31
- "SegmentProcessingTitle",
32
- ]
33
-
34
-
35
- class ParseUpdateParams(TypedDict, total=False):
36
- chunk_processing: Optional[ChunkProcessing]
37
- """Controls the setting for the chunking and post-processing of each chunk."""
38
-
39
- error_handling: Optional[Literal["Fail", "Continue"]]
40
- """Controls how errors are handled during processing:
41
-
42
- - `Fail`: Stops processing and fails the task when any error occurs
43
- - `Continue`: Attempts to continue processing despite non-critical errors (eg.
44
- LLM refusals etc.)
45
- """
46
-
47
- expires_in: Optional[int]
48
- """
49
- The number of seconds until task is deleted. Expired tasks can **not** be
50
- updated, polled or accessed via web interface.
51
- """
52
-
53
- high_resolution: Optional[bool]
54
- """Whether to use high-resolution images for cropping and post-processing.
55
-
56
- (Latency penalty: ~7 seconds per page)
57
- """
58
-
59
- llm_processing: Optional[LlmProcessing]
60
- """Controls the LLM used for the task."""
61
-
62
- ocr_strategy: Optional[Literal["All", "Auto"]]
63
- """Controls the Optical Character Recognition (OCR) strategy.
64
-
65
- - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
66
- - `Auto`: Selectively applies OCR only to pages with missing or low-quality
67
- text. When text layer is present the bounding boxes from the text layer are
68
- used.
69
- """
70
-
71
- pipeline: Optional[Literal["Azure", "Chunkr"]]
72
- """
73
- Choose the provider whose models will be used for segmentation and OCR. The
74
- output will be unified to the Chunkr `output` format.
75
- """
76
-
77
- segment_processing: Optional[SegmentProcessing]
78
- """Defines how each segment type is handled when generating the final output.
79
-
80
- Each segment uses one of three strategies. The chosen strategy controls:
81
-
82
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
83
- - How the content is produced (rule-based vs. LLM).
84
- - The output format (`Html` or `Markdown`).
85
-
86
- Optional flags such as image **cropping**, **extended context**, and
87
- **descriptions** further refine behaviour.
88
-
89
- **Default strategy per segment**
90
-
91
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
92
- (Markdown, description off)
93
- - `Table` → **LLM** (HTML, description on)
94
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
95
- - `Formula`, `Page` → **LLM** (Markdown, description off)
96
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
97
-
98
- **Strategy reference**
99
-
100
- - **Auto** – rule-based content generation.
101
- - **LLM** – generate content with an LLM.
102
- - **Ignore** – exclude the segment entirely.
103
- """
104
-
105
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]]
106
- """Controls the segmentation strategy:
107
-
108
- - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
109
- `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
110
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
111
- - `Page`: Treats each page as a single segment. Faster processing, but without
112
- layout element detection and only simple chunking.
113
- """
114
-
115
-
116
- class ChunkProcessingTokenizerEnum(TypedDict, total=False):
117
- enum: Required[
118
- Annotated[Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"], PropertyInfo(alias="Enum")]
119
- ]
120
- """Use one of the predefined tokenizer types"""
121
-
122
-
123
- class ChunkProcessingTokenizerString(TypedDict, total=False):
124
- string: Required[Annotated[str, PropertyInfo(alias="String")]]
125
- """
126
- Use any Hugging Face tokenizer by specifying its model ID Examples:
127
- "Qwen/Qwen-tokenizer", "facebook/bart-large"
128
- """
129
-
130
-
131
- ChunkProcessingTokenizer: TypeAlias = Union[ChunkProcessingTokenizerEnum, ChunkProcessingTokenizerString]
132
-
133
-
134
- class ChunkProcessing(TypedDict, total=False):
135
- ignore_headers_and_footers: Optional[bool]
136
- """DEPRECATED: use `segment_processing.ignore` instead"""
137
-
138
- target_length: int
139
- """The target number of words in each chunk.
140
-
141
- If 0, each chunk will contain a single segment.
142
- """
143
-
144
- tokenizer: ChunkProcessingTokenizer
145
- """The tokenizer to use for the chunking process."""
146
-
147
-
148
- class LlmProcessingFallbackStrategyModel(TypedDict, total=False):
149
- model: Required[Annotated[str, PropertyInfo(alias="Model")]]
150
- """Use a specific model as fallback"""
151
-
152
-
153
- LlmProcessingFallbackStrategy: TypeAlias = Union[Literal["None", "Default"], LlmProcessingFallbackStrategyModel]
154
-
155
-
156
- class LlmProcessing(TypedDict, total=False):
157
- fallback_strategy: LlmProcessingFallbackStrategy
158
- """The fallback strategy to use for the LLMs in the task."""
159
-
160
- llm_model_id: Optional[str]
161
- """The ID of the model to use for the task.
162
-
163
- If not provided, the default model will be used. Please check the documentation
164
- for the model you want to use.
165
- """
166
-
167
- max_completion_tokens: Optional[int]
168
- """The maximum number of tokens to generate."""
169
-
170
- temperature: float
171
- """The temperature to use for the LLM."""
172
-
173
-
174
- class SegmentProcessingCaption(TypedDict, total=False):
175
- crop_image: Literal["All", "Auto"]
176
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
177
-
178
- - `All` crops all images in the item
179
- - `Auto` crops images only if required for post-processing
180
- """
181
-
182
- description: bool
183
- """Generate LLM descriptions for this segment"""
184
-
185
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
186
- """**DEPRECATED**: `embed` field is auto populated"""
187
-
188
- extended_context: bool
189
- """Use the full page image as context for LLM generation"""
190
-
191
- format: Literal["Html", "Markdown"]
192
-
193
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
194
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
195
-
196
- llm: Optional[str]
197
- """**DEPRECATED**: use description instead"""
198
-
199
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
200
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
201
-
202
- strategy: Literal["LLM", "Auto", "Ignore"]
203
-
204
-
205
- class SegmentProcessingFootnote(TypedDict, total=False):
206
- crop_image: Literal["All", "Auto"]
207
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
208
-
209
- - `All` crops all images in the item
210
- - `Auto` crops images only if required for post-processing
211
- """
212
-
213
- description: bool
214
- """Generate LLM descriptions for this segment"""
215
-
216
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
217
- """**DEPRECATED**: `embed` field is auto populated"""
218
-
219
- extended_context: bool
220
- """Use the full page image as context for LLM generation"""
221
-
222
- format: Literal["Html", "Markdown"]
223
-
224
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
225
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
226
-
227
- llm: Optional[str]
228
- """**DEPRECATED**: use description instead"""
229
-
230
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
231
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
232
-
233
- strategy: Literal["LLM", "Auto", "Ignore"]
234
-
235
-
236
- class SegmentProcessingFormula(TypedDict, total=False):
237
- crop_image: Literal["All", "Auto"]
238
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
239
-
240
- - `All` crops all images in the item
241
- - `Auto` crops images only if required for post-processing
242
- """
243
-
244
- description: bool
245
- """Generate LLM descriptions for this segment"""
246
-
247
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
248
- """**DEPRECATED**: `embed` field is auto populated"""
249
-
250
- extended_context: bool
251
- """Use the full page image as context for LLM generation"""
252
-
253
- format: Literal["Html", "Markdown"]
254
-
255
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
256
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
257
-
258
- llm: Optional[str]
259
- """**DEPRECATED**: use description instead"""
260
-
261
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
262
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
263
-
264
- strategy: Literal["LLM", "Auto", "Ignore"]
265
-
266
-
267
- class SegmentProcessingListItem(TypedDict, total=False):
268
- crop_image: Literal["All", "Auto"]
269
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
270
-
271
- - `All` crops all images in the item
272
- - `Auto` crops images only if required for post-processing
273
- """
274
-
275
- description: bool
276
- """Generate LLM descriptions for this segment"""
277
-
278
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
279
- """**DEPRECATED**: `embed` field is auto populated"""
280
-
281
- extended_context: bool
282
- """Use the full page image as context for LLM generation"""
283
-
284
- format: Literal["Html", "Markdown"]
285
-
286
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
287
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
288
-
289
- llm: Optional[str]
290
- """**DEPRECATED**: use description instead"""
291
-
292
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
293
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
294
-
295
- strategy: Literal["LLM", "Auto", "Ignore"]
296
-
297
-
298
- class SegmentProcessingPage(TypedDict, total=False):
299
- crop_image: Literal["All", "Auto"]
300
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
301
-
302
- - `All` crops all images in the item
303
- - `Auto` crops images only if required for post-processing
304
- """
305
-
306
- description: bool
307
- """Generate LLM descriptions for this segment"""
308
-
309
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
310
- """**DEPRECATED**: `embed` field is auto populated"""
311
-
312
- extended_context: bool
313
- """Use the full page image as context for LLM generation"""
314
-
315
- format: Literal["Html", "Markdown"]
316
-
317
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
318
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
319
-
320
- llm: Optional[str]
321
- """**DEPRECATED**: use description instead"""
322
-
323
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
324
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
325
-
326
- strategy: Literal["LLM", "Auto", "Ignore"]
327
-
328
-
329
- class SegmentProcessingPageFooter(TypedDict, total=False):
330
- crop_image: Literal["All", "Auto"]
331
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
332
-
333
- - `All` crops all images in the item
334
- - `Auto` crops images only if required for post-processing
335
- """
336
-
337
- description: bool
338
- """Generate LLM descriptions for this segment"""
339
-
340
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
341
- """**DEPRECATED**: `embed` field is auto populated"""
342
-
343
- extended_context: bool
344
- """Use the full page image as context for LLM generation"""
345
-
346
- format: Literal["Html", "Markdown"]
347
-
348
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
349
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
350
-
351
- llm: Optional[str]
352
- """**DEPRECATED**: use description instead"""
353
-
354
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
355
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
356
-
357
- strategy: Literal["LLM", "Auto", "Ignore"]
358
-
359
-
360
- class SegmentProcessingPageHeader(TypedDict, total=False):
361
- crop_image: Literal["All", "Auto"]
362
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
363
-
364
- - `All` crops all images in the item
365
- - `Auto` crops images only if required for post-processing
366
- """
367
-
368
- description: bool
369
- """Generate LLM descriptions for this segment"""
370
-
371
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
372
- """**DEPRECATED**: `embed` field is auto populated"""
373
-
374
- extended_context: bool
375
- """Use the full page image as context for LLM generation"""
376
-
377
- format: Literal["Html", "Markdown"]
378
-
379
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
380
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
381
-
382
- llm: Optional[str]
383
- """**DEPRECATED**: use description instead"""
384
-
385
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
386
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
387
-
388
- strategy: Literal["LLM", "Auto", "Ignore"]
389
-
390
-
391
- class SegmentProcessingPicture(TypedDict, total=False):
392
- crop_image: Literal["All", "Auto"]
393
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
394
-
395
- - `All` crops all images in the item
396
- - `Auto` crops images only if required for post-processing
397
- """
398
-
399
- description: bool
400
- """Generate LLM descriptions for this segment"""
401
-
402
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
403
- """**DEPRECATED**: `embed` field is auto populated"""
404
-
405
- extended_context: bool
406
- """Use the full page image as context for LLM generation"""
407
-
408
- format: Literal["Html", "Markdown"]
409
-
410
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
411
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
412
-
413
- llm: Optional[str]
414
- """**DEPRECATED**: use description instead"""
415
-
416
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
417
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
418
-
419
- strategy: Literal["LLM", "Auto", "Ignore"]
420
-
421
-
422
- class SegmentProcessingSectionHeader(TypedDict, total=False):
423
- crop_image: Literal["All", "Auto"]
424
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
425
-
426
- - `All` crops all images in the item
427
- - `Auto` crops images only if required for post-processing
428
- """
429
-
430
- description: bool
431
- """Generate LLM descriptions for this segment"""
432
-
433
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
434
- """**DEPRECATED**: `embed` field is auto populated"""
435
-
436
- extended_context: bool
437
- """Use the full page image as context for LLM generation"""
438
-
439
- format: Literal["Html", "Markdown"]
440
-
441
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
442
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
443
-
444
- llm: Optional[str]
445
- """**DEPRECATED**: use description instead"""
446
-
447
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
448
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
449
-
450
- strategy: Literal["LLM", "Auto", "Ignore"]
451
-
452
-
453
- class SegmentProcessingTable(TypedDict, total=False):
454
- crop_image: Literal["All", "Auto"]
455
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
456
-
457
- - `All` crops all images in the item
458
- - `Auto` crops images only if required for post-processing
459
- """
460
-
461
- description: bool
462
- """Generate LLM descriptions for this segment"""
463
-
464
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
465
- """**DEPRECATED**: `embed` field is auto populated"""
466
-
467
- extended_context: bool
468
- """Use the full page image as context for LLM generation"""
469
-
470
- format: Literal["Html", "Markdown"]
471
-
472
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
473
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
474
-
475
- llm: Optional[str]
476
- """**DEPRECATED**: use description instead"""
477
-
478
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
479
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
480
-
481
- strategy: Literal["LLM", "Auto", "Ignore"]
482
-
483
-
484
- class SegmentProcessingText(TypedDict, total=False):
485
- crop_image: Literal["All", "Auto"]
486
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
487
-
488
- - `All` crops all images in the item
489
- - `Auto` crops images only if required for post-processing
490
- """
491
-
492
- description: bool
493
- """Generate LLM descriptions for this segment"""
494
-
495
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
496
- """**DEPRECATED**: `embed` field is auto populated"""
497
-
498
- extended_context: bool
499
- """Use the full page image as context for LLM generation"""
500
-
501
- format: Literal["Html", "Markdown"]
502
-
503
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
504
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
505
-
506
- llm: Optional[str]
507
- """**DEPRECATED**: use description instead"""
508
-
509
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
510
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
511
-
512
- strategy: Literal["LLM", "Auto", "Ignore"]
513
-
514
-
515
- class SegmentProcessingTitle(TypedDict, total=False):
516
- crop_image: Literal["All", "Auto"]
517
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
518
-
519
- - `All` crops all images in the item
520
- - `Auto` crops images only if required for post-processing
521
- """
522
-
523
- description: bool
524
- """Generate LLM descriptions for this segment"""
525
-
526
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
527
- """**DEPRECATED**: `embed` field is auto populated"""
528
-
529
- extended_context: bool
530
- """Use the full page image as context for LLM generation"""
531
-
532
- format: Literal["Html", "Markdown"]
533
-
534
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
535
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
536
-
537
- llm: Optional[str]
538
- """**DEPRECATED**: use description instead"""
539
-
540
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
541
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
542
-
543
- strategy: Literal["LLM", "Auto", "Ignore"]
544
-
545
-
546
- class SegmentProcessing(TypedDict, total=False):
547
- caption: Annotated[Optional[SegmentProcessingCaption], PropertyInfo(alias="Caption")]
548
- """Controls the processing and generation for the segment.
549
-
550
- - `crop_image` controls whether to crop the file's images to the segment's
551
- bounding box. The cropped image will be stored in the segment's `image` field.
552
- Use `All` to always crop, or `Auto` to only crop when needed for
553
- post-processing.
554
- - `format` specifies the output format: `Html` or `Markdown`
555
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
556
- - `Auto`: Process content automatically
557
- - `LLM`: Use large language models for processing
558
- - `Ignore`: Exclude segments from final output
559
- - `description` enables LLM-generated descriptions for segments. **Note:** This
560
- uses chunkr's own VLM models and is not configurable via LLM processing
561
- configuration.
562
- - `extended_context` uses the full page image as context for LLM generation.
563
-
564
- **Deprecated fields (for backwards compatibility):**
565
-
566
- - `llm` - **DEPRECATED**: Use `description` instead
567
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
568
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
569
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
570
- """
571
-
572
- footnote: Annotated[Optional[SegmentProcessingFootnote], PropertyInfo(alias="Footnote")]
573
- """Controls the processing and generation for the segment.
574
-
575
- - `crop_image` controls whether to crop the file's images to the segment's
576
- bounding box. The cropped image will be stored in the segment's `image` field.
577
- Use `All` to always crop, or `Auto` to only crop when needed for
578
- post-processing.
579
- - `format` specifies the output format: `Html` or `Markdown`
580
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
581
- - `Auto`: Process content automatically
582
- - `LLM`: Use large language models for processing
583
- - `Ignore`: Exclude segments from final output
584
- - `description` enables LLM-generated descriptions for segments. **Note:** This
585
- uses chunkr's own VLM models and is not configurable via LLM processing
586
- configuration.
587
- - `extended_context` uses the full page image as context for LLM generation.
588
-
589
- **Deprecated fields (for backwards compatibility):**
590
-
591
- - `llm` - **DEPRECATED**: Use `description` instead
592
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
593
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
594
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
595
- """
596
-
597
- formula: Annotated[Optional[SegmentProcessingFormula], PropertyInfo(alias="Formula")]
598
- """Controls the processing and generation for the segment.
599
-
600
- - `crop_image` controls whether to crop the file's images to the segment's
601
- bounding box. The cropped image will be stored in the segment's `image` field.
602
- Use `All` to always crop, or `Auto` to only crop when needed for
603
- post-processing.
604
- - `format` specifies the output format: `Html` or `Markdown`
605
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
606
- - `Auto`: Process content automatically
607
- - `LLM`: Use large language models for processing
608
- - `Ignore`: Exclude segments from final output
609
- - `description` enables LLM-generated descriptions for segments. **Note:** This
610
- uses chunkr's own VLM models and is not configurable via LLM processing
611
- configuration.
612
- - `extended_context` uses the full page image as context for LLM generation.
613
-
614
- **Deprecated fields (for backwards compatibility):**
615
-
616
- - `llm` - **DEPRECATED**: Use `description` instead
617
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
618
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
619
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
620
- """
621
-
622
- list_item: Annotated[Optional[SegmentProcessingListItem], PropertyInfo(alias="ListItem")]
623
- """Controls the processing and generation for the segment.
624
-
625
- - `crop_image` controls whether to crop the file's images to the segment's
626
- bounding box. The cropped image will be stored in the segment's `image` field.
627
- Use `All` to always crop, or `Auto` to only crop when needed for
628
- post-processing.
629
- - `format` specifies the output format: `Html` or `Markdown`
630
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
631
- - `Auto`: Process content automatically
632
- - `LLM`: Use large language models for processing
633
- - `Ignore`: Exclude segments from final output
634
- - `description` enables LLM-generated descriptions for segments. **Note:** This
635
- uses chunkr's own VLM models and is not configurable via LLM processing
636
- configuration.
637
- - `extended_context` uses the full page image as context for LLM generation.
638
-
639
- **Deprecated fields (for backwards compatibility):**
640
-
641
- - `llm` - **DEPRECATED**: Use `description` instead
642
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
643
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
644
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
645
- """
646
-
647
- page: Annotated[Optional[SegmentProcessingPage], PropertyInfo(alias="Page")]
648
- """Controls the processing and generation for the segment.
649
-
650
- - `crop_image` controls whether to crop the file's images to the segment's
651
- bounding box. The cropped image will be stored in the segment's `image` field.
652
- Use `All` to always crop, or `Auto` to only crop when needed for
653
- post-processing.
654
- - `format` specifies the output format: `Html` or `Markdown`
655
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
656
- - `Auto`: Process content automatically
657
- - `LLM`: Use large language models for processing
658
- - `Ignore`: Exclude segments from final output
659
- - `description` enables LLM-generated descriptions for segments. **Note:** This
660
- uses chunkr's own VLM models and is not configurable via LLM processing
661
- configuration.
662
- - `extended_context` uses the full page image as context for LLM generation.
663
-
664
- **Deprecated fields (for backwards compatibility):**
665
-
666
- - `llm` - **DEPRECATED**: Use `description` instead
667
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
668
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
669
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
670
- """
671
-
672
- page_footer: Annotated[Optional[SegmentProcessingPageFooter], PropertyInfo(alias="PageFooter")]
673
- """Controls the processing and generation for the segment.
674
-
675
- - `crop_image` controls whether to crop the file's images to the segment's
676
- bounding box. The cropped image will be stored in the segment's `image` field.
677
- Use `All` to always crop, or `Auto` to only crop when needed for
678
- post-processing.
679
- - `format` specifies the output format: `Html` or `Markdown`
680
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
681
- - `Auto`: Process content automatically
682
- - `LLM`: Use large language models for processing
683
- - `Ignore`: Exclude segments from final output
684
- - `description` enables LLM-generated descriptions for segments. **Note:** This
685
- uses chunkr's own VLM models and is not configurable via LLM processing
686
- configuration.
687
- - `extended_context` uses the full page image as context for LLM generation.
688
-
689
- **Deprecated fields (for backwards compatibility):**
690
-
691
- - `llm` - **DEPRECATED**: Use `description` instead
692
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
693
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
694
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
695
- """
696
-
697
- page_header: Annotated[Optional[SegmentProcessingPageHeader], PropertyInfo(alias="PageHeader")]
698
- """Controls the processing and generation for the segment.
699
-
700
- - `crop_image` controls whether to crop the file's images to the segment's
701
- bounding box. The cropped image will be stored in the segment's `image` field.
702
- Use `All` to always crop, or `Auto` to only crop when needed for
703
- post-processing.
704
- - `format` specifies the output format: `Html` or `Markdown`
705
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
706
- - `Auto`: Process content automatically
707
- - `LLM`: Use large language models for processing
708
- - `Ignore`: Exclude segments from final output
709
- - `description` enables LLM-generated descriptions for segments. **Note:** This
710
- uses chunkr's own VLM models and is not configurable via LLM processing
711
- configuration.
712
- - `extended_context` uses the full page image as context for LLM generation.
713
-
714
- **Deprecated fields (for backwards compatibility):**
715
-
716
- - `llm` - **DEPRECATED**: Use `description` instead
717
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
718
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
719
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
720
- """
721
-
722
- picture: Annotated[Optional[SegmentProcessingPicture], PropertyInfo(alias="Picture")]
723
- """Controls the processing and generation for the segment.
724
-
725
- - `crop_image` controls whether to crop the file's images to the segment's
726
- bounding box. The cropped image will be stored in the segment's `image` field.
727
- Use `All` to always crop, or `Auto` to only crop when needed for
728
- post-processing.
729
- - `format` specifies the output format: `Html` or `Markdown`
730
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
731
- - `Auto`: Process content automatically
732
- - `LLM`: Use large language models for processing
733
- - `Ignore`: Exclude segments from final output
734
- - `description` enables LLM-generated descriptions for segments. **Note:** This
735
- uses chunkr's own VLM models and is not configurable via LLM processing
736
- configuration.
737
- - `extended_context` uses the full page image as context for LLM generation.
738
-
739
- **Deprecated fields (for backwards compatibility):**
740
-
741
- - `llm` - **DEPRECATED**: Use `description` instead
742
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
743
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
744
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
745
- """
746
-
747
- section_header: Annotated[Optional[SegmentProcessingSectionHeader], PropertyInfo(alias="SectionHeader")]
748
- """Controls the processing and generation for the segment.
749
-
750
- - `crop_image` controls whether to crop the file's images to the segment's
751
- bounding box. The cropped image will be stored in the segment's `image` field.
752
- Use `All` to always crop, or `Auto` to only crop when needed for
753
- post-processing.
754
- - `format` specifies the output format: `Html` or `Markdown`
755
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
756
- - `Auto`: Process content automatically
757
- - `LLM`: Use large language models for processing
758
- - `Ignore`: Exclude segments from final output
759
- - `description` enables LLM-generated descriptions for segments. **Note:** This
760
- uses chunkr's own VLM models and is not configurable via LLM processing
761
- configuration.
762
- - `extended_context` uses the full page image as context for LLM generation.
763
-
764
- **Deprecated fields (for backwards compatibility):**
765
-
766
- - `llm` - **DEPRECATED**: Use `description` instead
767
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
768
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
769
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
770
- """
771
-
772
- table: Annotated[Optional[SegmentProcessingTable], PropertyInfo(alias="Table")]
773
- """Controls the processing and generation for the segment.
774
-
775
- - `crop_image` controls whether to crop the file's images to the segment's
776
- bounding box. The cropped image will be stored in the segment's `image` field.
777
- Use `All` to always crop, or `Auto` to only crop when needed for
778
- post-processing.
779
- - `format` specifies the output format: `Html` or `Markdown`
780
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
781
- - `Auto`: Process content automatically
782
- - `LLM`: Use large language models for processing
783
- - `Ignore`: Exclude segments from final output
784
- - `description` enables LLM-generated descriptions for segments. **Note:** This
785
- uses chunkr's own VLM models and is not configurable via LLM processing
786
- configuration.
787
- - `extended_context` uses the full page image as context for LLM generation.
788
-
789
- **Deprecated fields (for backwards compatibility):**
790
-
791
- - `llm` - **DEPRECATED**: Use `description` instead
792
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
793
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
794
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
795
- """
796
-
797
- text: Annotated[Optional[SegmentProcessingText], PropertyInfo(alias="Text")]
798
- """Controls the processing and generation for the segment.
799
-
800
- - `crop_image` controls whether to crop the file's images to the segment's
801
- bounding box. The cropped image will be stored in the segment's `image` field.
802
- Use `All` to always crop, or `Auto` to only crop when needed for
803
- post-processing.
804
- - `format` specifies the output format: `Html` or `Markdown`
805
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
806
- - `Auto`: Process content automatically
807
- - `LLM`: Use large language models for processing
808
- - `Ignore`: Exclude segments from final output
809
- - `description` enables LLM-generated descriptions for segments. **Note:** This
810
- uses chunkr's own VLM models and is not configurable via LLM processing
811
- configuration.
812
- - `extended_context` uses the full page image as context for LLM generation.
813
-
814
- **Deprecated fields (for backwards compatibility):**
815
-
816
- - `llm` - **DEPRECATED**: Use `description` instead
817
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
818
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
819
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
820
- """
821
-
822
- title: Annotated[Optional[SegmentProcessingTitle], PropertyInfo(alias="Title")]
823
- """Controls the processing and generation for the segment.
824
-
825
- - `crop_image` controls whether to crop the file's images to the segment's
826
- bounding box. The cropped image will be stored in the segment's `image` field.
827
- Use `All` to always crop, or `Auto` to only crop when needed for
828
- post-processing.
829
- - `format` specifies the output format: `Html` or `Markdown`
830
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
831
- - `Auto`: Process content automatically
832
- - `LLM`: Use large language models for processing
833
- - `Ignore`: Exclude segments from final output
834
- - `description` enables LLM-generated descriptions for segments. **Note:** This
835
- uses chunkr's own VLM models and is not configurable via LLM processing
836
- configuration.
837
- - `extended_context` uses the full page image as context for LLM generation.
838
-
839
- **Deprecated fields (for backwards compatibility):**
840
-
841
- - `llm` - **DEPRECATED**: Use `description` instead
842
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
843
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
844
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
845
- """