chunkr-ai 0.1.0a5__py3-none-any.whl → 0.1.0a7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. chunkr_ai/__init__.py +2 -0
  2. chunkr_ai/_client.py +31 -3
  3. chunkr_ai/_constants.py +5 -5
  4. chunkr_ai/_exceptions.py +4 -0
  5. chunkr_ai/_models.py +1 -1
  6. chunkr_ai/_types.py +35 -1
  7. chunkr_ai/_utils/__init__.py +1 -0
  8. chunkr_ai/_utils/_typing.py +5 -0
  9. chunkr_ai/_version.py +1 -1
  10. chunkr_ai/resources/__init__.py +14 -0
  11. chunkr_ai/resources/files.py +3 -3
  12. chunkr_ai/resources/tasks/__init__.py +14 -0
  13. chunkr_ai/resources/tasks/extract.py +409 -0
  14. chunkr_ai/resources/tasks/parse.py +102 -346
  15. chunkr_ai/resources/tasks/tasks.py +62 -14
  16. chunkr_ai/resources/webhooks.py +193 -0
  17. chunkr_ai/types/__init__.py +27 -1
  18. chunkr_ai/types/bounding_box.py +19 -0
  19. chunkr_ai/types/cell.py +39 -0
  20. chunkr_ai/types/cell_style.py +28 -0
  21. chunkr_ai/types/chunk.py +40 -0
  22. chunkr_ai/types/chunk_processing.py +40 -0
  23. chunkr_ai/types/chunk_processing_param.py +42 -0
  24. chunkr_ai/types/extract_configuration.py +24 -0
  25. chunkr_ai/types/extract_output_response.py +19 -0
  26. chunkr_ai/types/file_create_params.py +2 -1
  27. chunkr_ai/types/file_info.py +21 -0
  28. chunkr_ai/types/generation_config.py +29 -0
  29. chunkr_ai/types/generation_config_param.py +29 -0
  30. chunkr_ai/types/llm_processing.py +36 -0
  31. chunkr_ai/types/llm_processing_param.py +36 -0
  32. chunkr_ai/types/ocr_result.py +28 -0
  33. chunkr_ai/types/page.py +27 -0
  34. chunkr_ai/types/parse_configuration.py +64 -0
  35. chunkr_ai/types/parse_configuration_param.py +65 -0
  36. chunkr_ai/types/parse_output_response.py +29 -0
  37. chunkr_ai/types/segment.py +109 -0
  38. chunkr_ai/types/segment_processing.py +228 -0
  39. chunkr_ai/types/segment_processing_param.py +229 -0
  40. chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
  41. chunkr_ai/types/task_list_params.py +7 -1
  42. chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
  43. chunkr_ai/types/task_response.py +68 -0
  44. chunkr_ai/types/tasks/__init__.py +7 -1
  45. chunkr_ai/types/tasks/extract_create_params.py +47 -0
  46. chunkr_ai/types/tasks/extract_create_response.py +214 -0
  47. chunkr_ai/types/tasks/extract_get_params.py +21 -0
  48. chunkr_ai/types/tasks/extract_get_response.py +214 -0
  49. chunkr_ai/types/tasks/parse_create_params.py +25 -805
  50. chunkr_ai/types/tasks/parse_create_response.py +55 -0
  51. chunkr_ai/types/tasks/parse_get_params.py +21 -0
  52. chunkr_ai/types/tasks/parse_get_response.py +55 -0
  53. chunkr_ai/types/unwrap_webhook_event.py +11 -0
  54. chunkr_ai/types/version_info.py +31 -0
  55. chunkr_ai/types/webhook_url_response.py +9 -0
  56. {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/METADATA +14 -13
  57. chunkr_ai-0.1.0a7.dist-info/RECORD +86 -0
  58. chunkr_ai/types/task.py +0 -1225
  59. chunkr_ai/types/tasks/parse_update_params.py +0 -857
  60. chunkr_ai-0.1.0a5.dist-info/RECORD +0 -52
  61. {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/WHEEL +0 -0
  62. {chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/licenses/LICENSE +0 -0
@@ -2,39 +2,19 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import List, Union, Optional
6
- from typing_extensions import Literal, Required, Annotated, TypeAlias, TypedDict
5
+ from typing import Optional
6
+ from typing_extensions import Literal, Required, TypedDict
7
7
 
8
- from ..._utils import PropertyInfo
8
+ from ..llm_processing_param import LlmProcessingParam
9
+ from ..chunk_processing_param import ChunkProcessingParam
10
+ from ..segment_processing_param import SegmentProcessingParam
9
11
 
10
- __all__ = [
11
- "ParseCreateParams",
12
- "ChunkProcessing",
13
- "ChunkProcessingTokenizer",
14
- "ChunkProcessingTokenizerEnum",
15
- "ChunkProcessingTokenizerString",
16
- "LlmProcessing",
17
- "LlmProcessingFallbackStrategy",
18
- "LlmProcessingFallbackStrategyModel",
19
- "SegmentProcessing",
20
- "SegmentProcessingCaption",
21
- "SegmentProcessingFootnote",
22
- "SegmentProcessingFormula",
23
- "SegmentProcessingListItem",
24
- "SegmentProcessingPage",
25
- "SegmentProcessingPageFooter",
26
- "SegmentProcessingPageHeader",
27
- "SegmentProcessingPicture",
28
- "SegmentProcessingSectionHeader",
29
- "SegmentProcessingTable",
30
- "SegmentProcessingText",
31
- "SegmentProcessingTitle",
32
- ]
12
+ __all__ = ["ParseCreateParams"]
33
13
 
34
14
 
35
15
  class ParseCreateParams(TypedDict, total=False):
36
16
  file: Required[str]
37
- """The file to be uploaded. Supported inputs:
17
+ """The file to be parsed. Supported inputs:
38
18
 
39
19
  - `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
40
20
  API
@@ -42,22 +22,10 @@ class ParseCreateParams(TypedDict, total=False):
42
22
  - `data:*;base64,...` or raw base64 string
43
23
  """
44
24
 
45
- base64_urls: bool
46
- """Whether to return base64 encoded URLs. If false, presigned URLs are returned."""
47
-
48
- include_chunks: bool
49
- """Whether to include chunks in the output response"""
50
-
51
- wait_for_completion: bool
52
- """
53
- If true, server holds briefly and may return 200 when done; otherwise returns
54
- 408/409 with Retry-After headers
55
- """
56
-
57
- chunk_processing: Optional[ChunkProcessing]
25
+ chunk_processing: ChunkProcessingParam
58
26
  """Controls the setting for the chunking and post-processing of each chunk."""
59
27
 
60
- error_handling: Optional[Literal["Fail", "Continue"]]
28
+ error_handling: Literal["Fail", "Continue"]
61
29
  """Controls how errors are handled during processing:
62
30
 
63
31
  - `Fail`: Stops processing and fails the task when any error occurs
@@ -72,12 +40,12 @@ class ParseCreateParams(TypedDict, total=False):
72
40
  """
73
41
 
74
42
  file_name: Optional[str]
75
- """The name of the file to be uploaded. If not set a name will be generated."""
43
+ """The name of the file to be parsed. If not set a name will be generated."""
76
44
 
77
- llm_processing: Optional[LlmProcessing]
45
+ llm_processing: LlmProcessingParam
78
46
  """Controls the LLM used for the task."""
79
47
 
80
- ocr_strategy: Optional[Literal["All", "Auto"]]
48
+ ocr_strategy: Literal["All", "Auto"]
81
49
  """Controls the Optical Character Recognition (OCR) strategy.
82
50
 
83
51
  - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
@@ -86,778 +54,30 @@ class ParseCreateParams(TypedDict, total=False):
86
54
  used.
87
55
  """
88
56
 
89
- pipeline: Optional[Literal["Azure", "Chunkr"]]
90
- """
91
- Choose the provider whose models will be used for segmentation and OCR. The
92
- output will be unified to the Chunkr `output` format.
93
- """
94
-
95
- segment_processing: Optional[SegmentProcessing]
96
- """Defines how each segment type is handled when generating the final output.
57
+ pipeline: Literal["Azure", "Chunkr"]
97
58
 
98
- Each segment uses one of three strategies. The chosen strategy controls:
59
+ segment_processing: Optional[SegmentProcessingParam]
60
+ """Configuration for how each document segment is processed and formatted.
99
61
 
100
- - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
101
- - How the content is produced (rule-based vs. LLM).
102
- - The output format (`Html` or `Markdown`).
62
+ Each segment has sensible defaults, but you can override specific settings:
103
63
 
104
- Optional flags such as image **cropping**, **extended context**, and
105
- **descriptions** further refine behaviour.
64
+ - `format`: Output as `Html` or `Markdown`
65
+ - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
66
+ - `crop_image`: Whether to crop images to segment bounds
67
+ - `extended_context`: Use full page as context for LLM processing
68
+ - `description`: Generate descriptions for segments
106
69
 
107
- **Default strategy per segment**
70
+ **Defaults per segment type:** Check the documentation for more details.
108
71
 
109
- - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` **Auto**
110
- (Markdown, description off)
111
- - `Table` → **LLM** (HTML, description on)
112
- - `Picture` → **LLM** (Markdown, description off, cropping _All_)
113
- - `Formula`, `Page` → **LLM** (Markdown, description off)
114
- - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
115
-
116
- **Strategy reference**
117
-
118
- - **Auto** – rule-based content generation.
119
- - **LLM** – generate content with an LLM.
120
- - **Ignore** – exclude the segment entirely.
72
+ Only specify the fields you want to change - everything else uses the defaults.
121
73
  """
122
74
 
123
- segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]]
75
+ segmentation_strategy: Literal["LayoutAnalysis", "Page"]
124
76
  """Controls the segmentation strategy:
125
77
 
126
78
  - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
127
79
  `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
128
- segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
80
+ segmentation and better chunking.
129
81
  - `Page`: Treats each page as a single segment. Faster processing, but without
130
82
  layout element detection and only simple chunking.
131
83
  """
132
-
133
-
134
- class ChunkProcessingTokenizerEnum(TypedDict, total=False):
135
- enum: Required[
136
- Annotated[Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"], PropertyInfo(alias="Enum")]
137
- ]
138
- """Use one of the predefined tokenizer types"""
139
-
140
-
141
- class ChunkProcessingTokenizerString(TypedDict, total=False):
142
- string: Required[Annotated[str, PropertyInfo(alias="String")]]
143
- """
144
- Use any Hugging Face tokenizer by specifying its model ID Examples:
145
- "Qwen/Qwen-tokenizer", "facebook/bart-large"
146
- """
147
-
148
-
149
- ChunkProcessingTokenizer: TypeAlias = Union[ChunkProcessingTokenizerEnum, ChunkProcessingTokenizerString]
150
-
151
-
152
- class ChunkProcessing(TypedDict, total=False):
153
- ignore_headers_and_footers: Optional[bool]
154
- """DEPRECATED: use `segment_processing.ignore` instead"""
155
-
156
- target_length: int
157
- """The target number of words in each chunk.
158
-
159
- If 0, each chunk will contain a single segment.
160
- """
161
-
162
- tokenizer: ChunkProcessingTokenizer
163
- """The tokenizer to use for the chunking process."""
164
-
165
-
166
- class LlmProcessingFallbackStrategyModel(TypedDict, total=False):
167
- model: Required[Annotated[str, PropertyInfo(alias="Model")]]
168
- """Use a specific model as fallback"""
169
-
170
-
171
- LlmProcessingFallbackStrategy: TypeAlias = Union[Literal["None", "Default"], LlmProcessingFallbackStrategyModel]
172
-
173
-
174
- class LlmProcessing(TypedDict, total=False):
175
- fallback_strategy: LlmProcessingFallbackStrategy
176
- """The fallback strategy to use for the LLMs in the task."""
177
-
178
- llm_model_id: Optional[str]
179
- """The ID of the model to use for the task.
180
-
181
- If not provided, the default model will be used. Please check the documentation
182
- for the model you want to use.
183
- """
184
-
185
- max_completion_tokens: Optional[int]
186
- """The maximum number of tokens to generate."""
187
-
188
- temperature: float
189
- """The temperature to use for the LLM."""
190
-
191
-
192
- class SegmentProcessingCaption(TypedDict, total=False):
193
- crop_image: Literal["All", "Auto"]
194
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
195
-
196
- - `All` crops all images in the item
197
- - `Auto` crops images only if required for post-processing
198
- """
199
-
200
- description: bool
201
- """Generate LLM descriptions for this segment"""
202
-
203
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
204
- """**DEPRECATED**: `embed` field is auto populated"""
205
-
206
- extended_context: bool
207
- """Use the full page image as context for LLM generation"""
208
-
209
- format: Literal["Html", "Markdown"]
210
-
211
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
212
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
213
-
214
- llm: Optional[str]
215
- """**DEPRECATED**: use description instead"""
216
-
217
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
218
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
219
-
220
- strategy: Literal["LLM", "Auto", "Ignore"]
221
-
222
-
223
- class SegmentProcessingFootnote(TypedDict, total=False):
224
- crop_image: Literal["All", "Auto"]
225
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
226
-
227
- - `All` crops all images in the item
228
- - `Auto` crops images only if required for post-processing
229
- """
230
-
231
- description: bool
232
- """Generate LLM descriptions for this segment"""
233
-
234
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
235
- """**DEPRECATED**: `embed` field is auto populated"""
236
-
237
- extended_context: bool
238
- """Use the full page image as context for LLM generation"""
239
-
240
- format: Literal["Html", "Markdown"]
241
-
242
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
243
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
244
-
245
- llm: Optional[str]
246
- """**DEPRECATED**: use description instead"""
247
-
248
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
249
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
250
-
251
- strategy: Literal["LLM", "Auto", "Ignore"]
252
-
253
-
254
- class SegmentProcessingFormula(TypedDict, total=False):
255
- crop_image: Literal["All", "Auto"]
256
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
257
-
258
- - `All` crops all images in the item
259
- - `Auto` crops images only if required for post-processing
260
- """
261
-
262
- description: bool
263
- """Generate LLM descriptions for this segment"""
264
-
265
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
266
- """**DEPRECATED**: `embed` field is auto populated"""
267
-
268
- extended_context: bool
269
- """Use the full page image as context for LLM generation"""
270
-
271
- format: Literal["Html", "Markdown"]
272
-
273
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
274
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
275
-
276
- llm: Optional[str]
277
- """**DEPRECATED**: use description instead"""
278
-
279
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
280
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
281
-
282
- strategy: Literal["LLM", "Auto", "Ignore"]
283
-
284
-
285
- class SegmentProcessingListItem(TypedDict, total=False):
286
- crop_image: Literal["All", "Auto"]
287
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
288
-
289
- - `All` crops all images in the item
290
- - `Auto` crops images only if required for post-processing
291
- """
292
-
293
- description: bool
294
- """Generate LLM descriptions for this segment"""
295
-
296
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
297
- """**DEPRECATED**: `embed` field is auto populated"""
298
-
299
- extended_context: bool
300
- """Use the full page image as context for LLM generation"""
301
-
302
- format: Literal["Html", "Markdown"]
303
-
304
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
305
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
306
-
307
- llm: Optional[str]
308
- """**DEPRECATED**: use description instead"""
309
-
310
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
311
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
312
-
313
- strategy: Literal["LLM", "Auto", "Ignore"]
314
-
315
-
316
- class SegmentProcessingPage(TypedDict, total=False):
317
- crop_image: Literal["All", "Auto"]
318
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
319
-
320
- - `All` crops all images in the item
321
- - `Auto` crops images only if required for post-processing
322
- """
323
-
324
- description: bool
325
- """Generate LLM descriptions for this segment"""
326
-
327
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
328
- """**DEPRECATED**: `embed` field is auto populated"""
329
-
330
- extended_context: bool
331
- """Use the full page image as context for LLM generation"""
332
-
333
- format: Literal["Html", "Markdown"]
334
-
335
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
336
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
337
-
338
- llm: Optional[str]
339
- """**DEPRECATED**: use description instead"""
340
-
341
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
342
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
343
-
344
- strategy: Literal["LLM", "Auto", "Ignore"]
345
-
346
-
347
- class SegmentProcessingPageFooter(TypedDict, total=False):
348
- crop_image: Literal["All", "Auto"]
349
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
350
-
351
- - `All` crops all images in the item
352
- - `Auto` crops images only if required for post-processing
353
- """
354
-
355
- description: bool
356
- """Generate LLM descriptions for this segment"""
357
-
358
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
359
- """**DEPRECATED**: `embed` field is auto populated"""
360
-
361
- extended_context: bool
362
- """Use the full page image as context for LLM generation"""
363
-
364
- format: Literal["Html", "Markdown"]
365
-
366
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
367
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
368
-
369
- llm: Optional[str]
370
- """**DEPRECATED**: use description instead"""
371
-
372
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
373
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
374
-
375
- strategy: Literal["LLM", "Auto", "Ignore"]
376
-
377
-
378
- class SegmentProcessingPageHeader(TypedDict, total=False):
379
- crop_image: Literal["All", "Auto"]
380
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
381
-
382
- - `All` crops all images in the item
383
- - `Auto` crops images only if required for post-processing
384
- """
385
-
386
- description: bool
387
- """Generate LLM descriptions for this segment"""
388
-
389
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
390
- """**DEPRECATED**: `embed` field is auto populated"""
391
-
392
- extended_context: bool
393
- """Use the full page image as context for LLM generation"""
394
-
395
- format: Literal["Html", "Markdown"]
396
-
397
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
398
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
399
-
400
- llm: Optional[str]
401
- """**DEPRECATED**: use description instead"""
402
-
403
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
404
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
405
-
406
- strategy: Literal["LLM", "Auto", "Ignore"]
407
-
408
-
409
- class SegmentProcessingPicture(TypedDict, total=False):
410
- crop_image: Literal["All", "Auto"]
411
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
412
-
413
- - `All` crops all images in the item
414
- - `Auto` crops images only if required for post-processing
415
- """
416
-
417
- description: bool
418
- """Generate LLM descriptions for this segment"""
419
-
420
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
421
- """**DEPRECATED**: `embed` field is auto populated"""
422
-
423
- extended_context: bool
424
- """Use the full page image as context for LLM generation"""
425
-
426
- format: Literal["Html", "Markdown"]
427
-
428
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
429
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
430
-
431
- llm: Optional[str]
432
- """**DEPRECATED**: use description instead"""
433
-
434
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
435
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
436
-
437
- strategy: Literal["LLM", "Auto", "Ignore"]
438
-
439
-
440
- class SegmentProcessingSectionHeader(TypedDict, total=False):
441
- crop_image: Literal["All", "Auto"]
442
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
443
-
444
- - `All` crops all images in the item
445
- - `Auto` crops images only if required for post-processing
446
- """
447
-
448
- description: bool
449
- """Generate LLM descriptions for this segment"""
450
-
451
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
452
- """**DEPRECATED**: `embed` field is auto populated"""
453
-
454
- extended_context: bool
455
- """Use the full page image as context for LLM generation"""
456
-
457
- format: Literal["Html", "Markdown"]
458
-
459
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
460
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
461
-
462
- llm: Optional[str]
463
- """**DEPRECATED**: use description instead"""
464
-
465
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
466
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
467
-
468
- strategy: Literal["LLM", "Auto", "Ignore"]
469
-
470
-
471
- class SegmentProcessingTable(TypedDict, total=False):
472
- crop_image: Literal["All", "Auto"]
473
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
474
-
475
- - `All` crops all images in the item
476
- - `Auto` crops images only if required for post-processing
477
- """
478
-
479
- description: bool
480
- """Generate LLM descriptions for this segment"""
481
-
482
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
483
- """**DEPRECATED**: `embed` field is auto populated"""
484
-
485
- extended_context: bool
486
- """Use the full page image as context for LLM generation"""
487
-
488
- format: Literal["Html", "Markdown"]
489
-
490
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
491
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
492
-
493
- llm: Optional[str]
494
- """**DEPRECATED**: use description instead"""
495
-
496
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
497
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
498
-
499
- strategy: Literal["LLM", "Auto", "Ignore"]
500
-
501
-
502
- class SegmentProcessingText(TypedDict, total=False):
503
- crop_image: Literal["All", "Auto"]
504
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
505
-
506
- - `All` crops all images in the item
507
- - `Auto` crops images only if required for post-processing
508
- """
509
-
510
- description: bool
511
- """Generate LLM descriptions for this segment"""
512
-
513
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
514
- """**DEPRECATED**: `embed` field is auto populated"""
515
-
516
- extended_context: bool
517
- """Use the full page image as context for LLM generation"""
518
-
519
- format: Literal["Html", "Markdown"]
520
-
521
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
522
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
523
-
524
- llm: Optional[str]
525
- """**DEPRECATED**: use description instead"""
526
-
527
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
528
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
529
-
530
- strategy: Literal["LLM", "Auto", "Ignore"]
531
-
532
-
533
- class SegmentProcessingTitle(TypedDict, total=False):
534
- crop_image: Literal["All", "Auto"]
535
- """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
536
-
537
- - `All` crops all images in the item
538
- - `Auto` crops images only if required for post-processing
539
- """
540
-
541
- description: bool
542
- """Generate LLM descriptions for this segment"""
543
-
544
- embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]]
545
- """**DEPRECATED**: `embed` field is auto populated"""
546
-
547
- extended_context: bool
548
- """Use the full page image as context for LLM generation"""
549
-
550
- format: Literal["Html", "Markdown"]
551
-
552
- html: Optional[Literal["LLM", "Auto", "Ignore"]]
553
- """**DEPRECATED**: Use `format: html` and `strategy` instead."""
554
-
555
- llm: Optional[str]
556
- """**DEPRECATED**: use description instead"""
557
-
558
- markdown: Optional[Literal["LLM", "Auto", "Ignore"]]
559
- """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
560
-
561
- strategy: Literal["LLM", "Auto", "Ignore"]
562
-
563
-
564
- class SegmentProcessing(TypedDict, total=False):
565
- caption: Annotated[Optional[SegmentProcessingCaption], PropertyInfo(alias="Caption")]
566
- """Controls the processing and generation for the segment.
567
-
568
- - `crop_image` controls whether to crop the file's images to the segment's
569
- bounding box. The cropped image will be stored in the segment's `image` field.
570
- Use `All` to always crop, or `Auto` to only crop when needed for
571
- post-processing.
572
- - `format` specifies the output format: `Html` or `Markdown`
573
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
574
- - `Auto`: Process content automatically
575
- - `LLM`: Use large language models for processing
576
- - `Ignore`: Exclude segments from final output
577
- - `description` enables LLM-generated descriptions for segments. **Note:** This
578
- uses chunkr's own VLM models and is not configurable via LLM processing
579
- configuration.
580
- - `extended_context` uses the full page image as context for LLM generation.
581
-
582
- **Deprecated fields (for backwards compatibility):**
583
-
584
- - `llm` - **DEPRECATED**: Use `description` instead
585
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
586
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
587
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
588
- """
589
-
590
- footnote: Annotated[Optional[SegmentProcessingFootnote], PropertyInfo(alias="Footnote")]
591
- """Controls the processing and generation for the segment.
592
-
593
- - `crop_image` controls whether to crop the file's images to the segment's
594
- bounding box. The cropped image will be stored in the segment's `image` field.
595
- Use `All` to always crop, or `Auto` to only crop when needed for
596
- post-processing.
597
- - `format` specifies the output format: `Html` or `Markdown`
598
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
599
- - `Auto`: Process content automatically
600
- - `LLM`: Use large language models for processing
601
- - `Ignore`: Exclude segments from final output
602
- - `description` enables LLM-generated descriptions for segments. **Note:** This
603
- uses chunkr's own VLM models and is not configurable via LLM processing
604
- configuration.
605
- - `extended_context` uses the full page image as context for LLM generation.
606
-
607
- **Deprecated fields (for backwards compatibility):**
608
-
609
- - `llm` - **DEPRECATED**: Use `description` instead
610
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
611
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
612
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
613
- """
614
-
615
- formula: Annotated[Optional[SegmentProcessingFormula], PropertyInfo(alias="Formula")]
616
- """Controls the processing and generation for the segment.
617
-
618
- - `crop_image` controls whether to crop the file's images to the segment's
619
- bounding box. The cropped image will be stored in the segment's `image` field.
620
- Use `All` to always crop, or `Auto` to only crop when needed for
621
- post-processing.
622
- - `format` specifies the output format: `Html` or `Markdown`
623
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
624
- - `Auto`: Process content automatically
625
- - `LLM`: Use large language models for processing
626
- - `Ignore`: Exclude segments from final output
627
- - `description` enables LLM-generated descriptions for segments. **Note:** This
628
- uses chunkr's own VLM models and is not configurable via LLM processing
629
- configuration.
630
- - `extended_context` uses the full page image as context for LLM generation.
631
-
632
- **Deprecated fields (for backwards compatibility):**
633
-
634
- - `llm` - **DEPRECATED**: Use `description` instead
635
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
636
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
637
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
638
- """
639
-
640
- list_item: Annotated[Optional[SegmentProcessingListItem], PropertyInfo(alias="ListItem")]
641
- """Controls the processing and generation for the segment.
642
-
643
- - `crop_image` controls whether to crop the file's images to the segment's
644
- bounding box. The cropped image will be stored in the segment's `image` field.
645
- Use `All` to always crop, or `Auto` to only crop when needed for
646
- post-processing.
647
- - `format` specifies the output format: `Html` or `Markdown`
648
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
649
- - `Auto`: Process content automatically
650
- - `LLM`: Use large language models for processing
651
- - `Ignore`: Exclude segments from final output
652
- - `description` enables LLM-generated descriptions for segments. **Note:** This
653
- uses chunkr's own VLM models and is not configurable via LLM processing
654
- configuration.
655
- - `extended_context` uses the full page image as context for LLM generation.
656
-
657
- **Deprecated fields (for backwards compatibility):**
658
-
659
- - `llm` - **DEPRECATED**: Use `description` instead
660
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
661
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
662
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
663
- """
664
-
665
- page: Annotated[Optional[SegmentProcessingPage], PropertyInfo(alias="Page")]
666
- """Controls the processing and generation for the segment.
667
-
668
- - `crop_image` controls whether to crop the file's images to the segment's
669
- bounding box. The cropped image will be stored in the segment's `image` field.
670
- Use `All` to always crop, or `Auto` to only crop when needed for
671
- post-processing.
672
- - `format` specifies the output format: `Html` or `Markdown`
673
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
674
- - `Auto`: Process content automatically
675
- - `LLM`: Use large language models for processing
676
- - `Ignore`: Exclude segments from final output
677
- - `description` enables LLM-generated descriptions for segments. **Note:** This
678
- uses chunkr's own VLM models and is not configurable via LLM processing
679
- configuration.
680
- - `extended_context` uses the full page image as context for LLM generation.
681
-
682
- **Deprecated fields (for backwards compatibility):**
683
-
684
- - `llm` - **DEPRECATED**: Use `description` instead
685
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
686
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
687
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
688
- """
689
-
690
- page_footer: Annotated[Optional[SegmentProcessingPageFooter], PropertyInfo(alias="PageFooter")]
691
- """Controls the processing and generation for the segment.
692
-
693
- - `crop_image` controls whether to crop the file's images to the segment's
694
- bounding box. The cropped image will be stored in the segment's `image` field.
695
- Use `All` to always crop, or `Auto` to only crop when needed for
696
- post-processing.
697
- - `format` specifies the output format: `Html` or `Markdown`
698
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
699
- - `Auto`: Process content automatically
700
- - `LLM`: Use large language models for processing
701
- - `Ignore`: Exclude segments from final output
702
- - `description` enables LLM-generated descriptions for segments. **Note:** This
703
- uses chunkr's own VLM models and is not configurable via LLM processing
704
- configuration.
705
- - `extended_context` uses the full page image as context for LLM generation.
706
-
707
- **Deprecated fields (for backwards compatibility):**
708
-
709
- - `llm` - **DEPRECATED**: Use `description` instead
710
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
711
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
712
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
713
- """
714
-
715
- page_header: Annotated[Optional[SegmentProcessingPageHeader], PropertyInfo(alias="PageHeader")]
716
- """Controls the processing and generation for the segment.
717
-
718
- - `crop_image` controls whether to crop the file's images to the segment's
719
- bounding box. The cropped image will be stored in the segment's `image` field.
720
- Use `All` to always crop, or `Auto` to only crop when needed for
721
- post-processing.
722
- - `format` specifies the output format: `Html` or `Markdown`
723
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
724
- - `Auto`: Process content automatically
725
- - `LLM`: Use large language models for processing
726
- - `Ignore`: Exclude segments from final output
727
- - `description` enables LLM-generated descriptions for segments. **Note:** This
728
- uses chunkr's own VLM models and is not configurable via LLM processing
729
- configuration.
730
- - `extended_context` uses the full page image as context for LLM generation.
731
-
732
- **Deprecated fields (for backwards compatibility):**
733
-
734
- - `llm` - **DEPRECATED**: Use `description` instead
735
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
736
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
737
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
738
- """
739
-
740
- picture: Annotated[Optional[SegmentProcessingPicture], PropertyInfo(alias="Picture")]
741
- """Controls the processing and generation for the segment.
742
-
743
- - `crop_image` controls whether to crop the file's images to the segment's
744
- bounding box. The cropped image will be stored in the segment's `image` field.
745
- Use `All` to always crop, or `Auto` to only crop when needed for
746
- post-processing.
747
- - `format` specifies the output format: `Html` or `Markdown`
748
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
749
- - `Auto`: Process content automatically
750
- - `LLM`: Use large language models for processing
751
- - `Ignore`: Exclude segments from final output
752
- - `description` enables LLM-generated descriptions for segments. **Note:** This
753
- uses chunkr's own VLM models and is not configurable via LLM processing
754
- configuration.
755
- - `extended_context` uses the full page image as context for LLM generation.
756
-
757
- **Deprecated fields (for backwards compatibility):**
758
-
759
- - `llm` - **DEPRECATED**: Use `description` instead
760
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
761
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
762
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
763
- """
764
-
765
- section_header: Annotated[Optional[SegmentProcessingSectionHeader], PropertyInfo(alias="SectionHeader")]
766
- """Controls the processing and generation for the segment.
767
-
768
- - `crop_image` controls whether to crop the file's images to the segment's
769
- bounding box. The cropped image will be stored in the segment's `image` field.
770
- Use `All` to always crop, or `Auto` to only crop when needed for
771
- post-processing.
772
- - `format` specifies the output format: `Html` or `Markdown`
773
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
774
- - `Auto`: Process content automatically
775
- - `LLM`: Use large language models for processing
776
- - `Ignore`: Exclude segments from final output
777
- - `description` enables LLM-generated descriptions for segments. **Note:** This
778
- uses chunkr's own VLM models and is not configurable via LLM processing
779
- configuration.
780
- - `extended_context` uses the full page image as context for LLM generation.
781
-
782
- **Deprecated fields (for backwards compatibility):**
783
-
784
- - `llm` - **DEPRECATED**: Use `description` instead
785
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
786
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
787
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
788
- """
789
-
790
- table: Annotated[Optional[SegmentProcessingTable], PropertyInfo(alias="Table")]
791
- """Controls the processing and generation for the segment.
792
-
793
- - `crop_image` controls whether to crop the file's images to the segment's
794
- bounding box. The cropped image will be stored in the segment's `image` field.
795
- Use `All` to always crop, or `Auto` to only crop when needed for
796
- post-processing.
797
- - `format` specifies the output format: `Html` or `Markdown`
798
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
799
- - `Auto`: Process content automatically
800
- - `LLM`: Use large language models for processing
801
- - `Ignore`: Exclude segments from final output
802
- - `description` enables LLM-generated descriptions for segments. **Note:** This
803
- uses chunkr's own VLM models and is not configurable via LLM processing
804
- configuration.
805
- - `extended_context` uses the full page image as context for LLM generation.
806
-
807
- **Deprecated fields (for backwards compatibility):**
808
-
809
- - `llm` - **DEPRECATED**: Use `description` instead
810
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
811
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
812
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
813
- """
814
-
815
- text: Annotated[Optional[SegmentProcessingText], PropertyInfo(alias="Text")]
816
- """Controls the processing and generation for the segment.
817
-
818
- - `crop_image` controls whether to crop the file's images to the segment's
819
- bounding box. The cropped image will be stored in the segment's `image` field.
820
- Use `All` to always crop, or `Auto` to only crop when needed for
821
- post-processing.
822
- - `format` specifies the output format: `Html` or `Markdown`
823
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
824
- - `Auto`: Process content automatically
825
- - `LLM`: Use large language models for processing
826
- - `Ignore`: Exclude segments from final output
827
- - `description` enables LLM-generated descriptions for segments. **Note:** This
828
- uses chunkr's own VLM models and is not configurable via LLM processing
829
- configuration.
830
- - `extended_context` uses the full page image as context for LLM generation.
831
-
832
- **Deprecated fields (for backwards compatibility):**
833
-
834
- - `llm` - **DEPRECATED**: Use `description` instead
835
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
836
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
837
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
838
- """
839
-
840
- title: Annotated[Optional[SegmentProcessingTitle], PropertyInfo(alias="Title")]
841
- """Controls the processing and generation for the segment.
842
-
843
- - `crop_image` controls whether to crop the file's images to the segment's
844
- bounding box. The cropped image will be stored in the segment's `image` field.
845
- Use `All` to always crop, or `Auto` to only crop when needed for
846
- post-processing.
847
- - `format` specifies the output format: `Html` or `Markdown`
848
- - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
849
- - `Auto`: Process content automatically
850
- - `LLM`: Use large language models for processing
851
- - `Ignore`: Exclude segments from final output
852
- - `description` enables LLM-generated descriptions for segments. **Note:** This
853
- uses chunkr's own VLM models and is not configurable via LLM processing
854
- configuration.
855
- - `extended_context` uses the full page image as context for LLM generation.
856
-
857
- **Deprecated fields (for backwards compatibility):**
858
-
859
- - `llm` - **DEPRECATED**: Use `description` instead
860
- - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
861
- - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
862
- - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
863
- """