chunkr-ai 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. chunkr_ai/_client.py +18 -9
  2. chunkr_ai/_files.py +1 -1
  3. chunkr_ai/_version.py +1 -1
  4. chunkr_ai/pagination.py +61 -1
  5. chunkr_ai/resources/__init__.py +27 -13
  6. chunkr_ai/resources/files.py +712 -0
  7. chunkr_ai/resources/tasks/__init__.py +33 -0
  8. chunkr_ai/resources/tasks/parse.py +612 -0
  9. chunkr_ai/resources/tasks/tasks.py +596 -0
  10. chunkr_ai/types/__init__.py +7 -19
  11. chunkr_ai/types/delete.py +10 -0
  12. chunkr_ai/types/file.py +30 -0
  13. chunkr_ai/types/file_create_params.py +17 -0
  14. chunkr_ai/types/file_list_params.py +28 -0
  15. chunkr_ai/types/file_url.py +15 -0
  16. chunkr_ai/types/file_url_params.py +15 -0
  17. chunkr_ai/types/files_page_response.py +20 -0
  18. chunkr_ai/types/task.py +866 -27
  19. chunkr_ai/types/tasks/__init__.py +6 -0
  20. chunkr_ai/types/tasks/parse_create_params.py +844 -0
  21. chunkr_ai/types/tasks/parse_update_params.py +838 -0
  22. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/METADATA +39 -21
  23. chunkr_ai-0.1.0a3.dist-info/RECORD +52 -0
  24. chunkr_ai/resources/task.py +0 -1166
  25. chunkr_ai/types/auto_generation_config.py +0 -39
  26. chunkr_ai/types/auto_generation_config_param.py +0 -39
  27. chunkr_ai/types/bounding_box.py +0 -19
  28. chunkr_ai/types/chunk_processing.py +0 -40
  29. chunkr_ai/types/chunk_processing_param.py +0 -42
  30. chunkr_ai/types/ignore_generation_config.py +0 -39
  31. chunkr_ai/types/ignore_generation_config_param.py +0 -39
  32. chunkr_ai/types/llm_generation_config.py +0 -39
  33. chunkr_ai/types/llm_generation_config_param.py +0 -39
  34. chunkr_ai/types/llm_processing.py +0 -36
  35. chunkr_ai/types/llm_processing_param.py +0 -36
  36. chunkr_ai/types/picture_generation_config.py +0 -39
  37. chunkr_ai/types/picture_generation_config_param.py +0 -39
  38. chunkr_ai/types/segment_processing.py +0 -280
  39. chunkr_ai/types/segment_processing_param.py +0 -281
  40. chunkr_ai/types/table_generation_config.py +0 -39
  41. chunkr_ai/types/table_generation_config_param.py +0 -39
  42. chunkr_ai/types/task_parse_params.py +0 -90
  43. chunkr_ai/types/task_update_params.py +0 -90
  44. chunkr_ai-0.1.0a1.dist-info/RECORD +0 -58
  45. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/WHEEL +0 -0
  46. {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
chunkr_ai/types/task.py CHANGED
@@ -1,31 +1,807 @@
1
1
  # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
2
 
3
- from typing import List, Optional
3
+ from typing import List, Union, Optional
4
4
  from datetime import datetime
5
- from typing_extensions import Literal
5
+ from typing_extensions import Literal, TypeAlias
6
+
7
+ from pydantic import Field as FieldInfo
6
8
 
7
9
  from .._models import BaseModel
8
- from .bounding_box import BoundingBox
9
- from .llm_processing import LlmProcessing
10
- from .chunk_processing import ChunkProcessing
11
- from .segment_processing import SegmentProcessing
12
10
 
13
11
  __all__ = [
14
12
  "Task",
15
13
  "Configuration",
14
+ "ConfigurationChunkProcessing",
15
+ "ConfigurationChunkProcessingTokenizer",
16
+ "ConfigurationChunkProcessingTokenizerEnum",
17
+ "ConfigurationChunkProcessingTokenizerString",
18
+ "ConfigurationLlmProcessing",
19
+ "ConfigurationLlmProcessingFallbackStrategy",
20
+ "ConfigurationLlmProcessingFallbackStrategyModel",
21
+ "ConfigurationSegmentProcessing",
22
+ "ConfigurationSegmentProcessingCaption",
23
+ "ConfigurationSegmentProcessingFootnote",
24
+ "ConfigurationSegmentProcessingFormula",
25
+ "ConfigurationSegmentProcessingListItem",
26
+ "ConfigurationSegmentProcessingPage",
27
+ "ConfigurationSegmentProcessingPageFooter",
28
+ "ConfigurationSegmentProcessingPageHeader",
29
+ "ConfigurationSegmentProcessingPicture",
30
+ "ConfigurationSegmentProcessingSectionHeader",
31
+ "ConfigurationSegmentProcessingTable",
32
+ "ConfigurationSegmentProcessingText",
33
+ "ConfigurationSegmentProcessingTitle",
34
+ "ConfigurationClientVersion",
35
+ "ConfigurationClientVersionManualSDK",
36
+ "ConfigurationClientVersionGeneratedSDK",
16
37
  "Output",
17
38
  "OutputChunk",
18
39
  "OutputChunkSegment",
40
+ "OutputChunkSegmentBbox",
19
41
  "OutputChunkSegmentOcr",
42
+ "OutputChunkSegmentOcrBbox",
20
43
  "OutputChunkSegmentSSCell",
21
44
  "OutputChunkSegmentSSCellStyle",
45
+ "OutputChunkSegmentSSHeaderBbox",
22
46
  "OutputChunkSegmentSSHeaderOcr",
47
+ "OutputChunkSegmentSSHeaderOcrBbox",
23
48
  "OutputPage",
24
49
  ]
25
50
 
26
51
 
52
+ class ConfigurationChunkProcessingTokenizerEnum(BaseModel):
53
+ enum: Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"] = FieldInfo(alias="Enum")
54
+ """Use one of the predefined tokenizer types"""
55
+
56
+
57
+ class ConfigurationChunkProcessingTokenizerString(BaseModel):
58
+ string: str = FieldInfo(alias="String")
59
+ """
60
+ Use any Hugging Face tokenizer by specifying its model ID Examples:
61
+ "Qwen/Qwen-tokenizer", "facebook/bart-large"
62
+ """
63
+
64
+
65
+ ConfigurationChunkProcessingTokenizer: TypeAlias = Union[
66
+ ConfigurationChunkProcessingTokenizerEnum, ConfigurationChunkProcessingTokenizerString
67
+ ]
68
+
69
+
70
+ class ConfigurationChunkProcessing(BaseModel):
71
+ ignore_headers_and_footers: Optional[bool] = None
72
+ """DEPRECATED: use `segment_processing.ignore` instead"""
73
+
74
+ target_length: Optional[int] = None
75
+ """The target number of words in each chunk.
76
+
77
+ If 0, each chunk will contain a single segment.
78
+ """
79
+
80
+ tokenizer: Optional[ConfigurationChunkProcessingTokenizer] = None
81
+ """The tokenizer to use for the chunking process."""
82
+
83
+
84
+ class ConfigurationLlmProcessingFallbackStrategyModel(BaseModel):
85
+ model: str = FieldInfo(alias="Model")
86
+ """Use a specific model as fallback"""
87
+
88
+
89
+ ConfigurationLlmProcessingFallbackStrategy: TypeAlias = Union[
90
+ Literal["None", "Default"], ConfigurationLlmProcessingFallbackStrategyModel
91
+ ]
92
+
93
+
94
+ class ConfigurationLlmProcessing(BaseModel):
95
+ fallback_strategy: Optional[ConfigurationLlmProcessingFallbackStrategy] = None
96
+ """The fallback strategy to use for the LLMs in the task."""
97
+
98
+ llm_model_id: Optional[str] = None
99
+ """The ID of the model to use for the task.
100
+
101
+ If not provided, the default model will be used. Please check the documentation
102
+ for the model you want to use.
103
+ """
104
+
105
+ max_completion_tokens: Optional[int] = None
106
+ """The maximum number of tokens to generate."""
107
+
108
+ temperature: Optional[float] = None
109
+ """The temperature to use for the LLM."""
110
+
111
+
112
+ class ConfigurationSegmentProcessingCaption(BaseModel):
113
+ crop_image: Optional[Literal["All", "Auto"]] = None
114
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
115
+
116
+ - `All` crops all images in the item
117
+ - `Auto` crops images only if required for post-processing
118
+ """
119
+
120
+ description: Optional[bool] = None
121
+ """Generate LLM descriptions for this segment"""
122
+
123
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
124
+ """**DEPRECATED**: `embed` field is auto populated"""
125
+
126
+ extended_context: Optional[bool] = None
127
+ """Use the full page image as context for LLM generation"""
128
+
129
+ format: Optional[Literal["Html", "Markdown"]] = None
130
+
131
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
132
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
133
+
134
+ llm: Optional[str] = None
135
+ """**DEPRECATED**: use description instead"""
136
+
137
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
138
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
139
+
140
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
141
+
142
+
143
+ class ConfigurationSegmentProcessingFootnote(BaseModel):
144
+ crop_image: Optional[Literal["All", "Auto"]] = None
145
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
146
+
147
+ - `All` crops all images in the item
148
+ - `Auto` crops images only if required for post-processing
149
+ """
150
+
151
+ description: Optional[bool] = None
152
+ """Generate LLM descriptions for this segment"""
153
+
154
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
155
+ """**DEPRECATED**: `embed` field is auto populated"""
156
+
157
+ extended_context: Optional[bool] = None
158
+ """Use the full page image as context for LLM generation"""
159
+
160
+ format: Optional[Literal["Html", "Markdown"]] = None
161
+
162
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
163
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
164
+
165
+ llm: Optional[str] = None
166
+ """**DEPRECATED**: use description instead"""
167
+
168
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
169
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
170
+
171
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
172
+
173
+
174
+ class ConfigurationSegmentProcessingFormula(BaseModel):
175
+ crop_image: Optional[Literal["All", "Auto"]] = None
176
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
177
+
178
+ - `All` crops all images in the item
179
+ - `Auto` crops images only if required for post-processing
180
+ """
181
+
182
+ description: Optional[bool] = None
183
+ """Generate LLM descriptions for this segment"""
184
+
185
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
186
+ """**DEPRECATED**: `embed` field is auto populated"""
187
+
188
+ extended_context: Optional[bool] = None
189
+ """Use the full page image as context for LLM generation"""
190
+
191
+ format: Optional[Literal["Html", "Markdown"]] = None
192
+
193
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
194
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
195
+
196
+ llm: Optional[str] = None
197
+ """**DEPRECATED**: use description instead"""
198
+
199
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
200
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
201
+
202
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
203
+
204
+
205
+ class ConfigurationSegmentProcessingListItem(BaseModel):
206
+ crop_image: Optional[Literal["All", "Auto"]] = None
207
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
208
+
209
+ - `All` crops all images in the item
210
+ - `Auto` crops images only if required for post-processing
211
+ """
212
+
213
+ description: Optional[bool] = None
214
+ """Generate LLM descriptions for this segment"""
215
+
216
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
217
+ """**DEPRECATED**: `embed` field is auto populated"""
218
+
219
+ extended_context: Optional[bool] = None
220
+ """Use the full page image as context for LLM generation"""
221
+
222
+ format: Optional[Literal["Html", "Markdown"]] = None
223
+
224
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
225
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
226
+
227
+ llm: Optional[str] = None
228
+ """**DEPRECATED**: use description instead"""
229
+
230
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
231
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
232
+
233
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
234
+
235
+
236
+ class ConfigurationSegmentProcessingPage(BaseModel):
237
+ crop_image: Optional[Literal["All", "Auto"]] = None
238
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
239
+
240
+ - `All` crops all images in the item
241
+ - `Auto` crops images only if required for post-processing
242
+ """
243
+
244
+ description: Optional[bool] = None
245
+ """Generate LLM descriptions for this segment"""
246
+
247
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
248
+ """**DEPRECATED**: `embed` field is auto populated"""
249
+
250
+ extended_context: Optional[bool] = None
251
+ """Use the full page image as context for LLM generation"""
252
+
253
+ format: Optional[Literal["Html", "Markdown"]] = None
254
+
255
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
256
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
257
+
258
+ llm: Optional[str] = None
259
+ """**DEPRECATED**: use description instead"""
260
+
261
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
262
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
263
+
264
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
265
+
266
+
267
+ class ConfigurationSegmentProcessingPageFooter(BaseModel):
268
+ crop_image: Optional[Literal["All", "Auto"]] = None
269
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
270
+
271
+ - `All` crops all images in the item
272
+ - `Auto` crops images only if required for post-processing
273
+ """
274
+
275
+ description: Optional[bool] = None
276
+ """Generate LLM descriptions for this segment"""
277
+
278
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
279
+ """**DEPRECATED**: `embed` field is auto populated"""
280
+
281
+ extended_context: Optional[bool] = None
282
+ """Use the full page image as context for LLM generation"""
283
+
284
+ format: Optional[Literal["Html", "Markdown"]] = None
285
+
286
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
287
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
288
+
289
+ llm: Optional[str] = None
290
+ """**DEPRECATED**: use description instead"""
291
+
292
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
293
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
294
+
295
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
296
+
297
+
298
+ class ConfigurationSegmentProcessingPageHeader(BaseModel):
299
+ crop_image: Optional[Literal["All", "Auto"]] = None
300
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
301
+
302
+ - `All` crops all images in the item
303
+ - `Auto` crops images only if required for post-processing
304
+ """
305
+
306
+ description: Optional[bool] = None
307
+ """Generate LLM descriptions for this segment"""
308
+
309
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
310
+ """**DEPRECATED**: `embed` field is auto populated"""
311
+
312
+ extended_context: Optional[bool] = None
313
+ """Use the full page image as context for LLM generation"""
314
+
315
+ format: Optional[Literal["Html", "Markdown"]] = None
316
+
317
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
318
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
319
+
320
+ llm: Optional[str] = None
321
+ """**DEPRECATED**: use description instead"""
322
+
323
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
324
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
325
+
326
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
327
+
328
+
329
+ class ConfigurationSegmentProcessingPicture(BaseModel):
330
+ crop_image: Optional[Literal["All", "Auto"]] = None
331
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
332
+
333
+ - `All` crops all images in the item
334
+ - `Auto` crops images only if required for post-processing
335
+ """
336
+
337
+ description: Optional[bool] = None
338
+ """Generate LLM descriptions for this segment"""
339
+
340
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
341
+ """**DEPRECATED**: `embed` field is auto populated"""
342
+
343
+ extended_context: Optional[bool] = None
344
+ """Use the full page image as context for LLM generation"""
345
+
346
+ format: Optional[Literal["Html", "Markdown"]] = None
347
+
348
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
349
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
350
+
351
+ llm: Optional[str] = None
352
+ """**DEPRECATED**: use description instead"""
353
+
354
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
355
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
356
+
357
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
358
+
359
+
360
+ class ConfigurationSegmentProcessingSectionHeader(BaseModel):
361
+ crop_image: Optional[Literal["All", "Auto"]] = None
362
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
363
+
364
+ - `All` crops all images in the item
365
+ - `Auto` crops images only if required for post-processing
366
+ """
367
+
368
+ description: Optional[bool] = None
369
+ """Generate LLM descriptions for this segment"""
370
+
371
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
372
+ """**DEPRECATED**: `embed` field is auto populated"""
373
+
374
+ extended_context: Optional[bool] = None
375
+ """Use the full page image as context for LLM generation"""
376
+
377
+ format: Optional[Literal["Html", "Markdown"]] = None
378
+
379
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
380
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
381
+
382
+ llm: Optional[str] = None
383
+ """**DEPRECATED**: use description instead"""
384
+
385
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
386
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
387
+
388
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
389
+
390
+
391
+ class ConfigurationSegmentProcessingTable(BaseModel):
392
+ crop_image: Optional[Literal["All", "Auto"]] = None
393
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
394
+
395
+ - `All` crops all images in the item
396
+ - `Auto` crops images only if required for post-processing
397
+ """
398
+
399
+ description: Optional[bool] = None
400
+ """Generate LLM descriptions for this segment"""
401
+
402
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
403
+ """**DEPRECATED**: `embed` field is auto populated"""
404
+
405
+ extended_context: Optional[bool] = None
406
+ """Use the full page image as context for LLM generation"""
407
+
408
+ format: Optional[Literal["Html", "Markdown"]] = None
409
+
410
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
411
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
412
+
413
+ llm: Optional[str] = None
414
+ """**DEPRECATED**: use description instead"""
415
+
416
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
417
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
418
+
419
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
420
+
421
+
422
+ class ConfigurationSegmentProcessingText(BaseModel):
423
+ crop_image: Optional[Literal["All", "Auto"]] = None
424
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
425
+
426
+ - `All` crops all images in the item
427
+ - `Auto` crops images only if required for post-processing
428
+ """
429
+
430
+ description: Optional[bool] = None
431
+ """Generate LLM descriptions for this segment"""
432
+
433
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
434
+ """**DEPRECATED**: `embed` field is auto populated"""
435
+
436
+ extended_context: Optional[bool] = None
437
+ """Use the full page image as context for LLM generation"""
438
+
439
+ format: Optional[Literal["Html", "Markdown"]] = None
440
+
441
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
442
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
443
+
444
+ llm: Optional[str] = None
445
+ """**DEPRECATED**: use description instead"""
446
+
447
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
448
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
449
+
450
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
451
+
452
+
453
+ class ConfigurationSegmentProcessingTitle(BaseModel):
454
+ crop_image: Optional[Literal["All", "Auto"]] = None
455
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
456
+
457
+ - `All` crops all images in the item
458
+ - `Auto` crops images only if required for post-processing
459
+ """
460
+
461
+ description: Optional[bool] = None
462
+ """Generate LLM descriptions for this segment"""
463
+
464
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
465
+ """**DEPRECATED**: `embed` field is auto populated"""
466
+
467
+ extended_context: Optional[bool] = None
468
+ """Use the full page image as context for LLM generation"""
469
+
470
+ format: Optional[Literal["Html", "Markdown"]] = None
471
+
472
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
473
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
474
+
475
+ llm: Optional[str] = None
476
+ """**DEPRECATED**: use description instead"""
477
+
478
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
479
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
480
+
481
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
482
+
483
+
484
+ class ConfigurationSegmentProcessing(BaseModel):
485
+ caption: Optional[ConfigurationSegmentProcessingCaption] = FieldInfo(alias="Caption", default=None)
486
+ """Controls the processing and generation for the segment.
487
+
488
+ - `crop_image` controls whether to crop the file's images to the segment's
489
+ bounding box. The cropped image will be stored in the segment's `image` field.
490
+ Use `All` to always crop, or `Auto` to only crop when needed for
491
+ post-processing.
492
+ - `format` specifies the output format: `Html` or `Markdown`
493
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
494
+ - `Auto`: Process content automatically
495
+ - `LLM`: Use large language models for processing
496
+ - `Ignore`: Exclude segments from final output
497
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
498
+ uses chunkr's own VLM models and is not configurable via LLM processing
499
+ configuration.
500
+ - `extended_context` uses the full page image as context for LLM generation.
501
+
502
+ **Deprecated fields (for backwards compatibility):**
503
+
504
+ - `llm` - **DEPRECATED**: Use `description` instead
505
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
506
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
507
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
508
+ """
509
+
510
+ footnote: Optional[ConfigurationSegmentProcessingFootnote] = FieldInfo(alias="Footnote", default=None)
511
+ """Controls the processing and generation for the segment.
512
+
513
+ - `crop_image` controls whether to crop the file's images to the segment's
514
+ bounding box. The cropped image will be stored in the segment's `image` field.
515
+ Use `All` to always crop, or `Auto` to only crop when needed for
516
+ post-processing.
517
+ - `format` specifies the output format: `Html` or `Markdown`
518
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
519
+ - `Auto`: Process content automatically
520
+ - `LLM`: Use large language models for processing
521
+ - `Ignore`: Exclude segments from final output
522
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
523
+ uses chunkr's own VLM models and is not configurable via LLM processing
524
+ configuration.
525
+ - `extended_context` uses the full page image as context for LLM generation.
526
+
527
+ **Deprecated fields (for backwards compatibility):**
528
+
529
+ - `llm` - **DEPRECATED**: Use `description` instead
530
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
531
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
532
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
533
+ """
534
+
535
+ formula: Optional[ConfigurationSegmentProcessingFormula] = FieldInfo(alias="Formula", default=None)
536
+ """Controls the processing and generation for the segment.
537
+
538
+ - `crop_image` controls whether to crop the file's images to the segment's
539
+ bounding box. The cropped image will be stored in the segment's `image` field.
540
+ Use `All` to always crop, or `Auto` to only crop when needed for
541
+ post-processing.
542
+ - `format` specifies the output format: `Html` or `Markdown`
543
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
544
+ - `Auto`: Process content automatically
545
+ - `LLM`: Use large language models for processing
546
+ - `Ignore`: Exclude segments from final output
547
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
548
+ uses chunkr's own VLM models and is not configurable via LLM processing
549
+ configuration.
550
+ - `extended_context` uses the full page image as context for LLM generation.
551
+
552
+ **Deprecated fields (for backwards compatibility):**
553
+
554
+ - `llm` - **DEPRECATED**: Use `description` instead
555
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
556
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
557
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
558
+ """
559
+
560
+ list_item: Optional[ConfigurationSegmentProcessingListItem] = FieldInfo(alias="ListItem", default=None)
561
+ """Controls the processing and generation for the segment.
562
+
563
+ - `crop_image` controls whether to crop the file's images to the segment's
564
+ bounding box. The cropped image will be stored in the segment's `image` field.
565
+ Use `All` to always crop, or `Auto` to only crop when needed for
566
+ post-processing.
567
+ - `format` specifies the output format: `Html` or `Markdown`
568
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
569
+ - `Auto`: Process content automatically
570
+ - `LLM`: Use large language models for processing
571
+ - `Ignore`: Exclude segments from final output
572
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
573
+ uses chunkr's own VLM models and is not configurable via LLM processing
574
+ configuration.
575
+ - `extended_context` uses the full page image as context for LLM generation.
576
+
577
+ **Deprecated fields (for backwards compatibility):**
578
+
579
+ - `llm` - **DEPRECATED**: Use `description` instead
580
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
581
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
582
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
583
+ """
584
+
585
+ page: Optional[ConfigurationSegmentProcessingPage] = FieldInfo(alias="Page", default=None)
586
+ """Controls the processing and generation for the segment.
587
+
588
+ - `crop_image` controls whether to crop the file's images to the segment's
589
+ bounding box. The cropped image will be stored in the segment's `image` field.
590
+ Use `All` to always crop, or `Auto` to only crop when needed for
591
+ post-processing.
592
+ - `format` specifies the output format: `Html` or `Markdown`
593
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
594
+ - `Auto`: Process content automatically
595
+ - `LLM`: Use large language models for processing
596
+ - `Ignore`: Exclude segments from final output
597
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
598
+ uses chunkr's own VLM models and is not configurable via LLM processing
599
+ configuration.
600
+ - `extended_context` uses the full page image as context for LLM generation.
601
+
602
+ **Deprecated fields (for backwards compatibility):**
603
+
604
+ - `llm` - **DEPRECATED**: Use `description` instead
605
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
606
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
607
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
608
+ """
609
+
610
+ page_footer: Optional[ConfigurationSegmentProcessingPageFooter] = FieldInfo(alias="PageFooter", default=None)
611
+ """Controls the processing and generation for the segment.
612
+
613
+ - `crop_image` controls whether to crop the file's images to the segment's
614
+ bounding box. The cropped image will be stored in the segment's `image` field.
615
+ Use `All` to always crop, or `Auto` to only crop when needed for
616
+ post-processing.
617
+ - `format` specifies the output format: `Html` or `Markdown`
618
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
619
+ - `Auto`: Process content automatically
620
+ - `LLM`: Use large language models for processing
621
+ - `Ignore`: Exclude segments from final output
622
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
623
+ uses chunkr's own VLM models and is not configurable via LLM processing
624
+ configuration.
625
+ - `extended_context` uses the full page image as context for LLM generation.
626
+
627
+ **Deprecated fields (for backwards compatibility):**
628
+
629
+ - `llm` - **DEPRECATED**: Use `description` instead
630
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
631
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
632
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
633
+ """
634
+
635
+ page_header: Optional[ConfigurationSegmentProcessingPageHeader] = FieldInfo(alias="PageHeader", default=None)
636
+ """Controls the processing and generation for the segment.
637
+
638
+ - `crop_image` controls whether to crop the file's images to the segment's
639
+ bounding box. The cropped image will be stored in the segment's `image` field.
640
+ Use `All` to always crop, or `Auto` to only crop when needed for
641
+ post-processing.
642
+ - `format` specifies the output format: `Html` or `Markdown`
643
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
644
+ - `Auto`: Process content automatically
645
+ - `LLM`: Use large language models for processing
646
+ - `Ignore`: Exclude segments from final output
647
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
648
+ uses chunkr's own VLM models and is not configurable via LLM processing
649
+ configuration.
650
+ - `extended_context` uses the full page image as context for LLM generation.
651
+
652
+ **Deprecated fields (for backwards compatibility):**
653
+
654
+ - `llm` - **DEPRECATED**: Use `description` instead
655
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
656
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
657
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
658
+ """
659
+
660
+ picture: Optional[ConfigurationSegmentProcessingPicture] = FieldInfo(alias="Picture", default=None)
661
+ """Controls the processing and generation for the segment.
662
+
663
+ - `crop_image` controls whether to crop the file's images to the segment's
664
+ bounding box. The cropped image will be stored in the segment's `image` field.
665
+ Use `All` to always crop, or `Auto` to only crop when needed for
666
+ post-processing.
667
+ - `format` specifies the output format: `Html` or `Markdown`
668
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
669
+ - `Auto`: Process content automatically
670
+ - `LLM`: Use large language models for processing
671
+ - `Ignore`: Exclude segments from final output
672
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
673
+ uses chunkr's own VLM models and is not configurable via LLM processing
674
+ configuration.
675
+ - `extended_context` uses the full page image as context for LLM generation.
676
+
677
+ **Deprecated fields (for backwards compatibility):**
678
+
679
+ - `llm` - **DEPRECATED**: Use `description` instead
680
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
681
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
682
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
683
+ """
684
+
685
+ section_header: Optional[ConfigurationSegmentProcessingSectionHeader] = FieldInfo(
686
+ alias="SectionHeader", default=None
687
+ )
688
+ """Controls the processing and generation for the segment.
689
+
690
+ - `crop_image` controls whether to crop the file's images to the segment's
691
+ bounding box. The cropped image will be stored in the segment's `image` field.
692
+ Use `All` to always crop, or `Auto` to only crop when needed for
693
+ post-processing.
694
+ - `format` specifies the output format: `Html` or `Markdown`
695
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
696
+ - `Auto`: Process content automatically
697
+ - `LLM`: Use large language models for processing
698
+ - `Ignore`: Exclude segments from final output
699
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
700
+ uses chunkr's own VLM models and is not configurable via LLM processing
701
+ configuration.
702
+ - `extended_context` uses the full page image as context for LLM generation.
703
+
704
+ **Deprecated fields (for backwards compatibility):**
705
+
706
+ - `llm` - **DEPRECATED**: Use `description` instead
707
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
708
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
709
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
710
+ """
711
+
712
+ table: Optional[ConfigurationSegmentProcessingTable] = FieldInfo(alias="Table", default=None)
713
+ """Controls the processing and generation for the segment.
714
+
715
+ - `crop_image` controls whether to crop the file's images to the segment's
716
+ bounding box. The cropped image will be stored in the segment's `image` field.
717
+ Use `All` to always crop, or `Auto` to only crop when needed for
718
+ post-processing.
719
+ - `format` specifies the output format: `Html` or `Markdown`
720
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
721
+ - `Auto`: Process content automatically
722
+ - `LLM`: Use large language models for processing
723
+ - `Ignore`: Exclude segments from final output
724
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
725
+ uses chunkr's own VLM models and is not configurable via LLM processing
726
+ configuration.
727
+ - `extended_context` uses the full page image as context for LLM generation.
728
+
729
+ **Deprecated fields (for backwards compatibility):**
730
+
731
+ - `llm` - **DEPRECATED**: Use `description` instead
732
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
733
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
734
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
735
+ """
736
+
737
+ text: Optional[ConfigurationSegmentProcessingText] = FieldInfo(alias="Text", default=None)
738
+ """Controls the processing and generation for the segment.
739
+
740
+ - `crop_image` controls whether to crop the file's images to the segment's
741
+ bounding box. The cropped image will be stored in the segment's `image` field.
742
+ Use `All` to always crop, or `Auto` to only crop when needed for
743
+ post-processing.
744
+ - `format` specifies the output format: `Html` or `Markdown`
745
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
746
+ - `Auto`: Process content automatically
747
+ - `LLM`: Use large language models for processing
748
+ - `Ignore`: Exclude segments from final output
749
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
750
+ uses chunkr's own VLM models and is not configurable via LLM processing
751
+ configuration.
752
+ - `extended_context` uses the full page image as context for LLM generation.
753
+
754
+ **Deprecated fields (for backwards compatibility):**
755
+
756
+ - `llm` - **DEPRECATED**: Use `description` instead
757
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
758
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
759
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
760
+ """
761
+
762
+ title: Optional[ConfigurationSegmentProcessingTitle] = FieldInfo(alias="Title", default=None)
763
+ """Controls the processing and generation for the segment.
764
+
765
+ - `crop_image` controls whether to crop the file's images to the segment's
766
+ bounding box. The cropped image will be stored in the segment's `image` field.
767
+ Use `All` to always crop, or `Auto` to only crop when needed for
768
+ post-processing.
769
+ - `format` specifies the output format: `Html` or `Markdown`
770
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
771
+ - `Auto`: Process content automatically
772
+ - `LLM`: Use large language models for processing
773
+ - `Ignore`: Exclude segments from final output
774
+ - `description` enables LLM-generated descriptions for segments. **Note:** This
775
+ uses chunkr's own VLM models and is not configurable via LLM processing
776
+ configuration.
777
+ - `extended_context` uses the full page image as context for LLM generation.
778
+
779
+ **Deprecated fields (for backwards compatibility):**
780
+
781
+ - `llm` - **DEPRECATED**: Use `description` instead
782
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
783
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
784
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
785
+ """
786
+
787
+
788
+ class ConfigurationClientVersionManualSDK(BaseModel):
789
+ manual_sdk: str = FieldInfo(alias="ManualSdk")
790
+ """Current manually-maintained SDK"""
791
+
792
+
793
+ class ConfigurationClientVersionGeneratedSDK(BaseModel):
794
+ generated_sdk: str = FieldInfo(alias="GeneratedSdk")
795
+ """Future auto-generated SDK"""
796
+
797
+
798
+ ConfigurationClientVersion: TypeAlias = Union[
799
+ Literal["Legacy"], ConfigurationClientVersionManualSDK, ConfigurationClientVersionGeneratedSDK, None
800
+ ]
801
+
802
+
27
803
  class Configuration(BaseModel):
28
- chunk_processing: ChunkProcessing
804
+ chunk_processing: ConfigurationChunkProcessing
29
805
  """Controls the setting for the chunking and post-processing of each chunk."""
30
806
 
31
807
  error_handling: Literal["Fail", "Continue"]
@@ -36,7 +812,7 @@ class Configuration(BaseModel):
36
812
  LLM refusals etc.)
37
813
  """
38
814
 
39
- llm_processing: LlmProcessing
815
+ llm_processing: ConfigurationLlmProcessing
40
816
  """Controls the LLM used for the task."""
41
817
 
42
818
  ocr_strategy: Literal["All", "Auto"]
@@ -48,7 +824,7 @@ class Configuration(BaseModel):
48
824
  used.
49
825
  """
50
826
 
51
- segment_processing: SegmentProcessing
827
+ segment_processing: ConfigurationSegmentProcessing
52
828
  """Defines how each segment type is handled when generating the final output.
53
829
 
54
830
  Each segment uses one of three strategies. The chosen strategy controls: •
@@ -56,18 +832,14 @@ class Configuration(BaseModel):
56
832
  content is produced (rule-based vs. LLM). • The output format (`Html` or
57
833
  `Markdown`).
58
834
 
59
- Optional flags such as image **cropping**, **extended context**, and **LLM
60
- descriptions** further refine behaviour.
61
-
62
- ---
835
+ Optional flags such as image **cropping**, **extended context**, and
836
+ **descriptions** further refine behaviour.
63
837
 
64
838
  **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
65
- `Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
66
- description on) • `Picture` → **LLM** (Markdown, description on, cropping _All_)
67
- • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
68
- **Ignore** (removed from output)
69
-
70
- ---
839
+ `Caption`, `Footnote` → **Auto** (Markdown, description off) • `Table` → **LLM**
840
+ (HTML, description on) • `Picture` → **LLM** (Markdown, description off,
841
+ cropping _All_) • `Formula`, `Page` → **LLM** (Markdown, description off) •
842
+ `PageHeader`, `PageFooter` → **Ignore** (removed from output)
71
843
 
72
844
  **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
73
845
  generate content with an LLM. • **Ignore** – exclude the segment entirely.
@@ -83,6 +855,9 @@ class Configuration(BaseModel):
83
855
  layout element detection and only simple chunking.
84
856
  """
85
857
 
858
+ client_version: Optional[ConfigurationClientVersion] = None
859
+ """Client version for backwards compatibility processing"""
860
+
86
861
  expires_in: Optional[int] = None
87
862
  """
88
863
  The number of seconds until task is deleted. Expired tasks can **not** be
@@ -104,8 +879,36 @@ class Configuration(BaseModel):
104
879
  """
105
880
 
106
881
 
882
+ class OutputChunkSegmentBbox(BaseModel):
883
+ height: float
884
+ """The height of the bounding box."""
885
+
886
+ left: float
887
+ """The left coordinate of the bounding box."""
888
+
889
+ top: float
890
+ """The top coordinate of the bounding box."""
891
+
892
+ width: float
893
+ """The width of the bounding box."""
894
+
895
+
896
+ class OutputChunkSegmentOcrBbox(BaseModel):
897
+ height: float
898
+ """The height of the bounding box."""
899
+
900
+ left: float
901
+ """The left coordinate of the bounding box."""
902
+
903
+ top: float
904
+ """The top coordinate of the bounding box."""
905
+
906
+ width: float
907
+ """The width of the bounding box."""
908
+
909
+
107
910
  class OutputChunkSegmentOcr(BaseModel):
108
- bbox: BoundingBox
911
+ bbox: OutputChunkSegmentOcrBbox
109
912
  """Bounding box for an item. It is used for chunks, segments and OCR results."""
110
913
 
111
914
  text: str
@@ -166,8 +969,36 @@ class OutputChunkSegmentSSCell(BaseModel):
166
969
  """
167
970
 
168
971
 
972
+ class OutputChunkSegmentSSHeaderBbox(BaseModel):
973
+ height: float
974
+ """The height of the bounding box."""
975
+
976
+ left: float
977
+ """The left coordinate of the bounding box."""
978
+
979
+ top: float
980
+ """The top coordinate of the bounding box."""
981
+
982
+ width: float
983
+ """The width of the bounding box."""
984
+
985
+
986
+ class OutputChunkSegmentSSHeaderOcrBbox(BaseModel):
987
+ height: float
988
+ """The height of the bounding box."""
989
+
990
+ left: float
991
+ """The left coordinate of the bounding box."""
992
+
993
+ top: float
994
+ """The top coordinate of the bounding box."""
995
+
996
+ width: float
997
+ """The width of the bounding box."""
998
+
999
+
169
1000
  class OutputChunkSegmentSSHeaderOcr(BaseModel):
170
- bbox: BoundingBox
1001
+ bbox: OutputChunkSegmentSSHeaderOcrBbox
171
1002
  """Bounding box for an item. It is used for chunks, segments and OCR results."""
172
1003
 
173
1004
  text: str
@@ -178,7 +1009,7 @@ class OutputChunkSegmentSSHeaderOcr(BaseModel):
178
1009
 
179
1010
 
180
1011
  class OutputChunkSegment(BaseModel):
181
- bbox: BoundingBox
1012
+ bbox: OutputChunkSegmentBbox
182
1013
  """Bounding box for an item. It is used for chunks, segments and OCR results."""
183
1014
 
184
1015
  page_height: float
@@ -248,7 +1079,7 @@ class OutputChunkSegment(BaseModel):
248
1079
  ss_cells: Optional[List[OutputChunkSegmentSSCell]] = None
249
1080
  """Cells of the segment. Only used for Spreadsheets."""
250
1081
 
251
- ss_header_bbox: Optional[BoundingBox] = None
1082
+ ss_header_bbox: Optional[OutputChunkSegmentSSHeaderBbox] = None
252
1083
  """Bounding box of the header of the segment, if found.
253
1084
 
254
1085
  Only used for Spreadsheets.
@@ -286,7 +1117,10 @@ class OutputChunkSegment(BaseModel):
286
1117
 
287
1118
  class OutputChunk(BaseModel):
288
1119
  chunk_length: int
289
- """The total number of tokens in the chunk. Calculated by the `tokenizer`."""
1120
+ """The total number of tokens in the `embed` field of the chunk.
1121
+
1122
+ Calculated by the `tokenizer`.
1123
+ """
290
1124
 
291
1125
  segments: List[OutputChunkSegment]
292
1126
  """
@@ -299,12 +1133,17 @@ class OutputChunk(BaseModel):
299
1133
  chunk_id: Optional[str] = None
300
1134
  """The unique identifier for the chunk."""
301
1135
 
1136
+ content: Optional[str] = None
1137
+ """The content of the chunk.
1138
+
1139
+ This is the text that is generated by combining the `content` field from each
1140
+ segment. Can be used provided as context to the LLM.
1141
+ """
1142
+
302
1143
  embed: Optional[str] = None
303
1144
  """Suggested text to be embedded for the chunk.
304
1145
 
305
- This text is generated by combining the embed content from each segment
306
- according to the configured embed sources (HTML, Markdown, LLM, or Content). Can
307
- be configured using `embed_sources` in the `SegmentProcessing` configuration.
1146
+ This text is generated by combining the `embed` field from each segment.
308
1147
  """
309
1148
 
310
1149