chunkr-ai 0.1.0__py3-none-any.whl → 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. chunkr_ai/__init__.py +89 -2
  2. chunkr_ai/_base_client.py +1995 -0
  3. chunkr_ai/_client.py +403 -0
  4. chunkr_ai/_compat.py +219 -0
  5. chunkr_ai/_constants.py +14 -0
  6. chunkr_ai/_exceptions.py +108 -0
  7. chunkr_ai/_files.py +123 -0
  8. chunkr_ai/_models.py +829 -0
  9. chunkr_ai/_qs.py +150 -0
  10. chunkr_ai/_resource.py +43 -0
  11. chunkr_ai/_response.py +830 -0
  12. chunkr_ai/_streaming.py +333 -0
  13. chunkr_ai/_types.py +219 -0
  14. chunkr_ai/_utils/__init__.py +57 -0
  15. chunkr_ai/_utils/_logs.py +25 -0
  16. chunkr_ai/_utils/_proxy.py +65 -0
  17. chunkr_ai/_utils/_reflection.py +42 -0
  18. chunkr_ai/_utils/_resources_proxy.py +24 -0
  19. chunkr_ai/_utils/_streams.py +12 -0
  20. chunkr_ai/_utils/_sync.py +86 -0
  21. chunkr_ai/_utils/_transform.py +447 -0
  22. chunkr_ai/_utils/_typing.py +151 -0
  23. chunkr_ai/_utils/_utils.py +422 -0
  24. chunkr_ai/_version.py +4 -0
  25. chunkr_ai/lib/.keep +4 -0
  26. chunkr_ai/pagination.py +71 -0
  27. chunkr_ai/resources/__init__.py +33 -0
  28. chunkr_ai/resources/health.py +136 -0
  29. chunkr_ai/resources/task/__init__.py +33 -0
  30. chunkr_ai/resources/task/parse.py +616 -0
  31. chunkr_ai/resources/task/task.py +664 -0
  32. chunkr_ai/types/__init__.py +8 -0
  33. chunkr_ai/types/health_check_response.py +7 -0
  34. chunkr_ai/types/task/__init__.py +7 -0
  35. chunkr_ai/types/task/parse_create_params.py +806 -0
  36. chunkr_ai/types/task/parse_update_params.py +806 -0
  37. chunkr_ai/types/task/task.py +1186 -0
  38. chunkr_ai/types/task_get_params.py +18 -0
  39. chunkr_ai/types/task_list_params.py +37 -0
  40. chunkr_ai-0.1.0a2.dist-info/METADATA +504 -0
  41. chunkr_ai-0.1.0a2.dist-info/RECORD +44 -0
  42. {chunkr_ai-0.1.0.dist-info → chunkr_ai-0.1.0a2.dist-info}/WHEEL +1 -2
  43. chunkr_ai-0.1.0a2.dist-info/licenses/LICENSE +201 -0
  44. chunkr_ai/api/auth.py +0 -13
  45. chunkr_ai/api/chunkr.py +0 -103
  46. chunkr_ai/api/chunkr_base.py +0 -185
  47. chunkr_ai/api/configuration.py +0 -313
  48. chunkr_ai/api/decorators.py +0 -101
  49. chunkr_ai/api/misc.py +0 -139
  50. chunkr_ai/api/protocol.py +0 -14
  51. chunkr_ai/api/task_response.py +0 -208
  52. chunkr_ai/models.py +0 -55
  53. chunkr_ai-0.1.0.dist-info/METADATA +0 -268
  54. chunkr_ai-0.1.0.dist-info/RECORD +0 -16
  55. chunkr_ai-0.1.0.dist-info/licenses/LICENSE +0 -21
  56. chunkr_ai-0.1.0.dist-info/top_level.txt +0 -1
  57. /chunkr_ai/{api/__init__.py → py.typed} +0 -0
@@ -0,0 +1,1186 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import List, Union, Optional
4
+ from datetime import datetime
5
+ from typing_extensions import Literal, TypeAlias
6
+
7
+ from pydantic import Field as FieldInfo
8
+
9
+ from ..._models import BaseModel
10
+
11
+ __all__ = [
12
+ "Task",
13
+ "Configuration",
14
+ "ConfigurationChunkProcessing",
15
+ "ConfigurationChunkProcessingTokenizer",
16
+ "ConfigurationChunkProcessingTokenizerEnum",
17
+ "ConfigurationChunkProcessingTokenizerString",
18
+ "ConfigurationLlmProcessing",
19
+ "ConfigurationLlmProcessingFallbackStrategy",
20
+ "ConfigurationLlmProcessingFallbackStrategyModel",
21
+ "ConfigurationSegmentProcessing",
22
+ "ConfigurationSegmentProcessingCaption",
23
+ "ConfigurationSegmentProcessingFootnote",
24
+ "ConfigurationSegmentProcessingFormula",
25
+ "ConfigurationSegmentProcessingListItem",
26
+ "ConfigurationSegmentProcessingPage",
27
+ "ConfigurationSegmentProcessingPageFooter",
28
+ "ConfigurationSegmentProcessingPageHeader",
29
+ "ConfigurationSegmentProcessingPicture",
30
+ "ConfigurationSegmentProcessingSectionHeader",
31
+ "ConfigurationSegmentProcessingTable",
32
+ "ConfigurationSegmentProcessingText",
33
+ "ConfigurationSegmentProcessingTitle",
34
+ "ConfigurationClientVersion",
35
+ "ConfigurationClientVersionManualSDK",
36
+ "ConfigurationClientVersionGeneratedSDK",
37
+ "Output",
38
+ "OutputChunk",
39
+ "OutputChunkSegment",
40
+ "OutputChunkSegmentBbox",
41
+ "OutputChunkSegmentOcr",
42
+ "OutputChunkSegmentOcrBbox",
43
+ "OutputChunkSegmentSSCell",
44
+ "OutputChunkSegmentSSCellStyle",
45
+ "OutputChunkSegmentSSHeaderBbox",
46
+ "OutputChunkSegmentSSHeaderOcr",
47
+ "OutputChunkSegmentSSHeaderOcrBbox",
48
+ "OutputPage",
49
+ ]
50
+
51
+
52
+ class ConfigurationChunkProcessingTokenizerEnum(BaseModel):
53
+ enum: Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"] = FieldInfo(alias="Enum")
54
+ """Use one of the predefined tokenizer types"""
55
+
56
+
57
+ class ConfigurationChunkProcessingTokenizerString(BaseModel):
58
+ string: str = FieldInfo(alias="String")
59
+ """
60
+ Use any Hugging Face tokenizer by specifying its model ID Examples:
61
+ "Qwen/Qwen-tokenizer", "facebook/bart-large"
62
+ """
63
+
64
+
65
+ ConfigurationChunkProcessingTokenizer: TypeAlias = Union[
66
+ ConfigurationChunkProcessingTokenizerEnum, ConfigurationChunkProcessingTokenizerString
67
+ ]
68
+
69
+
70
+ class ConfigurationChunkProcessing(BaseModel):
71
+ ignore_headers_and_footers: Optional[bool] = None
72
+ """DEPRECATED: use `segment_processing.ignore` instead"""
73
+
74
+ target_length: Optional[int] = None
75
+ """The target number of words in each chunk.
76
+
77
+ If 0, each chunk will contain a single segment.
78
+ """
79
+
80
+ tokenizer: Optional[ConfigurationChunkProcessingTokenizer] = None
81
+ """The tokenizer to use for the chunking process."""
82
+
83
+
84
+ class ConfigurationLlmProcessingFallbackStrategyModel(BaseModel):
85
+ model: str = FieldInfo(alias="Model")
86
+ """Use a specific model as fallback"""
87
+
88
+
89
+ ConfigurationLlmProcessingFallbackStrategy: TypeAlias = Union[
90
+ Literal["None", "Default"], ConfigurationLlmProcessingFallbackStrategyModel
91
+ ]
92
+
93
+
94
+ class ConfigurationLlmProcessing(BaseModel):
95
+ fallback_strategy: Optional[ConfigurationLlmProcessingFallbackStrategy] = None
96
+ """The fallback strategy to use for the LLMs in the task."""
97
+
98
+ llm_model_id: Optional[str] = None
99
+ """The ID of the model to use for the task.
100
+
101
+ If not provided, the default model will be used. Please check the documentation
102
+ for the model you want to use.
103
+ """
104
+
105
+ max_completion_tokens: Optional[int] = None
106
+ """The maximum number of tokens to generate."""
107
+
108
+ temperature: Optional[float] = None
109
+ """The temperature to use for the LLM."""
110
+
111
+
112
+ class ConfigurationSegmentProcessingCaption(BaseModel):
113
+ crop_image: Optional[Literal["All", "Auto"]] = None
114
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
115
+
116
+ - `All` crops all images in the item
117
+ - `Auto` crops images only if required for post-processing
118
+ """
119
+
120
+ description: Optional[bool] = None
121
+ """Generate LLM descriptions for this segment"""
122
+
123
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
124
+ """**DEPRECATED**: `embed` field is auto populated"""
125
+
126
+ extended_context: Optional[bool] = None
127
+ """Use the full page image as context for LLM generation"""
128
+
129
+ format: Optional[Literal["Html", "Markdown"]] = None
130
+
131
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
132
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
133
+
134
+ llm: Optional[str] = None
135
+ """**DEPRECATED**: use description instead"""
136
+
137
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
138
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
139
+
140
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
141
+
142
+
143
+ class ConfigurationSegmentProcessingFootnote(BaseModel):
144
+ crop_image: Optional[Literal["All", "Auto"]] = None
145
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
146
+
147
+ - `All` crops all images in the item
148
+ - `Auto` crops images only if required for post-processing
149
+ """
150
+
151
+ description: Optional[bool] = None
152
+ """Generate LLM descriptions for this segment"""
153
+
154
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
155
+ """**DEPRECATED**: `embed` field is auto populated"""
156
+
157
+ extended_context: Optional[bool] = None
158
+ """Use the full page image as context for LLM generation"""
159
+
160
+ format: Optional[Literal["Html", "Markdown"]] = None
161
+
162
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
163
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
164
+
165
+ llm: Optional[str] = None
166
+ """**DEPRECATED**: use description instead"""
167
+
168
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
169
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
170
+
171
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
172
+
173
+
174
+ class ConfigurationSegmentProcessingFormula(BaseModel):
175
+ crop_image: Optional[Literal["All", "Auto"]] = None
176
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
177
+
178
+ - `All` crops all images in the item
179
+ - `Auto` crops images only if required for post-processing
180
+ """
181
+
182
+ description: Optional[bool] = None
183
+ """Generate LLM descriptions for this segment"""
184
+
185
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
186
+ """**DEPRECATED**: `embed` field is auto populated"""
187
+
188
+ extended_context: Optional[bool] = None
189
+ """Use the full page image as context for LLM generation"""
190
+
191
+ format: Optional[Literal["Html", "Markdown"]] = None
192
+
193
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
194
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
195
+
196
+ llm: Optional[str] = None
197
+ """**DEPRECATED**: use description instead"""
198
+
199
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
200
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
201
+
202
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
203
+
204
+
205
+ class ConfigurationSegmentProcessingListItem(BaseModel):
206
+ crop_image: Optional[Literal["All", "Auto"]] = None
207
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
208
+
209
+ - `All` crops all images in the item
210
+ - `Auto` crops images only if required for post-processing
211
+ """
212
+
213
+ description: Optional[bool] = None
214
+ """Generate LLM descriptions for this segment"""
215
+
216
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
217
+ """**DEPRECATED**: `embed` field is auto populated"""
218
+
219
+ extended_context: Optional[bool] = None
220
+ """Use the full page image as context for LLM generation"""
221
+
222
+ format: Optional[Literal["Html", "Markdown"]] = None
223
+
224
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
225
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
226
+
227
+ llm: Optional[str] = None
228
+ """**DEPRECATED**: use description instead"""
229
+
230
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
231
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
232
+
233
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
234
+
235
+
236
+ class ConfigurationSegmentProcessingPage(BaseModel):
237
+ crop_image: Optional[Literal["All", "Auto"]] = None
238
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
239
+
240
+ - `All` crops all images in the item
241
+ - `Auto` crops images only if required for post-processing
242
+ """
243
+
244
+ description: Optional[bool] = None
245
+ """Generate LLM descriptions for this segment"""
246
+
247
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
248
+ """**DEPRECATED**: `embed` field is auto populated"""
249
+
250
+ extended_context: Optional[bool] = None
251
+ """Use the full page image as context for LLM generation"""
252
+
253
+ format: Optional[Literal["Html", "Markdown"]] = None
254
+
255
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
256
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
257
+
258
+ llm: Optional[str] = None
259
+ """**DEPRECATED**: use description instead"""
260
+
261
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
262
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
263
+
264
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
265
+
266
+
267
+ class ConfigurationSegmentProcessingPageFooter(BaseModel):
268
+ crop_image: Optional[Literal["All", "Auto"]] = None
269
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
270
+
271
+ - `All` crops all images in the item
272
+ - `Auto` crops images only if required for post-processing
273
+ """
274
+
275
+ description: Optional[bool] = None
276
+ """Generate LLM descriptions for this segment"""
277
+
278
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
279
+ """**DEPRECATED**: `embed` field is auto populated"""
280
+
281
+ extended_context: Optional[bool] = None
282
+ """Use the full page image as context for LLM generation"""
283
+
284
+ format: Optional[Literal["Html", "Markdown"]] = None
285
+
286
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
287
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
288
+
289
+ llm: Optional[str] = None
290
+ """**DEPRECATED**: use description instead"""
291
+
292
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
293
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
294
+
295
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
296
+
297
+
298
+ class ConfigurationSegmentProcessingPageHeader(BaseModel):
299
+ crop_image: Optional[Literal["All", "Auto"]] = None
300
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
301
+
302
+ - `All` crops all images in the item
303
+ - `Auto` crops images only if required for post-processing
304
+ """
305
+
306
+ description: Optional[bool] = None
307
+ """Generate LLM descriptions for this segment"""
308
+
309
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
310
+ """**DEPRECATED**: `embed` field is auto populated"""
311
+
312
+ extended_context: Optional[bool] = None
313
+ """Use the full page image as context for LLM generation"""
314
+
315
+ format: Optional[Literal["Html", "Markdown"]] = None
316
+
317
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
318
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
319
+
320
+ llm: Optional[str] = None
321
+ """**DEPRECATED**: use description instead"""
322
+
323
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
324
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
325
+
326
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
327
+
328
+
329
+ class ConfigurationSegmentProcessingPicture(BaseModel):
330
+ crop_image: Optional[Literal["All", "Auto"]] = None
331
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
332
+
333
+ - `All` crops all images in the item
334
+ - `Auto` crops images only if required for post-processing
335
+ """
336
+
337
+ description: Optional[bool] = None
338
+ """Generate LLM descriptions for this segment"""
339
+
340
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
341
+ """**DEPRECATED**: `embed` field is auto populated"""
342
+
343
+ extended_context: Optional[bool] = None
344
+ """Use the full page image as context for LLM generation"""
345
+
346
+ format: Optional[Literal["Html", "Markdown"]] = None
347
+
348
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
349
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
350
+
351
+ llm: Optional[str] = None
352
+ """**DEPRECATED**: use description instead"""
353
+
354
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
355
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
356
+
357
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
358
+
359
+
360
+ class ConfigurationSegmentProcessingSectionHeader(BaseModel):
361
+ crop_image: Optional[Literal["All", "Auto"]] = None
362
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
363
+
364
+ - `All` crops all images in the item
365
+ - `Auto` crops images only if required for post-processing
366
+ """
367
+
368
+ description: Optional[bool] = None
369
+ """Generate LLM descriptions for this segment"""
370
+
371
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
372
+ """**DEPRECATED**: `embed` field is auto populated"""
373
+
374
+ extended_context: Optional[bool] = None
375
+ """Use the full page image as context for LLM generation"""
376
+
377
+ format: Optional[Literal["Html", "Markdown"]] = None
378
+
379
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
380
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
381
+
382
+ llm: Optional[str] = None
383
+ """**DEPRECATED**: use description instead"""
384
+
385
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
386
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
387
+
388
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
389
+
390
+
391
+ class ConfigurationSegmentProcessingTable(BaseModel):
392
+ crop_image: Optional[Literal["All", "Auto"]] = None
393
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
394
+
395
+ - `All` crops all images in the item
396
+ - `Auto` crops images only if required for post-processing
397
+ """
398
+
399
+ description: Optional[bool] = None
400
+ """Generate LLM descriptions for this segment"""
401
+
402
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
403
+ """**DEPRECATED**: `embed` field is auto populated"""
404
+
405
+ extended_context: Optional[bool] = None
406
+ """Use the full page image as context for LLM generation"""
407
+
408
+ format: Optional[Literal["Html", "Markdown"]] = None
409
+
410
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
411
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
412
+
413
+ llm: Optional[str] = None
414
+ """**DEPRECATED**: use description instead"""
415
+
416
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
417
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
418
+
419
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
420
+
421
+
422
+ class ConfigurationSegmentProcessingText(BaseModel):
423
+ crop_image: Optional[Literal["All", "Auto"]] = None
424
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
425
+
426
+ - `All` crops all images in the item
427
+ - `Auto` crops images only if required for post-processing
428
+ """
429
+
430
+ description: Optional[bool] = None
431
+ """Generate LLM descriptions for this segment"""
432
+
433
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
434
+ """**DEPRECATED**: `embed` field is auto populated"""
435
+
436
+ extended_context: Optional[bool] = None
437
+ """Use the full page image as context for LLM generation"""
438
+
439
+ format: Optional[Literal["Html", "Markdown"]] = None
440
+
441
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
442
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
443
+
444
+ llm: Optional[str] = None
445
+ """**DEPRECATED**: use description instead"""
446
+
447
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
448
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
449
+
450
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
451
+
452
+
453
+ class ConfigurationSegmentProcessingTitle(BaseModel):
454
+ crop_image: Optional[Literal["All", "Auto"]] = None
455
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
456
+
457
+ - `All` crops all images in the item
458
+ - `Auto` crops images only if required for post-processing
459
+ """
460
+
461
+ description: Optional[bool] = None
462
+ """Generate LLM descriptions for this segment"""
463
+
464
+ embed_sources: Optional[List[Literal["Content", "HTML", "Markdown", "LLM"]]] = None
465
+ """**DEPRECATED**: `embed` field is auto populated"""
466
+
467
+ extended_context: Optional[bool] = None
468
+ """Use the full page image as context for LLM generation"""
469
+
470
+ format: Optional[Literal["Html", "Markdown"]] = None
471
+
472
+ html: Optional[Literal["LLM", "Auto", "Ignore"]] = None
473
+ """**DEPRECATED**: Use `format: html` and `strategy` instead."""
474
+
475
+ llm: Optional[str] = None
476
+ """**DEPRECATED**: use description instead"""
477
+
478
+ markdown: Optional[Literal["LLM", "Auto", "Ignore"]] = None
479
+ """**DEPRECATED**: Use `format: markdown` and `strategy` instead."""
480
+
481
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
482
+
483
+
484
+ class ConfigurationSegmentProcessing(BaseModel):
485
+ caption: Optional[ConfigurationSegmentProcessingCaption] = FieldInfo(alias="Caption", default=None)
486
+ """Controls the processing and generation for the segment.
487
+
488
+ - `crop_image` controls whether to crop the file's images to the segment's
489
+ bounding box. The cropped image will be stored in the segment's `image` field.
490
+ Use `All` to always crop, or `Auto` to only crop when needed for
491
+ post-processing.
492
+ - `format` specifies the output format: `Html` or `Markdown`
493
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
494
+ - `Auto`: Process content automatically
495
+ - `LLM`: Use large language models for processing
496
+ - `Ignore`: Exclude segments from final output
497
+ - `description` enables LLM-generated descriptions for segments
498
+
499
+ **Deprecated fields (for backwards compatibility):**
500
+
501
+ - `llm` - **DEPRECATED**: Use `description` instead
502
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
503
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
504
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
505
+ """
506
+
507
+ footnote: Optional[ConfigurationSegmentProcessingFootnote] = FieldInfo(alias="Footnote", default=None)
508
+ """Controls the processing and generation for the segment.
509
+
510
+ - `crop_image` controls whether to crop the file's images to the segment's
511
+ bounding box. The cropped image will be stored in the segment's `image` field.
512
+ Use `All` to always crop, or `Auto` to only crop when needed for
513
+ post-processing.
514
+ - `format` specifies the output format: `Html` or `Markdown`
515
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
516
+ - `Auto`: Process content automatically
517
+ - `LLM`: Use large language models for processing
518
+ - `Ignore`: Exclude segments from final output
519
+ - `description` enables LLM-generated descriptions for segments
520
+
521
+ **Deprecated fields (for backwards compatibility):**
522
+
523
+ - `llm` - **DEPRECATED**: Use `description` instead
524
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
525
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
526
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
527
+ """
528
+
529
+ formula: Optional[ConfigurationSegmentProcessingFormula] = FieldInfo(alias="Formula", default=None)
530
+ """Controls the processing and generation for the segment.
531
+
532
+ - `crop_image` controls whether to crop the file's images to the segment's
533
+ bounding box. The cropped image will be stored in the segment's `image` field.
534
+ Use `All` to always crop, or `Auto` to only crop when needed for
535
+ post-processing.
536
+ - `format` specifies the output format: `Html` or `Markdown`
537
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
538
+ - `Auto`: Process content automatically
539
+ - `LLM`: Use large language models for processing
540
+ - `Ignore`: Exclude segments from final output
541
+ - `description` enables LLM-generated descriptions for segments
542
+
543
+ **Deprecated fields (for backwards compatibility):**
544
+
545
+ - `llm` - **DEPRECATED**: Use `description` instead
546
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
547
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
548
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
549
+ """
550
+
551
+ list_item: Optional[ConfigurationSegmentProcessingListItem] = FieldInfo(alias="ListItem", default=None)
552
+ """Controls the processing and generation for the segment.
553
+
554
+ - `crop_image` controls whether to crop the file's images to the segment's
555
+ bounding box. The cropped image will be stored in the segment's `image` field.
556
+ Use `All` to always crop, or `Auto` to only crop when needed for
557
+ post-processing.
558
+ - `format` specifies the output format: `Html` or `Markdown`
559
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
560
+ - `Auto`: Process content automatically
561
+ - `LLM`: Use large language models for processing
562
+ - `Ignore`: Exclude segments from final output
563
+ - `description` enables LLM-generated descriptions for segments
564
+
565
+ **Deprecated fields (for backwards compatibility):**
566
+
567
+ - `llm` - **DEPRECATED**: Use `description` instead
568
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
569
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
570
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
571
+ """
572
+
573
+ page: Optional[ConfigurationSegmentProcessingPage] = FieldInfo(alias="Page", default=None)
574
+ """Controls the processing and generation for the segment.
575
+
576
+ - `crop_image` controls whether to crop the file's images to the segment's
577
+ bounding box. The cropped image will be stored in the segment's `image` field.
578
+ Use `All` to always crop, or `Auto` to only crop when needed for
579
+ post-processing.
580
+ - `format` specifies the output format: `Html` or `Markdown`
581
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
582
+ - `Auto`: Process content automatically
583
+ - `LLM`: Use large language models for processing
584
+ - `Ignore`: Exclude segments from final output
585
+ - `description` enables LLM-generated descriptions for segments
586
+
587
+ **Deprecated fields (for backwards compatibility):**
588
+
589
+ - `llm` - **DEPRECATED**: Use `description` instead
590
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
591
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
592
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
593
+ """
594
+
595
+ page_footer: Optional[ConfigurationSegmentProcessingPageFooter] = FieldInfo(alias="PageFooter", default=None)
596
+ """Controls the processing and generation for the segment.
597
+
598
+ - `crop_image` controls whether to crop the file's images to the segment's
599
+ bounding box. The cropped image will be stored in the segment's `image` field.
600
+ Use `All` to always crop, or `Auto` to only crop when needed for
601
+ post-processing.
602
+ - `format` specifies the output format: `Html` or `Markdown`
603
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
604
+ - `Auto`: Process content automatically
605
+ - `LLM`: Use large language models for processing
606
+ - `Ignore`: Exclude segments from final output
607
+ - `description` enables LLM-generated descriptions for segments
608
+
609
+ **Deprecated fields (for backwards compatibility):**
610
+
611
+ - `llm` - **DEPRECATED**: Use `description` instead
612
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
613
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
614
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
615
+ """
616
+
617
+ page_header: Optional[ConfigurationSegmentProcessingPageHeader] = FieldInfo(alias="PageHeader", default=None)
618
+ """Controls the processing and generation for the segment.
619
+
620
+ - `crop_image` controls whether to crop the file's images to the segment's
621
+ bounding box. The cropped image will be stored in the segment's `image` field.
622
+ Use `All` to always crop, or `Auto` to only crop when needed for
623
+ post-processing.
624
+ - `format` specifies the output format: `Html` or `Markdown`
625
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
626
+ - `Auto`: Process content automatically
627
+ - `LLM`: Use large language models for processing
628
+ - `Ignore`: Exclude segments from final output
629
+ - `description` enables LLM-generated descriptions for segments
630
+
631
+ **Deprecated fields (for backwards compatibility):**
632
+
633
+ - `llm` - **DEPRECATED**: Use `description` instead
634
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
635
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
636
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
637
+ """
638
+
639
+ picture: Optional[ConfigurationSegmentProcessingPicture] = FieldInfo(alias="Picture", default=None)
640
+ """Controls the processing and generation for the segment.
641
+
642
+ - `crop_image` controls whether to crop the file's images to the segment's
643
+ bounding box. The cropped image will be stored in the segment's `image` field.
644
+ Use `All` to always crop, or `Auto` to only crop when needed for
645
+ post-processing.
646
+ - `format` specifies the output format: `Html` or `Markdown`
647
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
648
+ - `Auto`: Process content automatically
649
+ - `LLM`: Use large language models for processing
650
+ - `Ignore`: Exclude segments from final output
651
+ - `description` enables LLM-generated descriptions for segments
652
+
653
+ **Deprecated fields (for backwards compatibility):**
654
+
655
+ - `llm` - **DEPRECATED**: Use `description` instead
656
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
657
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
658
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
659
+ """
660
+
661
+ section_header: Optional[ConfigurationSegmentProcessingSectionHeader] = FieldInfo(
662
+ alias="SectionHeader", default=None
663
+ )
664
+ """Controls the processing and generation for the segment.
665
+
666
+ - `crop_image` controls whether to crop the file's images to the segment's
667
+ bounding box. The cropped image will be stored in the segment's `image` field.
668
+ Use `All` to always crop, or `Auto` to only crop when needed for
669
+ post-processing.
670
+ - `format` specifies the output format: `Html` or `Markdown`
671
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
672
+ - `Auto`: Process content automatically
673
+ - `LLM`: Use large language models for processing
674
+ - `Ignore`: Exclude segments from final output
675
+ - `description` enables LLM-generated descriptions for segments
676
+
677
+ **Deprecated fields (for backwards compatibility):**
678
+
679
+ - `llm` - **DEPRECATED**: Use `description` instead
680
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
681
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
682
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
683
+ """
684
+
685
+ table: Optional[ConfigurationSegmentProcessingTable] = FieldInfo(alias="Table", default=None)
686
+ """Controls the processing and generation for the segment.
687
+
688
+ - `crop_image` controls whether to crop the file's images to the segment's
689
+ bounding box. The cropped image will be stored in the segment's `image` field.
690
+ Use `All` to always crop, or `Auto` to only crop when needed for
691
+ post-processing.
692
+ - `format` specifies the output format: `Html` or `Markdown`
693
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
694
+ - `Auto`: Process content automatically
695
+ - `LLM`: Use large language models for processing
696
+ - `Ignore`: Exclude segments from final output
697
+ - `description` enables LLM-generated descriptions for segments
698
+
699
+ **Deprecated fields (for backwards compatibility):**
700
+
701
+ - `llm` - **DEPRECATED**: Use `description` instead
702
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
703
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
704
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
705
+ """
706
+
707
+ text: Optional[ConfigurationSegmentProcessingText] = FieldInfo(alias="Text", default=None)
708
+ """Controls the processing and generation for the segment.
709
+
710
+ - `crop_image` controls whether to crop the file's images to the segment's
711
+ bounding box. The cropped image will be stored in the segment's `image` field.
712
+ Use `All` to always crop, or `Auto` to only crop when needed for
713
+ post-processing.
714
+ - `format` specifies the output format: `Html` or `Markdown`
715
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
716
+ - `Auto`: Process content automatically
717
+ - `LLM`: Use large language models for processing
718
+ - `Ignore`: Exclude segments from final output
719
+ - `description` enables LLM-generated descriptions for segments
720
+
721
+ **Deprecated fields (for backwards compatibility):**
722
+
723
+ - `llm` - **DEPRECATED**: Use `description` instead
724
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
725
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
726
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
727
+ """
728
+
729
+ title: Optional[ConfigurationSegmentProcessingTitle] = FieldInfo(alias="Title", default=None)
730
+ """Controls the processing and generation for the segment.
731
+
732
+ - `crop_image` controls whether to crop the file's images to the segment's
733
+ bounding box. The cropped image will be stored in the segment's `image` field.
734
+ Use `All` to always crop, or `Auto` to only crop when needed for
735
+ post-processing.
736
+ - `format` specifies the output format: `Html` or `Markdown`
737
+ - `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
738
+ - `Auto`: Process content automatically
739
+ - `LLM`: Use large language models for processing
740
+ - `Ignore`: Exclude segments from final output
741
+ - `description` enables LLM-generated descriptions for segments
742
+
743
+ **Deprecated fields (for backwards compatibility):**
744
+
745
+ - `llm` - **DEPRECATED**: Use `description` instead
746
+ - `embed_sources` - **DEPRECATED**: Embed field is auto-populated
747
+ - `html` - **DEPRECATED**: Use `format: Html` and `strategy` instead
748
+ - `markdown` - **DEPRECATED**: Use `format: Markdown` and `strategy` instead
749
+ """
750
+
751
+
752
+ class ConfigurationClientVersionManualSDK(BaseModel):
753
+ manual_sdk: str = FieldInfo(alias="ManualSdk")
754
+ """Current manually-maintained SDK"""
755
+
756
+
757
+ class ConfigurationClientVersionGeneratedSDK(BaseModel):
758
+ generated_sdk: str = FieldInfo(alias="GeneratedSdk")
759
+ """Future auto-generated SDK"""
760
+
761
+
762
+ ConfigurationClientVersion: TypeAlias = Union[
763
+ Literal["Legacy"], ConfigurationClientVersionManualSDK, ConfigurationClientVersionGeneratedSDK, None
764
+ ]
765
+
766
+
767
+ class Configuration(BaseModel):
768
+ chunk_processing: ConfigurationChunkProcessing
769
+ """Controls the setting for the chunking and post-processing of each chunk."""
770
+
771
+ error_handling: Literal["Fail", "Continue"]
772
+ """Controls how errors are handled during processing:
773
+
774
+ - `Fail`: Stops processing and fails the task when any error occurs
775
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
776
+ LLM refusals etc.)
777
+ """
778
+
779
+ llm_processing: ConfigurationLlmProcessing
780
+ """Controls the LLM used for the task."""
781
+
782
+ ocr_strategy: Literal["All", "Auto"]
783
+ """Controls the Optical Character Recognition (OCR) strategy.
784
+
785
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
786
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
787
+ text. When text layer is present the bounding boxes from the text layer are
788
+ used.
789
+ """
790
+
791
+ segment_processing: ConfigurationSegmentProcessing
792
+ """Defines how each segment type is handled when generating the final output.
793
+
794
+ Each segment uses one of three strategies. The chosen strategy controls: •
795
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
796
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
797
+ `Markdown`).
798
+
799
+ Optional flags such as image **cropping**, **extended context**, and **LLM
800
+ descriptions** further refine behaviour.
801
+
802
+ ---
803
+
804
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
805
+ `Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
806
+ description on) • `Picture` → **LLM** (Markdown, description off, cropping
807
+ _All_) • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
808
+ **Ignore** (removed from output)
809
+
810
+ ---
811
+
812
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
813
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
814
+ """
815
+
816
+ segmentation_strategy: Literal["LayoutAnalysis", "Page"]
817
+ """Controls the segmentation strategy:
818
+
819
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
820
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
821
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
822
+ - `Page`: Treats each page as a single segment. Faster processing, but without
823
+ layout element detection and only simple chunking.
824
+ """
825
+
826
+ client_version: Optional[ConfigurationClientVersion] = None
827
+ """Client version for backwards compatibility processing"""
828
+
829
+ expires_in: Optional[int] = None
830
+ """
831
+ The number of seconds until task is deleted. Expired tasks can **not** be
832
+ updated, polled or accessed via web interface.
833
+ """
834
+
835
+ high_resolution: Optional[bool] = None
836
+ """Whether to use high-resolution images for cropping and post-processing."""
837
+
838
+ input_file_url: Optional[str] = None
839
+ """The presigned URL of the input file."""
840
+
841
+ pipeline: Optional[Literal["Azure", "Chunkr"]] = None
842
+
843
+ target_chunk_length: Optional[int] = None
844
+ """The target number of words in each chunk.
845
+
846
+ If 0, each chunk will contain a single segment.
847
+ """
848
+
849
+
850
+ class OutputChunkSegmentBbox(BaseModel):
851
+ height: float
852
+ """The height of the bounding box."""
853
+
854
+ left: float
855
+ """The left coordinate of the bounding box."""
856
+
857
+ top: float
858
+ """The top coordinate of the bounding box."""
859
+
860
+ width: float
861
+ """The width of the bounding box."""
862
+
863
+
864
+ class OutputChunkSegmentOcrBbox(BaseModel):
865
+ height: float
866
+ """The height of the bounding box."""
867
+
868
+ left: float
869
+ """The left coordinate of the bounding box."""
870
+
871
+ top: float
872
+ """The top coordinate of the bounding box."""
873
+
874
+ width: float
875
+ """The width of the bounding box."""
876
+
877
+
878
+ class OutputChunkSegmentOcr(BaseModel):
879
+ bbox: OutputChunkSegmentOcrBbox
880
+ """Bounding box for an item. It is used for chunks, segments and OCR results."""
881
+
882
+ text: str
883
+ """The recognized text of the OCR result."""
884
+
885
+ confidence: Optional[float] = None
886
+ """The confidence score of the recognized text."""
887
+
888
+
889
+ class OutputChunkSegmentSSCellStyle(BaseModel):
890
+ align: Optional[Literal["Left", "Center", "Right", "Justify"]] = None
891
+ """Alignment of the cell content."""
892
+
893
+ bg_color: Optional[str] = None
894
+ """Background color of the cell (e.g., "#FFFFFF" or "#DAE3F3")."""
895
+
896
+ font_face: Optional[str] = None
897
+ """Font face/family of the cell (e.g., "Arial", "Daytona")."""
898
+
899
+ is_bold: Optional[bool] = None
900
+ """Whether the cell content is bold."""
901
+
902
+ text_color: Optional[str] = None
903
+ """Text color of the cell (e.g., "#000000" or "red")."""
904
+
905
+ valign: Optional[Literal["Top", "Middle", "Bottom", "Baseline"]] = None
906
+ """Vertical alignment of the cell content."""
907
+
908
+
909
+ class OutputChunkSegmentSSCell(BaseModel):
910
+ cell_id: str
911
+ """The cell ID."""
912
+
913
+ range: str
914
+ """Range of the cell."""
915
+
916
+ text: str
917
+ """Text content of the cell."""
918
+
919
+ formula: Optional[str] = None
920
+ """Formula of the cell."""
921
+
922
+ hyperlink: Optional[str] = None
923
+ """Hyperlink URL if the cell contains a link (e.g., "https://www.chunkr.ai")."""
924
+
925
+ style: Optional[OutputChunkSegmentSSCellStyle] = None
926
+ """Styling information for the cell including colors, fonts, and formatting."""
927
+
928
+ value: Optional[str] = None
929
+ """The computed/evaluated value of the cell.
930
+
931
+ This represents the actual result after evaluating any formulas, as opposed to
932
+ the raw text content. For cells with formulas, this is the calculated result;
933
+ for cells with static content, this is typically the same as the text field.
934
+
935
+ Example: text might show "3.14" (formatted to 2 decimal places) while value
936
+ could be "3.141592653589793" (full precision).
937
+ """
938
+
939
+
940
+ class OutputChunkSegmentSSHeaderBbox(BaseModel):
941
+ height: float
942
+ """The height of the bounding box."""
943
+
944
+ left: float
945
+ """The left coordinate of the bounding box."""
946
+
947
+ top: float
948
+ """The top coordinate of the bounding box."""
949
+
950
+ width: float
951
+ """The width of the bounding box."""
952
+
953
+
954
+ class OutputChunkSegmentSSHeaderOcrBbox(BaseModel):
955
+ height: float
956
+ """The height of the bounding box."""
957
+
958
+ left: float
959
+ """The left coordinate of the bounding box."""
960
+
961
+ top: float
962
+ """The top coordinate of the bounding box."""
963
+
964
+ width: float
965
+ """The width of the bounding box."""
966
+
967
+
968
+ class OutputChunkSegmentSSHeaderOcr(BaseModel):
969
+ bbox: OutputChunkSegmentSSHeaderOcrBbox
970
+ """Bounding box for an item. It is used for chunks, segments and OCR results."""
971
+
972
+ text: str
973
+ """The recognized text of the OCR result."""
974
+
975
+ confidence: Optional[float] = None
976
+ """The confidence score of the recognized text."""
977
+
978
+
979
+ class OutputChunkSegment(BaseModel):
980
+ bbox: OutputChunkSegmentBbox
981
+ """Bounding box for an item. It is used for chunks, segments and OCR results."""
982
+
983
+ page_height: float
984
+ """Height of the page/sheet containing the segment."""
985
+
986
+ page_number: int
987
+ """Page number/Sheet number of the segment."""
988
+
989
+ page_width: float
990
+ """Width of the page/sheet containing the segment."""
991
+
992
+ segment_id: str
993
+ """Unique identifier for the segment."""
994
+
995
+ segment_type: Literal[
996
+ "Caption",
997
+ "Footnote",
998
+ "Formula",
999
+ "ListItem",
1000
+ "Page",
1001
+ "PageFooter",
1002
+ "PageHeader",
1003
+ "Picture",
1004
+ "SectionHeader",
1005
+ "Table",
1006
+ "Text",
1007
+ "Title",
1008
+ ]
1009
+ """
1010
+ All the possible types for a segment. Note: Different configurations will
1011
+ produce different types. Please refer to the documentation for more information.
1012
+ """
1013
+
1014
+ confidence: Optional[float] = None
1015
+ """Confidence score of the layout analysis model"""
1016
+
1017
+ content: Optional[str] = None
1018
+ """
1019
+ Content of the segment, will be either HTML or Markdown, depending on format
1020
+ chosen.
1021
+ """
1022
+
1023
+ description: Optional[str] = None
1024
+ """Description of the segment, generated by the LLM."""
1025
+
1026
+ embed: Optional[str] = None
1027
+ """Embeddable content of the segment."""
1028
+
1029
+ html: Optional[str] = None
1030
+ """HTML representation of the segment."""
1031
+
1032
+ image: Optional[str] = None
1033
+ """Presigned URL to the image of the segment."""
1034
+
1035
+ llm: Optional[str] = None
1036
+ """LLM representation of the segment."""
1037
+
1038
+ markdown: Optional[str] = None
1039
+ """Markdown representation of the segment."""
1040
+
1041
+ ocr: Optional[List[OutputChunkSegmentOcr]] = None
1042
+ """OCR results for the segment."""
1043
+
1044
+ segment_length: Optional[int] = None
1045
+ """Length of the segment in tokens."""
1046
+
1047
+ ss_cells: Optional[List[OutputChunkSegmentSSCell]] = None
1048
+ """Cells of the segment. Only used for Spreadsheets."""
1049
+
1050
+ ss_header_bbox: Optional[OutputChunkSegmentSSHeaderBbox] = None
1051
+ """Bounding box of the header of the segment, if found.
1052
+
1053
+ Only used for Spreadsheets.
1054
+ """
1055
+
1056
+ ss_header_ocr: Optional[List[OutputChunkSegmentSSHeaderOcr]] = None
1057
+ """OCR results of the header of the segment, if found. Only used for Spreadsheets."""
1058
+
1059
+ ss_header_range: Optional[str] = None
1060
+ """
1061
+ Header range of the segment, if found. The header can have overlap with the
1062
+ `segment.range` if the table contains the header, if the header is located in a
1063
+ different sheet, the header range will have no overlap with the `segment.range`.
1064
+ Only used for Spreadsheets.
1065
+ """
1066
+
1067
+ ss_header_text: Optional[str] = None
1068
+ """Text content of the header of the segment, if found.
1069
+
1070
+ Only used for Spreadsheets.
1071
+ """
1072
+
1073
+ ss_range: Optional[str] = None
1074
+ """Range of the segment in Excel notation (e.g., A1:B5).
1075
+
1076
+ Only used for Spreadsheets.
1077
+ """
1078
+
1079
+ ss_sheet_name: Optional[str] = None
1080
+ """Name of the sheet containing the segment. Only used for Spreadsheets."""
1081
+
1082
+ text: Optional[str] = None
1083
+ """Text content of the segment. Calculated by the OCR results."""
1084
+
1085
+
1086
+ class OutputChunk(BaseModel):
1087
+ chunk_length: int
1088
+ """The total number of tokens in the `embed` field of the chunk.
1089
+
1090
+ Calculated by the `tokenizer`.
1091
+ """
1092
+
1093
+ segments: List[OutputChunkSegment]
1094
+ """
1095
+ Collection of document segments that form this chunk. When
1096
+ `target_chunk_length` > 0, contains the maximum number of segments that fit
1097
+ within that length (segments remain intact). Otherwise, contains exactly one
1098
+ segment.
1099
+ """
1100
+
1101
+ chunk_id: Optional[str] = None
1102
+ """The unique identifier for the chunk."""
1103
+
1104
+ content: Optional[str] = None
1105
+ """The content of the chunk.
1106
+
1107
+ This is the text that is generated by combining the `content` field from each
1108
+ segment. Can be used provided as context to the LLM.
1109
+ """
1110
+
1111
+ embed: Optional[str] = None
1112
+ """Suggested text to be embedded for the chunk.
1113
+
1114
+ This text is generated by combining the `embed` field from each segment.
1115
+ """
1116
+
1117
+
1118
+ class OutputPage(BaseModel):
1119
+ image: str
1120
+ """The presigned URL of the page/sheet image."""
1121
+
1122
+ page_height: float
1123
+ """The number of pages in the file."""
1124
+
1125
+ page_number: int
1126
+ """The number of pages in the file."""
1127
+
1128
+ page_width: float
1129
+ """The number of pages in the file."""
1130
+
1131
+ dpi: Optional[float] = None
1132
+ """DPI of the page/sheet. All cropped images are scaled to this DPI."""
1133
+
1134
+ ss_sheet_name: Optional[str] = None
1135
+ """The name of the sheet containing the page. Only used for Spreadsheets."""
1136
+
1137
+
1138
+ class Output(BaseModel):
1139
+ chunks: List[OutputChunk]
1140
+ """Collection of document chunks, where each chunk contains one or more segments"""
1141
+
1142
+ file_name: Optional[str] = None
1143
+ """The name of the file."""
1144
+
1145
+ mime_type: Optional[str] = None
1146
+ """The MIME type of the file."""
1147
+
1148
+ page_count: Optional[int] = None
1149
+ """The number of pages in the file."""
1150
+
1151
+ pages: Optional[List[OutputPage]] = None
1152
+ """The pages of the file. Includes the image and metadata for each page."""
1153
+
1154
+ pdf_url: Optional[str] = None
1155
+ """The presigned URL of the PDF file."""
1156
+
1157
+
1158
+ class Task(BaseModel):
1159
+ configuration: Configuration
1160
+
1161
+ created_at: datetime
1162
+ """The date and time when the task was created and queued."""
1163
+
1164
+ message: str
1165
+ """A message describing the task's status or any errors that occurred."""
1166
+
1167
+ status: Literal["Starting", "Processing", "Succeeded", "Failed", "Cancelled"]
1168
+ """The status of the task."""
1169
+
1170
+ task_id: str
1171
+ """The unique identifier for the task."""
1172
+
1173
+ expires_at: Optional[datetime] = None
1174
+ """The date and time when the task will expire."""
1175
+
1176
+ finished_at: Optional[datetime] = None
1177
+ """The date and time when the task was finished."""
1178
+
1179
+ output: Optional[Output] = None
1180
+ """The processed results of a document analysis task"""
1181
+
1182
+ started_at: Optional[datetime] = None
1183
+ """The date and time when the task was started."""
1184
+
1185
+ task_url: Optional[str] = None
1186
+ """The presigned URL of the task."""