chunkr-ai 0.1.0__py3-none-any.whl → 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. chunkr_ai/__init__.py +89 -2
  2. chunkr_ai/_base_client.py +1995 -0
  3. chunkr_ai/_client.py +402 -0
  4. chunkr_ai/_compat.py +219 -0
  5. chunkr_ai/_constants.py +14 -0
  6. chunkr_ai/_exceptions.py +108 -0
  7. chunkr_ai/_files.py +123 -0
  8. chunkr_ai/_models.py +829 -0
  9. chunkr_ai/_qs.py +150 -0
  10. chunkr_ai/_resource.py +43 -0
  11. chunkr_ai/_response.py +830 -0
  12. chunkr_ai/_streaming.py +333 -0
  13. chunkr_ai/_types.py +219 -0
  14. chunkr_ai/_utils/__init__.py +57 -0
  15. chunkr_ai/_utils/_logs.py +25 -0
  16. chunkr_ai/_utils/_proxy.py +65 -0
  17. chunkr_ai/_utils/_reflection.py +42 -0
  18. chunkr_ai/_utils/_resources_proxy.py +24 -0
  19. chunkr_ai/_utils/_streams.py +12 -0
  20. chunkr_ai/_utils/_sync.py +86 -0
  21. chunkr_ai/_utils/_transform.py +447 -0
  22. chunkr_ai/_utils/_typing.py +151 -0
  23. chunkr_ai/_utils/_utils.py +422 -0
  24. chunkr_ai/_version.py +4 -0
  25. chunkr_ai/lib/.keep +4 -0
  26. chunkr_ai/pagination.py +71 -0
  27. chunkr_ai/resources/__init__.py +33 -0
  28. chunkr_ai/resources/health.py +136 -0
  29. chunkr_ai/resources/task.py +1166 -0
  30. chunkr_ai/types/__init__.py +27 -0
  31. chunkr_ai/types/auto_generation_config.py +39 -0
  32. chunkr_ai/types/auto_generation_config_param.py +39 -0
  33. chunkr_ai/types/bounding_box.py +19 -0
  34. chunkr_ai/types/chunk_processing.py +40 -0
  35. chunkr_ai/types/chunk_processing_param.py +42 -0
  36. chunkr_ai/types/health_check_response.py +7 -0
  37. chunkr_ai/types/ignore_generation_config.py +39 -0
  38. chunkr_ai/types/ignore_generation_config_param.py +39 -0
  39. chunkr_ai/types/llm_generation_config.py +39 -0
  40. chunkr_ai/types/llm_generation_config_param.py +39 -0
  41. chunkr_ai/types/llm_processing.py +36 -0
  42. chunkr_ai/types/llm_processing_param.py +36 -0
  43. chunkr_ai/types/picture_generation_config.py +39 -0
  44. chunkr_ai/types/picture_generation_config_param.py +39 -0
  45. chunkr_ai/types/segment_processing.py +280 -0
  46. chunkr_ai/types/segment_processing_param.py +281 -0
  47. chunkr_ai/types/table_generation_config.py +39 -0
  48. chunkr_ai/types/table_generation_config_param.py +39 -0
  49. chunkr_ai/types/task.py +379 -0
  50. chunkr_ai/types/task_get_params.py +18 -0
  51. chunkr_ai/types/task_list_params.py +37 -0
  52. chunkr_ai/types/task_parse_params.py +90 -0
  53. chunkr_ai/types/task_update_params.py +90 -0
  54. chunkr_ai-0.1.0a1.dist-info/METADATA +504 -0
  55. chunkr_ai-0.1.0a1.dist-info/RECORD +58 -0
  56. {chunkr_ai-0.1.0.dist-info → chunkr_ai-0.1.0a1.dist-info}/WHEEL +1 -2
  57. chunkr_ai-0.1.0a1.dist-info/licenses/LICENSE +201 -0
  58. chunkr_ai/api/auth.py +0 -13
  59. chunkr_ai/api/chunkr.py +0 -103
  60. chunkr_ai/api/chunkr_base.py +0 -185
  61. chunkr_ai/api/configuration.py +0 -313
  62. chunkr_ai/api/decorators.py +0 -101
  63. chunkr_ai/api/misc.py +0 -139
  64. chunkr_ai/api/protocol.py +0 -14
  65. chunkr_ai/api/task_response.py +0 -208
  66. chunkr_ai/models.py +0 -55
  67. chunkr_ai-0.1.0.dist-info/METADATA +0 -268
  68. chunkr_ai-0.1.0.dist-info/RECORD +0 -16
  69. chunkr_ai-0.1.0.dist-info/licenses/LICENSE +0 -21
  70. chunkr_ai-0.1.0.dist-info/top_level.txt +0 -1
  71. /chunkr_ai/{api/__init__.py → py.typed} +0 -0
@@ -0,0 +1,379 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import List, Optional
4
+ from datetime import datetime
5
+ from typing_extensions import Literal
6
+
7
+ from .._models import BaseModel
8
+ from .bounding_box import BoundingBox
9
+ from .llm_processing import LlmProcessing
10
+ from .chunk_processing import ChunkProcessing
11
+ from .segment_processing import SegmentProcessing
12
+
13
+ __all__ = [
14
+ "Task",
15
+ "Configuration",
16
+ "Output",
17
+ "OutputChunk",
18
+ "OutputChunkSegment",
19
+ "OutputChunkSegmentOcr",
20
+ "OutputChunkSegmentSSCell",
21
+ "OutputChunkSegmentSSCellStyle",
22
+ "OutputChunkSegmentSSHeaderOcr",
23
+ "OutputPage",
24
+ ]
25
+
26
+
27
+ class Configuration(BaseModel):
28
+ chunk_processing: ChunkProcessing
29
+ """Controls the setting for the chunking and post-processing of each chunk."""
30
+
31
+ error_handling: Literal["Fail", "Continue"]
32
+ """Controls how errors are handled during processing:
33
+
34
+ - `Fail`: Stops processing and fails the task when any error occurs
35
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
36
+ LLM refusals etc.)
37
+ """
38
+
39
+ llm_processing: LlmProcessing
40
+ """Controls the LLM used for the task."""
41
+
42
+ ocr_strategy: Literal["All", "Auto"]
43
+ """Controls the Optical Character Recognition (OCR) strategy.
44
+
45
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
46
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
47
+ text. When text layer is present the bounding boxes from the text layer are
48
+ used.
49
+ """
50
+
51
+ segment_processing: SegmentProcessing
52
+ """Defines how each segment type is handled when generating the final output.
53
+
54
+ Each segment uses one of three strategies. The chosen strategy controls: •
55
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
56
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
57
+ `Markdown`).
58
+
59
+ Optional flags such as image **cropping**, **extended context**, and **LLM
60
+ descriptions** further refine behaviour.
61
+
62
+ ---
63
+
64
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
65
+ `Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
66
+ description on) • `Picture` → **LLM** (Markdown, description on, cropping _All_)
67
+ • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
68
+ **Ignore** (removed from output)
69
+
70
+ ---
71
+
72
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
73
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
74
+ """
75
+
76
+ segmentation_strategy: Literal["LayoutAnalysis", "Page"]
77
+ """Controls the segmentation strategy:
78
+
79
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
80
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
81
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
82
+ - `Page`: Treats each page as a single segment. Faster processing, but without
83
+ layout element detection and only simple chunking.
84
+ """
85
+
86
+ expires_in: Optional[int] = None
87
+ """
88
+ The number of seconds until task is deleted. Expired tasks can **not** be
89
+ updated, polled or accessed via web interface.
90
+ """
91
+
92
+ high_resolution: Optional[bool] = None
93
+ """Whether to use high-resolution images for cropping and post-processing."""
94
+
95
+ input_file_url: Optional[str] = None
96
+ """The presigned URL of the input file."""
97
+
98
+ pipeline: Optional[Literal["Azure", "Chunkr"]] = None
99
+
100
+ target_chunk_length: Optional[int] = None
101
+ """The target number of words in each chunk.
102
+
103
+ If 0, each chunk will contain a single segment.
104
+ """
105
+
106
+
107
+ class OutputChunkSegmentOcr(BaseModel):
108
+ bbox: BoundingBox
109
+ """Bounding box for an item. It is used for chunks, segments and OCR results."""
110
+
111
+ text: str
112
+ """The recognized text of the OCR result."""
113
+
114
+ confidence: Optional[float] = None
115
+ """The confidence score of the recognized text."""
116
+
117
+
118
+ class OutputChunkSegmentSSCellStyle(BaseModel):
119
+ align: Optional[Literal["Left", "Center", "Right", "Justify"]] = None
120
+ """Alignment of the cell content."""
121
+
122
+ bg_color: Optional[str] = None
123
+ """Background color of the cell (e.g., "#FFFFFF" or "#DAE3F3")."""
124
+
125
+ font_face: Optional[str] = None
126
+ """Font face/family of the cell (e.g., "Arial", "Daytona")."""
127
+
128
+ is_bold: Optional[bool] = None
129
+ """Whether the cell content is bold."""
130
+
131
+ text_color: Optional[str] = None
132
+ """Text color of the cell (e.g., "#000000" or "red")."""
133
+
134
+ valign: Optional[Literal["Top", "Middle", "Bottom", "Baseline"]] = None
135
+ """Vertical alignment of the cell content."""
136
+
137
+
138
+ class OutputChunkSegmentSSCell(BaseModel):
139
+ cell_id: str
140
+ """The cell ID."""
141
+
142
+ range: str
143
+ """Range of the cell."""
144
+
145
+ text: str
146
+ """Text content of the cell."""
147
+
148
+ formula: Optional[str] = None
149
+ """Formula of the cell."""
150
+
151
+ hyperlink: Optional[str] = None
152
+ """Hyperlink URL if the cell contains a link (e.g., "https://www.chunkr.ai")."""
153
+
154
+ style: Optional[OutputChunkSegmentSSCellStyle] = None
155
+ """Styling information for the cell including colors, fonts, and formatting."""
156
+
157
+ value: Optional[str] = None
158
+ """The computed/evaluated value of the cell.
159
+
160
+ This represents the actual result after evaluating any formulas, as opposed to
161
+ the raw text content. For cells with formulas, this is the calculated result;
162
+ for cells with static content, this is typically the same as the text field.
163
+
164
+ Example: text might show "3.14" (formatted to 2 decimal places) while value
165
+ could be "3.141592653589793" (full precision).
166
+ """
167
+
168
+
169
+ class OutputChunkSegmentSSHeaderOcr(BaseModel):
170
+ bbox: BoundingBox
171
+ """Bounding box for an item. It is used for chunks, segments and OCR results."""
172
+
173
+ text: str
174
+ """The recognized text of the OCR result."""
175
+
176
+ confidence: Optional[float] = None
177
+ """The confidence score of the recognized text."""
178
+
179
+
180
+ class OutputChunkSegment(BaseModel):
181
+ bbox: BoundingBox
182
+ """Bounding box for an item. It is used for chunks, segments and OCR results."""
183
+
184
+ page_height: float
185
+ """Height of the page/sheet containing the segment."""
186
+
187
+ page_number: int
188
+ """Page number/Sheet number of the segment."""
189
+
190
+ page_width: float
191
+ """Width of the page/sheet containing the segment."""
192
+
193
+ segment_id: str
194
+ """Unique identifier for the segment."""
195
+
196
+ segment_type: Literal[
197
+ "Caption",
198
+ "Footnote",
199
+ "Formula",
200
+ "ListItem",
201
+ "Page",
202
+ "PageFooter",
203
+ "PageHeader",
204
+ "Picture",
205
+ "SectionHeader",
206
+ "Table",
207
+ "Text",
208
+ "Title",
209
+ ]
210
+ """
211
+ All the possible types for a segment. Note: Different configurations will
212
+ produce different types. Please refer to the documentation for more information.
213
+ """
214
+
215
+ confidence: Optional[float] = None
216
+ """Confidence score of the layout analysis model"""
217
+
218
+ content: Optional[str] = None
219
+ """
220
+ Content of the segment, will be either HTML or Markdown, depending on format
221
+ chosen.
222
+ """
223
+
224
+ description: Optional[str] = None
225
+ """Description of the segment, generated by the LLM."""
226
+
227
+ embed: Optional[str] = None
228
+ """Embeddable content of the segment."""
229
+
230
+ html: Optional[str] = None
231
+ """HTML representation of the segment."""
232
+
233
+ image: Optional[str] = None
234
+ """Presigned URL to the image of the segment."""
235
+
236
+ llm: Optional[str] = None
237
+ """LLM representation of the segment."""
238
+
239
+ markdown: Optional[str] = None
240
+ """Markdown representation of the segment."""
241
+
242
+ ocr: Optional[List[OutputChunkSegmentOcr]] = None
243
+ """OCR results for the segment."""
244
+
245
+ segment_length: Optional[int] = None
246
+ """Length of the segment in tokens."""
247
+
248
+ ss_cells: Optional[List[OutputChunkSegmentSSCell]] = None
249
+ """Cells of the segment. Only used for Spreadsheets."""
250
+
251
+ ss_header_bbox: Optional[BoundingBox] = None
252
+ """Bounding box of the header of the segment, if found.
253
+
254
+ Only used for Spreadsheets.
255
+ """
256
+
257
+ ss_header_ocr: Optional[List[OutputChunkSegmentSSHeaderOcr]] = None
258
+ """OCR results of the header of the segment, if found. Only used for Spreadsheets."""
259
+
260
+ ss_header_range: Optional[str] = None
261
+ """
262
+ Header range of the segment, if found. The header can have overlap with the
263
+ `segment.range` if the table contains the header, if the header is located in a
264
+ different sheet, the header range will have no overlap with the `segment.range`.
265
+ Only used for Spreadsheets.
266
+ """
267
+
268
+ ss_header_text: Optional[str] = None
269
+ """Text content of the header of the segment, if found.
270
+
271
+ Only used for Spreadsheets.
272
+ """
273
+
274
+ ss_range: Optional[str] = None
275
+ """Range of the segment in Excel notation (e.g., A1:B5).
276
+
277
+ Only used for Spreadsheets.
278
+ """
279
+
280
+ ss_sheet_name: Optional[str] = None
281
+ """Name of the sheet containing the segment. Only used for Spreadsheets."""
282
+
283
+ text: Optional[str] = None
284
+ """Text content of the segment. Calculated by the OCR results."""
285
+
286
+
287
+ class OutputChunk(BaseModel):
288
+ chunk_length: int
289
+ """The total number of tokens in the chunk. Calculated by the `tokenizer`."""
290
+
291
+ segments: List[OutputChunkSegment]
292
+ """
293
+ Collection of document segments that form this chunk. When
294
+ `target_chunk_length` > 0, contains the maximum number of segments that fit
295
+ within that length (segments remain intact). Otherwise, contains exactly one
296
+ segment.
297
+ """
298
+
299
+ chunk_id: Optional[str] = None
300
+ """The unique identifier for the chunk."""
301
+
302
+ embed: Optional[str] = None
303
+ """Suggested text to be embedded for the chunk.
304
+
305
+ This text is generated by combining the embed content from each segment
306
+ according to the configured embed sources (HTML, Markdown, LLM, or Content). Can
307
+ be configured using `embed_sources` in the `SegmentProcessing` configuration.
308
+ """
309
+
310
+
311
+ class OutputPage(BaseModel):
312
+ image: str
313
+ """The presigned URL of the page/sheet image."""
314
+
315
+ page_height: float
316
+ """The number of pages in the file."""
317
+
318
+ page_number: int
319
+ """The number of pages in the file."""
320
+
321
+ page_width: float
322
+ """The number of pages in the file."""
323
+
324
+ dpi: Optional[float] = None
325
+ """DPI of the page/sheet. All cropped images are scaled to this DPI."""
326
+
327
+ ss_sheet_name: Optional[str] = None
328
+ """The name of the sheet containing the page. Only used for Spreadsheets."""
329
+
330
+
331
+ class Output(BaseModel):
332
+ chunks: List[OutputChunk]
333
+ """Collection of document chunks, where each chunk contains one or more segments"""
334
+
335
+ file_name: Optional[str] = None
336
+ """The name of the file."""
337
+
338
+ mime_type: Optional[str] = None
339
+ """The MIME type of the file."""
340
+
341
+ page_count: Optional[int] = None
342
+ """The number of pages in the file."""
343
+
344
+ pages: Optional[List[OutputPage]] = None
345
+ """The pages of the file. Includes the image and metadata for each page."""
346
+
347
+ pdf_url: Optional[str] = None
348
+ """The presigned URL of the PDF file."""
349
+
350
+
351
+ class Task(BaseModel):
352
+ configuration: Configuration
353
+
354
+ created_at: datetime
355
+ """The date and time when the task was created and queued."""
356
+
357
+ message: str
358
+ """A message describing the task's status or any errors that occurred."""
359
+
360
+ status: Literal["Starting", "Processing", "Succeeded", "Failed", "Cancelled"]
361
+ """The status of the task."""
362
+
363
+ task_id: str
364
+ """The unique identifier for the task."""
365
+
366
+ expires_at: Optional[datetime] = None
367
+ """The date and time when the task will expire."""
368
+
369
+ finished_at: Optional[datetime] = None
370
+ """The date and time when the task was finished."""
371
+
372
+ output: Optional[Output] = None
373
+ """The processed results of a document analysis task"""
374
+
375
+ started_at: Optional[datetime] = None
376
+ """The date and time when the task was started."""
377
+
378
+ task_url: Optional[str] = None
379
+ """The presigned URL of the task."""
@@ -0,0 +1,18 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing_extensions import TypedDict
6
+
7
+ __all__ = ["TaskGetParams"]
8
+
9
+
10
+ class TaskGetParams(TypedDict, total=False):
11
+ base64_urls: bool
12
+ """Whether to return base64 encoded URLs.
13
+
14
+ If false, the URLs will be returned as presigned URLs.
15
+ """
16
+
17
+ include_chunks: bool
18
+ """Whether to include chunks in the output response"""
@@ -0,0 +1,37 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Union
6
+ from datetime import datetime
7
+ from typing_extensions import Literal, Annotated, TypedDict
8
+
9
+ from .._utils import PropertyInfo
10
+
11
+ __all__ = ["TaskListParams"]
12
+
13
+
14
+ class TaskListParams(TypedDict, total=False):
15
+ base64_urls: bool
16
+ """Whether to return base64 encoded URLs.
17
+
18
+ If false, the URLs will be returned as presigned URLs.
19
+ """
20
+
21
+ cursor: Annotated[Union[str, datetime], PropertyInfo(format="iso8601")]
22
+ """Cursor for pagination (timestamp)"""
23
+
24
+ end: Annotated[Union[str, datetime], PropertyInfo(format="iso8601")]
25
+ """End date"""
26
+
27
+ include_chunks: bool
28
+ """Whether to include chunks in the output response"""
29
+
30
+ limit: int
31
+ """Number of tasks per page"""
32
+
33
+ sort: Literal["asc", "desc"]
34
+ """Sort order: 'asc' for ascending, 'desc' for descending (default)"""
35
+
36
+ start: Annotated[Union[str, datetime], PropertyInfo(format="iso8601")]
37
+ """Start date"""
@@ -0,0 +1,90 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+ from typing_extensions import Literal, Required, TypedDict
7
+
8
+ from .llm_processing_param import LlmProcessingParam
9
+ from .chunk_processing_param import ChunkProcessingParam
10
+ from .segment_processing_param import SegmentProcessingParam
11
+
12
+ __all__ = ["TaskParseParams"]
13
+
14
+
15
+ class TaskParseParams(TypedDict, total=False):
16
+ file: Required[str]
17
+ """The file to be uploaded. Can be a URL or a base64 encoded file."""
18
+
19
+ chunk_processing: Optional[ChunkProcessingParam]
20
+ """Controls the setting for the chunking and post-processing of each chunk."""
21
+
22
+ error_handling: Optional[Literal["Fail", "Continue"]]
23
+ """Controls how errors are handled during processing:
24
+
25
+ - `Fail`: Stops processing and fails the task when any error occurs
26
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
27
+ LLM refusals etc.)
28
+ """
29
+
30
+ expires_in: Optional[int]
31
+ """
32
+ The number of seconds until task is deleted. Expired tasks can **not** be
33
+ updated, polled or accessed via web interface.
34
+ """
35
+
36
+ file_name: Optional[str]
37
+ """The name of the file to be uploaded. If not set a name will be generated."""
38
+
39
+ llm_processing: Optional[LlmProcessingParam]
40
+ """Controls the LLM used for the task."""
41
+
42
+ ocr_strategy: Optional[Literal["All", "Auto"]]
43
+ """Controls the Optical Character Recognition (OCR) strategy.
44
+
45
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
46
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
47
+ text. When text layer is present the bounding boxes from the text layer are
48
+ used.
49
+ """
50
+
51
+ pipeline: Optional[Literal["Azure", "Chunkr"]]
52
+ """
53
+ Choose the provider whose models will be used for segmentation and OCR. The
54
+ output will be unified to the Chunkr `output` format.
55
+ """
56
+
57
+ segment_processing: Optional[SegmentProcessingParam]
58
+ """Defines how each segment type is handled when generating the final output.
59
+
60
+ Each segment uses one of three strategies. The chosen strategy controls: •
61
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
62
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
63
+ `Markdown`).
64
+
65
+ Optional flags such as image **cropping**, **extended context**, and **LLM
66
+ descriptions** further refine behaviour.
67
+
68
+ ---
69
+
70
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
71
+ `Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
72
+ description on) • `Picture` → **LLM** (Markdown, description on, cropping _All_)
73
+ • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
74
+ **Ignore** (removed from output)
75
+
76
+ ---
77
+
78
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
79
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
80
+ """
81
+
82
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]]
83
+ """Controls the segmentation strategy:
84
+
85
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
86
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
87
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
88
+ - `Page`: Treats each page as a single segment. Faster processing, but without
89
+ layout element detection and only simple chunking.
90
+ """
@@ -0,0 +1,90 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+ from typing_extensions import Literal, TypedDict
7
+
8
+ from .llm_processing_param import LlmProcessingParam
9
+ from .chunk_processing_param import ChunkProcessingParam
10
+ from .segment_processing_param import SegmentProcessingParam
11
+
12
+ __all__ = ["TaskUpdateParams"]
13
+
14
+
15
+ class TaskUpdateParams(TypedDict, total=False):
16
+ chunk_processing: Optional[ChunkProcessingParam]
17
+ """Controls the setting for the chunking and post-processing of each chunk."""
18
+
19
+ error_handling: Optional[Literal["Fail", "Continue"]]
20
+ """Controls how errors are handled during processing:
21
+
22
+ - `Fail`: Stops processing and fails the task when any error occurs
23
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
24
+ LLM refusals etc.)
25
+ """
26
+
27
+ expires_in: Optional[int]
28
+ """
29
+ The number of seconds until task is deleted. Expired tasks can **not** be
30
+ updated, polled or accessed via web interface.
31
+ """
32
+
33
+ high_resolution: Optional[bool]
34
+ """Whether to use high-resolution images for cropping and post-processing.
35
+
36
+ (Latency penalty: ~7 seconds per page)
37
+ """
38
+
39
+ llm_processing: Optional[LlmProcessingParam]
40
+ """Controls the LLM used for the task."""
41
+
42
+ ocr_strategy: Optional[Literal["All", "Auto"]]
43
+ """Controls the Optical Character Recognition (OCR) strategy.
44
+
45
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
46
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
47
+ text. When text layer is present the bounding boxes from the text layer are
48
+ used.
49
+ """
50
+
51
+ pipeline: Optional[Literal["Azure", "Chunkr"]]
52
+ """
53
+ Choose the provider whose models will be used for segmentation and OCR. The
54
+ output will be unified to the Chunkr `output` format.
55
+ """
56
+
57
+ segment_processing: Optional[SegmentProcessingParam]
58
+ """Defines how each segment type is handled when generating the final output.
59
+
60
+ Each segment uses one of three strategies. The chosen strategy controls: •
61
+ Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
62
+ content is produced (rule-based vs. LLM). • The output format (`Html` or
63
+ `Markdown`).
64
+
65
+ Optional flags such as image **cropping**, **extended context**, and **LLM
66
+ descriptions** further refine behaviour.
67
+
68
+ ---
69
+
70
+ **Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
71
+ `Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
72
+ description on) • `Picture` → **LLM** (Markdown, description on, cropping _All_)
73
+ • `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
74
+ **Ignore** (removed from output)
75
+
76
+ ---
77
+
78
+ **Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
79
+ generate content with an LLM. • **Ignore** – exclude the segment entirely.
80
+ """
81
+
82
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]]
83
+ """Controls the segmentation strategy:
84
+
85
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
86
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
87
+ segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
88
+ - `Page`: Treats each page as a single segment. Faster processing, but without
89
+ layout element detection and only simple chunking.
90
+ """