chunkr-ai 0.1.0a1__py3-none-any.whl → 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/_client.py +2 -1
- chunkr_ai/_version.py +1 -1
- chunkr_ai/resources/task/__init__.py +33 -0
- chunkr_ai/resources/{task.py → task/parse.py} +146 -696
- chunkr_ai/resources/task/task.py +664 -0
- chunkr_ai/types/__init__.py +0 -19
- chunkr_ai/types/task/__init__.py +7 -0
- chunkr_ai/types/task/parse_create_params.py +806 -0
- chunkr_ai/types/task/parse_update_params.py +806 -0
- chunkr_ai/types/task/task.py +1186 -0
- {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a2.dist-info}/METADATA +12 -12
- {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a2.dist-info}/RECORD +14 -28
- chunkr_ai/types/auto_generation_config.py +0 -39
- chunkr_ai/types/auto_generation_config_param.py +0 -39
- chunkr_ai/types/bounding_box.py +0 -19
- chunkr_ai/types/chunk_processing.py +0 -40
- chunkr_ai/types/chunk_processing_param.py +0 -42
- chunkr_ai/types/ignore_generation_config.py +0 -39
- chunkr_ai/types/ignore_generation_config_param.py +0 -39
- chunkr_ai/types/llm_generation_config.py +0 -39
- chunkr_ai/types/llm_generation_config_param.py +0 -39
- chunkr_ai/types/llm_processing.py +0 -36
- chunkr_ai/types/llm_processing_param.py +0 -36
- chunkr_ai/types/picture_generation_config.py +0 -39
- chunkr_ai/types/picture_generation_config_param.py +0 -39
- chunkr_ai/types/segment_processing.py +0 -280
- chunkr_ai/types/segment_processing_param.py +0 -281
- chunkr_ai/types/table_generation_config.py +0 -39
- chunkr_ai/types/table_generation_config_param.py +0 -39
- chunkr_ai/types/task.py +0 -379
- chunkr_ai/types/task_parse_params.py +0 -90
- chunkr_ai/types/task_update_params.py +0 -90
- {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a2.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a1.dist-info → chunkr_ai-0.1.0a2.dist-info}/licenses/LICENSE +0 -0
chunkr_ai/types/task.py
DELETED
@@ -1,379 +0,0 @@
|
|
1
|
-
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
-
|
3
|
-
from typing import List, Optional
|
4
|
-
from datetime import datetime
|
5
|
-
from typing_extensions import Literal
|
6
|
-
|
7
|
-
from .._models import BaseModel
|
8
|
-
from .bounding_box import BoundingBox
|
9
|
-
from .llm_processing import LlmProcessing
|
10
|
-
from .chunk_processing import ChunkProcessing
|
11
|
-
from .segment_processing import SegmentProcessing
|
12
|
-
|
13
|
-
__all__ = [
|
14
|
-
"Task",
|
15
|
-
"Configuration",
|
16
|
-
"Output",
|
17
|
-
"OutputChunk",
|
18
|
-
"OutputChunkSegment",
|
19
|
-
"OutputChunkSegmentOcr",
|
20
|
-
"OutputChunkSegmentSSCell",
|
21
|
-
"OutputChunkSegmentSSCellStyle",
|
22
|
-
"OutputChunkSegmentSSHeaderOcr",
|
23
|
-
"OutputPage",
|
24
|
-
]
|
25
|
-
|
26
|
-
|
27
|
-
class Configuration(BaseModel):
|
28
|
-
chunk_processing: ChunkProcessing
|
29
|
-
"""Controls the setting for the chunking and post-processing of each chunk."""
|
30
|
-
|
31
|
-
error_handling: Literal["Fail", "Continue"]
|
32
|
-
"""Controls how errors are handled during processing:
|
33
|
-
|
34
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
35
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
36
|
-
LLM refusals etc.)
|
37
|
-
"""
|
38
|
-
|
39
|
-
llm_processing: LlmProcessing
|
40
|
-
"""Controls the LLM used for the task."""
|
41
|
-
|
42
|
-
ocr_strategy: Literal["All", "Auto"]
|
43
|
-
"""Controls the Optical Character Recognition (OCR) strategy.
|
44
|
-
|
45
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
46
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
47
|
-
text. When text layer is present the bounding boxes from the text layer are
|
48
|
-
used.
|
49
|
-
"""
|
50
|
-
|
51
|
-
segment_processing: SegmentProcessing
|
52
|
-
"""Defines how each segment type is handled when generating the final output.
|
53
|
-
|
54
|
-
Each segment uses one of three strategies. The chosen strategy controls: •
|
55
|
-
Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
|
56
|
-
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
57
|
-
`Markdown`).
|
58
|
-
|
59
|
-
Optional flags such as image **cropping**, **extended context**, and **LLM
|
60
|
-
descriptions** further refine behaviour.
|
61
|
-
|
62
|
-
---
|
63
|
-
|
64
|
-
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
65
|
-
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
|
66
|
-
description on) • `Picture` → **LLM** (Markdown, description on, cropping _All_)
|
67
|
-
• `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
|
68
|
-
**Ignore** (removed from output)
|
69
|
-
|
70
|
-
---
|
71
|
-
|
72
|
-
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
73
|
-
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
74
|
-
"""
|
75
|
-
|
76
|
-
segmentation_strategy: Literal["LayoutAnalysis", "Page"]
|
77
|
-
"""Controls the segmentation strategy:
|
78
|
-
|
79
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
80
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
81
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
82
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
83
|
-
layout element detection and only simple chunking.
|
84
|
-
"""
|
85
|
-
|
86
|
-
expires_in: Optional[int] = None
|
87
|
-
"""
|
88
|
-
The number of seconds until task is deleted. Expired tasks can **not** be
|
89
|
-
updated, polled or accessed via web interface.
|
90
|
-
"""
|
91
|
-
|
92
|
-
high_resolution: Optional[bool] = None
|
93
|
-
"""Whether to use high-resolution images for cropping and post-processing."""
|
94
|
-
|
95
|
-
input_file_url: Optional[str] = None
|
96
|
-
"""The presigned URL of the input file."""
|
97
|
-
|
98
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]] = None
|
99
|
-
|
100
|
-
target_chunk_length: Optional[int] = None
|
101
|
-
"""The target number of words in each chunk.
|
102
|
-
|
103
|
-
If 0, each chunk will contain a single segment.
|
104
|
-
"""
|
105
|
-
|
106
|
-
|
107
|
-
class OutputChunkSegmentOcr(BaseModel):
|
108
|
-
bbox: BoundingBox
|
109
|
-
"""Bounding box for an item. It is used for chunks, segments and OCR results."""
|
110
|
-
|
111
|
-
text: str
|
112
|
-
"""The recognized text of the OCR result."""
|
113
|
-
|
114
|
-
confidence: Optional[float] = None
|
115
|
-
"""The confidence score of the recognized text."""
|
116
|
-
|
117
|
-
|
118
|
-
class OutputChunkSegmentSSCellStyle(BaseModel):
|
119
|
-
align: Optional[Literal["Left", "Center", "Right", "Justify"]] = None
|
120
|
-
"""Alignment of the cell content."""
|
121
|
-
|
122
|
-
bg_color: Optional[str] = None
|
123
|
-
"""Background color of the cell (e.g., "#FFFFFF" or "#DAE3F3")."""
|
124
|
-
|
125
|
-
font_face: Optional[str] = None
|
126
|
-
"""Font face/family of the cell (e.g., "Arial", "Daytona")."""
|
127
|
-
|
128
|
-
is_bold: Optional[bool] = None
|
129
|
-
"""Whether the cell content is bold."""
|
130
|
-
|
131
|
-
text_color: Optional[str] = None
|
132
|
-
"""Text color of the cell (e.g., "#000000" or "red")."""
|
133
|
-
|
134
|
-
valign: Optional[Literal["Top", "Middle", "Bottom", "Baseline"]] = None
|
135
|
-
"""Vertical alignment of the cell content."""
|
136
|
-
|
137
|
-
|
138
|
-
class OutputChunkSegmentSSCell(BaseModel):
|
139
|
-
cell_id: str
|
140
|
-
"""The cell ID."""
|
141
|
-
|
142
|
-
range: str
|
143
|
-
"""Range of the cell."""
|
144
|
-
|
145
|
-
text: str
|
146
|
-
"""Text content of the cell."""
|
147
|
-
|
148
|
-
formula: Optional[str] = None
|
149
|
-
"""Formula of the cell."""
|
150
|
-
|
151
|
-
hyperlink: Optional[str] = None
|
152
|
-
"""Hyperlink URL if the cell contains a link (e.g., "https://www.chunkr.ai")."""
|
153
|
-
|
154
|
-
style: Optional[OutputChunkSegmentSSCellStyle] = None
|
155
|
-
"""Styling information for the cell including colors, fonts, and formatting."""
|
156
|
-
|
157
|
-
value: Optional[str] = None
|
158
|
-
"""The computed/evaluated value of the cell.
|
159
|
-
|
160
|
-
This represents the actual result after evaluating any formulas, as opposed to
|
161
|
-
the raw text content. For cells with formulas, this is the calculated result;
|
162
|
-
for cells with static content, this is typically the same as the text field.
|
163
|
-
|
164
|
-
Example: text might show "3.14" (formatted to 2 decimal places) while value
|
165
|
-
could be "3.141592653589793" (full precision).
|
166
|
-
"""
|
167
|
-
|
168
|
-
|
169
|
-
class OutputChunkSegmentSSHeaderOcr(BaseModel):
|
170
|
-
bbox: BoundingBox
|
171
|
-
"""Bounding box for an item. It is used for chunks, segments and OCR results."""
|
172
|
-
|
173
|
-
text: str
|
174
|
-
"""The recognized text of the OCR result."""
|
175
|
-
|
176
|
-
confidence: Optional[float] = None
|
177
|
-
"""The confidence score of the recognized text."""
|
178
|
-
|
179
|
-
|
180
|
-
class OutputChunkSegment(BaseModel):
|
181
|
-
bbox: BoundingBox
|
182
|
-
"""Bounding box for an item. It is used for chunks, segments and OCR results."""
|
183
|
-
|
184
|
-
page_height: float
|
185
|
-
"""Height of the page/sheet containing the segment."""
|
186
|
-
|
187
|
-
page_number: int
|
188
|
-
"""Page number/Sheet number of the segment."""
|
189
|
-
|
190
|
-
page_width: float
|
191
|
-
"""Width of the page/sheet containing the segment."""
|
192
|
-
|
193
|
-
segment_id: str
|
194
|
-
"""Unique identifier for the segment."""
|
195
|
-
|
196
|
-
segment_type: Literal[
|
197
|
-
"Caption",
|
198
|
-
"Footnote",
|
199
|
-
"Formula",
|
200
|
-
"ListItem",
|
201
|
-
"Page",
|
202
|
-
"PageFooter",
|
203
|
-
"PageHeader",
|
204
|
-
"Picture",
|
205
|
-
"SectionHeader",
|
206
|
-
"Table",
|
207
|
-
"Text",
|
208
|
-
"Title",
|
209
|
-
]
|
210
|
-
"""
|
211
|
-
All the possible types for a segment. Note: Different configurations will
|
212
|
-
produce different types. Please refer to the documentation for more information.
|
213
|
-
"""
|
214
|
-
|
215
|
-
confidence: Optional[float] = None
|
216
|
-
"""Confidence score of the layout analysis model"""
|
217
|
-
|
218
|
-
content: Optional[str] = None
|
219
|
-
"""
|
220
|
-
Content of the segment, will be either HTML or Markdown, depending on format
|
221
|
-
chosen.
|
222
|
-
"""
|
223
|
-
|
224
|
-
description: Optional[str] = None
|
225
|
-
"""Description of the segment, generated by the LLM."""
|
226
|
-
|
227
|
-
embed: Optional[str] = None
|
228
|
-
"""Embeddable content of the segment."""
|
229
|
-
|
230
|
-
html: Optional[str] = None
|
231
|
-
"""HTML representation of the segment."""
|
232
|
-
|
233
|
-
image: Optional[str] = None
|
234
|
-
"""Presigned URL to the image of the segment."""
|
235
|
-
|
236
|
-
llm: Optional[str] = None
|
237
|
-
"""LLM representation of the segment."""
|
238
|
-
|
239
|
-
markdown: Optional[str] = None
|
240
|
-
"""Markdown representation of the segment."""
|
241
|
-
|
242
|
-
ocr: Optional[List[OutputChunkSegmentOcr]] = None
|
243
|
-
"""OCR results for the segment."""
|
244
|
-
|
245
|
-
segment_length: Optional[int] = None
|
246
|
-
"""Length of the segment in tokens."""
|
247
|
-
|
248
|
-
ss_cells: Optional[List[OutputChunkSegmentSSCell]] = None
|
249
|
-
"""Cells of the segment. Only used for Spreadsheets."""
|
250
|
-
|
251
|
-
ss_header_bbox: Optional[BoundingBox] = None
|
252
|
-
"""Bounding box of the header of the segment, if found.
|
253
|
-
|
254
|
-
Only used for Spreadsheets.
|
255
|
-
"""
|
256
|
-
|
257
|
-
ss_header_ocr: Optional[List[OutputChunkSegmentSSHeaderOcr]] = None
|
258
|
-
"""OCR results of the header of the segment, if found. Only used for Spreadsheets."""
|
259
|
-
|
260
|
-
ss_header_range: Optional[str] = None
|
261
|
-
"""
|
262
|
-
Header range of the segment, if found. The header can have overlap with the
|
263
|
-
`segment.range` if the table contains the header, if the header is located in a
|
264
|
-
different sheet, the header range will have no overlap with the `segment.range`.
|
265
|
-
Only used for Spreadsheets.
|
266
|
-
"""
|
267
|
-
|
268
|
-
ss_header_text: Optional[str] = None
|
269
|
-
"""Text content of the header of the segment, if found.
|
270
|
-
|
271
|
-
Only used for Spreadsheets.
|
272
|
-
"""
|
273
|
-
|
274
|
-
ss_range: Optional[str] = None
|
275
|
-
"""Range of the segment in Excel notation (e.g., A1:B5).
|
276
|
-
|
277
|
-
Only used for Spreadsheets.
|
278
|
-
"""
|
279
|
-
|
280
|
-
ss_sheet_name: Optional[str] = None
|
281
|
-
"""Name of the sheet containing the segment. Only used for Spreadsheets."""
|
282
|
-
|
283
|
-
text: Optional[str] = None
|
284
|
-
"""Text content of the segment. Calculated by the OCR results."""
|
285
|
-
|
286
|
-
|
287
|
-
class OutputChunk(BaseModel):
|
288
|
-
chunk_length: int
|
289
|
-
"""The total number of tokens in the chunk. Calculated by the `tokenizer`."""
|
290
|
-
|
291
|
-
segments: List[OutputChunkSegment]
|
292
|
-
"""
|
293
|
-
Collection of document segments that form this chunk. When
|
294
|
-
`target_chunk_length` > 0, contains the maximum number of segments that fit
|
295
|
-
within that length (segments remain intact). Otherwise, contains exactly one
|
296
|
-
segment.
|
297
|
-
"""
|
298
|
-
|
299
|
-
chunk_id: Optional[str] = None
|
300
|
-
"""The unique identifier for the chunk."""
|
301
|
-
|
302
|
-
embed: Optional[str] = None
|
303
|
-
"""Suggested text to be embedded for the chunk.
|
304
|
-
|
305
|
-
This text is generated by combining the embed content from each segment
|
306
|
-
according to the configured embed sources (HTML, Markdown, LLM, or Content). Can
|
307
|
-
be configured using `embed_sources` in the `SegmentProcessing` configuration.
|
308
|
-
"""
|
309
|
-
|
310
|
-
|
311
|
-
class OutputPage(BaseModel):
|
312
|
-
image: str
|
313
|
-
"""The presigned URL of the page/sheet image."""
|
314
|
-
|
315
|
-
page_height: float
|
316
|
-
"""The number of pages in the file."""
|
317
|
-
|
318
|
-
page_number: int
|
319
|
-
"""The number of pages in the file."""
|
320
|
-
|
321
|
-
page_width: float
|
322
|
-
"""The number of pages in the file."""
|
323
|
-
|
324
|
-
dpi: Optional[float] = None
|
325
|
-
"""DPI of the page/sheet. All cropped images are scaled to this DPI."""
|
326
|
-
|
327
|
-
ss_sheet_name: Optional[str] = None
|
328
|
-
"""The name of the sheet containing the page. Only used for Spreadsheets."""
|
329
|
-
|
330
|
-
|
331
|
-
class Output(BaseModel):
|
332
|
-
chunks: List[OutputChunk]
|
333
|
-
"""Collection of document chunks, where each chunk contains one or more segments"""
|
334
|
-
|
335
|
-
file_name: Optional[str] = None
|
336
|
-
"""The name of the file."""
|
337
|
-
|
338
|
-
mime_type: Optional[str] = None
|
339
|
-
"""The MIME type of the file."""
|
340
|
-
|
341
|
-
page_count: Optional[int] = None
|
342
|
-
"""The number of pages in the file."""
|
343
|
-
|
344
|
-
pages: Optional[List[OutputPage]] = None
|
345
|
-
"""The pages of the file. Includes the image and metadata for each page."""
|
346
|
-
|
347
|
-
pdf_url: Optional[str] = None
|
348
|
-
"""The presigned URL of the PDF file."""
|
349
|
-
|
350
|
-
|
351
|
-
class Task(BaseModel):
|
352
|
-
configuration: Configuration
|
353
|
-
|
354
|
-
created_at: datetime
|
355
|
-
"""The date and time when the task was created and queued."""
|
356
|
-
|
357
|
-
message: str
|
358
|
-
"""A message describing the task's status or any errors that occurred."""
|
359
|
-
|
360
|
-
status: Literal["Starting", "Processing", "Succeeded", "Failed", "Cancelled"]
|
361
|
-
"""The status of the task."""
|
362
|
-
|
363
|
-
task_id: str
|
364
|
-
"""The unique identifier for the task."""
|
365
|
-
|
366
|
-
expires_at: Optional[datetime] = None
|
367
|
-
"""The date and time when the task will expire."""
|
368
|
-
|
369
|
-
finished_at: Optional[datetime] = None
|
370
|
-
"""The date and time when the task was finished."""
|
371
|
-
|
372
|
-
output: Optional[Output] = None
|
373
|
-
"""The processed results of a document analysis task"""
|
374
|
-
|
375
|
-
started_at: Optional[datetime] = None
|
376
|
-
"""The date and time when the task was started."""
|
377
|
-
|
378
|
-
task_url: Optional[str] = None
|
379
|
-
"""The presigned URL of the task."""
|
@@ -1,90 +0,0 @@
|
|
1
|
-
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
-
|
3
|
-
from __future__ import annotations
|
4
|
-
|
5
|
-
from typing import Optional
|
6
|
-
from typing_extensions import Literal, Required, TypedDict
|
7
|
-
|
8
|
-
from .llm_processing_param import LlmProcessingParam
|
9
|
-
from .chunk_processing_param import ChunkProcessingParam
|
10
|
-
from .segment_processing_param import SegmentProcessingParam
|
11
|
-
|
12
|
-
__all__ = ["TaskParseParams"]
|
13
|
-
|
14
|
-
|
15
|
-
class TaskParseParams(TypedDict, total=False):
|
16
|
-
file: Required[str]
|
17
|
-
"""The file to be uploaded. Can be a URL or a base64 encoded file."""
|
18
|
-
|
19
|
-
chunk_processing: Optional[ChunkProcessingParam]
|
20
|
-
"""Controls the setting for the chunking and post-processing of each chunk."""
|
21
|
-
|
22
|
-
error_handling: Optional[Literal["Fail", "Continue"]]
|
23
|
-
"""Controls how errors are handled during processing:
|
24
|
-
|
25
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
26
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
27
|
-
LLM refusals etc.)
|
28
|
-
"""
|
29
|
-
|
30
|
-
expires_in: Optional[int]
|
31
|
-
"""
|
32
|
-
The number of seconds until task is deleted. Expired tasks can **not** be
|
33
|
-
updated, polled or accessed via web interface.
|
34
|
-
"""
|
35
|
-
|
36
|
-
file_name: Optional[str]
|
37
|
-
"""The name of the file to be uploaded. If not set a name will be generated."""
|
38
|
-
|
39
|
-
llm_processing: Optional[LlmProcessingParam]
|
40
|
-
"""Controls the LLM used for the task."""
|
41
|
-
|
42
|
-
ocr_strategy: Optional[Literal["All", "Auto"]]
|
43
|
-
"""Controls the Optical Character Recognition (OCR) strategy.
|
44
|
-
|
45
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
46
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
47
|
-
text. When text layer is present the bounding boxes from the text layer are
|
48
|
-
used.
|
49
|
-
"""
|
50
|
-
|
51
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]]
|
52
|
-
"""
|
53
|
-
Choose the provider whose models will be used for segmentation and OCR. The
|
54
|
-
output will be unified to the Chunkr `output` format.
|
55
|
-
"""
|
56
|
-
|
57
|
-
segment_processing: Optional[SegmentProcessingParam]
|
58
|
-
"""Defines how each segment type is handled when generating the final output.
|
59
|
-
|
60
|
-
Each segment uses one of three strategies. The chosen strategy controls: •
|
61
|
-
Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
|
62
|
-
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
63
|
-
`Markdown`).
|
64
|
-
|
65
|
-
Optional flags such as image **cropping**, **extended context**, and **LLM
|
66
|
-
descriptions** further refine behaviour.
|
67
|
-
|
68
|
-
---
|
69
|
-
|
70
|
-
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
71
|
-
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
|
72
|
-
description on) • `Picture` → **LLM** (Markdown, description on, cropping _All_)
|
73
|
-
• `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
|
74
|
-
**Ignore** (removed from output)
|
75
|
-
|
76
|
-
---
|
77
|
-
|
78
|
-
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
79
|
-
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
80
|
-
"""
|
81
|
-
|
82
|
-
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]]
|
83
|
-
"""Controls the segmentation strategy:
|
84
|
-
|
85
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
86
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
87
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
88
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
89
|
-
layout element detection and only simple chunking.
|
90
|
-
"""
|
@@ -1,90 +0,0 @@
|
|
1
|
-
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
-
|
3
|
-
from __future__ import annotations
|
4
|
-
|
5
|
-
from typing import Optional
|
6
|
-
from typing_extensions import Literal, TypedDict
|
7
|
-
|
8
|
-
from .llm_processing_param import LlmProcessingParam
|
9
|
-
from .chunk_processing_param import ChunkProcessingParam
|
10
|
-
from .segment_processing_param import SegmentProcessingParam
|
11
|
-
|
12
|
-
__all__ = ["TaskUpdateParams"]
|
13
|
-
|
14
|
-
|
15
|
-
class TaskUpdateParams(TypedDict, total=False):
|
16
|
-
chunk_processing: Optional[ChunkProcessingParam]
|
17
|
-
"""Controls the setting for the chunking and post-processing of each chunk."""
|
18
|
-
|
19
|
-
error_handling: Optional[Literal["Fail", "Continue"]]
|
20
|
-
"""Controls how errors are handled during processing:
|
21
|
-
|
22
|
-
- `Fail`: Stops processing and fails the task when any error occurs
|
23
|
-
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
24
|
-
LLM refusals etc.)
|
25
|
-
"""
|
26
|
-
|
27
|
-
expires_in: Optional[int]
|
28
|
-
"""
|
29
|
-
The number of seconds until task is deleted. Expired tasks can **not** be
|
30
|
-
updated, polled or accessed via web interface.
|
31
|
-
"""
|
32
|
-
|
33
|
-
high_resolution: Optional[bool]
|
34
|
-
"""Whether to use high-resolution images for cropping and post-processing.
|
35
|
-
|
36
|
-
(Latency penalty: ~7 seconds per page)
|
37
|
-
"""
|
38
|
-
|
39
|
-
llm_processing: Optional[LlmProcessingParam]
|
40
|
-
"""Controls the LLM used for the task."""
|
41
|
-
|
42
|
-
ocr_strategy: Optional[Literal["All", "Auto"]]
|
43
|
-
"""Controls the Optical Character Recognition (OCR) strategy.
|
44
|
-
|
45
|
-
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
46
|
-
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
47
|
-
text. When text layer is present the bounding boxes from the text layer are
|
48
|
-
used.
|
49
|
-
"""
|
50
|
-
|
51
|
-
pipeline: Optional[Literal["Azure", "Chunkr"]]
|
52
|
-
"""
|
53
|
-
Choose the provider whose models will be used for segmentation and OCR. The
|
54
|
-
output will be unified to the Chunkr `output` format.
|
55
|
-
"""
|
56
|
-
|
57
|
-
segment_processing: Optional[SegmentProcessingParam]
|
58
|
-
"""Defines how each segment type is handled when generating the final output.
|
59
|
-
|
60
|
-
Each segment uses one of three strategies. The chosen strategy controls: •
|
61
|
-
Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`). • How the
|
62
|
-
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
63
|
-
`Markdown`).
|
64
|
-
|
65
|
-
Optional flags such as image **cropping**, **extended context**, and **LLM
|
66
|
-
descriptions** further refine behaviour.
|
67
|
-
|
68
|
-
---
|
69
|
-
|
70
|
-
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
71
|
-
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM** (HTML,
|
72
|
-
description on) • `Picture` → **LLM** (Markdown, description on, cropping _All_)
|
73
|
-
• `Formula`, `Page` → **LLM** (Markdown) • `PageHeader`, `PageFooter` →
|
74
|
-
**Ignore** (removed from output)
|
75
|
-
|
76
|
-
---
|
77
|
-
|
78
|
-
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
79
|
-
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
80
|
-
"""
|
81
|
-
|
82
|
-
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]]
|
83
|
-
"""Controls the segmentation strategy:
|
84
|
-
|
85
|
-
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
86
|
-
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
87
|
-
segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
|
88
|
-
- `Page`: Treats each page as a single segment. Faster processing, but without
|
89
|
-
layout element detection and only simple chunking.
|
90
|
-
"""
|
File without changes
|
File without changes
|