chunkr-ai 0.1.0a11__py3-none-any.whl → 0.1.0a12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/_version.py +1 -1
- chunkr_ai/resources/tasks/parse.py +0 -9
- chunkr_ai/types/__init__.py +0 -2
- chunkr_ai/types/file_info.py +3 -0
- chunkr_ai/types/ocr_result.py +6 -6
- chunkr_ai/types/parse_configuration.py +0 -4
- chunkr_ai/types/parse_configuration_param.py +0 -4
- chunkr_ai/types/segment.py +8 -5
- chunkr_ai/types/segment_processing.py +92 -2
- chunkr_ai/types/segment_processing_param.py +92 -2
- chunkr_ai/types/task_response.py +8 -2
- chunkr_ai/types/tasks/extract_create_response.py +7 -1
- chunkr_ai/types/tasks/extract_get_response.py +7 -1
- chunkr_ai/types/tasks/parse_create_params.py +0 -4
- chunkr_ai/types/tasks/parse_create_response.py +6 -0
- chunkr_ai/types/tasks/parse_get_response.py +6 -0
- chunkr_ai/types/version_info.py +1 -1
- {chunkr_ai-0.1.0a11.dist-info → chunkr_ai-0.1.0a12.dist-info}/METADATA +1 -1
- {chunkr_ai-0.1.0a11.dist-info → chunkr_ai-0.1.0a12.dist-info}/RECORD +21 -23
- chunkr_ai/types/llm_processing.py +0 -36
- chunkr_ai/types/llm_processing_param.py +0 -36
- {chunkr_ai-0.1.0a11.dist-info → chunkr_ai-0.1.0a12.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a11.dist-info → chunkr_ai-0.1.0a12.dist-info}/licenses/LICENSE +0 -0
chunkr_ai/_version.py
CHANGED
@@ -19,7 +19,6 @@ from ..._response import (
|
|
19
19
|
)
|
20
20
|
from ...types.tasks import parse_get_params, parse_create_params
|
21
21
|
from ..._base_client import make_request_options
|
22
|
-
from ...types.llm_processing_param import LlmProcessingParam
|
23
22
|
from ...types.chunk_processing_param import ChunkProcessingParam
|
24
23
|
from ...types.segment_processing_param import SegmentProcessingParam
|
25
24
|
from ...types.tasks.parse_get_response import ParseGetResponse
|
@@ -56,7 +55,6 @@ class ParseResource(SyncAPIResource):
|
|
56
55
|
error_handling: Literal["Fail", "Continue"] | Omit = omit,
|
57
56
|
expires_in: Optional[int] | Omit = omit,
|
58
57
|
file_name: Optional[str] | Omit = omit,
|
59
|
-
llm_processing: LlmProcessingParam | Omit = omit,
|
60
58
|
ocr_strategy: Literal["All", "Auto"] | Omit = omit,
|
61
59
|
pipeline: Literal["Azure", "Chunkr"] | Omit = omit,
|
62
60
|
segment_processing: Optional[SegmentProcessingParam] | Omit = omit,
|
@@ -99,8 +97,6 @@ class ParseResource(SyncAPIResource):
|
|
99
97
|
|
100
98
|
file_name: The name of the file to be parsed. If not set a name will be generated.
|
101
99
|
|
102
|
-
llm_processing: Controls the LLM used for the task.
|
103
|
-
|
104
100
|
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
105
101
|
|
106
102
|
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
@@ -150,7 +146,6 @@ class ParseResource(SyncAPIResource):
|
|
150
146
|
"error_handling": error_handling,
|
151
147
|
"expires_in": expires_in,
|
152
148
|
"file_name": file_name,
|
153
|
-
"llm_processing": llm_processing,
|
154
149
|
"ocr_strategy": ocr_strategy,
|
155
150
|
"pipeline": pipeline,
|
156
151
|
"segment_processing": segment_processing,
|
@@ -256,7 +251,6 @@ class AsyncParseResource(AsyncAPIResource):
|
|
256
251
|
error_handling: Literal["Fail", "Continue"] | Omit = omit,
|
257
252
|
expires_in: Optional[int] | Omit = omit,
|
258
253
|
file_name: Optional[str] | Omit = omit,
|
259
|
-
llm_processing: LlmProcessingParam | Omit = omit,
|
260
254
|
ocr_strategy: Literal["All", "Auto"] | Omit = omit,
|
261
255
|
pipeline: Literal["Azure", "Chunkr"] | Omit = omit,
|
262
256
|
segment_processing: Optional[SegmentProcessingParam] | Omit = omit,
|
@@ -299,8 +293,6 @@ class AsyncParseResource(AsyncAPIResource):
|
|
299
293
|
|
300
294
|
file_name: The name of the file to be parsed. If not set a name will be generated.
|
301
295
|
|
302
|
-
llm_processing: Controls the LLM used for the task.
|
303
|
-
|
304
296
|
ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
|
305
297
|
|
306
298
|
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
@@ -350,7 +342,6 @@ class AsyncParseResource(AsyncAPIResource):
|
|
350
342
|
"error_handling": error_handling,
|
351
343
|
"expires_in": expires_in,
|
352
344
|
"file_name": file_name,
|
353
|
-
"llm_processing": llm_processing,
|
354
345
|
"ocr_strategy": ocr_strategy,
|
355
346
|
"pipeline": pipeline,
|
356
347
|
"segment_processing": segment_processing,
|
chunkr_ai/types/__init__.py
CHANGED
@@ -15,7 +15,6 @@ from .ocr_result import OcrResult as OcrResult
|
|
15
15
|
from .bounding_box import BoundingBox as BoundingBox
|
16
16
|
from .version_info import VersionInfo as VersionInfo
|
17
17
|
from .task_response import TaskResponse as TaskResponse
|
18
|
-
from .llm_processing import LlmProcessing as LlmProcessing
|
19
18
|
from .file_url_params import FileURLParams as FileURLParams
|
20
19
|
from .task_get_params import TaskGetParams as TaskGetParams
|
21
20
|
from .chunk_processing import ChunkProcessing as ChunkProcessing
|
@@ -26,7 +25,6 @@ from .file_create_params import FileCreateParams as FileCreateParams
|
|
26
25
|
from .segment_processing import SegmentProcessing as SegmentProcessing
|
27
26
|
from .files_list_response import FilesListResponse as FilesListResponse
|
28
27
|
from .parse_configuration import ParseConfiguration as ParseConfiguration
|
29
|
-
from .llm_processing_param import LlmProcessingParam as LlmProcessingParam
|
30
28
|
from .unwrap_webhook_event import UnwrapWebhookEvent as UnwrapWebhookEvent
|
31
29
|
from .webhook_url_response import WebhookURLResponse as WebhookURLResponse
|
32
30
|
from .extract_configuration import ExtractConfiguration as ExtractConfiguration
|
chunkr_ai/types/file_info.py
CHANGED
chunkr_ai/types/ocr_result.py
CHANGED
@@ -15,14 +15,14 @@ class OcrResult(BaseModel):
|
|
15
15
|
text: str
|
16
16
|
"""The recognized text of the OCR result."""
|
17
17
|
|
18
|
-
cell_ref: Optional[str] = None
|
19
|
-
"""
|
20
|
-
Excel-style cell reference (e.g., "A1" or "A1:B2") when OCR originates from a
|
21
|
-
spreadsheet cell
|
22
|
-
"""
|
23
|
-
|
24
18
|
confidence: Optional[float] = None
|
25
19
|
"""The confidence score of the recognized text."""
|
26
20
|
|
27
21
|
ocr_id: Optional[str] = None
|
28
22
|
"""The unique identifier for the OCR result."""
|
23
|
+
|
24
|
+
ss_cell_ref: Optional[str] = None
|
25
|
+
"""
|
26
|
+
Excel-style cell reference (e.g., "A1" or "A1:B2") when OCR originates from a
|
27
|
+
spreadsheet cell
|
28
|
+
"""
|
@@ -4,7 +4,6 @@ from typing import Optional
|
|
4
4
|
from typing_extensions import Literal
|
5
5
|
|
6
6
|
from .._models import BaseModel
|
7
|
-
from .llm_processing import LlmProcessing
|
8
7
|
from .chunk_processing import ChunkProcessing
|
9
8
|
from .segment_processing import SegmentProcessing
|
10
9
|
|
@@ -23,9 +22,6 @@ class ParseConfiguration(BaseModel):
|
|
23
22
|
LLM refusals etc.)
|
24
23
|
"""
|
25
24
|
|
26
|
-
llm_processing: Optional[LlmProcessing] = None
|
27
|
-
"""Controls the LLM used for the task."""
|
28
|
-
|
29
25
|
ocr_strategy: Optional[Literal["All", "Auto"]] = None
|
30
26
|
"""Controls the Optical Character Recognition (OCR) strategy.
|
31
27
|
|
@@ -5,7 +5,6 @@ from __future__ import annotations
|
|
5
5
|
from typing import Optional
|
6
6
|
from typing_extensions import Literal, TypedDict
|
7
7
|
|
8
|
-
from .llm_processing_param import LlmProcessingParam
|
9
8
|
from .chunk_processing_param import ChunkProcessingParam
|
10
9
|
from .segment_processing_param import SegmentProcessingParam
|
11
10
|
|
@@ -24,9 +23,6 @@ class ParseConfigurationParam(TypedDict, total=False):
|
|
24
23
|
LLM refusals etc.)
|
25
24
|
"""
|
26
25
|
|
27
|
-
llm_processing: LlmProcessingParam
|
28
|
-
"""Controls the LLM used for the task."""
|
29
|
-
|
30
26
|
ocr_strategy: Literal["All", "Auto"]
|
31
27
|
"""Controls the Optical Character Recognition (OCR) strategy.
|
32
28
|
|
chunkr_ai/types/segment.py
CHANGED
@@ -31,20 +31,23 @@ class Segment(BaseModel):
|
|
31
31
|
"Caption",
|
32
32
|
"Footnote",
|
33
33
|
"Formula",
|
34
|
+
"FormRegion",
|
35
|
+
"GraphicalItem",
|
36
|
+
"Legend",
|
37
|
+
"LineNumber",
|
34
38
|
"ListItem",
|
35
39
|
"Page",
|
36
40
|
"PageFooter",
|
37
41
|
"PageHeader",
|
42
|
+
"PageNumber",
|
38
43
|
"Picture",
|
39
|
-
"SectionHeader",
|
40
44
|
"Table",
|
41
45
|
"Text",
|
42
46
|
"Title",
|
47
|
+
"Unknown",
|
48
|
+
"SectionHeader",
|
43
49
|
]
|
44
|
-
"""
|
45
|
-
All the possible types for a segment. Note: Different configurations will
|
46
|
-
produce different types. Please refer to the documentation for more information.
|
47
|
-
"""
|
50
|
+
"""All the possible types for a segment."""
|
48
51
|
|
49
52
|
confidence: Optional[float] = None
|
50
53
|
"""Confidence score of the layout analysis model"""
|
@@ -47,6 +47,24 @@ class SegmentProcessing(BaseModel):
|
|
47
47
|
- `extended_context` uses the full page image as context for LLM generation.
|
48
48
|
"""
|
49
49
|
|
50
|
+
form_region: Optional[GenerationConfig] = FieldInfo(alias="FormRegion", default=None)
|
51
|
+
"""Controls the processing and generation for the segment.
|
52
|
+
|
53
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
54
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
55
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
56
|
+
post-processing.
|
57
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
58
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
59
|
+
- `Auto`: Process content automatically
|
60
|
+
- `LLM`: Use large language models for processing
|
61
|
+
- `Ignore`: Exclude segments from final output
|
62
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
63
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
64
|
+
configuration.
|
65
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
66
|
+
"""
|
67
|
+
|
50
68
|
formula: Optional[GenerationConfig] = FieldInfo(alias="Formula", default=None)
|
51
69
|
"""Controls the processing and generation for the segment.
|
52
70
|
|
@@ -65,6 +83,60 @@ class SegmentProcessing(BaseModel):
|
|
65
83
|
- `extended_context` uses the full page image as context for LLM generation.
|
66
84
|
"""
|
67
85
|
|
86
|
+
graphical_item: Optional[GenerationConfig] = FieldInfo(alias="GraphicalItem", default=None)
|
87
|
+
"""Controls the processing and generation for the segment.
|
88
|
+
|
89
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
90
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
91
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
92
|
+
post-processing.
|
93
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
94
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
95
|
+
- `Auto`: Process content automatically
|
96
|
+
- `LLM`: Use large language models for processing
|
97
|
+
- `Ignore`: Exclude segments from final output
|
98
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
99
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
100
|
+
configuration.
|
101
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
102
|
+
"""
|
103
|
+
|
104
|
+
legend: Optional[GenerationConfig] = FieldInfo(alias="Legend", default=None)
|
105
|
+
"""Controls the processing and generation for the segment.
|
106
|
+
|
107
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
108
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
109
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
110
|
+
post-processing.
|
111
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
112
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
113
|
+
- `Auto`: Process content automatically
|
114
|
+
- `LLM`: Use large language models for processing
|
115
|
+
- `Ignore`: Exclude segments from final output
|
116
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
117
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
118
|
+
configuration.
|
119
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
120
|
+
"""
|
121
|
+
|
122
|
+
line_number: Optional[GenerationConfig] = FieldInfo(alias="LineNumber", default=None)
|
123
|
+
"""Controls the processing and generation for the segment.
|
124
|
+
|
125
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
126
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
127
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
128
|
+
post-processing.
|
129
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
130
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
131
|
+
- `Auto`: Process content automatically
|
132
|
+
- `LLM`: Use large language models for processing
|
133
|
+
- `Ignore`: Exclude segments from final output
|
134
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
135
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
136
|
+
configuration.
|
137
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
138
|
+
"""
|
139
|
+
|
68
140
|
list_item: Optional[GenerationConfig] = FieldInfo(alias="ListItem", default=None)
|
69
141
|
"""Controls the processing and generation for the segment.
|
70
142
|
|
@@ -137,7 +209,7 @@ class SegmentProcessing(BaseModel):
|
|
137
209
|
- `extended_context` uses the full page image as context for LLM generation.
|
138
210
|
"""
|
139
211
|
|
140
|
-
|
212
|
+
page_number: Optional[GenerationConfig] = FieldInfo(alias="PageNumber", default=None)
|
141
213
|
"""Controls the processing and generation for the segment.
|
142
214
|
|
143
215
|
- `crop_image` controls whether to crop the file's images to the segment's
|
@@ -155,7 +227,7 @@ class SegmentProcessing(BaseModel):
|
|
155
227
|
- `extended_context` uses the full page image as context for LLM generation.
|
156
228
|
"""
|
157
229
|
|
158
|
-
|
230
|
+
picture: Optional[GenerationConfig] = FieldInfo(alias="Picture", default=None)
|
159
231
|
"""Controls the processing and generation for the segment.
|
160
232
|
|
161
233
|
- `crop_image` controls whether to crop the file's images to the segment's
|
@@ -226,3 +298,21 @@ class SegmentProcessing(BaseModel):
|
|
226
298
|
configuration.
|
227
299
|
- `extended_context` uses the full page image as context for LLM generation.
|
228
300
|
"""
|
301
|
+
|
302
|
+
unknown: Optional[GenerationConfig] = FieldInfo(alias="Unknown", default=None)
|
303
|
+
"""Controls the processing and generation for the segment.
|
304
|
+
|
305
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
306
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
307
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
308
|
+
post-processing.
|
309
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
310
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
311
|
+
- `Auto`: Process content automatically
|
312
|
+
- `LLM`: Use large language models for processing
|
313
|
+
- `Ignore`: Exclude segments from final output
|
314
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
315
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
316
|
+
configuration.
|
317
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
318
|
+
"""
|
@@ -48,6 +48,24 @@ class SegmentProcessingParam(TypedDict, total=False):
|
|
48
48
|
- `extended_context` uses the full page image as context for LLM generation.
|
49
49
|
"""
|
50
50
|
|
51
|
+
form_region: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="FormRegion")]
|
52
|
+
"""Controls the processing and generation for the segment.
|
53
|
+
|
54
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
55
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
56
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
57
|
+
post-processing.
|
58
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
59
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
60
|
+
- `Auto`: Process content automatically
|
61
|
+
- `LLM`: Use large language models for processing
|
62
|
+
- `Ignore`: Exclude segments from final output
|
63
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
64
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
65
|
+
configuration.
|
66
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
67
|
+
"""
|
68
|
+
|
51
69
|
formula: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Formula")]
|
52
70
|
"""Controls the processing and generation for the segment.
|
53
71
|
|
@@ -66,6 +84,60 @@ class SegmentProcessingParam(TypedDict, total=False):
|
|
66
84
|
- `extended_context` uses the full page image as context for LLM generation.
|
67
85
|
"""
|
68
86
|
|
87
|
+
graphical_item: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="GraphicalItem")]
|
88
|
+
"""Controls the processing and generation for the segment.
|
89
|
+
|
90
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
91
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
92
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
93
|
+
post-processing.
|
94
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
95
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
96
|
+
- `Auto`: Process content automatically
|
97
|
+
- `LLM`: Use large language models for processing
|
98
|
+
- `Ignore`: Exclude segments from final output
|
99
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
100
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
101
|
+
configuration.
|
102
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
103
|
+
"""
|
104
|
+
|
105
|
+
legend: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Legend")]
|
106
|
+
"""Controls the processing and generation for the segment.
|
107
|
+
|
108
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
109
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
110
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
111
|
+
post-processing.
|
112
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
113
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
114
|
+
- `Auto`: Process content automatically
|
115
|
+
- `LLM`: Use large language models for processing
|
116
|
+
- `Ignore`: Exclude segments from final output
|
117
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
118
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
119
|
+
configuration.
|
120
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
121
|
+
"""
|
122
|
+
|
123
|
+
line_number: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="LineNumber")]
|
124
|
+
"""Controls the processing and generation for the segment.
|
125
|
+
|
126
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
127
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
128
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
129
|
+
post-processing.
|
130
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
131
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
132
|
+
- `Auto`: Process content automatically
|
133
|
+
- `LLM`: Use large language models for processing
|
134
|
+
- `Ignore`: Exclude segments from final output
|
135
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
136
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
137
|
+
configuration.
|
138
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
139
|
+
"""
|
140
|
+
|
69
141
|
list_item: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="ListItem")]
|
70
142
|
"""Controls the processing and generation for the segment.
|
71
143
|
|
@@ -138,7 +210,7 @@ class SegmentProcessingParam(TypedDict, total=False):
|
|
138
210
|
- `extended_context` uses the full page image as context for LLM generation.
|
139
211
|
"""
|
140
212
|
|
141
|
-
|
213
|
+
page_number: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="PageNumber")]
|
142
214
|
"""Controls the processing and generation for the segment.
|
143
215
|
|
144
216
|
- `crop_image` controls whether to crop the file's images to the segment's
|
@@ -156,7 +228,7 @@ class SegmentProcessingParam(TypedDict, total=False):
|
|
156
228
|
- `extended_context` uses the full page image as context for LLM generation.
|
157
229
|
"""
|
158
230
|
|
159
|
-
|
231
|
+
picture: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Picture")]
|
160
232
|
"""Controls the processing and generation for the segment.
|
161
233
|
|
162
234
|
- `crop_image` controls whether to crop the file's images to the segment's
|
@@ -227,3 +299,21 @@ class SegmentProcessingParam(TypedDict, total=False):
|
|
227
299
|
configuration.
|
228
300
|
- `extended_context` uses the full page image as context for LLM generation.
|
229
301
|
"""
|
302
|
+
|
303
|
+
unknown: Annotated[Optional[GenerationConfigParam], PropertyInfo(alias="Unknown")]
|
304
|
+
"""Controls the processing and generation for the segment.
|
305
|
+
|
306
|
+
- `crop_image` controls whether to crop the file's images to the segment's
|
307
|
+
bounding box. The cropped image will be stored in the segment's `image` field.
|
308
|
+
Use `All` to always crop, or `Auto` to only crop when needed for
|
309
|
+
post-processing.
|
310
|
+
- `format` specifies the output format: `Html` or `Markdown`
|
311
|
+
- `strategy` determines how the content is generated: `Auto`, `LLM`, or `Ignore`
|
312
|
+
- `Auto`: Process content automatically
|
313
|
+
- `LLM`: Use large language models for processing
|
314
|
+
- `Ignore`: Exclude segments from final output
|
315
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
316
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
317
|
+
configuration.
|
318
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
319
|
+
"""
|
chunkr_ai/types/task_response.py
CHANGED
@@ -20,6 +20,12 @@ Output: TypeAlias = Union[ParseOutputResponse, ExtractOutputResponse, None]
|
|
20
20
|
|
21
21
|
|
22
22
|
class TaskResponse(BaseModel):
|
23
|
+
completed: bool
|
24
|
+
"""True when the task reaches a terminal state i.e.
|
25
|
+
|
26
|
+
`status` is `Succeeded` or `Failed` or `Cancelled`
|
27
|
+
"""
|
28
|
+
|
23
29
|
configuration: Configuration
|
24
30
|
"""
|
25
31
|
Unified configuration type that can represent either parse or extract
|
@@ -58,8 +64,8 @@ class TaskResponse(BaseModel):
|
|
58
64
|
output: Optional[Output] = None
|
59
65
|
"""Unified output type that can represent either parse or extract results"""
|
60
66
|
|
61
|
-
|
62
|
-
"""The ID of the source task that was used for the task"""
|
67
|
+
parse_task_id: Optional[str] = None
|
68
|
+
"""The ID of the source `parse` task that was used for the task"""
|
63
69
|
|
64
70
|
started_at: Optional[datetime] = None
|
65
71
|
"""The date and time when the task was started."""
|
@@ -14,6 +14,12 @@ __all__ = ["ExtractCreateResponse"]
|
|
14
14
|
|
15
15
|
|
16
16
|
class ExtractCreateResponse(BaseModel):
|
17
|
+
completed: bool
|
18
|
+
"""True when the task reaches a terminal state i.e.
|
19
|
+
|
20
|
+
`status` is `Succeeded` or `Failed` or `Cancelled`
|
21
|
+
"""
|
22
|
+
|
17
23
|
configuration: ExtractConfiguration
|
18
24
|
|
19
25
|
created_at: datetime
|
@@ -57,7 +63,7 @@ class ExtractCreateResponse(BaseModel):
|
|
57
63
|
for that field.
|
58
64
|
"""
|
59
65
|
|
60
|
-
|
66
|
+
parse_task_id: Optional[str] = None
|
61
67
|
"""The ID of the source `parse` task that was used for extraction"""
|
62
68
|
|
63
69
|
started_at: Optional[datetime] = None
|
@@ -14,6 +14,12 @@ __all__ = ["ExtractGetResponse"]
|
|
14
14
|
|
15
15
|
|
16
16
|
class ExtractGetResponse(BaseModel):
|
17
|
+
completed: bool
|
18
|
+
"""True when the task reaches a terminal state i.e.
|
19
|
+
|
20
|
+
`status` is `Succeeded` or `Failed` or `Cancelled`
|
21
|
+
"""
|
22
|
+
|
17
23
|
configuration: ExtractConfiguration
|
18
24
|
|
19
25
|
created_at: datetime
|
@@ -57,7 +63,7 @@ class ExtractGetResponse(BaseModel):
|
|
57
63
|
for that field.
|
58
64
|
"""
|
59
65
|
|
60
|
-
|
66
|
+
parse_task_id: Optional[str] = None
|
61
67
|
"""The ID of the source `parse` task that was used for extraction"""
|
62
68
|
|
63
69
|
started_at: Optional[datetime] = None
|
@@ -5,7 +5,6 @@ from __future__ import annotations
|
|
5
5
|
from typing import Optional
|
6
6
|
from typing_extensions import Literal, Required, TypedDict
|
7
7
|
|
8
|
-
from ..llm_processing_param import LlmProcessingParam
|
9
8
|
from ..chunk_processing_param import ChunkProcessingParam
|
10
9
|
from ..segment_processing_param import SegmentProcessingParam
|
11
10
|
|
@@ -42,9 +41,6 @@ class ParseCreateParams(TypedDict, total=False):
|
|
42
41
|
file_name: Optional[str]
|
43
42
|
"""The name of the file to be parsed. If not set a name will be generated."""
|
44
43
|
|
45
|
-
llm_processing: LlmProcessingParam
|
46
|
-
"""Controls the LLM used for the task."""
|
47
|
-
|
48
44
|
ocr_strategy: Literal["All", "Auto"]
|
49
45
|
"""Controls the Optical Character Recognition (OCR) strategy.
|
50
46
|
|
@@ -14,6 +14,12 @@ __all__ = ["ParseCreateResponse"]
|
|
14
14
|
|
15
15
|
|
16
16
|
class ParseCreateResponse(BaseModel):
|
17
|
+
completed: bool
|
18
|
+
"""True when the task reaches a terminal state i.e.
|
19
|
+
|
20
|
+
`status` is `Succeeded` or `Failed` or `Cancelled`
|
21
|
+
"""
|
22
|
+
|
17
23
|
configuration: ParseConfiguration
|
18
24
|
|
19
25
|
created_at: datetime
|
@@ -14,6 +14,12 @@ __all__ = ["ParseGetResponse"]
|
|
14
14
|
|
15
15
|
|
16
16
|
class ParseGetResponse(BaseModel):
|
17
|
+
completed: bool
|
18
|
+
"""True when the task reaches a terminal state i.e.
|
19
|
+
|
20
|
+
`status` is `Succeeded` or `Failed` or `Cancelled`
|
21
|
+
"""
|
22
|
+
|
17
23
|
configuration: ParseConfiguration
|
18
24
|
|
19
25
|
created_at: datetime
|
chunkr_ai/types/version_info.py
CHANGED
@@ -20,7 +20,7 @@ class ClientVersionGeneratedSDK(BaseModel):
|
|
20
20
|
"""Version of the auto-generated SDK"""
|
21
21
|
|
22
22
|
|
23
|
-
ClientVersion: TypeAlias = Union[Literal["Legacy"], ClientVersionManualSDK, ClientVersionGeneratedSDK]
|
23
|
+
ClientVersion: TypeAlias = Union[Literal["Legacy", "Unspecified"], ClientVersionManualSDK, ClientVersionGeneratedSDK]
|
24
24
|
|
25
25
|
|
26
26
|
class VersionInfo(BaseModel):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.0a12
|
4
4
|
Summary: The official Python library for the chunkr API
|
5
5
|
Project-URL: Homepage, https://github.com/lumina-ai-inc/chunkr-python
|
6
6
|
Project-URL: Repository, https://github.com/lumina-ai-inc/chunkr-python
|
@@ -11,7 +11,7 @@ chunkr_ai/_resource.py,sha256=f5tiwjxcKdbeMor8idoHtMFTUhqD9yc2xXtq5rqeLLk,1100
|
|
11
11
|
chunkr_ai/_response.py,sha256=xXNpF53hiYARmAW7npKuxQ5UHAEjgAzm7ME_L3eIstY,28800
|
12
12
|
chunkr_ai/_streaming.py,sha256=ZmyrVWk7-AWkLAATR55WgNxnyFzYmaqJt2LthA_PTqQ,10100
|
13
13
|
chunkr_ai/_types.py,sha256=nzD_EEP9CVutLcSeuKLO6Mpn5cd_R0Vo0dEM7AWa7yY,7239
|
14
|
-
chunkr_ai/_version.py,sha256=
|
14
|
+
chunkr_ai/_version.py,sha256=qDyA1DMpmqGoQPNBAzGD_frtGPh6Bee1q-GXz3_l60c,170
|
15
15
|
chunkr_ai/pagination.py,sha256=bT-ErcJ80YlKBV6tWq2s9uqg-wv7o66SKe_AgUAGrKc,3533
|
16
16
|
chunkr_ai/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
chunkr_ai/_utils/__init__.py,sha256=7fch0GT9zpNnErbciSpUNa-SjTxxjY6kxHxKMOM4AGs,2305
|
@@ -33,9 +33,9 @@ chunkr_ai/resources/health.py,sha256=sLA4PSAf-4JK1Lrqb0TApQ0Hc5Q8fZzbKQXzA3bNEdQ
|
|
33
33
|
chunkr_ai/resources/webhooks.py,sha256=RhteI3ahE2rGSDEMUZH0HDBTOQqWS8sZ5D00ErKWnpE,7006
|
34
34
|
chunkr_ai/resources/tasks/__init__.py,sha256=wDCnwtnpTfiaLg7NBxoLZYh2TtOw44_DSqtJa_TjmXU,1439
|
35
35
|
chunkr_ai/resources/tasks/extract.py,sha256=LQJTmYItqUu60G0hYfdw_nyMLARyxqzjVO6ETIn8hDo,14980
|
36
|
-
chunkr_ai/resources/tasks/parse.py,sha256=
|
36
|
+
chunkr_ai/resources/tasks/parse.py,sha256=hbFEFu-tU6RWktA0Tv6KP0HyeLdS62kO4UoQfbSsAJc,18963
|
37
37
|
chunkr_ai/resources/tasks/tasks.py,sha256=W9bXpfgE56MkM5buBYg5-dcPYGFo_CzgVkr1kOOpXtQ,23582
|
38
|
-
chunkr_ai/types/__init__.py,sha256=
|
38
|
+
chunkr_ai/types/__init__.py,sha256=gO4mkpo_tYfe1PGSc0Uzlc8rZr9PTmHshGZFzmG98mM,2365
|
39
39
|
chunkr_ai/types/bounding_box.py,sha256=JDZlhJJl4lg6RYGf8VpC46soQfQ10-K8YwHHA6XBFkM,431
|
40
40
|
chunkr_ai/types/cell.py,sha256=D-S_XAzmOJs0Lo2RgY7T3h_ChdhSrRrI8IN4qo2sGOU,1143
|
41
41
|
chunkr_ai/types/cell_style.py,sha256=VqSz6pZ7rjmHYrq_S63IOFPaWvXjWhNUIGc3V4UlF6U,873
|
@@ -47,7 +47,7 @@ chunkr_ai/types/extract_configuration.py,sha256=OCs3SnuS8qXWB926o8Gv1Y2AuNszplGm
|
|
47
47
|
chunkr_ai/types/extract_output_response.py,sha256=kfkHbeEVl3x3t-7u4h4Cd4wC5KbrOjz4-joP5RV1WyA,1272
|
48
48
|
chunkr_ai/types/file.py,sha256=kOxR0g-3A-qOxz2cjuTcq0wFMqPoph9uQuLYQ56zb-c,718
|
49
49
|
chunkr_ai/types/file_create_params.py,sha256=_1Dr3FlO9BOv6gzhCN4g46_otCBqEdLe0mnxpdaRPaE,468
|
50
|
-
chunkr_ai/types/file_info.py,sha256=
|
50
|
+
chunkr_ai/types/file_info.py,sha256=y5kVR3mPeiwsPBMc3IBAigQbkVRpGQN09IpDPdvCe7E,611
|
51
51
|
chunkr_ai/types/file_list_params.py,sha256=oJGTf88aAxBhNfmQDbxGT63b95HdSbMXUubKjXM22_U,822
|
52
52
|
chunkr_ai/types/file_url.py,sha256=L434WnOXkNmt59dJiaAgT1_3pN3BIsxm2q14zHQK6xY,365
|
53
53
|
chunkr_ai/types/file_url_params.py,sha256=ZHfKiy_6B25StdDemulavGcsPggNNMKLWf6KN7xfPTY,413
|
@@ -55,34 +55,32 @@ chunkr_ai/types/files_list_response.py,sha256=ggSRWhTzZWjcDXxStyCzrYICXXB5TqnL2j
|
|
55
55
|
chunkr_ai/types/generation_config.py,sha256=9gfwdd228x29jC1egxq3IreKwgkGZCjSWHCXIkzQwqE,958
|
56
56
|
chunkr_ai/types/generation_config_param.py,sha256=9E0Mhee-NInwOzjXmq3gpd8G5drsPBpzFs0AA2ywTc0,960
|
57
57
|
chunkr_ai/types/health_check_response.py,sha256=6Zn5YYHCQf2RgMjDlf39mtiTPqfaBfC9Vv599U_rKCI,200
|
58
|
-
chunkr_ai/types/
|
59
|
-
chunkr_ai/types/llm_processing_param.py,sha256=CSnW4-5-32Pzoo-G7G3p_NUvljtCkNguj1dHVc2Y4cA,1135
|
60
|
-
chunkr_ai/types/ocr_result.py,sha256=EdIvpuccQ_8A8ml7yVCOEOfBoewgwTBzVJZ_les9udM,740
|
58
|
+
chunkr_ai/types/ocr_result.py,sha256=W3piXLotfmZ40FJrJYMuS72shaVYLsKeN6jCf26uZGI,743
|
61
59
|
chunkr_ai/types/page.py,sha256=ADdGJisS-GxBD_wdu3q1pmikgJ7twFsP0choDEXw9ro,690
|
62
|
-
chunkr_ai/types/parse_configuration.py,sha256=
|
63
|
-
chunkr_ai/types/parse_configuration_param.py,sha256=
|
60
|
+
chunkr_ai/types/parse_configuration.py,sha256=WcUUk7ai0sHTeWUQYHyDn1ZjYqT7dzLqRWjGx5zFfsg,2427
|
61
|
+
chunkr_ai/types/parse_configuration_param.py,sha256=dl884XkCnloSS9YMk8UnUm7Z963HiAzNy5qgtCSIPH8,2405
|
64
62
|
chunkr_ai/types/parse_output_response.py,sha256=KfRFY5PnchJfEWr4jy3Dd-3AWeImGE5BP_NMFC5I6_c,947
|
65
|
-
chunkr_ai/types/segment.py,sha256=
|
66
|
-
chunkr_ai/types/segment_processing.py,sha256=
|
67
|
-
chunkr_ai/types/segment_processing_param.py,sha256=
|
63
|
+
chunkr_ai/types/segment.py,sha256=KsGNynGQq7s55EHuPLY7glmvXunv2Wszhx5FhkhkN5U,3093
|
64
|
+
chunkr_ai/types/segment_processing.py,sha256=uyNbxp7DsgTgBHoS9ELoyW8j-aTBbOshxYrb-TQ990E,17049
|
65
|
+
chunkr_ai/types/segment_processing_param.py,sha256=a1Zk8NiaeFNSbHw5P8Usq-17mqENuZdCOQCn8nNN0o0,17199
|
68
66
|
chunkr_ai/types/task_extract_updated_webhook_event.py,sha256=YYHDQEs4wg2bDgGXgHUgX_CwSLFxePJZrT5OV4J6Mhk,640
|
69
67
|
chunkr_ai/types/task_get_params.py,sha256=Nx2luhebcoaiuRln4KP4FarWvBPd1OYi__efi56zHPM,460
|
70
68
|
chunkr_ai/types/task_list_params.py,sha256=NySdOH1mIhZAJvcHr9xm2PeODsCO05lJMsrAiGGBKNE,1275
|
71
69
|
chunkr_ai/types/task_parse_updated_webhook_event.py,sha256=3NsfEpJr_bfFB3Y66elraSxk0FS76c60BLUmhqmU9Vc,636
|
72
|
-
chunkr_ai/types/task_response.py,sha256=
|
70
|
+
chunkr_ai/types/task_response.py,sha256=RgyRaZK8TKjVfMSvsI10LYAv61QY2-195Tg3op8yCGo,2339
|
73
71
|
chunkr_ai/types/unwrap_webhook_event.py,sha256=G23CLp0__AMlfM2xE11ugnDxN82uiG0Xru0p-pI9VHQ,442
|
74
|
-
chunkr_ai/types/version_info.py,sha256=
|
72
|
+
chunkr_ai/types/version_info.py,sha256=MVSU2Z9ATehyc1IgVExczvcP_yH7wYc1UV_BwXeF0UA,917
|
75
73
|
chunkr_ai/types/webhook_url_response.py,sha256=q7VsWGOLqVfA_ctdcrbynQJVbfCGh1rHlXZsDc-9Sus,205
|
76
74
|
chunkr_ai/types/tasks/__init__.py,sha256=AEF_lM5YdEvz8_7fcX0HHnVvsXdC8Hcsb2Cs-LzRBK4,711
|
77
75
|
chunkr_ai/types/tasks/extract_create_params.py,sha256=IV5TrFqJAGFf4w_uH3hqWlbEySlAOC_2QzwKZ-3oM6o,1376
|
78
|
-
chunkr_ai/types/tasks/extract_create_response.py,sha256=
|
76
|
+
chunkr_ai/types/tasks/extract_create_response.py,sha256=goc8x-L3W0hJowb4PvXigc2o_p15JL0a2ESo9Geg9xc,2287
|
79
77
|
chunkr_ai/types/tasks/extract_get_params.py,sha256=AsJvXHvdDnIcVOvTK9gCeiMFk4wckuv19IXIJcqpqso,466
|
80
|
-
chunkr_ai/types/tasks/extract_get_response.py,sha256=
|
81
|
-
chunkr_ai/types/tasks/parse_create_params.py,sha256=
|
82
|
-
chunkr_ai/types/tasks/parse_create_response.py,sha256=
|
78
|
+
chunkr_ai/types/tasks/extract_get_response.py,sha256=sQgDLTwpE2w-xVcuXO7NAF_kjhLXv_0swdBJQRGQNSI,2281
|
79
|
+
chunkr_ai/types/tasks/parse_create_params.py,sha256=8ctOPP2QT-q_8zN8Fl8ene74ZGOUnR6EAA9XcvA_0p4,2957
|
80
|
+
chunkr_ai/types/tasks/parse_create_response.py,sha256=l9hj6qKs76-qKzjBtVeo9lRe9wRuYltsh2GFNPyfEDM,1820
|
83
81
|
chunkr_ai/types/tasks/parse_get_params.py,sha256=Ca0C91k6ajNTMhtUkFMulgC6g8_wI7YLVGxsWiupiVA,462
|
84
|
-
chunkr_ai/types/tasks/parse_get_response.py,sha256
|
85
|
-
chunkr_ai-0.1.
|
86
|
-
chunkr_ai-0.1.
|
87
|
-
chunkr_ai-0.1.
|
88
|
-
chunkr_ai-0.1.
|
82
|
+
chunkr_ai/types/tasks/parse_get_response.py,sha256=2IoZeN8BAxQEtxRq6CCA_d9nWPvCQbp71zMzaaKPlug,1814
|
83
|
+
chunkr_ai-0.1.0a12.dist-info/METADATA,sha256=HUM4LJTDsqX9zgI_0QgVqHbZbHlpxwENau7AVKmVslg,16493
|
84
|
+
chunkr_ai-0.1.0a12.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
|
85
|
+
chunkr_ai-0.1.0a12.dist-info/licenses/LICENSE,sha256=3FDRL-L-DFkrFy8yJpb1Nxhuztm0PB2kawcCgK5utFg,11336
|
86
|
+
chunkr_ai-0.1.0a12.dist-info/RECORD,,
|
@@ -1,36 +0,0 @@
|
|
1
|
-
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
-
|
3
|
-
from typing import Union, Optional
|
4
|
-
from typing_extensions import Literal, TypeAlias
|
5
|
-
|
6
|
-
from pydantic import Field as FieldInfo
|
7
|
-
|
8
|
-
from .._models import BaseModel
|
9
|
-
|
10
|
-
__all__ = ["LlmProcessing", "FallbackStrategy", "FallbackStrategyModel"]
|
11
|
-
|
12
|
-
|
13
|
-
class FallbackStrategyModel(BaseModel):
|
14
|
-
model: str = FieldInfo(alias="Model")
|
15
|
-
"""Use a specific model as fallback"""
|
16
|
-
|
17
|
-
|
18
|
-
FallbackStrategy: TypeAlias = Union[Literal["None", "Default"], FallbackStrategyModel]
|
19
|
-
|
20
|
-
|
21
|
-
class LlmProcessing(BaseModel):
|
22
|
-
fallback_strategy: Optional[FallbackStrategy] = None
|
23
|
-
"""The fallback strategy to use for the LLMs in the task."""
|
24
|
-
|
25
|
-
llm_model_id: Optional[str] = None
|
26
|
-
"""The ID of the model to use for the task.
|
27
|
-
|
28
|
-
If not provided, the default model will be used. Please check the documentation
|
29
|
-
for the model you want to use.
|
30
|
-
"""
|
31
|
-
|
32
|
-
max_completion_tokens: Optional[int] = None
|
33
|
-
"""The maximum number of tokens to generate."""
|
34
|
-
|
35
|
-
temperature: Optional[float] = None
|
36
|
-
"""The temperature to use for the LLM."""
|
@@ -1,36 +0,0 @@
|
|
1
|
-
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
-
|
3
|
-
from __future__ import annotations
|
4
|
-
|
5
|
-
from typing import Union, Optional
|
6
|
-
from typing_extensions import Literal, Required, Annotated, TypeAlias, TypedDict
|
7
|
-
|
8
|
-
from .._utils import PropertyInfo
|
9
|
-
|
10
|
-
__all__ = ["LlmProcessingParam", "FallbackStrategy", "FallbackStrategyModel"]
|
11
|
-
|
12
|
-
|
13
|
-
class FallbackStrategyModel(TypedDict, total=False):
|
14
|
-
model: Required[Annotated[str, PropertyInfo(alias="Model")]]
|
15
|
-
"""Use a specific model as fallback"""
|
16
|
-
|
17
|
-
|
18
|
-
FallbackStrategy: TypeAlias = Union[Literal["None", "Default"], FallbackStrategyModel]
|
19
|
-
|
20
|
-
|
21
|
-
class LlmProcessingParam(TypedDict, total=False):
|
22
|
-
fallback_strategy: FallbackStrategy
|
23
|
-
"""The fallback strategy to use for the LLMs in the task."""
|
24
|
-
|
25
|
-
llm_model_id: Optional[str]
|
26
|
-
"""The ID of the model to use for the task.
|
27
|
-
|
28
|
-
If not provided, the default model will be used. Please check the documentation
|
29
|
-
for the model you want to use.
|
30
|
-
"""
|
31
|
-
|
32
|
-
max_completion_tokens: Optional[int]
|
33
|
-
"""The maximum number of tokens to generate."""
|
34
|
-
|
35
|
-
temperature: float
|
36
|
-
"""The temperature to use for the LLM."""
|
File without changes
|
File without changes
|