chunkr-ai 0.0.44__tar.gz → 0.0.46__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chunkr_ai-0.0.44/src/chunkr_ai.egg-info → chunkr_ai-0.0.46}/PKG-INFO +1 -1
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/pyproject.toml +1 -1
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/configuration.py +103 -9
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/models.py +6 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/tests/test_chunkr.py +142 -1
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/LICENSE +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/README.md +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/setup.cfg +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/__init__.py +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/auth.py +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/chunkr.py +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/chunkr_base.py +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/decorators.py +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/misc.py +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/protocol.py +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/task_response.py +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai.egg-info/requires.txt +0 -0
- {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.0.
|
7
|
+
version = "0.0.46"
|
8
8
|
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
9
|
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from pydantic import BaseModel, Field, ConfigDict
|
2
2
|
from enum import Enum
|
3
3
|
from typing import Any, List, Optional, Union
|
4
|
-
from pydantic import field_validator
|
4
|
+
from pydantic import field_validator, field_serializer
|
5
5
|
|
6
6
|
class GenerationStrategy(str, Enum):
|
7
7
|
LLM = "LLM"
|
@@ -65,11 +65,7 @@ class TokenizerType(BaseModel):
|
|
65
65
|
return f"string:{self.string_value}"
|
66
66
|
return ""
|
67
67
|
|
68
|
-
model_config = ConfigDict(
|
69
|
-
json_encoders={
|
70
|
-
'TokenizerType': lambda v: v.model_dump()
|
71
|
-
}
|
72
|
-
)
|
68
|
+
model_config = ConfigDict()
|
73
69
|
|
74
70
|
def model_dump(self, **kwargs):
|
75
71
|
if self.enum_value is not None:
|
@@ -85,10 +81,13 @@ class ChunkProcessing(BaseModel):
|
|
85
81
|
|
86
82
|
model_config = ConfigDict(
|
87
83
|
arbitrary_types_allowed=True,
|
88
|
-
json_encoders={
|
89
|
-
TokenizerType: lambda v: v.model_dump()
|
90
|
-
}
|
91
84
|
)
|
85
|
+
|
86
|
+
@field_serializer('tokenizer')
|
87
|
+
def serialize_tokenizer(self, tokenizer: Optional[TokenizerType], _info):
|
88
|
+
if tokenizer is None:
|
89
|
+
return None
|
90
|
+
return tokenizer.model_dump()
|
92
91
|
|
93
92
|
@field_validator('tokenizer', mode='before')
|
94
93
|
def validate_tokenizer(cls, v):
|
@@ -126,6 +125,99 @@ class SegmentationStrategy(str, Enum):
|
|
126
125
|
LAYOUT_ANALYSIS = "LayoutAnalysis"
|
127
126
|
PAGE = "Page"
|
128
127
|
|
128
|
+
class ErrorHandlingStrategy(str, Enum):
|
129
|
+
FAIL = "Fail"
|
130
|
+
CONTINUE = "Continue"
|
131
|
+
|
132
|
+
class FallbackStrategy(BaseModel):
|
133
|
+
type: str
|
134
|
+
model_id: Optional[str] = None
|
135
|
+
|
136
|
+
@classmethod
|
137
|
+
def none(cls) -> "FallbackStrategy":
|
138
|
+
return cls(type="None")
|
139
|
+
|
140
|
+
@classmethod
|
141
|
+
def default(cls) -> "FallbackStrategy":
|
142
|
+
return cls(type="Default")
|
143
|
+
|
144
|
+
@classmethod
|
145
|
+
def model(cls, model_id: str) -> "FallbackStrategy":
|
146
|
+
return cls(type="Model", model_id=model_id)
|
147
|
+
|
148
|
+
def __str__(self) -> str:
|
149
|
+
if self.type == "Model":
|
150
|
+
return f"Model({self.model_id})"
|
151
|
+
return self.type
|
152
|
+
|
153
|
+
def model_dump(self, **kwargs):
|
154
|
+
if self.type == "Model":
|
155
|
+
return {"Model": self.model_id}
|
156
|
+
return self.type
|
157
|
+
|
158
|
+
@field_validator('type')
|
159
|
+
def validate_type(cls, v):
|
160
|
+
if v not in ["None", "Default", "Model"]:
|
161
|
+
raise ValueError(f"Invalid fallback strategy: {v}")
|
162
|
+
return v
|
163
|
+
|
164
|
+
model_config = ConfigDict()
|
165
|
+
|
166
|
+
@classmethod
|
167
|
+
def model_validate(cls, obj):
|
168
|
+
# Handle string values like "None" or "Default"
|
169
|
+
if isinstance(obj, str):
|
170
|
+
if obj in ["None", "Default"]:
|
171
|
+
return cls(type=obj)
|
172
|
+
# Try to parse as Enum value if it's not a direct match
|
173
|
+
try:
|
174
|
+
return cls(type=obj)
|
175
|
+
except ValueError:
|
176
|
+
pass # Let it fall through to normal validation
|
177
|
+
|
178
|
+
# Handle dictionary format like {"Model": "model-id"}
|
179
|
+
elif isinstance(obj, dict) and len(obj) == 1:
|
180
|
+
if "Model" in obj:
|
181
|
+
return cls(type="Model", model_id=obj["Model"])
|
182
|
+
|
183
|
+
# Fall back to normal validation
|
184
|
+
return super().model_validate(obj)
|
185
|
+
|
186
|
+
class LlmProcessing(BaseModel):
|
187
|
+
model_id: Optional[str] = None
|
188
|
+
fallback_strategy: FallbackStrategy = Field(default_factory=FallbackStrategy.default)
|
189
|
+
max_completion_tokens: Optional[int] = None
|
190
|
+
temperature: float = 0.0
|
191
|
+
|
192
|
+
model_config = ConfigDict()
|
193
|
+
|
194
|
+
@field_serializer('fallback_strategy')
|
195
|
+
def serialize_fallback_strategy(self, fallback_strategy: FallbackStrategy, _info):
|
196
|
+
return fallback_strategy.model_dump()
|
197
|
+
|
198
|
+
@field_validator('fallback_strategy', mode='before')
|
199
|
+
def validate_fallback_strategy(cls, v):
|
200
|
+
if isinstance(v, str):
|
201
|
+
if v == "None":
|
202
|
+
return FallbackStrategy.none()
|
203
|
+
elif v == "Default":
|
204
|
+
return FallbackStrategy.default()
|
205
|
+
# Try to parse as a model ID if it's not None or Default
|
206
|
+
try:
|
207
|
+
return FallbackStrategy.model(v)
|
208
|
+
except ValueError:
|
209
|
+
pass # Let it fall through to normal validation
|
210
|
+
# Handle dictionary format like {"Model": "model-id"}
|
211
|
+
elif isinstance(v, dict) and len(v) == 1:
|
212
|
+
if "Model" in v:
|
213
|
+
return FallbackStrategy.model(v["Model"])
|
214
|
+
elif "None" in v or v.get("None") is None:
|
215
|
+
return FallbackStrategy.none()
|
216
|
+
elif "Default" in v or v.get("Default") is None:
|
217
|
+
return FallbackStrategy.default()
|
218
|
+
|
219
|
+
return v
|
220
|
+
|
129
221
|
class BoundingBox(BaseModel):
|
130
222
|
left: float
|
131
223
|
top: float
|
@@ -189,11 +281,13 @@ class Pipeline(str, Enum):
|
|
189
281
|
class Configuration(BaseModel):
|
190
282
|
chunk_processing: Optional[ChunkProcessing] = None
|
191
283
|
expires_in: Optional[int] = None
|
284
|
+
error_handling: Optional[ErrorHandlingStrategy] = None
|
192
285
|
high_resolution: Optional[bool] = None
|
193
286
|
ocr_strategy: Optional[OcrStrategy] = None
|
194
287
|
segment_processing: Optional[SegmentProcessing] = None
|
195
288
|
segmentation_strategy: Optional[SegmentationStrategy] = None
|
196
289
|
pipeline: Optional[Pipeline] = None
|
290
|
+
llm_processing: Optional[LlmProcessing] = None
|
197
291
|
|
198
292
|
class OutputConfiguration(Configuration):
|
199
293
|
input_file_url: Optional[str] = None
|
@@ -5,8 +5,11 @@ from .api.configuration import (
|
|
5
5
|
Configuration,
|
6
6
|
CroppingStrategy,
|
7
7
|
EmbedSource,
|
8
|
+
ErrorHandlingStrategy,
|
9
|
+
FallbackStrategy,
|
8
10
|
GenerationStrategy,
|
9
11
|
GenerationConfig,
|
12
|
+
LlmProcessing,
|
10
13
|
Model,
|
11
14
|
OCRResult,
|
12
15
|
OcrStrategy,
|
@@ -29,8 +32,11 @@ __all__ = [
|
|
29
32
|
"Configuration",
|
30
33
|
"CroppingStrategy",
|
31
34
|
"EmbedSource",
|
35
|
+
"ErrorHandlingStrategy",
|
36
|
+
"FallbackStrategy",
|
32
37
|
"GenerationConfig",
|
33
38
|
"GenerationStrategy",
|
39
|
+
"LlmProcessing",
|
34
40
|
"Model",
|
35
41
|
"OCRResult",
|
36
42
|
"OcrStrategy",
|
@@ -16,7 +16,10 @@ from chunkr_ai.models import (
|
|
16
16
|
ChunkProcessing,
|
17
17
|
TaskResponse,
|
18
18
|
EmbedSource,
|
19
|
+
ErrorHandlingStrategy,
|
19
20
|
Tokenizer,
|
21
|
+
LlmProcessing,
|
22
|
+
FallbackStrategy,
|
20
23
|
)
|
21
24
|
|
22
25
|
@pytest.fixture
|
@@ -120,6 +123,39 @@ def xlm_roberta_with_html_content_config():
|
|
120
123
|
),
|
121
124
|
)
|
122
125
|
|
126
|
+
@pytest.fixture
|
127
|
+
def none_fallback_config():
|
128
|
+
return Configuration(
|
129
|
+
llm_processing=LlmProcessing(
|
130
|
+
model_id="gemini-pro-2.5",
|
131
|
+
fallback_strategy=FallbackStrategy.none(),
|
132
|
+
max_completion_tokens=500,
|
133
|
+
temperature=0.2
|
134
|
+
),
|
135
|
+
)
|
136
|
+
|
137
|
+
@pytest.fixture
|
138
|
+
def default_fallback_config():
|
139
|
+
return Configuration(
|
140
|
+
llm_processing=LlmProcessing(
|
141
|
+
model_id="gemini-pro-2.5",
|
142
|
+
fallback_strategy=FallbackStrategy.default(),
|
143
|
+
max_completion_tokens=1000,
|
144
|
+
temperature=0.5
|
145
|
+
),
|
146
|
+
)
|
147
|
+
|
148
|
+
@pytest.fixture
|
149
|
+
def model_fallback_config():
|
150
|
+
return Configuration(
|
151
|
+
llm_processing=LlmProcessing(
|
152
|
+
model_id="gemini-pro-2.5",
|
153
|
+
fallback_strategy=FallbackStrategy.model("claude-3.7-sonnet"),
|
154
|
+
max_completion_tokens=2000,
|
155
|
+
temperature=0.7
|
156
|
+
),
|
157
|
+
)
|
158
|
+
|
123
159
|
@pytest.mark.asyncio
|
124
160
|
async def test_send_file_path(client, sample_path):
|
125
161
|
response = await client.upload(sample_path)
|
@@ -442,4 +478,109 @@ async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_r
|
|
442
478
|
response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
|
443
479
|
assert response.task_id is not None
|
444
480
|
assert response.status == "Succeeded"
|
445
|
-
assert response.output is not None
|
481
|
+
assert response.output is not None
|
482
|
+
|
483
|
+
@pytest.mark.asyncio
|
484
|
+
async def test_error_handling_continue(client, sample_path):
|
485
|
+
response = await client.upload(sample_path, Configuration(error_handling=ErrorHandlingStrategy.CONTINUE))
|
486
|
+
assert response.task_id is not None
|
487
|
+
assert response.status == "Succeeded"
|
488
|
+
assert response.output is not None
|
489
|
+
|
490
|
+
@pytest.mark.asyncio
|
491
|
+
async def test_llm_processing_none_fallback(client, sample_path, none_fallback_config):
|
492
|
+
response = await client.upload(sample_path, none_fallback_config)
|
493
|
+
assert response.task_id is not None
|
494
|
+
assert response.status == "Succeeded"
|
495
|
+
assert response.output is not None
|
496
|
+
assert response.configuration.llm_processing is not None
|
497
|
+
assert response.configuration.llm_processing.model_id == "gemini-pro-2.5"
|
498
|
+
assert str(response.configuration.llm_processing.fallback_strategy) == "None"
|
499
|
+
assert response.configuration.llm_processing.max_completion_tokens == 500
|
500
|
+
assert response.configuration.llm_processing.temperature == 0.2
|
501
|
+
|
502
|
+
@pytest.mark.asyncio
|
503
|
+
async def test_llm_processing_default_fallback(client, sample_path, default_fallback_config):
|
504
|
+
response = await client.upload(sample_path, default_fallback_config)
|
505
|
+
assert response.task_id is not None
|
506
|
+
assert response.status == "Succeeded"
|
507
|
+
assert response.output is not None
|
508
|
+
assert response.configuration.llm_processing is not None
|
509
|
+
assert response.configuration.llm_processing.model_id == "gemini-pro-2.5"
|
510
|
+
# The service may resolve Default to an actual model
|
511
|
+
assert response.configuration.llm_processing.fallback_strategy is not None
|
512
|
+
assert response.configuration.llm_processing.max_completion_tokens == 1000
|
513
|
+
assert response.configuration.llm_processing.temperature == 0.5
|
514
|
+
|
515
|
+
@pytest.mark.asyncio
|
516
|
+
async def test_llm_processing_model_fallback(client, sample_path, model_fallback_config):
|
517
|
+
response = await client.upload(sample_path, model_fallback_config)
|
518
|
+
assert response.task_id is not None
|
519
|
+
assert response.status == "Succeeded"
|
520
|
+
assert response.output is not None
|
521
|
+
assert response.configuration.llm_processing is not None
|
522
|
+
assert response.configuration.llm_processing.model_id == "gemini-pro-2.5"
|
523
|
+
assert str(response.configuration.llm_processing.fallback_strategy) == "Model(claude-3.7-sonnet)"
|
524
|
+
assert response.configuration.llm_processing.max_completion_tokens == 2000
|
525
|
+
assert response.configuration.llm_processing.temperature == 0.7
|
526
|
+
|
527
|
+
@pytest.mark.asyncio
|
528
|
+
async def test_llm_custom_model(client, sample_path):
|
529
|
+
config = Configuration(
|
530
|
+
llm_processing=LlmProcessing(
|
531
|
+
model_id="claude-3.7-sonnet", # Using a model from models.yaml
|
532
|
+
fallback_strategy=FallbackStrategy.none(),
|
533
|
+
max_completion_tokens=1500,
|
534
|
+
temperature=0.3
|
535
|
+
),
|
536
|
+
)
|
537
|
+
response = await client.upload(sample_path, config)
|
538
|
+
assert response.task_id is not None
|
539
|
+
assert response.status == "Succeeded"
|
540
|
+
assert response.output is not None
|
541
|
+
assert response.configuration.llm_processing is not None
|
542
|
+
assert response.configuration.llm_processing.model_id == "claude-3.7-sonnet"
|
543
|
+
|
544
|
+
@pytest.mark.asyncio
|
545
|
+
async def test_fallback_strategy_serialization():
|
546
|
+
# Test that FallbackStrategy objects serialize correctly
|
547
|
+
none_strategy = FallbackStrategy.none()
|
548
|
+
default_strategy = FallbackStrategy.default()
|
549
|
+
model_strategy = FallbackStrategy.model("gpt-4.1")
|
550
|
+
|
551
|
+
assert none_strategy.model_dump() == "None"
|
552
|
+
assert default_strategy.model_dump() == "Default"
|
553
|
+
assert model_strategy.model_dump() == {"Model": "gpt-4.1"}
|
554
|
+
|
555
|
+
# Test string representation
|
556
|
+
assert str(none_strategy) == "None"
|
557
|
+
assert str(default_strategy) == "Default"
|
558
|
+
assert str(model_strategy) == "Model(gpt-4.1)"
|
559
|
+
|
560
|
+
@pytest.mark.asyncio
|
561
|
+
async def test_combined_config_with_llm_and_other_settings(client, sample_path):
|
562
|
+
# Test combining LLM settings with other configuration options
|
563
|
+
config = Configuration(
|
564
|
+
llm_processing=LlmProcessing(
|
565
|
+
model_id="qwen-2.5-vl-7b-instruct",
|
566
|
+
fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
|
567
|
+
temperature=0.4
|
568
|
+
),
|
569
|
+
segmentation_strategy=SegmentationStrategy.PAGE,
|
570
|
+
segment_processing=SegmentProcessing(
|
571
|
+
page=GenerationConfig(
|
572
|
+
html=GenerationStrategy.LLM,
|
573
|
+
markdown=GenerationStrategy.LLM
|
574
|
+
)
|
575
|
+
),
|
576
|
+
chunk_processing=ChunkProcessing(target_length=1024)
|
577
|
+
)
|
578
|
+
|
579
|
+
response = await client.upload(sample_path, config)
|
580
|
+
assert response.task_id is not None
|
581
|
+
assert response.status == "Succeeded"
|
582
|
+
assert response.output is not None
|
583
|
+
assert response.configuration.llm_processing is not None
|
584
|
+
assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
|
585
|
+
assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
|
586
|
+
assert response.configuration.chunk_processing.target_length == 1024
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|