chunkr-ai 0.0.45__tar.gz → 0.0.46__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chunkr_ai-0.0.45/src/chunkr_ai.egg-info → chunkr_ai-0.0.46}/PKG-INFO +1 -1
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/pyproject.toml +1 -1
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai/api/configuration.py +98 -9
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai/models.py +4 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/tests/test_chunkr.py +133 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/LICENSE +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/README.md +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/setup.cfg +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai/__init__.py +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai/api/auth.py +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai/api/chunkr.py +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai/api/chunkr_base.py +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai/api/decorators.py +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai/api/misc.py +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai/api/protocol.py +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai/api/task_response.py +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai.egg-info/requires.txt +0 -0
- {chunkr_ai-0.0.45 → chunkr_ai-0.0.46}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.0.
|
7
|
+
version = "0.0.46"
|
8
8
|
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
9
|
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from pydantic import BaseModel, Field, ConfigDict
|
2
2
|
from enum import Enum
|
3
3
|
from typing import Any, List, Optional, Union
|
4
|
-
from pydantic import field_validator
|
4
|
+
from pydantic import field_validator, field_serializer
|
5
5
|
|
6
6
|
class GenerationStrategy(str, Enum):
|
7
7
|
LLM = "LLM"
|
@@ -65,11 +65,7 @@ class TokenizerType(BaseModel):
|
|
65
65
|
return f"string:{self.string_value}"
|
66
66
|
return ""
|
67
67
|
|
68
|
-
model_config = ConfigDict(
|
69
|
-
json_encoders={
|
70
|
-
'TokenizerType': lambda v: v.model_dump()
|
71
|
-
}
|
72
|
-
)
|
68
|
+
model_config = ConfigDict()
|
73
69
|
|
74
70
|
def model_dump(self, **kwargs):
|
75
71
|
if self.enum_value is not None:
|
@@ -85,10 +81,13 @@ class ChunkProcessing(BaseModel):
|
|
85
81
|
|
86
82
|
model_config = ConfigDict(
|
87
83
|
arbitrary_types_allowed=True,
|
88
|
-
json_encoders={
|
89
|
-
TokenizerType: lambda v: v.model_dump()
|
90
|
-
}
|
91
84
|
)
|
85
|
+
|
86
|
+
@field_serializer('tokenizer')
|
87
|
+
def serialize_tokenizer(self, tokenizer: Optional[TokenizerType], _info):
|
88
|
+
if tokenizer is None:
|
89
|
+
return None
|
90
|
+
return tokenizer.model_dump()
|
92
91
|
|
93
92
|
@field_validator('tokenizer', mode='before')
|
94
93
|
def validate_tokenizer(cls, v):
|
@@ -130,6 +129,95 @@ class ErrorHandlingStrategy(str, Enum):
|
|
130
129
|
FAIL = "Fail"
|
131
130
|
CONTINUE = "Continue"
|
132
131
|
|
132
|
+
class FallbackStrategy(BaseModel):
|
133
|
+
type: str
|
134
|
+
model_id: Optional[str] = None
|
135
|
+
|
136
|
+
@classmethod
|
137
|
+
def none(cls) -> "FallbackStrategy":
|
138
|
+
return cls(type="None")
|
139
|
+
|
140
|
+
@classmethod
|
141
|
+
def default(cls) -> "FallbackStrategy":
|
142
|
+
return cls(type="Default")
|
143
|
+
|
144
|
+
@classmethod
|
145
|
+
def model(cls, model_id: str) -> "FallbackStrategy":
|
146
|
+
return cls(type="Model", model_id=model_id)
|
147
|
+
|
148
|
+
def __str__(self) -> str:
|
149
|
+
if self.type == "Model":
|
150
|
+
return f"Model({self.model_id})"
|
151
|
+
return self.type
|
152
|
+
|
153
|
+
def model_dump(self, **kwargs):
|
154
|
+
if self.type == "Model":
|
155
|
+
return {"Model": self.model_id}
|
156
|
+
return self.type
|
157
|
+
|
158
|
+
@field_validator('type')
|
159
|
+
def validate_type(cls, v):
|
160
|
+
if v not in ["None", "Default", "Model"]:
|
161
|
+
raise ValueError(f"Invalid fallback strategy: {v}")
|
162
|
+
return v
|
163
|
+
|
164
|
+
model_config = ConfigDict()
|
165
|
+
|
166
|
+
@classmethod
|
167
|
+
def model_validate(cls, obj):
|
168
|
+
# Handle string values like "None" or "Default"
|
169
|
+
if isinstance(obj, str):
|
170
|
+
if obj in ["None", "Default"]:
|
171
|
+
return cls(type=obj)
|
172
|
+
# Try to parse as Enum value if it's not a direct match
|
173
|
+
try:
|
174
|
+
return cls(type=obj)
|
175
|
+
except ValueError:
|
176
|
+
pass # Let it fall through to normal validation
|
177
|
+
|
178
|
+
# Handle dictionary format like {"Model": "model-id"}
|
179
|
+
elif isinstance(obj, dict) and len(obj) == 1:
|
180
|
+
if "Model" in obj:
|
181
|
+
return cls(type="Model", model_id=obj["Model"])
|
182
|
+
|
183
|
+
# Fall back to normal validation
|
184
|
+
return super().model_validate(obj)
|
185
|
+
|
186
|
+
class LlmProcessing(BaseModel):
|
187
|
+
model_id: Optional[str] = None
|
188
|
+
fallback_strategy: FallbackStrategy = Field(default_factory=FallbackStrategy.default)
|
189
|
+
max_completion_tokens: Optional[int] = None
|
190
|
+
temperature: float = 0.0
|
191
|
+
|
192
|
+
model_config = ConfigDict()
|
193
|
+
|
194
|
+
@field_serializer('fallback_strategy')
|
195
|
+
def serialize_fallback_strategy(self, fallback_strategy: FallbackStrategy, _info):
|
196
|
+
return fallback_strategy.model_dump()
|
197
|
+
|
198
|
+
@field_validator('fallback_strategy', mode='before')
|
199
|
+
def validate_fallback_strategy(cls, v):
|
200
|
+
if isinstance(v, str):
|
201
|
+
if v == "None":
|
202
|
+
return FallbackStrategy.none()
|
203
|
+
elif v == "Default":
|
204
|
+
return FallbackStrategy.default()
|
205
|
+
# Try to parse as a model ID if it's not None or Default
|
206
|
+
try:
|
207
|
+
return FallbackStrategy.model(v)
|
208
|
+
except ValueError:
|
209
|
+
pass # Let it fall through to normal validation
|
210
|
+
# Handle dictionary format like {"Model": "model-id"}
|
211
|
+
elif isinstance(v, dict) and len(v) == 1:
|
212
|
+
if "Model" in v:
|
213
|
+
return FallbackStrategy.model(v["Model"])
|
214
|
+
elif "None" in v or v.get("None") is None:
|
215
|
+
return FallbackStrategy.none()
|
216
|
+
elif "Default" in v or v.get("Default") is None:
|
217
|
+
return FallbackStrategy.default()
|
218
|
+
|
219
|
+
return v
|
220
|
+
|
133
221
|
class BoundingBox(BaseModel):
|
134
222
|
left: float
|
135
223
|
top: float
|
@@ -199,6 +287,7 @@ class Configuration(BaseModel):
|
|
199
287
|
segment_processing: Optional[SegmentProcessing] = None
|
200
288
|
segmentation_strategy: Optional[SegmentationStrategy] = None
|
201
289
|
pipeline: Optional[Pipeline] = None
|
290
|
+
llm_processing: Optional[LlmProcessing] = None
|
202
291
|
|
203
292
|
class OutputConfiguration(Configuration):
|
204
293
|
input_file_url: Optional[str] = None
|
@@ -6,8 +6,10 @@ from .api.configuration import (
|
|
6
6
|
CroppingStrategy,
|
7
7
|
EmbedSource,
|
8
8
|
ErrorHandlingStrategy,
|
9
|
+
FallbackStrategy,
|
9
10
|
GenerationStrategy,
|
10
11
|
GenerationConfig,
|
12
|
+
LlmProcessing,
|
11
13
|
Model,
|
12
14
|
OCRResult,
|
13
15
|
OcrStrategy,
|
@@ -31,8 +33,10 @@ __all__ = [
|
|
31
33
|
"CroppingStrategy",
|
32
34
|
"EmbedSource",
|
33
35
|
"ErrorHandlingStrategy",
|
36
|
+
"FallbackStrategy",
|
34
37
|
"GenerationConfig",
|
35
38
|
"GenerationStrategy",
|
39
|
+
"LlmProcessing",
|
36
40
|
"Model",
|
37
41
|
"OCRResult",
|
38
42
|
"OcrStrategy",
|
@@ -18,6 +18,8 @@ from chunkr_ai.models import (
|
|
18
18
|
EmbedSource,
|
19
19
|
ErrorHandlingStrategy,
|
20
20
|
Tokenizer,
|
21
|
+
LlmProcessing,
|
22
|
+
FallbackStrategy,
|
21
23
|
)
|
22
24
|
|
23
25
|
@pytest.fixture
|
@@ -121,6 +123,39 @@ def xlm_roberta_with_html_content_config():
|
|
121
123
|
),
|
122
124
|
)
|
123
125
|
|
126
|
+
@pytest.fixture
|
127
|
+
def none_fallback_config():
|
128
|
+
return Configuration(
|
129
|
+
llm_processing=LlmProcessing(
|
130
|
+
model_id="gemini-pro-2.5",
|
131
|
+
fallback_strategy=FallbackStrategy.none(),
|
132
|
+
max_completion_tokens=500,
|
133
|
+
temperature=0.2
|
134
|
+
),
|
135
|
+
)
|
136
|
+
|
137
|
+
@pytest.fixture
|
138
|
+
def default_fallback_config():
|
139
|
+
return Configuration(
|
140
|
+
llm_processing=LlmProcessing(
|
141
|
+
model_id="gemini-pro-2.5",
|
142
|
+
fallback_strategy=FallbackStrategy.default(),
|
143
|
+
max_completion_tokens=1000,
|
144
|
+
temperature=0.5
|
145
|
+
),
|
146
|
+
)
|
147
|
+
|
148
|
+
@pytest.fixture
|
149
|
+
def model_fallback_config():
|
150
|
+
return Configuration(
|
151
|
+
llm_processing=LlmProcessing(
|
152
|
+
model_id="gemini-pro-2.5",
|
153
|
+
fallback_strategy=FallbackStrategy.model("claude-3.7-sonnet"),
|
154
|
+
max_completion_tokens=2000,
|
155
|
+
temperature=0.7
|
156
|
+
),
|
157
|
+
)
|
158
|
+
|
124
159
|
@pytest.mark.asyncio
|
125
160
|
async def test_send_file_path(client, sample_path):
|
126
161
|
response = await client.upload(sample_path)
|
@@ -451,3 +486,101 @@ async def test_error_handling_continue(client, sample_path):
|
|
451
486
|
assert response.task_id is not None
|
452
487
|
assert response.status == "Succeeded"
|
453
488
|
assert response.output is not None
|
489
|
+
|
490
|
+
@pytest.mark.asyncio
|
491
|
+
async def test_llm_processing_none_fallback(client, sample_path, none_fallback_config):
|
492
|
+
response = await client.upload(sample_path, none_fallback_config)
|
493
|
+
assert response.task_id is not None
|
494
|
+
assert response.status == "Succeeded"
|
495
|
+
assert response.output is not None
|
496
|
+
assert response.configuration.llm_processing is not None
|
497
|
+
assert response.configuration.llm_processing.model_id == "gemini-pro-2.5"
|
498
|
+
assert str(response.configuration.llm_processing.fallback_strategy) == "None"
|
499
|
+
assert response.configuration.llm_processing.max_completion_tokens == 500
|
500
|
+
assert response.configuration.llm_processing.temperature == 0.2
|
501
|
+
|
502
|
+
@pytest.mark.asyncio
|
503
|
+
async def test_llm_processing_default_fallback(client, sample_path, default_fallback_config):
|
504
|
+
response = await client.upload(sample_path, default_fallback_config)
|
505
|
+
assert response.task_id is not None
|
506
|
+
assert response.status == "Succeeded"
|
507
|
+
assert response.output is not None
|
508
|
+
assert response.configuration.llm_processing is not None
|
509
|
+
assert response.configuration.llm_processing.model_id == "gemini-pro-2.5"
|
510
|
+
# The service may resolve Default to an actual model
|
511
|
+
assert response.configuration.llm_processing.fallback_strategy is not None
|
512
|
+
assert response.configuration.llm_processing.max_completion_tokens == 1000
|
513
|
+
assert response.configuration.llm_processing.temperature == 0.5
|
514
|
+
|
515
|
+
@pytest.mark.asyncio
|
516
|
+
async def test_llm_processing_model_fallback(client, sample_path, model_fallback_config):
|
517
|
+
response = await client.upload(sample_path, model_fallback_config)
|
518
|
+
assert response.task_id is not None
|
519
|
+
assert response.status == "Succeeded"
|
520
|
+
assert response.output is not None
|
521
|
+
assert response.configuration.llm_processing is not None
|
522
|
+
assert response.configuration.llm_processing.model_id == "gemini-pro-2.5"
|
523
|
+
assert str(response.configuration.llm_processing.fallback_strategy) == "Model(claude-3.7-sonnet)"
|
524
|
+
assert response.configuration.llm_processing.max_completion_tokens == 2000
|
525
|
+
assert response.configuration.llm_processing.temperature == 0.7
|
526
|
+
|
527
|
+
@pytest.mark.asyncio
|
528
|
+
async def test_llm_custom_model(client, sample_path):
|
529
|
+
config = Configuration(
|
530
|
+
llm_processing=LlmProcessing(
|
531
|
+
model_id="claude-3.7-sonnet", # Using a model from models.yaml
|
532
|
+
fallback_strategy=FallbackStrategy.none(),
|
533
|
+
max_completion_tokens=1500,
|
534
|
+
temperature=0.3
|
535
|
+
),
|
536
|
+
)
|
537
|
+
response = await client.upload(sample_path, config)
|
538
|
+
assert response.task_id is not None
|
539
|
+
assert response.status == "Succeeded"
|
540
|
+
assert response.output is not None
|
541
|
+
assert response.configuration.llm_processing is not None
|
542
|
+
assert response.configuration.llm_processing.model_id == "claude-3.7-sonnet"
|
543
|
+
|
544
|
+
@pytest.mark.asyncio
|
545
|
+
async def test_fallback_strategy_serialization():
|
546
|
+
# Test that FallbackStrategy objects serialize correctly
|
547
|
+
none_strategy = FallbackStrategy.none()
|
548
|
+
default_strategy = FallbackStrategy.default()
|
549
|
+
model_strategy = FallbackStrategy.model("gpt-4.1")
|
550
|
+
|
551
|
+
assert none_strategy.model_dump() == "None"
|
552
|
+
assert default_strategy.model_dump() == "Default"
|
553
|
+
assert model_strategy.model_dump() == {"Model": "gpt-4.1"}
|
554
|
+
|
555
|
+
# Test string representation
|
556
|
+
assert str(none_strategy) == "None"
|
557
|
+
assert str(default_strategy) == "Default"
|
558
|
+
assert str(model_strategy) == "Model(gpt-4.1)"
|
559
|
+
|
560
|
+
@pytest.mark.asyncio
|
561
|
+
async def test_combined_config_with_llm_and_other_settings(client, sample_path):
|
562
|
+
# Test combining LLM settings with other configuration options
|
563
|
+
config = Configuration(
|
564
|
+
llm_processing=LlmProcessing(
|
565
|
+
model_id="qwen-2.5-vl-7b-instruct",
|
566
|
+
fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
|
567
|
+
temperature=0.4
|
568
|
+
),
|
569
|
+
segmentation_strategy=SegmentationStrategy.PAGE,
|
570
|
+
segment_processing=SegmentProcessing(
|
571
|
+
page=GenerationConfig(
|
572
|
+
html=GenerationStrategy.LLM,
|
573
|
+
markdown=GenerationStrategy.LLM
|
574
|
+
)
|
575
|
+
),
|
576
|
+
chunk_processing=ChunkProcessing(target_length=1024)
|
577
|
+
)
|
578
|
+
|
579
|
+
response = await client.upload(sample_path, config)
|
580
|
+
assert response.task_id is not None
|
581
|
+
assert response.status == "Succeeded"
|
582
|
+
assert response.output is not None
|
583
|
+
assert response.configuration.llm_processing is not None
|
584
|
+
assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
|
585
|
+
assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
|
586
|
+
assert response.configuration.chunk_processing.target_length == 1024
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|