chunkr-ai 0.0.44__tar.gz → 0.0.46__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {chunkr_ai-0.0.44/src/chunkr_ai.egg-info → chunkr_ai-0.0.46}/PKG-INFO +1 -1
  2. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/pyproject.toml +1 -1
  3. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/configuration.py +103 -9
  4. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/models.py +6 -0
  5. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
  6. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/tests/test_chunkr.py +142 -1
  7. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/LICENSE +0 -0
  8. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/README.md +0 -0
  9. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/setup.cfg +0 -0
  10. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/__init__.py +0 -0
  11. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/__init__.py +0 -0
  12. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/auth.py +0 -0
  13. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/chunkr.py +0 -0
  14. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/chunkr_base.py +0 -0
  15. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/decorators.py +0 -0
  16. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/misc.py +0 -0
  17. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/protocol.py +0 -0
  18. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/task_response.py +0 -0
  19. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
  20. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  21. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai.egg-info/requires.txt +0 -0
  22. {chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.44
3
+ Version: 0.0.46
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.44"
7
+ version = "0.0.46"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
@@ -1,7 +1,7 @@
1
1
  from pydantic import BaseModel, Field, ConfigDict
2
2
  from enum import Enum
3
3
  from typing import Any, List, Optional, Union
4
- from pydantic import field_validator
4
+ from pydantic import field_validator, field_serializer
5
5
 
6
6
  class GenerationStrategy(str, Enum):
7
7
  LLM = "LLM"
@@ -65,11 +65,7 @@ class TokenizerType(BaseModel):
65
65
  return f"string:{self.string_value}"
66
66
  return ""
67
67
 
68
- model_config = ConfigDict(
69
- json_encoders={
70
- 'TokenizerType': lambda v: v.model_dump()
71
- }
72
- )
68
+ model_config = ConfigDict()
73
69
 
74
70
  def model_dump(self, **kwargs):
75
71
  if self.enum_value is not None:
@@ -85,10 +81,13 @@ class ChunkProcessing(BaseModel):
85
81
 
86
82
  model_config = ConfigDict(
87
83
  arbitrary_types_allowed=True,
88
- json_encoders={
89
- TokenizerType: lambda v: v.model_dump()
90
- }
91
84
  )
85
+
86
+ @field_serializer('tokenizer')
87
+ def serialize_tokenizer(self, tokenizer: Optional[TokenizerType], _info):
88
+ if tokenizer is None:
89
+ return None
90
+ return tokenizer.model_dump()
92
91
 
93
92
  @field_validator('tokenizer', mode='before')
94
93
  def validate_tokenizer(cls, v):
@@ -126,6 +125,99 @@ class SegmentationStrategy(str, Enum):
126
125
  LAYOUT_ANALYSIS = "LayoutAnalysis"
127
126
  PAGE = "Page"
128
127
 
128
+ class ErrorHandlingStrategy(str, Enum):
129
+ FAIL = "Fail"
130
+ CONTINUE = "Continue"
131
+
132
+ class FallbackStrategy(BaseModel):
133
+ type: str
134
+ model_id: Optional[str] = None
135
+
136
+ @classmethod
137
+ def none(cls) -> "FallbackStrategy":
138
+ return cls(type="None")
139
+
140
+ @classmethod
141
+ def default(cls) -> "FallbackStrategy":
142
+ return cls(type="Default")
143
+
144
+ @classmethod
145
+ def model(cls, model_id: str) -> "FallbackStrategy":
146
+ return cls(type="Model", model_id=model_id)
147
+
148
+ def __str__(self) -> str:
149
+ if self.type == "Model":
150
+ return f"Model({self.model_id})"
151
+ return self.type
152
+
153
+ def model_dump(self, **kwargs):
154
+ if self.type == "Model":
155
+ return {"Model": self.model_id}
156
+ return self.type
157
+
158
+ @field_validator('type')
159
+ def validate_type(cls, v):
160
+ if v not in ["None", "Default", "Model"]:
161
+ raise ValueError(f"Invalid fallback strategy: {v}")
162
+ return v
163
+
164
+ model_config = ConfigDict()
165
+
166
+ @classmethod
167
+ def model_validate(cls, obj):
168
+ # Handle string values like "None" or "Default"
169
+ if isinstance(obj, str):
170
+ if obj in ["None", "Default"]:
171
+ return cls(type=obj)
172
+ # Try to parse as Enum value if it's not a direct match
173
+ try:
174
+ return cls(type=obj)
175
+ except ValueError:
176
+ pass # Let it fall through to normal validation
177
+
178
+ # Handle dictionary format like {"Model": "model-id"}
179
+ elif isinstance(obj, dict) and len(obj) == 1:
180
+ if "Model" in obj:
181
+ return cls(type="Model", model_id=obj["Model"])
182
+
183
+ # Fall back to normal validation
184
+ return super().model_validate(obj)
185
+
186
+ class LlmProcessing(BaseModel):
187
+ model_id: Optional[str] = None
188
+ fallback_strategy: FallbackStrategy = Field(default_factory=FallbackStrategy.default)
189
+ max_completion_tokens: Optional[int] = None
190
+ temperature: float = 0.0
191
+
192
+ model_config = ConfigDict()
193
+
194
+ @field_serializer('fallback_strategy')
195
+ def serialize_fallback_strategy(self, fallback_strategy: FallbackStrategy, _info):
196
+ return fallback_strategy.model_dump()
197
+
198
+ @field_validator('fallback_strategy', mode='before')
199
+ def validate_fallback_strategy(cls, v):
200
+ if isinstance(v, str):
201
+ if v == "None":
202
+ return FallbackStrategy.none()
203
+ elif v == "Default":
204
+ return FallbackStrategy.default()
205
+ # Try to parse as a model ID if it's not None or Default
206
+ try:
207
+ return FallbackStrategy.model(v)
208
+ except ValueError:
209
+ pass # Let it fall through to normal validation
210
+ # Handle dictionary format like {"Model": "model-id"}
211
+ elif isinstance(v, dict) and len(v) == 1:
212
+ if "Model" in v:
213
+ return FallbackStrategy.model(v["Model"])
214
+ elif "None" in v or v.get("None") is None:
215
+ return FallbackStrategy.none()
216
+ elif "Default" in v or v.get("Default") is None:
217
+ return FallbackStrategy.default()
218
+
219
+ return v
220
+
129
221
  class BoundingBox(BaseModel):
130
222
  left: float
131
223
  top: float
@@ -189,11 +281,13 @@ class Pipeline(str, Enum):
189
281
  class Configuration(BaseModel):
190
282
  chunk_processing: Optional[ChunkProcessing] = None
191
283
  expires_in: Optional[int] = None
284
+ error_handling: Optional[ErrorHandlingStrategy] = None
192
285
  high_resolution: Optional[bool] = None
193
286
  ocr_strategy: Optional[OcrStrategy] = None
194
287
  segment_processing: Optional[SegmentProcessing] = None
195
288
  segmentation_strategy: Optional[SegmentationStrategy] = None
196
289
  pipeline: Optional[Pipeline] = None
290
+ llm_processing: Optional[LlmProcessing] = None
197
291
 
198
292
  class OutputConfiguration(Configuration):
199
293
  input_file_url: Optional[str] = None
@@ -5,8 +5,11 @@ from .api.configuration import (
5
5
  Configuration,
6
6
  CroppingStrategy,
7
7
  EmbedSource,
8
+ ErrorHandlingStrategy,
9
+ FallbackStrategy,
8
10
  GenerationStrategy,
9
11
  GenerationConfig,
12
+ LlmProcessing,
10
13
  Model,
11
14
  OCRResult,
12
15
  OcrStrategy,
@@ -29,8 +32,11 @@ __all__ = [
29
32
  "Configuration",
30
33
  "CroppingStrategy",
31
34
  "EmbedSource",
35
+ "ErrorHandlingStrategy",
36
+ "FallbackStrategy",
32
37
  "GenerationConfig",
33
38
  "GenerationStrategy",
39
+ "LlmProcessing",
34
40
  "Model",
35
41
  "OCRResult",
36
42
  "OcrStrategy",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.44
3
+ Version: 0.0.46
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -16,7 +16,10 @@ from chunkr_ai.models import (
16
16
  ChunkProcessing,
17
17
  TaskResponse,
18
18
  EmbedSource,
19
+ ErrorHandlingStrategy,
19
20
  Tokenizer,
21
+ LlmProcessing,
22
+ FallbackStrategy,
20
23
  )
21
24
 
22
25
  @pytest.fixture
@@ -120,6 +123,39 @@ def xlm_roberta_with_html_content_config():
120
123
  ),
121
124
  )
122
125
 
126
+ @pytest.fixture
127
+ def none_fallback_config():
128
+ return Configuration(
129
+ llm_processing=LlmProcessing(
130
+ model_id="gemini-pro-2.5",
131
+ fallback_strategy=FallbackStrategy.none(),
132
+ max_completion_tokens=500,
133
+ temperature=0.2
134
+ ),
135
+ )
136
+
137
+ @pytest.fixture
138
+ def default_fallback_config():
139
+ return Configuration(
140
+ llm_processing=LlmProcessing(
141
+ model_id="gemini-pro-2.5",
142
+ fallback_strategy=FallbackStrategy.default(),
143
+ max_completion_tokens=1000,
144
+ temperature=0.5
145
+ ),
146
+ )
147
+
148
+ @pytest.fixture
149
+ def model_fallback_config():
150
+ return Configuration(
151
+ llm_processing=LlmProcessing(
152
+ model_id="gemini-pro-2.5",
153
+ fallback_strategy=FallbackStrategy.model("claude-3.7-sonnet"),
154
+ max_completion_tokens=2000,
155
+ temperature=0.7
156
+ ),
157
+ )
158
+
123
159
  @pytest.mark.asyncio
124
160
  async def test_send_file_path(client, sample_path):
125
161
  response = await client.upload(sample_path)
@@ -442,4 +478,109 @@ async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_r
442
478
  response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
443
479
  assert response.task_id is not None
444
480
  assert response.status == "Succeeded"
445
- assert response.output is not None
481
+ assert response.output is not None
482
+
483
+ @pytest.mark.asyncio
484
+ async def test_error_handling_continue(client, sample_path):
485
+ response = await client.upload(sample_path, Configuration(error_handling=ErrorHandlingStrategy.CONTINUE))
486
+ assert response.task_id is not None
487
+ assert response.status == "Succeeded"
488
+ assert response.output is not None
489
+
490
+ @pytest.mark.asyncio
491
+ async def test_llm_processing_none_fallback(client, sample_path, none_fallback_config):
492
+ response = await client.upload(sample_path, none_fallback_config)
493
+ assert response.task_id is not None
494
+ assert response.status == "Succeeded"
495
+ assert response.output is not None
496
+ assert response.configuration.llm_processing is not None
497
+ assert response.configuration.llm_processing.model_id == "gemini-pro-2.5"
498
+ assert str(response.configuration.llm_processing.fallback_strategy) == "None"
499
+ assert response.configuration.llm_processing.max_completion_tokens == 500
500
+ assert response.configuration.llm_processing.temperature == 0.2
501
+
502
+ @pytest.mark.asyncio
503
+ async def test_llm_processing_default_fallback(client, sample_path, default_fallback_config):
504
+ response = await client.upload(sample_path, default_fallback_config)
505
+ assert response.task_id is not None
506
+ assert response.status == "Succeeded"
507
+ assert response.output is not None
508
+ assert response.configuration.llm_processing is not None
509
+ assert response.configuration.llm_processing.model_id == "gemini-pro-2.5"
510
+ # The service may resolve Default to an actual model
511
+ assert response.configuration.llm_processing.fallback_strategy is not None
512
+ assert response.configuration.llm_processing.max_completion_tokens == 1000
513
+ assert response.configuration.llm_processing.temperature == 0.5
514
+
515
+ @pytest.mark.asyncio
516
+ async def test_llm_processing_model_fallback(client, sample_path, model_fallback_config):
517
+ response = await client.upload(sample_path, model_fallback_config)
518
+ assert response.task_id is not None
519
+ assert response.status == "Succeeded"
520
+ assert response.output is not None
521
+ assert response.configuration.llm_processing is not None
522
+ assert response.configuration.llm_processing.model_id == "gemini-pro-2.5"
523
+ assert str(response.configuration.llm_processing.fallback_strategy) == "Model(claude-3.7-sonnet)"
524
+ assert response.configuration.llm_processing.max_completion_tokens == 2000
525
+ assert response.configuration.llm_processing.temperature == 0.7
526
+
527
+ @pytest.mark.asyncio
528
+ async def test_llm_custom_model(client, sample_path):
529
+ config = Configuration(
530
+ llm_processing=LlmProcessing(
531
+ model_id="claude-3.7-sonnet", # Using a model from models.yaml
532
+ fallback_strategy=FallbackStrategy.none(),
533
+ max_completion_tokens=1500,
534
+ temperature=0.3
535
+ ),
536
+ )
537
+ response = await client.upload(sample_path, config)
538
+ assert response.task_id is not None
539
+ assert response.status == "Succeeded"
540
+ assert response.output is not None
541
+ assert response.configuration.llm_processing is not None
542
+ assert response.configuration.llm_processing.model_id == "claude-3.7-sonnet"
543
+
544
+ @pytest.mark.asyncio
545
+ async def test_fallback_strategy_serialization():
546
+ # Test that FallbackStrategy objects serialize correctly
547
+ none_strategy = FallbackStrategy.none()
548
+ default_strategy = FallbackStrategy.default()
549
+ model_strategy = FallbackStrategy.model("gpt-4.1")
550
+
551
+ assert none_strategy.model_dump() == "None"
552
+ assert default_strategy.model_dump() == "Default"
553
+ assert model_strategy.model_dump() == {"Model": "gpt-4.1"}
554
+
555
+ # Test string representation
556
+ assert str(none_strategy) == "None"
557
+ assert str(default_strategy) == "Default"
558
+ assert str(model_strategy) == "Model(gpt-4.1)"
559
+
560
+ @pytest.mark.asyncio
561
+ async def test_combined_config_with_llm_and_other_settings(client, sample_path):
562
+ # Test combining LLM settings with other configuration options
563
+ config = Configuration(
564
+ llm_processing=LlmProcessing(
565
+ model_id="qwen-2.5-vl-7b-instruct",
566
+ fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
567
+ temperature=0.4
568
+ ),
569
+ segmentation_strategy=SegmentationStrategy.PAGE,
570
+ segment_processing=SegmentProcessing(
571
+ page=GenerationConfig(
572
+ html=GenerationStrategy.LLM,
573
+ markdown=GenerationStrategy.LLM
574
+ )
575
+ ),
576
+ chunk_processing=ChunkProcessing(target_length=1024)
577
+ )
578
+
579
+ response = await client.upload(sample_path, config)
580
+ assert response.task_id is not None
581
+ assert response.status == "Succeeded"
582
+ assert response.output is not None
583
+ assert response.configuration.llm_processing is not None
584
+ assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
585
+ assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
586
+ assert response.configuration.chunk_processing.target_length == 1024
File without changes
File without changes
File without changes