PyPI - chunkr-ai - Versions diffs - 0.0.44__tar.gz → 0.0.46__tar.gz - Mend

chunkr-ai 0.0.44tar.gz → 0.0.46tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{chunkr_ai-0.0.44/src/chunkr_ai.egg-info → chunkr_ai-0.0.46}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chunkr-ai
-Version: 0.0.44
+Version: 0.0.46
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

{chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "chunkr-ai"
-version = "0.0.44"
+version = "0.0.46"
 authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
 description = "Python client for Chunkr: open source document intelligence"
 readme = "README.md"

{chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/api/configuration.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from pydantic import BaseModel, Field, ConfigDict
 from enum import Enum
 from typing import Any, List, Optional, Union
-from pydantic import field_validator
+from pydantic import field_validator, field_serializer
 class GenerationStrategy(str, Enum):
     LLM = "LLM"
@@ -65,11 +65,7 @@ class TokenizerType(BaseModel):
             return f"string:{self.string_value}"
         return ""
-    model_config = ConfigDict(
-        json_encoders={
-            'TokenizerType': lambda v: v.model_dump()
-        }
-    )
+    model_config = ConfigDict()
     def model_dump(self, **kwargs):
         if self.enum_value is not None:
@@ -85,10 +81,13 @@ class ChunkProcessing(BaseModel):
     model_config = ConfigDict(
         arbitrary_types_allowed=True,
-        json_encoders={
-            TokenizerType: lambda v: v.model_dump()
-        }
     )
+    @field_serializer('tokenizer')
+    def serialize_tokenizer(self, tokenizer: Optional[TokenizerType], _info):
+        if tokenizer is None:
+            return None
+        return tokenizer.model_dump()
     @field_validator('tokenizer', mode='before')
     def validate_tokenizer(cls, v):
@@ -126,6 +125,99 @@ class SegmentationStrategy(str, Enum):
     LAYOUT_ANALYSIS = "LayoutAnalysis"
     PAGE = "Page"
+class ErrorHandlingStrategy(str, Enum):
+    FAIL = "Fail"
+    CONTINUE = "Continue"
+class FallbackStrategy(BaseModel):
+    type: str
+    model_id: Optional[str] = None
+    @classmethod
+    def none(cls) -> "FallbackStrategy":
+        return cls(type="None")
+    @classmethod
+    def default(cls) -> "FallbackStrategy":
+        return cls(type="Default")
+    @classmethod
+    def model(cls, model_id: str) -> "FallbackStrategy":
+        return cls(type="Model", model_id=model_id)
+    def __str__(self) -> str:
+        if self.type == "Model":
+            return f"Model({self.model_id})"
+        return self.type
+    def model_dump(self, **kwargs):
+        if self.type == "Model":
+            return {"Model": self.model_id}
+        return self.type
+    @field_validator('type')
+    def validate_type(cls, v):
+        if v not in ["None", "Default", "Model"]:
+            raise ValueError(f"Invalid fallback strategy: {v}")
+        return v
+    model_config = ConfigDict()
+    @classmethod
+    def model_validate(cls, obj):
+        # Handle string values like "None" or "Default"
+        if isinstance(obj, str):
+            if obj in ["None", "Default"]:
+                return cls(type=obj)
+            # Try to parse as Enum value if it's not a direct match
+            try:
+                return cls(type=obj)
+            except ValueError:
+                pass  # Let it fall through to normal validation
+        # Handle dictionary format like {"Model": "model-id"}
+        elif isinstance(obj, dict) and len(obj) == 1:
+            if "Model" in obj:
+                return cls(type="Model", model_id=obj["Model"])
+        # Fall back to normal validation
+        return super().model_validate(obj)
+class LlmProcessing(BaseModel):
+    model_id: Optional[str] = None
+    fallback_strategy: FallbackStrategy = Field(default_factory=FallbackStrategy.default)
+    max_completion_tokens: Optional[int] = None
+    temperature: float = 0.0
+    model_config = ConfigDict()
+    @field_serializer('fallback_strategy')
+    def serialize_fallback_strategy(self, fallback_strategy: FallbackStrategy, _info):
+        return fallback_strategy.model_dump()
+    @field_validator('fallback_strategy', mode='before')
+    def validate_fallback_strategy(cls, v):
+        if isinstance(v, str):
+            if v == "None":
+                return FallbackStrategy.none()
+            elif v == "Default":
+                return FallbackStrategy.default()
+            # Try to parse as a model ID if it's not None or Default
+            try:
+                return FallbackStrategy.model(v)
+            except ValueError:
+                pass  # Let it fall through to normal validation
+        # Handle dictionary format like {"Model": "model-id"}
+        elif isinstance(v, dict) and len(v) == 1:
+            if "Model" in v:
+                return FallbackStrategy.model(v["Model"])
+            elif "None" in v or v.get("None") is None:
+                return FallbackStrategy.none()
+            elif "Default" in v or v.get("Default") is None:
+                return FallbackStrategy.default()
+        return v
 class BoundingBox(BaseModel):
     left: float
     top: float
@@ -189,11 +281,13 @@ class Pipeline(str, Enum):
 class Configuration(BaseModel):
     chunk_processing: Optional[ChunkProcessing] = None
     expires_in: Optional[int] = None
+    error_handling: Optional[ErrorHandlingStrategy] = None
     high_resolution: Optional[bool] = None
     ocr_strategy: Optional[OcrStrategy] = None
     segment_processing: Optional[SegmentProcessing] = None
     segmentation_strategy: Optional[SegmentationStrategy] = None
     pipeline: Optional[Pipeline] = None
+    llm_processing: Optional[LlmProcessing] = None
 class OutputConfiguration(Configuration):
     input_file_url: Optional[str] = None

{chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/src/chunkr_ai/models.py RENAMED Viewed

@@ -5,8 +5,11 @@ from .api.configuration import (
     Configuration,
     CroppingStrategy,
     EmbedSource,
+    ErrorHandlingStrategy,
+    FallbackStrategy,
     GenerationStrategy,
     GenerationConfig,
+    LlmProcessing,
     Model,
     OCRResult,
     OcrStrategy,
@@ -29,8 +32,11 @@ __all__ = [
     "Configuration",
     "CroppingStrategy",
     "EmbedSource",
+    "ErrorHandlingStrategy",
+    "FallbackStrategy",
     "GenerationConfig",
     "GenerationStrategy",
+    "LlmProcessing",
     "Model",
     "OCRResult",
     "OcrStrategy",

{chunkr_ai-0.0.44 → chunkr_ai-0.0.46/src/chunkr_ai.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chunkr-ai
-Version: 0.0.44
+Version: 0.0.46
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

{chunkr_ai-0.0.44 → chunkr_ai-0.0.46}/tests/test_chunkr.py RENAMED Viewed

@@ -16,7 +16,10 @@ from chunkr_ai.models import (
     ChunkProcessing,
     TaskResponse,
     EmbedSource,
+    ErrorHandlingStrategy,
     Tokenizer,
+    LlmProcessing,
+    FallbackStrategy,
 )
 @pytest.fixture
@@ -120,6 +123,39 @@ def xlm_roberta_with_html_content_config():
         ),
     )
+@pytest.fixture
+def none_fallback_config():
+    return Configuration(
+        llm_processing=LlmProcessing(
+            model_id="gemini-pro-2.5",
+            fallback_strategy=FallbackStrategy.none(),
+            max_completion_tokens=500,
+            temperature=0.2
+        ),
+    )
+@pytest.fixture
+def default_fallback_config():
+    return Configuration(
+        llm_processing=LlmProcessing(
+            model_id="gemini-pro-2.5",
+            fallback_strategy=FallbackStrategy.default(),
+            max_completion_tokens=1000,
+            temperature=0.5
+        ),
+    )
+@pytest.fixture
+def model_fallback_config():
+    return Configuration(
+        llm_processing=LlmProcessing(
+            model_id="gemini-pro-2.5",
+            fallback_strategy=FallbackStrategy.model("claude-3.7-sonnet"),
+            max_completion_tokens=2000,
+            temperature=0.7
+        ),
+    )
 @pytest.mark.asyncio
 async def test_send_file_path(client, sample_path):
     response = await client.upload(sample_path)
@@ -442,4 +478,109 @@ async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_r
     response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
     assert response.task_id is not None
     assert response.status == "Succeeded"
-    assert response.output is not None
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_error_handling_continue(client, sample_path):
+    response = await client.upload(sample_path, Configuration(error_handling=ErrorHandlingStrategy.CONTINUE))
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_llm_processing_none_fallback(client, sample_path, none_fallback_config):
+    response = await client.upload(sample_path, none_fallback_config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+    assert response.configuration.llm_processing is not None
+    assert response.configuration.llm_processing.model_id == "gemini-pro-2.5"
+    assert str(response.configuration.llm_processing.fallback_strategy) == "None"
+    assert response.configuration.llm_processing.max_completion_tokens == 500
+    assert response.configuration.llm_processing.temperature == 0.2
+@pytest.mark.asyncio
+async def test_llm_processing_default_fallback(client, sample_path, default_fallback_config):
+    response = await client.upload(sample_path, default_fallback_config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+    assert response.configuration.llm_processing is not None
+    assert response.configuration.llm_processing.model_id == "gemini-pro-2.5"
+    # The service may resolve Default to an actual model
+    assert response.configuration.llm_processing.fallback_strategy is not None
+    assert response.configuration.llm_processing.max_completion_tokens == 1000
+    assert response.configuration.llm_processing.temperature == 0.5
+@pytest.mark.asyncio
+async def test_llm_processing_model_fallback(client, sample_path, model_fallback_config):
+    response = await client.upload(sample_path, model_fallback_config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+    assert response.configuration.llm_processing is not None
+    assert response.configuration.llm_processing.model_id == "gemini-pro-2.5"
+    assert str(response.configuration.llm_processing.fallback_strategy) == "Model(claude-3.7-sonnet)"
+    assert response.configuration.llm_processing.max_completion_tokens == 2000
+    assert response.configuration.llm_processing.temperature == 0.7
+@pytest.mark.asyncio
+async def test_llm_custom_model(client, sample_path):
+    config = Configuration(
+        llm_processing=LlmProcessing(
+            model_id="claude-3.7-sonnet",  # Using a model from models.yaml
+            fallback_strategy=FallbackStrategy.none(),
+            max_completion_tokens=1500,
+            temperature=0.3
+        ),
+    )
+    response = await client.upload(sample_path, config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+    assert response.configuration.llm_processing is not None
+    assert response.configuration.llm_processing.model_id == "claude-3.7-sonnet"
+@pytest.mark.asyncio
+async def test_fallback_strategy_serialization():
+    # Test that FallbackStrategy objects serialize correctly
+    none_strategy = FallbackStrategy.none()
+    default_strategy = FallbackStrategy.default()
+    model_strategy = FallbackStrategy.model("gpt-4.1")
+    assert none_strategy.model_dump() == "None"
+    assert default_strategy.model_dump() == "Default"
+    assert model_strategy.model_dump() == {"Model": "gpt-4.1"}
+    # Test string representation
+    assert str(none_strategy) == "None"
+    assert str(default_strategy) == "Default"
+    assert str(model_strategy) == "Model(gpt-4.1)"
+@pytest.mark.asyncio
+async def test_combined_config_with_llm_and_other_settings(client, sample_path):
+    # Test combining LLM settings with other configuration options
+    config = Configuration(
+        llm_processing=LlmProcessing(
+            model_id="qwen-2.5-vl-7b-instruct",
+            fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
+            temperature=0.4
+        ),
+        segmentation_strategy=SegmentationStrategy.PAGE,
+        segment_processing=SegmentProcessing(
+            page=GenerationConfig(
+                html=GenerationStrategy.LLM,
+                markdown=GenerationStrategy.LLM
+            )
+        ),
+        chunk_processing=ChunkProcessing(target_length=1024)
+    )
+    response = await client.upload(sample_path, config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+    assert response.configuration.llm_processing is not None
+    assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
+    assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
+    assert response.configuration.chunk_processing.target_length == 1024