PyPI - chunkr-ai - Versions diffs - 0.0.41__tar.gz → 0.0.44__tar.gz - Mend

chunkr-ai 0.0.41tar.gz → 0.0.44tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{chunkr_ai-0.0.41/src/chunkr_ai.egg-info → chunkr_ai-0.0.44}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: chunkr-ai
-Version: 0.0.41
+Version: 0.0.44
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License
@@ -37,6 +37,7 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
 Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
 Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
 Requires-Dist: ruff>=0.9.3; extra == "test"
+Dynamic: license-file
 # Chunkr Python Client

{chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "chunkr-ai"
-version = "0.0.41"
+version = "0.0.44"
 authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
 description = "Python client for Chunkr: open source document intelligence"
 readme = "README.md"

{chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/configuration.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from pydantic import BaseModel, Field, ConfigDict
 from enum import Enum
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Union
+from pydantic import field_validator
 class GenerationStrategy(str, Enum):
     LLM = "LLM"
@@ -10,11 +11,18 @@ class CroppingStrategy(str, Enum):
     ALL = "All"
     AUTO = "Auto"
+class EmbedSource(str, Enum):
+    HTML = "HTML"
+    MARKDOWN = "Markdown"
+    LLM = "LLM"
+    CONTENT = "Content"
 class GenerationConfig(BaseModel):
     html: Optional[GenerationStrategy] = None
     llm: Optional[str] = None
     markdown: Optional[GenerationStrategy] = None
     crop_image: Optional[CroppingStrategy] = None
+    embed_sources: Optional[List[EmbedSource]] = Field(default_factory=lambda: [EmbedSource.MARKDOWN])
 class SegmentProcessing(BaseModel):
     model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
@@ -32,9 +40,83 @@ class SegmentProcessing(BaseModel):
     text: Optional[GenerationConfig] = Field(default=None, alias="Text")
     title: Optional[GenerationConfig] = Field(default=None, alias="Title")
+class Tokenizer(str, Enum):
+    WORD = "Word"
+    CL100K_BASE = "Cl100kBase"
+    XLM_ROBERTA_BASE = "XlmRobertaBase"
+    BERT_BASE_UNCASED = "BertBaseUncased"
+class TokenizerType(BaseModel):
+    enum_value: Optional[Tokenizer] = None
+    string_value: Optional[str] = None
+    @classmethod
+    def from_enum(cls, enum_value: Tokenizer) -> "TokenizerType":
+        return cls(enum_value=enum_value)
+    @classmethod
+    def from_string(cls, string_value: str) -> "TokenizerType":
+        return cls(string_value=string_value)
+    def __str__(self) -> str:
+        if self.enum_value is not None:
+            return f"enum:{self.enum_value.value}"
+        elif self.string_value is not None:
+            return f"string:{self.string_value}"
+        return ""
+    model_config = ConfigDict(
+        json_encoders={
+            'TokenizerType': lambda v: v.model_dump()
+        }
+    )
+    def model_dump(self, **kwargs):
+        if self.enum_value is not None:
+            return {"Enum": self.enum_value.value}
+        elif self.string_value is not None:
+            return {"String": self.string_value}
+        return {}
 class ChunkProcessing(BaseModel):
-    ignore_headers_and_footers: Optional[bool] = None
+    ignore_headers_and_footers: Optional[bool] = True
     target_length: Optional[int] = None
+    tokenizer: Optional[Union[TokenizerType, Tokenizer, str]] = None
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+        json_encoders={
+            TokenizerType: lambda v: v.model_dump()
+        }
+    )
+    @field_validator('tokenizer', mode='before')
+    def validate_tokenizer(cls, v):
+        if v is None:
+            return None
+        if isinstance(v, TokenizerType):
+            return v
+        if isinstance(v, Tokenizer):
+            return TokenizerType(enum_value=v)
+        if isinstance(v, dict):
+            if "Enum" in v:
+                try:
+                    return TokenizerType(enum_value=Tokenizer(v["Enum"]))
+                except ValueError:
+                    return TokenizerType(string_value=v["Enum"])
+            elif "String" in v:
+                return TokenizerType(string_value=v["String"])
+        if isinstance(v, str):
+            try:
+                return TokenizerType(enum_value=Tokenizer(v))
+            except ValueError:
+                return TokenizerType(string_value=v)
+        raise ValueError(f"Cannot convert {v} to TokenizerType")
 class OcrStrategy(str, Enum):
     ALL = "All"

{chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/task_response.py RENAMED Viewed

@@ -4,6 +4,7 @@ from pydantic import BaseModel, PrivateAttr
 import asyncio
 import json
 import os
+import httpx
 from .configuration import Configuration, OutputConfiguration, OutputResponse, Status
 from .protocol import ChunkrClientProtocol
@@ -51,8 +52,12 @@ class TaskResponse(BaseModel, Generic[T]):
             )
             r.raise_for_status()
             return r.json()
-        except (ConnectionError, TimeoutError, OSError) as e:
-            print(f"Connection error while polling the task: {str(e)}, retrying...")
+        except (ConnectionError, TimeoutError, OSError,
+                httpx.ReadTimeout, httpx.ConnectTimeout,
+                httpx.WriteTimeout, httpx.PoolTimeout,
+                httpx.ConnectError, httpx.ReadError,
+                httpx.NetworkError) as e:
+            print(f"Connection error while polling the task: {str(e)}\nretrying...")
             await asyncio.sleep(0.5)
             return await self._poll_request()
         except Exception as e:

{chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/models.py RENAMED Viewed

@@ -4,6 +4,7 @@ from .api.configuration import (
     ChunkProcessing,
     Configuration,
     CroppingStrategy,
+    EmbedSource,
     GenerationStrategy,
     GenerationConfig,
     Model,
@@ -16,6 +17,8 @@ from .api.configuration import (
     SegmentationStrategy,
     Status,
     Pipeline,
+    Tokenizer,
+    TokenizerType,
 )
 from .api.task_response import TaskResponse
@@ -25,6 +28,7 @@ __all__ = [
     "ChunkProcessing",
     "Configuration",
     "CroppingStrategy",
+    "EmbedSource",
     "GenerationConfig",
     "GenerationStrategy",
     "Model",
@@ -38,5 +42,6 @@ __all__ = [
     "Status",
     "TaskResponse",
     "Pipeline",
+    "Tokenizer",
+    "TokenizerType",
 ]

{chunkr_ai-0.0.41 → chunkr_ai-0.0.44/src/chunkr_ai.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: chunkr-ai
-Version: 0.0.41
+Version: 0.0.44
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License
@@ -37,6 +37,7 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
 Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
 Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
 Requires-Dist: ruff>=0.9.3; extra == "test"
+Dynamic: license-file
 # Chunkr Python Client

{chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/tests/test_chunkr.py RENAMED Viewed

@@ -15,6 +15,8 @@ from chunkr_ai.models import (
     SegmentProcessing,
     ChunkProcessing,
     TaskResponse,
+    EmbedSource,
+    Tokenizer,
 )
 @pytest.fixture
@@ -34,6 +36,90 @@ def client():
     client = Chunkr()
     yield client
+@pytest.fixture
+def markdown_embed_config():
+    return Configuration(
+        segment_processing=SegmentProcessing(
+            page=GenerationConfig(
+                html=GenerationStrategy.LLM,
+                markdown=GenerationStrategy.LLM,
+                embed_sources=[EmbedSource.MARKDOWN]
+            )
+        ),
+    )
+@pytest.fixture
+def html_embed_config():
+    return Configuration(
+        segment_processing=SegmentProcessing(
+            page=GenerationConfig(
+                html=GenerationStrategy.LLM,
+                markdown=GenerationStrategy.LLM,
+                embed_sources=[EmbedSource.HTML]
+            )
+        ),
+    )
+@pytest.fixture
+def multiple_embed_config():
+    return Configuration(
+        segment_processing=SegmentProcessing(
+            page=GenerationConfig(
+                html=GenerationStrategy.LLM,
+                markdown=GenerationStrategy.LLM,
+                llm="Generate a summary of this content",
+                embed_sources=[EmbedSource.MARKDOWN, EmbedSource.LLM, EmbedSource.HTML]
+            )
+        ),
+    )
+@pytest.fixture
+def word_tokenizer_string_config():
+    return Configuration(
+        chunk_processing=ChunkProcessing(
+            tokenizer="Word"
+        ),
+    )
+@pytest.fixture
+def word_tokenizer_config():
+    return Configuration(
+        chunk_processing=ChunkProcessing(
+            tokenizer=Tokenizer.WORD
+        ),
+    )
+@pytest.fixture
+def cl100k_tokenizer_config():
+    return Configuration(
+        chunk_processing=ChunkProcessing(
+            tokenizer=Tokenizer.CL100K_BASE
+        ),
+    )
+@pytest.fixture
+def custom_tokenizer_config():
+    return Configuration(
+        chunk_processing=ChunkProcessing(
+            tokenizer="Qwen/Qwen-tokenizer"
+        ),
+    )
+@pytest.fixture
+def xlm_roberta_with_html_content_config():
+    return Configuration(
+        chunk_processing=ChunkProcessing(
+            tokenizer=Tokenizer.XLM_ROBERTA_BASE
+        ),
+        segment_processing=SegmentProcessing(
+            page=GenerationConfig(
+                html=GenerationStrategy.LLM,
+                markdown=GenerationStrategy.LLM,
+                embed_sources=[EmbedSource.HTML, EmbedSource.CONTENT]
+            )
+        ),
+    )
 @pytest.mark.asyncio
 async def test_send_file_path(client, sample_path):
     response = await client.upload(sample_path)
@@ -241,6 +327,15 @@ async def test_send_base64_file(client, sample_path):
     assert response.status == "Succeeded"
     assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_base64_file_with_data_url(client, sample_path):
+    with open(sample_path, "rb") as f:
+        base64_content = base64.b64encode(f.read()).decode('utf-8')
+    response = await client.upload(f"data:application/pdf;base64,{base64_content}")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
 @pytest.mark.asyncio
 async def test_send_base64_file_with_filename(client, sample_path):
     # Read file and convert to base64
@@ -289,4 +384,62 @@ async def test_output_files_with_dirs(client, sample_path, tmp_path):
     assert html_file.exists()
     assert md_file.exists()
     assert content_file.exists()
-    assert json_file.exists()
+    assert json_file.exists()
+@pytest.mark.asyncio
+async def test_embed_sources_markdown_only(client, sample_path, markdown_embed_config):
+    response = await client.upload(sample_path, markdown_embed_config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+    # Check the first chunk to verify embed exists
+    if response.output.chunks:
+        chunk = response.output.chunks[0]
+        assert chunk.embed is not None
+@pytest.mark.asyncio
+async def test_embed_sources_html_only(client, sample_path, html_embed_config):
+    response = await client.upload(sample_path, html_embed_config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_embed_sources_multiple(client, sample_path, multiple_embed_config):
+    response = await client.upload(sample_path, multiple_embed_config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_tokenizer_word(client, sample_path, word_tokenizer_config):
+    response = await client.upload(sample_path, word_tokenizer_config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+    if response.output.chunks:
+        for chunk in response.output.chunks:
+            # Word tokenizer should result in chunks with length close to target
+            assert chunk.chunk_length > 0
+            assert chunk.chunk_length <= 600  # Allow some flexibility
+@pytest.mark.asyncio
+async def test_tokenizer_cl100k(client, sample_path, cl100k_tokenizer_config):
+    response = await client.upload(sample_path, cl100k_tokenizer_config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_tokenizer_custom_string(client, sample_path, custom_tokenizer_config):
+    response = await client.upload(sample_path, custom_tokenizer_config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
+    response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None