PyPI - chunkr-ai - Versions diffs - 0.3.1__tar.gz → 0.3.3__tar.gz - Mend

chunkr-ai 0.3.1tar.gz → 0.3.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{chunkr_ai-0.3.1/src/chunkr_ai.egg-info → chunkr_ai-0.3.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chunkr-ai
-Version: 0.3.1
+Version: 0.3.3
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/pyproject.toml RENAMED Viewed

@@ -4,12 +4,12 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "chunkr-ai"
-version = "0.3.1"
-authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
+version = "0.3.3"
+authors = [{ "name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh" }]
 description = "Python client for Chunkr: open source document intelligence"
 readme = "README.md"
-license = {"file" = "LICENSE"}
-urls = {Homepage = "https://chunkr.ai"}
+license = { "file" = "LICENSE" }
+urls = { Homepage = "https://chunkr.ai" }
 dependencies = [
     "httpx>=0.25.0",
     "matplotlib>=3.10.3",
@@ -27,3 +27,5 @@ test = [
     "ruff>=0.9.3",
 ]
+[dependency-groups]
+dev = ["mypy>=1.17.1"]

chunkr_ai-0.3.3/src/chunkr_ai/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .api.chunkr import Chunkr
+import tomllib
+from pathlib import Path
+# Read version from pyproject.toml
+try:
+    pyproject_path = Path(__file__).parent.parent.parent / "pyproject.toml"
+    with open(pyproject_path, "rb") as f:
+        pyproject_data = tomllib.load(f)
+    __version__ = pyproject_data["project"]["version"]
+except Exception:
+    __version__ = "unknown"
+__all__ = ["Chunkr", "__version__"]

chunkr_ai-0.3.3/src/chunkr_ai/api/auth.py ADDED Viewed

@@ -0,0 +1,42 @@
+import platform
+import sys
+import tomllib
+from pathlib import Path
+def _find_pyproject_toml(start_path: Path) -> Path | None:
+    """Search for pyproject.toml in current and parent directories."""
+    for parent in [start_path, *start_path.parents]:
+        candidate = parent / "pyproject.toml"
+        if candidate.is_file():
+            return candidate
+    return None
+# Read version from pyproject.toml
+try:
+    pyproject_path = _find_pyproject_toml(Path(__file__).resolve().parent)
+    if pyproject_path is not None:
+        with open(pyproject_path, "rb") as f:
+            pyproject_data = tomllib.load(f)
+        __version__ = pyproject_data["project"]["version"]
+    else:
+        __version__ = "unknown"
+except Exception:
+    __version__ = "unknown"
+class HeadersMixin:
+    """Mixin class for handling authorization headers"""
+    _api_key: str = ""
+    def get_api_key(self) -> str:
+        """Get the API key"""
+        if not hasattr(self, "_api_key") or not self._api_key:
+            raise ValueError("API key not set")
+        return self._api_key
+    def _headers(self) -> dict:
+        """Generate authorization headers and version information"""
+        user_agent = f"chunkr-ai/{__version__} (Python/{sys.version.split()[0]}; {platform.system()}/{platform.release()})"
+        return {
+            "Authorization": self.get_api_key(),
+            "User-Agent": user_agent
+        }

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/chunkr.py RENAMED Viewed

@@ -42,7 +42,7 @@ class Chunkr(ChunkrBase):
         data = await prepare_upload_data(file, filename, config)
         assert self._client is not None
         r = await self._client.post(
-            f"{self.url}/api/v1/task/parse", json=data, headers=self._headers()
+            f"{self.url}/task/parse", json=data, headers=self._headers()
         )
         r.raise_for_status()
         return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), True, False)
@@ -55,7 +55,7 @@ class Chunkr(ChunkrBase):
         data = await prepare_upload_data(None, None, config)
         assert self._client is not None
         r = await self._client.patch(
-            f"{self.url}/api/v1/task/{task_id}/parse",
+            f"{self.url}/task/{task_id}/parse",
             json=data,
             headers=self._headers(),
         )
@@ -71,7 +71,7 @@ class Chunkr(ChunkrBase):
         }
         assert self._client is not None
         r = await self._client.get(
-            f"{self.url}/api/v1/task/{task_id}",
+            f"{self.url}/task/{task_id}",
             params=params,
             headers=self._headers()
         )
@@ -83,7 +83,7 @@ class Chunkr(ChunkrBase):
     async def delete_task(self, task_id: str) -> None:
         assert self._client is not None
         r = await self._client.delete(
-            f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
+            f"{self.url}/task/{task_id}", headers=self._headers()
         )
         r.raise_for_status()
@@ -92,7 +92,7 @@ class Chunkr(ChunkrBase):
     async def cancel_task(self, task_id: str) -> None:
         assert self._client is not None
         r = await self._client.get(
-            f"{self.url}/api/v1/task/{task_id}/cancel", headers=self._headers()
+            f"{self.url}/task/{task_id}/cancel", headers=self._headers()
         )
         r.raise_for_status()

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/configuration.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from pydantic import BaseModel, Field, ConfigDict
 from enum import Enum
-from typing import Any, List, Optional, Union
+from typing import List, Optional, Union
 from pydantic import field_validator, field_serializer
 class CroppingStrategy(str, Enum):
@@ -20,15 +20,17 @@ class EmbedSource(str, Enum):
 class GenerationStrategy(str, Enum):
     LLM = "LLM"
     AUTO = "Auto"
+    IGNORE = "Ignore"
 class GenerationConfig(BaseModel):
     format: Optional[SegmentFormat] = None
     strategy: Optional[GenerationStrategy] = None
-    llm: Optional[str] = None
     crop_image: Optional[CroppingStrategy] = None
-    embed_sources: Optional[List[EmbedSource]] = None
     extended_context: Optional[bool] = None
+    description: Optional[bool] = None
     # Deprecated fields for backwards compatibility
+    llm: Optional[str] = None # Deprecated
+    embed_sources: Optional[List[EmbedSource]] = None # Deprecated
     html: Optional[GenerationStrategy] = None  # Deprecated: Use format=SegmentFormat.HTML and strategy instead
     markdown: Optional[GenerationStrategy] = None  # Deprecated: Use format=SegmentFormat.MARKDOWN and strategy instead
@@ -83,7 +85,7 @@ class TokenizerType(BaseModel):
         return {}
 class ChunkProcessing(BaseModel):
-    ignore_headers_and_footers: Optional[bool] = True
+    ignore_headers_and_footers: Optional[bool] = None # Deprecated
     target_length: Optional[int] = None
     tokenizer: Optional[Union[TokenizerType, Tokenizer, str]] = None
@@ -286,6 +288,7 @@ class Page(BaseModel):
     page_height: float
     page_width: float
     ss_sheet_name: Optional[str] = None
+    dpi: Optional[float] = None
 class Segment(BaseModel):
     bbox: BoundingBox
@@ -303,6 +306,8 @@ class Segment(BaseModel):
     confidence: Optional[float]
     text: str = ""
     segment_length: Optional[int] = None
+    embed: Optional[str] = None
+    description: Optional[str] = None
     # Spreadsheet-specific fields
     ss_cells: Optional[List[Cell]] = None
     ss_header_bbox: Optional[BoundingBox] = None
@@ -317,6 +322,7 @@ class Chunk(BaseModel):
     chunk_length: int
     segments: List[Segment]
     embed: Optional[str] = None
+    content: Optional[str] = None
 class OutputResponse(BaseModel):
     chunks: List[Chunk]
@@ -347,10 +353,6 @@ class Configuration(BaseModel):
 class OutputConfiguration(Configuration):
     input_file_url: Optional[str] = None
-    # Deprecated
-    json_schema: Optional[Any] = None
-    model: Optional[Model] = None
-    target_chunk_length: Optional[int] = None
 class Status(str, Enum):
     STARTING = "Starting"

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/decorators.py RENAMED Viewed

@@ -2,10 +2,12 @@ import asyncio
 import functools
 import httpx
 import nest_asyncio
-from typing import Callable, Any, TypeVar, Awaitable, Union, overload
-try:
+from typing import Callable, Any, TypeVar, Awaitable, Union
+import sys
+if sys.version_info >= (3, 10):
     from typing import ParamSpec
-except ImportError:
+else:
     from typing_extensions import ParamSpec
 T = TypeVar('T')

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/misc.py RENAMED Viewed

@@ -3,7 +3,7 @@ import base64
 import io
 from pathlib import Path
 from PIL import Image
-from typing import Union, Tuple, BinaryIO, Optional, Any
+from typing import Union, Tuple, BinaryIO, Optional
 async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]) -> Tuple[Optional[str], str]:
     """Convert various file types into a tuple of (filename, file content).
@@ -39,7 +39,7 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, byte
             base64.b64decode(potential_base64)
             # If we get here, it was a valid base64 string in bytes form
             return None, potential_base64
-        except:
+        except Exception:
             # Not a base64 string in bytes form, encode it as base64
             base64_str = base64.b64encode(file_bytes).decode()
             return None, base64_str
@@ -66,14 +66,14 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, byte
                     # Just test if it's valid base64, don't store the result
                     base64.b64decode(file)
                     return None, file
-                except:
+                except Exception:
                     raise ValueError(f"File not found: {file} and it's not a valid base64 string")
         except Exception as e:
             # If string can't be converted to Path or decoded as base64, it might still be a base64 string
             try:
-                base64.b64decode(file)
-                return None, file
-            except:
+                base64.b64decode(str(file))
+                return None, str(file)
+            except Exception:
                 raise ValueError(f"Unable to process file: {e}")
     # Handle file paths - convert to base64

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/task_response.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from datetime import datetime
-from typing import Optional, cast, Awaitable, Union
+from typing import Optional, cast, Union
 from pydantic import BaseModel, PrivateAttr
 import asyncio
 import json

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3/src/chunkr_ai.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chunkr-ai
-Version: 0.3.1
+Version: 0.3.3
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/tests/test_chunkr.py RENAMED Viewed

@@ -680,6 +680,74 @@ class TestNewFields:
         for segment in segments_with_length:
             assert segment.segment_length > 0
+    @pytest.mark.asyncio
+    async def test_segment_embed_field(self, client, sample_path):
+        """Test that Segment objects include the embed field"""
+        response = await client.upload(sample_path)
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        assert len(response.output.chunks) > 0
+        segment = response.output.chunks[0].segments[0]
+        # embed field should be accessible
+        assert hasattr(segment, 'embed')
+        # embed can be None or a string
+        assert segment.embed is None or isinstance(segment.embed, str)
+    @pytest.mark.asyncio
+    async def test_segment_description_field(self, client, sample_path):
+        """Test that Segment objects include the description field"""
+        response = await client.upload(sample_path)
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        assert len(response.output.chunks) > 0
+        segment = response.output.chunks[0].segments[0]
+        # description field should be accessible
+        assert hasattr(segment, 'description')
+        # description can be None or a string
+        assert segment.description is None or isinstance(segment.description, str)
+    @pytest.mark.asyncio
+    async def test_chunk_content_field(self, client, sample_path):
+        """Test that Chunk objects include the content field"""
+        response = await client.upload(sample_path)
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        assert len(response.output.chunks) > 0
+        chunk = response.output.chunks[0]
+        # content field should be accessible
+        assert hasattr(chunk, 'content')
+        # content can be None or a string
+        assert chunk.content is None or isinstance(chunk.content, str)
+    @pytest.mark.asyncio
+    async def test_page_dpi_field_type(self, client, sample_path):
+        """Test that Page objects have dpi field as float type"""
+        response = await client.upload(sample_path)
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        # pages should be accessible (might be None for some configurations)
+        assert hasattr(response.output, 'pages')
+        # If pages exist, validate dpi type
+        if response.output.pages:
+            assert len(response.output.pages) > 0
+            page = response.output.pages[0]
+            assert hasattr(page, 'dpi')
+            # dpi can be None or a float/int (numbers)
+            if page.dpi is not None:
+                assert isinstance(page.dpi, (int, float))
     @pytest.mark.asyncio
     async def test_backwards_compatibility_preserved(self, client, sample_path):
         """Test that all existing fields still work after adding new ones"""
@@ -716,4 +784,10 @@ class TestNewFields:
         assert segment.segment_id is not None
         assert segment.segment_type is not None
         assert segment.confidence is not None or segment.confidence is None
-        assert segment.text is not None or segment.text == ""
+        assert segment.text is not None or segment.text == ""
+        # New fields added for server compatibility
+        assert hasattr(segment, 'embed')
+        assert hasattr(segment, 'description')
+        # Chunk should include new content field
+        assert hasattr(chunk, 'content')

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/tests/test_excel.py RENAMED Viewed

@@ -14,12 +14,6 @@ from chunkr_ai.models import (
     Tokenizer,
     OcrStrategy,
     SegmentationStrategy,
-    Cell,
-    CellStyle,
-    Alignment,
-    VerticalAlignment,
-    Page,
-    Segment,
     SegmentType,
 )
@@ -316,6 +310,17 @@ class TestExcelEmbedding:
         for chunk in chunks_with_embed:
             assert len(chunk.embed.strip()) > 0, "Empty embed content found"
+    @pytest.mark.asyncio
+    async def test_chunks_have_content_field(self, client, excel_sample_path, excel_config):
+        """Test that chunks include the new content field for Excel data"""
+        response = await client.upload(excel_sample_path, excel_config)
+        # All chunks should have the content field accessible
+        for chunk in response.output.chunks:
+            assert hasattr(chunk, 'content'), "Chunk missing content field"
+            # content can be None or a string
+            assert chunk.content is None or isinstance(chunk.content, str)
     @pytest.mark.asyncio
     async def test_segment_length_calculation(self, client, excel_sample_path, excel_config):
         """Test that segments have length calculations"""

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/tests/test_pages.py RENAMED Viewed

@@ -4,16 +4,8 @@ from pathlib import Path
 from chunkr_ai import Chunkr
 from chunkr_ai.models import (
     Configuration,
-    ChunkProcessing,
-    SegmentProcessing,
-    GenerationConfig,
-    SegmentFormat,
-    GenerationStrategy,
-    EmbedSource,
-    Tokenizer,
     OcrStrategy,
     SegmentationStrategy,
-    Page,
 )

chunkr_ai-0.3.1/src/chunkr_ai/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .api.chunkr import Chunkr
-__all__ = ["Chunkr"]

chunkr_ai-0.3.1/src/chunkr_ai/api/auth.py DELETED Viewed

@@ -1,13 +0,0 @@
-class HeadersMixin:
-    """Mixin class for handling authorization headers"""
-    _api_key: str = ""
-    def get_api_key(self) -> str:
-        """Get the API key"""
-        if not hasattr(self, "_api_key") or not self._api_key:
-            raise ValueError("API key not set")
-        return self._api_key
-    def _headers(self) -> dict:
-        """Generate authorization headers"""
-        return {"Authorization": self.get_api_key()}

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/LICENSE RENAMED Viewed

File without changes

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/README.md RENAMED Viewed

File without changes

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/setup.cfg RENAMED Viewed

File without changes

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/__init__.py RENAMED Viewed

File without changes

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/chunkr_base.py RENAMED Viewed

File without changes

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/protocol.py RENAMED Viewed

File without changes

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/models.py RENAMED Viewed

File without changes

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai.egg-info/requires.txt RENAMED Viewed

File without changes

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai.egg-info/top_level.txt RENAMED Viewed

File without changes

{chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/tests/test_file_handling.py RENAMED Viewed

File without changes

chunkr-ai 0.3.1__tar.gz → 0.3.3__tar.gz

chunkr-ai 0.3.1tar.gz → 0.3.3tar.gz