PyPI - chunkr-ai - Versions diffs - 0.0.36__tar.gz → 0.0.38__tar.gz - Mend

chunkr-ai 0.0.36tar.gz → 0.0.38tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{chunkr_ai-0.0.36/src/chunkr_ai.egg-info → chunkr_ai-0.0.38}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: chunkr-ai
-Version: 0.0.36
+Version: 0.0.38
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

{chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "chunkr-ai"
-version = "0.0.36"
+version = "0.0.38"
 authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
 description = "Python client for Chunkr: open source document intelligence"
 readme = "README.md"

{chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/api/chunkr.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from pathlib import Path
 from PIL import Image
-from typing import Union, BinaryIO
+from typing import Union, BinaryIO, Optional
 from .configuration import Configuration
 from .decorators import anywhere, ensure_client, retry_on_429
@@ -17,8 +17,9 @@ class Chunkr(ChunkrBase):
         self,
         file: Union[str, Path, BinaryIO, Image.Image],
         config: Configuration = None,
+        filename: Optional[str] = None,
     ) -> TaskResponse:
-        task = await self.create_task(file, config)
+        task = await self.create_task(file, config, filename)
         return await task.poll()
     @anywhere()
@@ -34,10 +35,12 @@ class Chunkr(ChunkrBase):
         self,
         file: Union[str, Path, BinaryIO, Image.Image],
         config: Configuration = None,
+        filename: Optional[str] = None,
     ) -> TaskResponse:
-        files = await prepare_upload_data(file, config, self._client)
+        """Create a new task with the given file and configuration."""
+        data = await prepare_upload_data(file, filename, config)
         r = await self._client.post(
-            f"{self.url}/api/v1/task", files=files, headers=self._headers()
+            f"{self.url}/api/v1/task/parse", json=data, headers=self._headers()
         )
         r.raise_for_status()
         return TaskResponse(**r.json()).with_client(self, True, False)
@@ -46,10 +49,11 @@ class Chunkr(ChunkrBase):
     @ensure_client()
     @retry_on_429()
     async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
-        files = await prepare_upload_data(None, config, self._client)
+        """Update an existing task with new configuration."""
+        data = await prepare_upload_data(None, None, config)
         r = await self._client.patch(
-            f"{self.url}/api/v1/task/{task_id}",
-            files=files,
+            f"{self.url}/api/v1/task/{task_id}/parse",
+            json=data,
             headers=self._headers(),
         )
         r.raise_for_status()

{chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/api/chunkr_base.py RENAMED Viewed

@@ -7,8 +7,7 @@ import httpx
 import os
 from pathlib import Path
 from PIL import Image
-from typing import BinaryIO, Union
+from typing import BinaryIO, Union, Optional
 class ChunkrBase(HeadersMixin):
     """Base class with shared functionality for Chunkr API clients.
@@ -20,7 +19,7 @@ class ChunkrBase(HeadersMixin):
     """
     def __init__(self, url: str = None, api_key: str = None, raise_on_failure: bool = False):
-        load_dotenv()
+        load_dotenv(override=True)
         self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
         self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
         self.raise_on_failure = raise_on_failure
@@ -38,13 +37,15 @@ class ChunkrBase(HeadersMixin):
         self,
         file: Union[str, Path, BinaryIO, Image.Image],
         config: Configuration = None,
+        filename: Optional[str] = None,
     ) -> TaskResponse:
         """Upload a file and wait for processing to complete.
         Args:
             file: The file to upload.
             config: Configuration options for processing. Optional.
+            filename: The filename to use for the file. Optional.
         Examples:
         ```python
         # Upload from file path
@@ -58,7 +59,7 @@ class ChunkrBase(HeadersMixin):
         await chunkr.upload("https://example.com/document.pdf")
         # Upload from base64 string (must include MIME type header)
-        await chunkr.upload("data:application/pdf;base64,JVBERi0...")
+        await chunkr.upload("data:application/pdf;base64,JVBERi0...", filename="document.pdf")
         # Upload an image
         from PIL import Image
@@ -90,13 +91,14 @@ class ChunkrBase(HeadersMixin):
         self,
         file: Union[str, Path, BinaryIO, Image.Image],
         config: Configuration = None,
+        filename: Optional[str] = None,
     ) -> TaskResponse:
         """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
         Args:
             file: The file to upload.
             config: Configuration options for processing. Optional.
+            filename: The filename to use for the file. Optional.
         Examples:
         ```
         # Upload from file path
@@ -110,7 +112,7 @@ class ChunkrBase(HeadersMixin):
         task = await chunkr.create_task("https://example.com/document.pdf")
         # Upload from base64 string (must include MIME type header)
-        task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
+        task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...", filename="document.pdf")
         # Upload an image
         from PIL import Image

{chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/api/configuration.py RENAMED Viewed

@@ -19,20 +19,18 @@ class GenerationConfig(BaseModel):
 class SegmentProcessing(BaseModel):
     model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
-    title: Optional[GenerationConfig] = Field(default=None, alias="Title")
-    section_header: Optional[GenerationConfig] = Field(
-        default=None, alias="SectionHeader"
-    )
-    text: Optional[GenerationConfig] = Field(default=None, alias="Text")
-    list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
-    table: Optional[GenerationConfig] = Field(default=None, alias="Table")
-    picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
     caption: Optional[GenerationConfig] = Field(default=None, alias="Caption")
-    formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
     footnote: Optional[GenerationConfig] = Field(default=None, alias="Footnote")
-    page_header: Optional[GenerationConfig] = Field(default=None, alias="PageHeader")
-    page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
+    formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
+    list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
     page: Optional[GenerationConfig] = Field(default=None, alias="Page")
+    page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
+    page_header: Optional[GenerationConfig] = Field(default=None, alias="PageHeader")
+    picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
+    section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
+    table: Optional[GenerationConfig] = Field(default=None, alias="Table")
+    text: Optional[GenerationConfig] = Field(default=None, alias="Text")
+    title: Optional[GenerationConfig] = Field(default=None, alias="Title")
 class ChunkProcessing(BaseModel):
     ignore_headers_and_footers: Optional[bool] = None
@@ -84,11 +82,13 @@ class Segment(BaseModel):
     page_width: float
     segment_id: str
     segment_type: SegmentType
+    confidence: Optional[float]
 class Chunk(BaseModel):
     chunk_id: str
     chunk_length: int
     segments: List[Segment]
+    embed: Optional[str] = None
 class OutputResponse(BaseModel):
     chunks: List[Chunk]

chunkr_ai-0.0.38/src/chunkr_ai/api/misc.py ADDED Viewed

@@ -0,0 +1,103 @@
+from .configuration import Configuration
+import base64
+import io
+from pathlib import Path
+from PIL import Image
+from typing import Union, Tuple, BinaryIO, Optional
+async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[Optional[str], str]:
+    """Convert various file types into a tuple of (filename, file content).
+    Args:
+        file: Input file, can be:
+            - URL string starting with http:// or https://
+            - Base64 string
+            - Local file path (will be converted to base64)
+            - Opened binary file (will be converted to base64)
+            - PIL/Pillow Image object (will be converted to base64)
+    Returns:
+        Tuple[Optional[str], str]: (filename, content) where content is either a URL or base64 string
+        The filename may be None for URLs, base64 strings, and PIL Images
+    Raises:
+        FileNotFoundError: If the file path doesn't exist
+        TypeError: If the file type is not supported
+        ValueError: If the URL is invalid or unreachable
+        ValueError: If the MIME type is unsupported
+    """
+    # Handle strings
+    if isinstance(file, str):
+        if file.startswith(('http://', 'https://')):
+            return None, file
+        try:
+            base64.b64decode(file)
+            return None, file
+        except:
+            try:
+                file = Path(file)
+            except:
+                raise ValueError("File must be a valid path, URL, or base64 string")
+    # Handle file paths - convert to base64
+    if isinstance(file, Path):
+        path = Path(file).resolve()
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file}")
+        with open(path, "rb") as f:
+            file_content = f.read()
+            file_ext = path.suffix.lower().lstrip('.')
+            if not file_ext:
+                raise ValueError("File must have an extension")
+            base64_str = base64.b64encode(file_content).decode()
+            return path.name, base64_str
+    # Handle PIL Images - convert to base64
+    if isinstance(file, Image.Image):
+        img_byte_arr = io.BytesIO()
+        format = file.format or "PNG"
+        file.save(img_byte_arr, format=format)
+        img_byte_arr.seek(0)
+        base64_str = base64.b64encode(img_byte_arr.getvalue()).decode()
+        return None, base64_str
+    # Handle file-like objects - convert to base64
+    if hasattr(file, "read") and hasattr(file, "seek"):
+        file.seek(0)
+        file_content = file.read()
+        name = getattr(file, "name", "document")
+        file_ext = Path(name).suffix.lower().lstrip('.')
+        if not file_ext:
+            raise ValueError("File must have an extension")
+        base64_str = base64.b64encode(file_content).decode()
+        return Path(name).name, base64_str
+    raise TypeError(f"Unsupported file type: {type(file)}")
+async def prepare_upload_data(
+    file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
+    filename: Optional[str] = None,
+    config: Optional[Configuration] = None,
+) -> dict:
+    """Prepare data dictionary for upload.
+    Args:
+        file: The file to upload
+        config: Optional configuration settings
+        client: HTTP client for downloading remote files
+    Returns:
+        dict: JSON-serializable data dictionary ready for upload
+    """
+    data = {}
+    if file:
+        processed_filename, processed_file = await prepare_file(file)
+        data["file"] = processed_file
+        data["file_name"] = filename or processed_filename
+    if config:
+        data.update(config.model_dump(mode="json", exclude_none=True))
+    return data

{chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/api/task_response.py RENAMED Viewed

@@ -74,9 +74,11 @@ class TaskResponse(BaseModel, Generic[T]):
     @retry_on_429()
     async def update(self, config: Configuration) -> T:
         """Update the task configuration."""
-        f = await prepare_upload_data(None, config, self._client._client)
+        data = await prepare_upload_data(None, None, config)
         r = await self._client._client.patch(
-            self.task_url, files=f, headers=self._client._headers()
+            f"{self.task_url}/parse",
+            json=data,
+            headers=self._client._headers()
         )
         r.raise_for_status()
         updated = TaskResponse(**r.json()).with_client(self._client)
@@ -142,7 +144,7 @@ class TaskResponse(BaseModel, Generic[T]):
         Args:
             output_file (str, optional): Path to save the markdown content. Defaults to None.
         """
-        content = self._get_content("markdown")
+        content = self._get_content("markdown", separator="\n\n")
         self._write_to_file(content, output_file)
         return content
@@ -166,7 +168,7 @@ class TaskResponse(BaseModel, Generic[T]):
         self._write_to_file(data, output_file, is_json=True)
         return data
-    def _get_content(self, t: str) -> str:
+    def _get_content(self, t: str, separator: str = "\n") -> str:
         if not self.output:
             return ""
         parts = []
@@ -175,4 +177,4 @@ class TaskResponse(BaseModel, Generic[T]):
                 v = getattr(s, t)
                 if v:
                     parts.append(v)
-        return "\n".join(parts)
+        return separator.join(parts)

{chunkr_ai-0.0.36 → chunkr_ai-0.0.38/src/chunkr_ai.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: chunkr-ai
-Version: 0.0.36
+Version: 0.0.38
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

{chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/tests/test_chunkr.py RENAMED Viewed

@@ -2,6 +2,7 @@ import pytest
 from pathlib import Path
 from PIL import Image
 import asyncio
+import base64
 from chunkr_ai import Chunkr
 from chunkr_ai.models import (
@@ -68,6 +69,7 @@ async def test_send_pil_image(client, sample_image):
     assert response.task_id is not None
     assert response.status == "Succeeded"
     assert response.output is not None
+    assert response.output is not None
 @pytest.mark.asyncio
 async def test_ocr_auto(client, sample_path):
@@ -222,15 +224,41 @@ async def test_task_operations_after_client_close(client, sample_path):
     result = await task.poll()
     assert result.status == "Succeeded"
+@pytest.mark.asyncio
+async def test_send_base64_file(client, sample_path):
+    # Read file and convert to base64
+    with open(sample_path, "rb") as f:
+        base64_content = base64.b64encode(f.read()).decode('utf-8')
+    response = await client.upload(base64_content)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_base64_file_with_filename(client, sample_path):
+    # Read file and convert to base64
+    with open(sample_path, "rb") as f:
+        base64_content = base64.b64encode(f.read()).decode('utf-8')
+    response = await client.upload(base64_content, filename="test.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
 @pytest.mark.asyncio
 async def test_output_files_no_dir(client, sample_path, tmp_path):
-    await client.upload(sample_path)
+    task = await client.upload(sample_path)
     html_file = tmp_path / "output.html"
     md_file = tmp_path / "output.md"
     content_file = tmp_path / "output.txt"
     json_file = tmp_path / "output.json"
+    task.html(html_file)
+    task.markdown(md_file)
+    task.content(content_file)
+    task.json(json_file)
     assert html_file.exists()
     assert md_file.exists()
     assert content_file.exists()
@@ -238,13 +266,18 @@ async def test_output_files_no_dir(client, sample_path, tmp_path):
 @pytest.mark.asyncio
 async def test_output_files_with_dirs(client, sample_path, tmp_path):
-    await client.upload(sample_path)
+    task = await client.upload(sample_path)
     nested_dir = tmp_path / "nested" / "output" / "dir"
     html_file = nested_dir / "output.html"
     md_file = nested_dir / "output.md"
     content_file = nested_dir / "output.txt"
     json_file = nested_dir / "output.json"
+    task.html(html_file)
+    task.markdown(md_file)
+    task.content(content_file)
+    task.json(json_file)
     assert html_file.exists()
     assert md_file.exists()

chunkr_ai-0.0.36/src/chunkr_ai/api/misc.py DELETED Viewed

@@ -1,155 +0,0 @@
-from .configuration import Configuration
-import io
-import json
-from pathlib import Path
-from PIL import Image
-import httpx
-from typing import Union, Tuple, BinaryIO, Optional
-async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image], client: httpx.AsyncClient = None) -> Tuple[str, BinaryIO]:
-    """Convert various file types into a tuple of (filename, file-like object).
-        Args:
-            file: Input file, can be:
-                - String or Path to a file
-                - URL string starting with http:// or https://
-                - Base64 string
-                - Opened binary file (mode='rb')
-                - PIL/Pillow Image object
-        Returns:
-            Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
-        Raises:
-            FileNotFoundError: If the file path doesn't exist
-            TypeError: If the file type is not supported
-            ValueError: If the URL is invalid or unreachable
-            ValueError: If the MIME type is unsupported
-    """
-    # Handle URLs
-    if isinstance(file, str) and (
-        file.startswith("http://") or file.startswith("https://")
-    ):
-        if not client:
-            raise ValueError("Client must be provided to download files from URLs")
-        response = await client.get(file)
-        response.raise_for_status()
-        # Try to get filename from Content-Disposition header first
-        filename = None
-        content_disposition = response.headers.get("Content-Disposition")
-        if content_disposition and "filename=" in content_disposition:
-            filename = content_disposition.split("filename=")[-1].strip("\"'")
-        # If no Content-Disposition, try to get clean filename from URL path
-        if not filename:
-            from urllib.parse import urlparse, unquote
-            parsed_url = urlparse(file)
-            path = unquote(parsed_url.path)
-            filename = Path(path).name if path else None
-        # Fallback to default name if we couldn't extract one
-        filename = filename or "downloaded_file"
-        # Sanitize filename: remove invalid characters and limit length
-        import re
-        filename = re.sub(
-            r'[<>:"/\\|?*%]', "_", filename
-        )  # Replace invalid chars with underscore
-        filename = re.sub(r"\s+", "_", filename)  # Replace whitespace with underscore
-        filename = filename.strip("._")  # Remove leading/trailing dots and underscores
-        filename = filename[:255]  # Limit length to 255 characters
-        file_obj = io.BytesIO(response.content)
-        return filename, file_obj
-    # Handle base64 strings
-    if isinstance(file, str) and "," in file and ";base64," in file:
-        try:
-            # Split header and data
-            header, base64_data = file.split(",", 1)
-            import base64
-            file_bytes = base64.b64decode(base64_data)
-            file_obj = io.BytesIO(file_bytes)
-            # Try to determine format from header
-            format = "bin"
-            mime_type = header.split(":")[-1].split(";")[0].lower()
-            # Map MIME types to file extensions
-            mime_to_ext = {
-                "application/pdf": "pdf",
-                "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
-                "application/msword": "doc",
-                "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
-                "application/vnd.ms-powerpoint": "ppt",
-                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
-                "application/vnd.ms-excel": "xls",
-                "image/jpeg": "jpg",
-                "image/png": "png",
-                "image/jpg": "jpg",
-            }
-            if mime_type in mime_to_ext:
-                format = mime_to_ext[mime_type]
-            else:
-                raise ValueError(f"Unsupported MIME type: {mime_type}")
-            return f"file.{format}", file_obj
-        except Exception as e:
-            raise ValueError(f"Invalid base64 string: {str(e)}")
-    # Handle file paths
-    if isinstance(file, (str, Path)):
-        path = Path(file).resolve()
-        if not path.exists():
-            raise FileNotFoundError(f"File not found: {file}")
-        return path.name, open(path, "rb")
-    # Handle PIL Images
-    if isinstance(file, Image.Image):
-        img_byte_arr = io.BytesIO()
-        format = file.format or "PNG"
-        file.save(img_byte_arr, format=format)
-        img_byte_arr.seek(0)
-        return f"image.{format.lower()}", img_byte_arr
-    # Handle file-like objects
-    if hasattr(file, "read") and hasattr(file, "seek"):
-        # Try to get the filename from the file object if possible
-        name = (
-            getattr(file, "name", "document") if hasattr(file, "name") else "document"
-        )
-        return Path(name).name, file
-    raise TypeError(f"Unsupported file type: {type(file)}")
-async def prepare_upload_data(
-    file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
-    config: Optional[Configuration] = None,
-    client: httpx.AsyncClient = None,
-) -> dict:
-    """Prepare files and data dictionaries for upload.
-    Args:
-        file: The file to upload
-        config: Optional configuration settings
-    Returns:
-        dict: (files dict) ready for upload
-    """
-    files = {}
-    if file:
-        filename, file_obj = await prepare_file(file, client)
-        files = {"file": (filename, file_obj)}
-    if config:
-        config_dict = config.model_dump(mode="json", exclude_none=True)
-        for key, value in config_dict.items():
-            files[key] = (None, json.dumps(value), "application/json")
-    return files