PyPI - chunkr-ai - Versions diffs - 0.0.35__tar.gz → 0.0.37__tar.gz - Mend

chunkr-ai 0.0.35tar.gz → 0.0.37tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{chunkr_ai-0.0.35/src/chunkr_ai.egg-info → chunkr_ai-0.0.37}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: chunkr-ai
-Version: 0.0.35
+Version: 0.0.37
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

{chunkr_ai-0.0.35 → chunkr_ai-0.0.37}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "chunkr-ai"
-version = "0.0.35"
+version = "0.0.37"
 authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
 description = "Python client for Chunkr: open source document intelligence"
 readme = "README.md"

{chunkr_ai-0.0.35 → chunkr_ai-0.0.37}/src/chunkr_ai/api/chunkr.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from pathlib import Path
 from PIL import Image
-from typing import Union, BinaryIO
+from typing import Union, BinaryIO, Optional
 from .configuration import Configuration
 from .decorators import anywhere, ensure_client, retry_on_429
@@ -17,8 +17,9 @@ class Chunkr(ChunkrBase):
         self,
         file: Union[str, Path, BinaryIO, Image.Image],
         config: Configuration = None,
+        filename: Optional[str] = None,
     ) -> TaskResponse:
-        task = await self.create_task(file, config)
+        task = await self.create_task(file, config, filename)
         return await task.poll()
     @anywhere()
@@ -34,10 +35,12 @@ class Chunkr(ChunkrBase):
         self,
         file: Union[str, Path, BinaryIO, Image.Image],
         config: Configuration = None,
+        filename: Optional[str] = None,
     ) -> TaskResponse:
-        files = await prepare_upload_data(file, config, self._client)
+        """Create a new task with the given file and configuration."""
+        data = await prepare_upload_data(file, filename, config)
         r = await self._client.post(
-            f"{self.url}/api/v1/task", files=files, headers=self._headers()
+            f"{self.url}/api/v1/task/parse", json=data, headers=self._headers()
         )
         r.raise_for_status()
         return TaskResponse(**r.json()).with_client(self, True, False)
@@ -46,10 +49,11 @@ class Chunkr(ChunkrBase):
     @ensure_client()
     @retry_on_429()
     async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
-        files = await prepare_upload_data(None, config, self._client)
+        """Update an existing task with new configuration."""
+        data = await prepare_upload_data(None, None, config)
         r = await self._client.patch(
-            f"{self.url}/api/v1/task/{task_id}",
-            files=files,
+            f"{self.url}/api/v1/task/{task_id}/parse",
+            json=data,
             headers=self._headers(),
         )
         r.raise_for_status()

{chunkr_ai-0.0.35 → chunkr_ai-0.0.37}/src/chunkr_ai/api/chunkr_base.py RENAMED Viewed

@@ -7,8 +7,7 @@ import httpx
 import os
 from pathlib import Path
 from PIL import Image
-from typing import BinaryIO, Union
+from typing import BinaryIO, Union, Optional
 class ChunkrBase(HeadersMixin):
     """Base class with shared functionality for Chunkr API clients.
@@ -20,7 +19,7 @@ class ChunkrBase(HeadersMixin):
     """
     def __init__(self, url: str = None, api_key: str = None, raise_on_failure: bool = False):
-        load_dotenv()
+        load_dotenv(override=True)
         self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
         self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
         self.raise_on_failure = raise_on_failure
@@ -38,13 +37,15 @@ class ChunkrBase(HeadersMixin):
         self,
         file: Union[str, Path, BinaryIO, Image.Image],
         config: Configuration = None,
+        filename: Optional[str] = None,
     ) -> TaskResponse:
         """Upload a file and wait for processing to complete.
         Args:
             file: The file to upload.
             config: Configuration options for processing. Optional.
+            filename: The filename to use for the file. Optional.
         Examples:
         ```python
         # Upload from file path
@@ -58,7 +59,7 @@ class ChunkrBase(HeadersMixin):
         await chunkr.upload("https://example.com/document.pdf")
         # Upload from base64 string (must include MIME type header)
-        await chunkr.upload("data:application/pdf;base64,JVBERi0...")
+        await chunkr.upload("data:application/pdf;base64,JVBERi0...", filename="document.pdf")
         # Upload an image
         from PIL import Image
@@ -90,13 +91,14 @@ class ChunkrBase(HeadersMixin):
         self,
         file: Union[str, Path, BinaryIO, Image.Image],
         config: Configuration = None,
+        filename: Optional[str] = None,
     ) -> TaskResponse:
         """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
         Args:
             file: The file to upload.
             config: Configuration options for processing. Optional.
+            filename: The filename to use for the file. Optional.
         Examples:
         ```
         # Upload from file path
@@ -110,7 +112,7 @@ class ChunkrBase(HeadersMixin):
         task = await chunkr.create_task("https://example.com/document.pdf")
         # Upload from base64 string (must include MIME type header)
-        task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
+        task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...", filename="document.pdf")
         # Upload an image
         from PIL import Image

chunkr_ai-0.0.37/src/chunkr_ai/api/misc.py ADDED Viewed

@@ -0,0 +1,103 @@
+from .configuration import Configuration
+import base64
+import io
+from pathlib import Path
+from PIL import Image
+from typing import Union, Tuple, BinaryIO, Optional
+async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[Optional[str], str]:
+    """Convert various file types into a tuple of (filename, file content).
+    Args:
+        file: Input file, can be:
+            - URL string starting with http:// or https://
+            - Base64 string
+            - Local file path (will be converted to base64)
+            - Opened binary file (will be converted to base64)
+            - PIL/Pillow Image object (will be converted to base64)
+    Returns:
+        Tuple[Optional[str], str]: (filename, content) where content is either a URL or base64 string
+        The filename may be None for URLs, base64 strings, and PIL Images
+    Raises:
+        FileNotFoundError: If the file path doesn't exist
+        TypeError: If the file type is not supported
+        ValueError: If the URL is invalid or unreachable
+        ValueError: If the MIME type is unsupported
+    """
+    # Handle strings
+    if isinstance(file, str):
+        if file.startswith(('http://', 'https://')):
+            return None, file
+        try:
+            base64.b64decode(file)
+            return None, file
+        except:
+            try:
+                file = Path(file)
+            except:
+                raise ValueError("File must be a valid path, URL, or base64 string")
+    # Handle file paths - convert to base64
+    if isinstance(file, Path):
+        path = Path(file).resolve()
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file}")
+        with open(path, "rb") as f:
+            file_content = f.read()
+            file_ext = path.suffix.lower().lstrip('.')
+            if not file_ext:
+                raise ValueError("File must have an extension")
+            base64_str = base64.b64encode(file_content).decode()
+            return path.name, base64_str
+    # Handle PIL Images - convert to base64
+    if isinstance(file, Image.Image):
+        img_byte_arr = io.BytesIO()
+        format = file.format or "PNG"
+        file.save(img_byte_arr, format=format)
+        img_byte_arr.seek(0)
+        base64_str = base64.b64encode(img_byte_arr.getvalue()).decode()
+        return None, base64_str
+    # Handle file-like objects - convert to base64
+    if hasattr(file, "read") and hasattr(file, "seek"):
+        file.seek(0)
+        file_content = file.read()
+        name = getattr(file, "name", "document")
+        file_ext = Path(name).suffix.lower().lstrip('.')
+        if not file_ext:
+            raise ValueError("File must have an extension")
+        base64_str = base64.b64encode(file_content).decode()
+        return Path(name).name, base64_str
+    raise TypeError(f"Unsupported file type: {type(file)}")
+async def prepare_upload_data(
+    file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
+    filename: Optional[str] = None,
+    config: Optional[Configuration] = None,
+) -> dict:
+    """Prepare data dictionary for upload.
+    Args:
+        file: The file to upload
+        config: Optional configuration settings
+        client: HTTP client for downloading remote files
+    Returns:
+        dict: JSON-serializable data dictionary ready for upload
+    """
+    data = {}
+    if file:
+        processed_filename, processed_file = await prepare_file(file)
+        data["file"] = processed_file
+        data["file_name"] = filename or processed_filename
+    if config:
+        data.update(config.model_dump(mode="json", exclude_none=True))
+    return data

{chunkr_ai-0.0.35 → chunkr_ai-0.0.37}/src/chunkr_ai/api/task_response.py RENAMED Viewed

@@ -74,9 +74,11 @@ class TaskResponse(BaseModel, Generic[T]):
     @retry_on_429()
     async def update(self, config: Configuration) -> T:
         """Update the task configuration."""
-        f = await prepare_upload_data(None, config, self._client._client)
+        data = await prepare_upload_data(None, None, config)
         r = await self._client._client.patch(
-            self.task_url, files=f, headers=self._client._headers()
+            f"{self.task_url}/parse",
+            json=data,
+            headers=self._client._headers()
         )
         r.raise_for_status()
         updated = TaskResponse(**r.json()).with_client(self._client)
@@ -103,6 +105,29 @@ class TaskResponse(BaseModel, Generic[T]):
         r.raise_for_status()
         return await self.poll()
+    def _write_to_file(self, content: str | dict, output_file: str, is_json: bool = False) -> None:
+        """Helper method to write content to a file
+        Args:
+            content: Content to write (string or dict for JSON)
+            output_file: Path to save the content
+            is_json: Whether the content should be written as JSON
+        """
+        class DateTimeEncoder(json.JSONEncoder):
+            def default(self, obj):
+                if isinstance(obj, datetime):
+                    return obj.isoformat()
+                return super().default(obj)
+        if output_file:
+            directory = os.path.dirname(output_file)
+            if directory:
+                os.makedirs(directory, exist_ok=True)
+            with open(output_file, "w", encoding="utf-8") as f:
+                if is_json:
+                    json.dump(content, f, cls=DateTimeEncoder, indent=2)
+                else:
+                    f.write(content)
     def html(self, output_file: str = None) -> str:
         """Get the full HTML of the task
@@ -110,10 +135,7 @@ class TaskResponse(BaseModel, Generic[T]):
             output_file (str, optional): Path to save the HTML content. Defaults to None.
         """
         content = self._get_content("html")
-        if output_file:
-            os.makedirs(os.path.dirname(output_file), exist_ok=True)
-            with open(output_file, "w", encoding="utf-8") as f:
-                f.write(content)
+        self._write_to_file(content, output_file)
         return content
     def markdown(self, output_file: str = None) -> str:
@@ -123,10 +145,7 @@ class TaskResponse(BaseModel, Generic[T]):
             output_file (str, optional): Path to save the markdown content. Defaults to None.
         """
         content = self._get_content("markdown")
-        if output_file:
-            os.makedirs(os.path.dirname(output_file), exist_ok=True)
-            with open(output_file, "w", encoding="utf-8") as f:
-                f.write(content)
+        self._write_to_file(content, output_file)
         return content
     def content(self, output_file: str = None) -> str:
@@ -136,10 +155,7 @@ class TaskResponse(BaseModel, Generic[T]):
             output_file (str, optional): Path to save the content. Defaults to None.
         """
         content = self._get_content("content")
-        if output_file:
-            os.makedirs(os.path.dirname(output_file), exist_ok=True)
-            with open(output_file, "w", encoding="utf-8") as f:
-                f.write(content)
+        self._write_to_file(content, output_file)
         return content
     def json(self, output_file: str = None) -> dict:
@@ -148,17 +164,8 @@ class TaskResponse(BaseModel, Generic[T]):
         Args:
             output_file (str, optional): Path to save the task data as JSON. Defaults to None.
         """
-        class DateTimeEncoder(json.JSONEncoder):
-            def default(self, obj):
-                if isinstance(obj, datetime):
-                    return obj.isoformat()
-                return super().default(obj)
         data = self.model_dump()
-        if output_file:
-            os.makedirs(os.path.dirname(output_file), exist_ok=True)
-            with open(output_file, "w", encoding="utf-8") as f:
-                json.dump(data, f, cls=DateTimeEncoder, indent=2)
+        self._write_to_file(data, output_file, is_json=True)
         return data
     def _get_content(self, t: str) -> str:

{chunkr_ai-0.0.35 → chunkr_ai-0.0.37/src/chunkr_ai.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: chunkr-ai
-Version: 0.0.35
+Version: 0.0.37
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

{chunkr_ai-0.0.35 → chunkr_ai-0.0.37}/tests/test_chunkr.py RENAMED Viewed

@@ -2,6 +2,7 @@ import pytest
 from pathlib import Path
 from PIL import Image
 import asyncio
+import base64
 from chunkr_ai import Chunkr
 from chunkr_ai.models import (
@@ -67,6 +68,8 @@ async def test_send_pil_image(client, sample_image):
     response = await client.upload(sample_image)
     assert response.task_id is not None
     assert response.status == "Succeeded"
+    assert response.output is not None
+    assert response.output is not None
 @pytest.mark.asyncio
 async def test_ocr_auto(client, sample_path):
@@ -220,3 +223,53 @@ async def test_task_operations_after_client_close(client, sample_path):
     await client.close()
     result = await task.poll()
     assert result.status == "Succeeded"
+@pytest.mark.asyncio
+async def test_send_base64_file(client, sample_path):
+    # Read file and convert to base64
+    with open(sample_path, "rb") as f:
+        base64_content = base64.b64encode(f.read()).decode('utf-8')
+    response = await client.upload(base64_content)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_base64_file_with_filename(client, sample_path):
+    # Read file and convert to base64
+    with open(sample_path, "rb") as f:
+        base64_content = base64.b64encode(f.read()).decode('utf-8')
+    response = await client.upload(base64_content, filename="test.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_output_files_no_dir(client, sample_path, tmp_path):
+    await client.upload(sample_path)
+    html_file = tmp_path / "output.html"
+    md_file = tmp_path / "output.md"
+    content_file = tmp_path / "output.txt"
+    json_file = tmp_path / "output.json"
+    assert html_file.exists()
+    assert md_file.exists()
+    assert content_file.exists()
+    assert json_file.exists()
+@pytest.mark.asyncio
+async def test_output_files_with_dirs(client, sample_path, tmp_path):
+    await client.upload(sample_path)
+    nested_dir = tmp_path / "nested" / "output" / "dir"
+    html_file = nested_dir / "output.html"
+    md_file = nested_dir / "output.md"
+    content_file = nested_dir / "output.txt"
+    json_file = nested_dir / "output.json"
+    assert html_file.exists()
+    assert md_file.exists()
+    assert content_file.exists()
+    assert json_file.exists()

chunkr_ai-0.0.35/src/chunkr_ai/api/misc.py DELETED Viewed

@@ -1,155 +0,0 @@
-from .configuration import Configuration
-import io
-import json
-from pathlib import Path
-from PIL import Image
-import httpx
-from typing import Union, Tuple, BinaryIO, Optional
-async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image], client: httpx.AsyncClient = None) -> Tuple[str, BinaryIO]:
-    """Convert various file types into a tuple of (filename, file-like object).
-        Args:
-            file: Input file, can be:
-                - String or Path to a file
-                - URL string starting with http:// or https://
-                - Base64 string
-                - Opened binary file (mode='rb')
-                - PIL/Pillow Image object
-        Returns:
-            Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
-        Raises:
-            FileNotFoundError: If the file path doesn't exist
-            TypeError: If the file type is not supported
-            ValueError: If the URL is invalid or unreachable
-            ValueError: If the MIME type is unsupported
-    """
-    # Handle URLs
-    if isinstance(file, str) and (
-        file.startswith("http://") or file.startswith("https://")
-    ):
-        if not client:
-            raise ValueError("Client must be provided to download files from URLs")
-        response = await client.get(file)
-        response.raise_for_status()
-        # Try to get filename from Content-Disposition header first
-        filename = None
-        content_disposition = response.headers.get("Content-Disposition")
-        if content_disposition and "filename=" in content_disposition:
-            filename = content_disposition.split("filename=")[-1].strip("\"'")
-        # If no Content-Disposition, try to get clean filename from URL path
-        if not filename:
-            from urllib.parse import urlparse, unquote
-            parsed_url = urlparse(file)
-            path = unquote(parsed_url.path)
-            filename = Path(path).name if path else None
-        # Fallback to default name if we couldn't extract one
-        filename = filename or "downloaded_file"
-        # Sanitize filename: remove invalid characters and limit length
-        import re
-        filename = re.sub(
-            r'[<>:"/\\|?*%]', "_", filename
-        )  # Replace invalid chars with underscore
-        filename = re.sub(r"\s+", "_", filename)  # Replace whitespace with underscore
-        filename = filename.strip("._")  # Remove leading/trailing dots and underscores
-        filename = filename[:255]  # Limit length to 255 characters
-        file_obj = io.BytesIO(response.content)
-        return filename, file_obj
-    # Handle base64 strings
-    if isinstance(file, str) and "," in file and ";base64," in file:
-        try:
-            # Split header and data
-            header, base64_data = file.split(",", 1)
-            import base64
-            file_bytes = base64.b64decode(base64_data)
-            file_obj = io.BytesIO(file_bytes)
-            # Try to determine format from header
-            format = "bin"
-            mime_type = header.split(":")[-1].split(";")[0].lower()
-            # Map MIME types to file extensions
-            mime_to_ext = {
-                "application/pdf": "pdf",
-                "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
-                "application/msword": "doc",
-                "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
-                "application/vnd.ms-powerpoint": "ppt",
-                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
-                "application/vnd.ms-excel": "xls",
-                "image/jpeg": "jpg",
-                "image/png": "png",
-                "image/jpg": "jpg",
-            }
-            if mime_type in mime_to_ext:
-                format = mime_to_ext[mime_type]
-            else:
-                raise ValueError(f"Unsupported MIME type: {mime_type}")
-            return f"file.{format}", file_obj
-        except Exception as e:
-            raise ValueError(f"Invalid base64 string: {str(e)}")
-    # Handle file paths
-    if isinstance(file, (str, Path)):
-        path = Path(file).resolve()
-        if not path.exists():
-            raise FileNotFoundError(f"File not found: {file}")
-        return path.name, open(path, "rb")
-    # Handle PIL Images
-    if isinstance(file, Image.Image):
-        img_byte_arr = io.BytesIO()
-        format = file.format or "PNG"
-        file.save(img_byte_arr, format=format)
-        img_byte_arr.seek(0)
-        return f"image.{format.lower()}", img_byte_arr
-    # Handle file-like objects
-    if hasattr(file, "read") and hasattr(file, "seek"):
-        # Try to get the filename from the file object if possible
-        name = (
-            getattr(file, "name", "document") if hasattr(file, "name") else "document"
-        )
-        return Path(name).name, file
-    raise TypeError(f"Unsupported file type: {type(file)}")
-async def prepare_upload_data(
-    file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
-    config: Optional[Configuration] = None,
-    client: httpx.AsyncClient = None,
-) -> dict:
-    """Prepare files and data dictionaries for upload.
-    Args:
-        file: The file to upload
-        config: Optional configuration settings
-    Returns:
-        dict: (files dict) ready for upload
-    """
-    files = {}
-    if file:
-        filename, file_obj = await prepare_file(file, client)
-        files = {"file": (filename, file_obj)}
-    if config:
-        config_dict = config.model_dump(mode="json", exclude_none=True)
-        for key, value in config_dict.items():
-            files[key] = (None, json.dumps(value), "application/json")
-    return files