PyPI - chunkr-ai - Versions diffs - 0.0.46__tar.gz → 0.0.48__tar.gz - Mend

chunkr-ai 0.0.46tar.gz → 0.0.48tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{chunkr_ai-0.0.46/src/chunkr_ai.egg-info → chunkr_ai-0.0.48}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chunkr-ai
-Version: 0.0.46
+Version: 0.0.48
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

{chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "chunkr-ai"
-version = "0.0.46"
+version = "0.0.48"
 authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
 description = "Python client for Chunkr: open source document intelligence"
 readme = "README.md"

{chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/auth.py RENAMED Viewed

@@ -1,5 +1,6 @@
 class HeadersMixin:
     """Mixin class for handling authorization headers"""
+    _api_key: str = ""
     def get_api_key(self) -> str:
         """Get the API key"""

{chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/chunkr.py RENAMED Viewed

@@ -1,12 +1,13 @@
 from pathlib import Path
 from PIL import Image
-from typing import Union, BinaryIO, Optional
+from typing import Union, BinaryIO, Optional, cast, Awaitable
 from .configuration import Configuration
 from .decorators import anywhere, ensure_client, retry_on_429
 from .misc import prepare_upload_data
 from .task_response import TaskResponse
 from .chunkr_base import ChunkrBase
+from .protocol import ChunkrClientProtocol
 class Chunkr(ChunkrBase):
     """Chunkr API client that works in both sync and async contexts"""
@@ -15,49 +16,51 @@ class Chunkr(ChunkrBase):
     @ensure_client()
     async def upload(
         self,
-        file: Union[str, Path, BinaryIO, Image.Image],
-        config: Configuration = None,
+        file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview],
+        config: Optional[Configuration] = None,
         filename: Optional[str] = None,
     ) -> TaskResponse:
-        task = await self.create_task(file, config, filename)
-        return await task.poll()
+        task = await cast(Awaitable[TaskResponse], self.create_task(file, config, filename))
+        return await cast(Awaitable[TaskResponse], task.poll())
     @anywhere()
     @ensure_client()
     async def update(self, task_id: str, config: Configuration) -> TaskResponse:
-        task = await self.update_task(task_id, config)
-        return await task.poll()
+        task = await cast(Awaitable[TaskResponse], self.update_task(task_id, config))
+        return await cast(Awaitable[TaskResponse], task.poll())
     @anywhere()
     @ensure_client()
     @retry_on_429()
     async def create_task(
         self,
-        file: Union[str, Path, BinaryIO, Image.Image],
-        config: Configuration = None,
+        file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview],
+        config: Optional[Configuration] = None,
         filename: Optional[str] = None,
     ) -> TaskResponse:
         """Create a new task with the given file and configuration."""
         data = await prepare_upload_data(file, filename, config)
+        assert self._client is not None
         r = await self._client.post(
             f"{self.url}/api/v1/task/parse", json=data, headers=self._headers()
         )
         r.raise_for_status()
-        return TaskResponse(**r.json()).with_client(self, True, False)
+        return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), True, False)
     @anywhere()
     @ensure_client()
     @retry_on_429()
-    async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
+    async def update_task(self, task_id: str, config: Optional[Configuration] = None) -> TaskResponse:
         """Update an existing task with new configuration."""
         data = await prepare_upload_data(None, None, config)
+        assert self._client is not None
         r = await self._client.patch(
             f"{self.url}/api/v1/task/{task_id}/parse",
             json=data,
             headers=self._headers(),
         )
         r.raise_for_status()
-        return TaskResponse(**r.json()).with_client(self, True, False)
+        return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), True, False)
     @anywhere()
     @ensure_client()
@@ -66,17 +69,19 @@ class Chunkr(ChunkrBase):
             "base64_urls": str(base64_urls).lower(),
             "include_chunks": str(include_chunks).lower()
         }
+        assert self._client is not None
         r = await self._client.get(
             f"{self.url}/api/v1/task/{task_id}",
             params=params,
             headers=self._headers()
         )
         r.raise_for_status()
-        return TaskResponse(**r.json()).with_client(self, include_chunks, base64_urls)
+        return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), include_chunks, base64_urls)
     @anywhere()
     @ensure_client()
     async def delete_task(self, task_id: str) -> None:
+        assert self._client is not None
         r = await self._client.delete(
             f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
         )
@@ -85,6 +90,7 @@ class Chunkr(ChunkrBase):
     @anywhere()
     @ensure_client()
     async def cancel_task(self, task_id: str) -> None:
+        assert self._client is not None
         r = await self._client.get(
             f"{self.url}/api/v1/task/{task_id}/cancel", headers=self._headers()
         )

{chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/chunkr_base.py RENAMED Viewed

@@ -18,17 +18,23 @@ class ChunkrBase(HeadersMixin):
         raise_on_failure: Whether to raise an exception if the task fails. Defaults to False.
     """
-    def __init__(self, url: str = None, api_key: str = None, raise_on_failure: bool = False):
+    url: str
+    _api_key: str
+    raise_on_failure: bool
+    _client: Optional[httpx.AsyncClient]
+    def __init__(self, url: Optional[str] = None, api_key: Optional[str] = None, raise_on_failure: bool = False):
         load_dotenv(override=True)
         self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
-        self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
+        _api_key = api_key or os.getenv("CHUNKR_API_KEY")
         self.raise_on_failure = raise_on_failure
-        if not self._api_key:
+        if not _api_key:
             raise ValueError(
                 "API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
             )
+        self._api_key = _api_key
         self.url = self.url.rstrip("/")
         self._client = httpx.AsyncClient()
@@ -36,7 +42,7 @@ class ChunkrBase(HeadersMixin):
     def upload(
         self,
         file: Union[str, Path, BinaryIO, Image.Image],
-        config: Configuration = None,
+        config: Optional[Configuration] = None,
         filename: Optional[str] = None,
     ) -> TaskResponse:
         """Upload a file and wait for processing to complete.
@@ -90,7 +96,7 @@ class ChunkrBase(HeadersMixin):
     def create_task(
         self,
         file: Union[str, Path, BinaryIO, Image.Image],
-        config: Configuration = None,
+        config: Optional[Configuration] = None,
         filename: Optional[str] = None,
     ) -> TaskResponse:
         """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
@@ -127,7 +133,7 @@ class ChunkrBase(HeadersMixin):
     @abstractmethod
     def update_task(
-        self, task_id: str, config: Configuration
+        self, task_id: str, config: Optional[Configuration] = None
     ) -> TaskResponse:
         """Update a task by its ID and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.

{chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/decorators.py RENAMED Viewed

@@ -13,10 +13,7 @@ P = ParamSpec('P')
 _sync_loop = None
-@overload
-def anywhere() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Union[Awaitable[T], T]]]: ...
-def anywhere():
+def anywhere() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Union[Awaitable[T], T]]]:
     """Decorator that allows an async function to run anywhere - sync or async context."""
     def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Union[Awaitable[T], T]]:
         @functools.wraps(async_func)
@@ -42,22 +39,22 @@ def anywhere():
         return wrapper
     return decorator
-def ensure_client() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
+def ensure_client() -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
     """Decorator that ensures a valid httpx.AsyncClient exists before executing the method"""
-    def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
+    def decorator(async_func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
         @functools.wraps(async_func)
-        async def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
+        async def wrapper(self: Any, *args: Any, **kwargs: Any) -> T:
             if not self._client or self._client.is_closed:
                 self._client = httpx.AsyncClient()
             return await async_func(self, *args, **kwargs)
         return wrapper
     return decorator
-def require_task() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
+def require_task() -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
     """Decorator that ensures task has required attributes and valid client before execution"""
-    def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
+    def decorator(async_func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
         @functools.wraps(async_func)
-        async def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
+        async def wrapper(self: Any, *args: Any, **kwargs: Any) -> T:
             if not self.task_url:
                 raise ValueError("Task URL not found")
             if not self._client:

{chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/misc.py RENAMED Viewed

@@ -3,9 +3,9 @@ import base64
 import io
 from pathlib import Path
 from PIL import Image
-from typing import Union, Tuple, BinaryIO, Optional
+from typing import Union, Tuple, BinaryIO, Optional, Any
-async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[Optional[str], str]:
+async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]) -> Tuple[Optional[str], str]:
     """Convert various file types into a tuple of (filename, file content).
     Args:
@@ -15,6 +15,7 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
             - Local file path (will be converted to base64)
             - Opened binary file (will be converted to base64)
             - PIL/Pillow Image object (will be converted to base64)
+            - Bytes object (will be converted to base64)
     Returns:
         Tuple[Optional[str], str]: (filename, content) where content is either a URL or base64 string
@@ -26,18 +27,54 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
         ValueError: If the URL is invalid or unreachable
         ValueError: If the MIME type is unsupported
     """
-    # Handle strings
+    # Handle bytes-like objects
+    if isinstance(file, (bytes, bytearray, memoryview)):
+        # Convert to bytes first if it's not already
+        file_bytes = bytes(file)
+        # Check if this might be an already-encoded base64 string in bytes form
+        try:
+            # Try to decode the bytes to a string and see if it's valid base64
+            potential_base64 = file_bytes.decode('utf-8', errors='strict')
+            base64.b64decode(potential_base64)
+            # If we get here, it was a valid base64 string in bytes form
+            return None, potential_base64
+        except:
+            # Not a base64 string in bytes form, encode it as base64
+            base64_str = base64.b64encode(file_bytes).decode()
+            return None, base64_str
+    # Handle strings - urls or paths or base64
     if isinstance(file, str):
+        # Handle URLs
         if file.startswith(('http://', 'https://')):
             return None, file
-        try:
-            base64.b64decode(file)
+        # Handle data URLs
+        if file.startswith('data:'):
             return None, file
-        except:
+        # Try to handle as a file path
+        try:
+            path = Path(file)
+            if path.exists():
+                # It's a valid file path, convert to Path object and continue processing
+                file = path
+            else:
+                # If not a valid file path, try treating as base64
+                try:
+                    # Just test if it's valid base64, don't store the result
+                    base64.b64decode(file)
+                    return None, file
+                except:
+                    raise ValueError(f"File not found: {file} and it's not a valid base64 string")
+        except Exception as e:
+            # If string can't be converted to Path or decoded as base64, it might still be a base64 string
             try:
-                file = Path(file)
+                base64.b64decode(file)
+                return None, file
             except:
-                raise ValueError("File must be a valid path, URL, or base64 string")
+                raise ValueError(f"Unable to process file: {e}")
     # Handle file paths - convert to base64
     if isinstance(file, Path):
@@ -67,17 +104,16 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
         file.seek(0)
         file_content = file.read()
         name = getattr(file, "name", "document")
-        file_ext = Path(name).suffix.lower().lstrip('.')
-        if not file_ext:
-            raise ValueError("File must have an extension")
+        if not name or not isinstance(name, str):
+            name = None
         base64_str = base64.b64encode(file_content).decode()
-        return Path(name).name, base64_str
+        return name, base64_str
     raise TypeError(f"Unsupported file type: {type(file)}")
 async def prepare_upload_data(
-    file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
+    file: Optional[Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]] = None,
     filename: Optional[str] = None,
     config: Optional[Configuration] = None,
 ) -> dict:
@@ -85,8 +121,8 @@ async def prepare_upload_data(
     Args:
         file: The file to upload
+        filename: Optional filename to use (overrides any filename from the file)
         config: Optional configuration settings
-        client: HTTP client for downloading remote files
     Returns:
         dict: JSON-serializable data dictionary ready for upload

{chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/task_response.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from datetime import datetime
-from typing import TypeVar, Optional, Generic
+from typing import Optional, cast, Awaitable, Union
 from pydantic import BaseModel, PrivateAttr
 import asyncio
 import json
@@ -11,9 +11,7 @@ from .protocol import ChunkrClientProtocol
 from .misc import prepare_upload_data
 from .decorators import anywhere, require_task, retry_on_429
-T = TypeVar("T", bound="TaskResponse")
-class TaskResponse(BaseModel, Generic[T]):
+class TaskResponse(BaseModel):
     configuration: OutputConfiguration
     created_at: datetime
     expires_at: Optional[datetime] = None
@@ -28,13 +26,13 @@ class TaskResponse(BaseModel, Generic[T]):
     _base64_urls: bool = False
     _client: Optional[ChunkrClientProtocol] = PrivateAttr(default=None)
-    def with_client(self, client: ChunkrClientProtocol, include_chunks: bool = False, base64_urls: bool = False) -> T:
+    def with_client(self, client: ChunkrClientProtocol, include_chunks: bool = False, base64_urls: bool = False) -> "TaskResponse":
         self._client = client
         self.include_chunks = include_chunks
         self._base64_urls = base64_urls
         return self
-    def _check_status(self) -> Optional[T]:
+    def _check_status(self) -> Optional["TaskResponse"]:
         """Helper method to check task status and handle completion/failure"""
         if self.status == "Failed":
             if getattr(self._client, 'raise_on_failure', True):
@@ -47,6 +45,11 @@ class TaskResponse(BaseModel, Generic[T]):
     @require_task()
     async def _poll_request(self) -> dict:
         try:
+            if not self._client:
+                raise ValueError("Chunkr client protocol is not initialized")
+            if not self._client._client or self._client._client.is_closed:
+                raise ValueError("httpx client is not open")
+            assert self.task_url is not None
             r = await self._client._client.get(
                 self.task_url, headers=self._client._headers()
             )
@@ -64,10 +67,12 @@ class TaskResponse(BaseModel, Generic[T]):
             raise e
     @anywhere()
-    async def poll(self) -> T:
+    async def poll(self) -> "TaskResponse":
         """Poll the task for completion."""
         while True:
             j = await self._poll_request()
+            if not self._client:
+                raise ValueError("Chunkr client protocol is not initialized")
             updated = TaskResponse(**j).with_client(self._client)
             self.__dict__.update(updated.__dict__)
             if res := self._check_status():
@@ -77,9 +82,14 @@ class TaskResponse(BaseModel, Generic[T]):
     @anywhere()
     @require_task()
     @retry_on_429()
-    async def update(self, config: Configuration) -> T:
+    async def update(self, config: Configuration) -> "TaskResponse":
         """Update the task configuration."""
         data = await prepare_upload_data(None, None, config)
+        if not self._client:
+            raise ValueError("Chunkr client protocol is not initialized")
+        if not self._client._client or self._client._client.is_closed:
+            raise ValueError("httpx client is not open")
+        assert self.task_url is not None
         r = await self._client._client.patch(
             f"{self.task_url}/parse",
             json=data,
@@ -88,12 +98,17 @@ class TaskResponse(BaseModel, Generic[T]):
         r.raise_for_status()
         updated = TaskResponse(**r.json()).with_client(self._client)
         self.__dict__.update(updated.__dict__)
-        return await self.poll()
+        return cast(TaskResponse, self.poll())
     @anywhere()
     @require_task()
-    async def delete(self) -> T:
+    async def delete(self) -> "TaskResponse":
         """Delete the task."""
+        if not self._client:
+            raise ValueError("Chunkr client protocol is not initialized")
+        if not self._client._client or self._client._client.is_closed:
+            raise ValueError("httpx client is not open")
+        assert self.task_url is not None
         r = await self._client._client.delete(
             self.task_url, headers=self._client._headers()
         )
@@ -102,15 +117,20 @@ class TaskResponse(BaseModel, Generic[T]):
     @anywhere()
     @require_task()
-    async def cancel(self) -> T:
+    async def cancel(self) -> "TaskResponse":
         """Cancel the task."""
+        if not self._client:
+            raise ValueError("Chunkr client protocol is not initialized")
+        if not self._client._client or self._client._client.is_closed:
+            raise ValueError("httpx client is not open")
+        assert self.task_url is not None
         r = await self._client._client.get(
             f"{self.task_url}/cancel", headers=self._client._headers()
         )
         r.raise_for_status()
-        return await self.poll()
+        return cast(TaskResponse, self.poll())
-    def _write_to_file(self, content: str | dict, output_file: str, is_json: bool = False) -> None:
+    def _write_to_file(self, content: Union[str, dict], output_file: Optional[str], is_json: bool = False) -> None:
         """Helper method to write content to a file
         Args:
@@ -131,9 +151,12 @@ class TaskResponse(BaseModel, Generic[T]):
                 if is_json:
                     json.dump(content, f, cls=DateTimeEncoder, indent=2)
                 else:
-                    f.write(content)
+                    if isinstance(content, str):
+                        f.write(content)
+                    else:
+                        raise ValueError("Content is not a string")
-    def html(self, output_file: str = None) -> str:
+    def html(self, output_file: Optional[str] = None) -> str:
         """Get the full HTML of the task
         Args:
@@ -143,7 +166,7 @@ class TaskResponse(BaseModel, Generic[T]):
         self._write_to_file(content, output_file)
         return content
-    def markdown(self, output_file: str = None) -> str:
+    def markdown(self, output_file: Optional[str] = None) -> str:
         """Get the full markdown of the task
         Args:
@@ -153,7 +176,7 @@ class TaskResponse(BaseModel, Generic[T]):
         self._write_to_file(content, output_file)
         return content
-    def content(self, output_file: str = None) -> str:
+    def content(self, output_file: Optional[str] = None) -> str:
         """Get the full content of the task
         Args:
@@ -163,7 +186,7 @@ class TaskResponse(BaseModel, Generic[T]):
         self._write_to_file(content, output_file)
         return content
-    def json(self, output_file: str = None) -> dict:
+    def json(self, output_file: Optional[str] = None) -> dict:
         """Get the full task data as JSON
         Args:

{chunkr_ai-0.0.46 → chunkr_ai-0.0.48/src/chunkr_ai.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chunkr-ai
-Version: 0.0.46
+Version: 0.0.48
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

{chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai.egg-info/SOURCES.txt RENAMED Viewed

@@ -17,4 +17,5 @@ src/chunkr_ai/api/decorators.py
 src/chunkr_ai/api/misc.py
 src/chunkr_ai/api/protocol.py
 src/chunkr_ai/api/task_response.py
-tests/test_chunkr.py
+tests/test_chunkr.py
+tests/test_file_handling.py

{chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/tests/test_chunkr.py RENAMED Viewed

@@ -3,6 +3,9 @@ from pathlib import Path
 from PIL import Image
 import asyncio
 import base64
+import io
+import tempfile
+from typing import Awaitable
 from chunkr_ai import Chunkr
 from chunkr_ai.models import (
@@ -26,6 +29,14 @@ from chunkr_ai.models import (
 def sample_path():
     return Path("tests/files/test.pdf")
+@pytest.fixture
+def sample_absolute_path_str():
+    return "tests/files/test.pdf"
+@pytest.fixture
+def sample_relative_path_str():
+    return "./tests/files/test.pdf"
 @pytest.fixture
 def sample_image():
     return Image.open("tests/files/test.jpg")
@@ -43,7 +54,7 @@ def client():
 def markdown_embed_config():
     return Configuration(
         segment_processing=SegmentProcessing(
-            page=GenerationConfig(
+            Page=GenerationConfig(
                 html=GenerationStrategy.LLM,
                 markdown=GenerationStrategy.LLM,
                 embed_sources=[EmbedSource.MARKDOWN]
@@ -55,7 +66,7 @@ def markdown_embed_config():
 def html_embed_config():
     return Configuration(
         segment_processing=SegmentProcessing(
-            page=GenerationConfig(
+            Page=GenerationConfig(
                 html=GenerationStrategy.LLM,
                 markdown=GenerationStrategy.LLM,
                 embed_sources=[EmbedSource.HTML]
@@ -67,7 +78,7 @@ def html_embed_config():
 def multiple_embed_config():
     return Configuration(
         segment_processing=SegmentProcessing(
-            page=GenerationConfig(
+            Page=GenerationConfig(
                 html=GenerationStrategy.LLM,
                 markdown=GenerationStrategy.LLM,
                 llm="Generate a summary of this content",
@@ -115,7 +126,7 @@ def xlm_roberta_with_html_content_config():
             tokenizer=Tokenizer.XLM_ROBERTA_BASE
         ),
         segment_processing=SegmentProcessing(
-            page=GenerationConfig(
+            Page=GenerationConfig(
                 html=GenerationStrategy.LLM,
                 markdown=GenerationStrategy.LLM,
                 embed_sources=[EmbedSource.HTML, EmbedSource.CONTENT]
@@ -156,43 +167,6 @@ def model_fallback_config():
         ),
     )
-@pytest.mark.asyncio
-async def test_send_file_path(client, sample_path):
-    response = await client.upload(sample_path)
-    assert response.task_id is not None
-    assert response.status == "Succeeded"
-    assert response.output is not None
-@pytest.mark.asyncio
-async def test_send_file_url(client, sample_url):
-    response = await client.upload(sample_url)
-    assert response.task_id is not None
-    assert response.status == "Succeeded"
-    assert response.output is not None
-@pytest.mark.asyncio
-async def test_send_file_path_str(client, sample_path):
-    response = await client.upload(str(sample_path))
-    assert response.task_id is not None
-    assert response.status == "Succeeded"
-    assert response.output is not None
-@pytest.mark.asyncio
-async def test_send_opened_file(client, sample_path):
-    with open(sample_path, "rb") as f:
-        response = await client.upload(f)
-    assert response.task_id is not None
-    assert response.status == "Succeeded"
-    assert response.output is not None
-@pytest.mark.asyncio
-async def test_send_pil_image(client, sample_image):
-    response = await client.upload(sample_image)
-    assert response.task_id is not None
-    assert response.status == "Succeeded"
-    assert response.output is not None
-    assert response.output is not None
 @pytest.mark.asyncio
 async def test_ocr_auto(client, sample_path):
     response = await client.upload(sample_path, Configuration(ocr_strategy=OcrStrategy.AUTO))
@@ -240,7 +214,7 @@ async def test_page_llm_html(client, sample_path):
         Configuration(
             segmentation_strategy=SegmentationStrategy.PAGE,
             segment_processing=SegmentProcessing(
-                page=GenerationConfig(html=GenerationStrategy.LLM)
+                Page=GenerationConfig(html=GenerationStrategy.LLM)
             ),
         ),
     )
@@ -253,7 +227,7 @@ async def test_page_llm(client, sample_path):
     configuration = Configuration(
         segmentation_strategy=SegmentationStrategy.PAGE,
         segment_processing=SegmentProcessing(
-            page=GenerationConfig(
+            Page=GenerationConfig(
                 html=GenerationStrategy.LLM, markdown=GenerationStrategy.LLM
             )
         ),
@@ -291,7 +265,7 @@ async def test_cancel_task(client, sample_path):
 @pytest.mark.asyncio
 async def test_cancel_task_direct(client, sample_path):
     task = await client.create_task(sample_path)
-    assert isinstance(task, TaskResponse)
+    assert isinstance(task, Awaitable) and isinstance(task, TaskResponse)
     assert task.status == "Starting"
     await task.cancel()
     assert task.status == "Cancelled"
@@ -332,7 +306,7 @@ async def test_pipeline_type_azure(client, sample_path):
     assert response.output is not None
 @pytest.mark.asyncio
-async def test_pipeline_type_azure(client, sample_path):
+async def test_pipeline_type_chunkr(client, sample_path):
     response = await client.upload(sample_path, Configuration(pipeline=Pipeline.CHUNKR))
     assert response.task_id is not None
     assert response.status == "Succeeded"
@@ -353,36 +327,6 @@ async def test_task_operations_after_client_close(client, sample_path):
     result = await task.poll()
     assert result.status == "Succeeded"
-@pytest.mark.asyncio
-async def test_send_base64_file(client, sample_path):
-    # Read file and convert to base64
-    with open(sample_path, "rb") as f:
-        base64_content = base64.b64encode(f.read()).decode('utf-8')
-    response = await client.upload(base64_content)
-    assert response.task_id is not None
-    assert response.status == "Succeeded"
-    assert response.output is not None
-@pytest.mark.asyncio
-async def test_send_base64_file_with_data_url(client, sample_path):
-    with open(sample_path, "rb") as f:
-        base64_content = base64.b64encode(f.read()).decode('utf-8')
-    response = await client.upload(f"data:application/pdf;base64,{base64_content}")
-    assert response.task_id is not None
-    assert response.status == "Succeeded"
-    assert response.output is not None
-@pytest.mark.asyncio
-async def test_send_base64_file_with_filename(client, sample_path):
-    # Read file and convert to base64
-    with open(sample_path, "rb") as f:
-        base64_content = base64.b64encode(f.read()).decode('utf-8')
-    response = await client.upload(base64_content, filename="test.pdf")
-    assert response.task_id is not None
-    assert response.status == "Succeeded"
-    assert response.output is not None
 @pytest.mark.asyncio
 async def test_output_files_no_dir(client, sample_path, tmp_path):
     task = await client.upload(sample_path)
@@ -422,6 +366,35 @@ async def test_output_files_with_dirs(client, sample_path, tmp_path):
     assert content_file.exists()
     assert json_file.exists()
+@pytest.mark.asyncio
+async def test_combined_config_with_llm_and_other_settings(client, sample_path):
+    # Test combining LLM settings with other configuration options
+    config = Configuration(
+        llm_processing=LlmProcessing(
+            model_id="qwen-2.5-vl-7b-instruct",
+            fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
+            temperature=0.4
+        ),
+        segmentation_strategy=SegmentationStrategy.PAGE,
+        segment_processing=SegmentProcessing(
+            Page=GenerationConfig(
+                html=GenerationStrategy.LLM,
+                markdown=GenerationStrategy.LLM
+            )
+        ),
+        chunk_processing=ChunkProcessing(target_length=1024)
+    )
+    response = await client.upload(sample_path, config)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+    assert response.configuration.llm_processing is not None
+    assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
+    assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
+    assert response.configuration.chunk_processing.target_length == 1024
 @pytest.mark.asyncio
 async def test_embed_sources_markdown_only(client, sample_path, markdown_embed_config):
     response = await client.upload(sample_path, markdown_embed_config)
@@ -555,32 +528,4 @@ async def test_fallback_strategy_serialization():
     # Test string representation
     assert str(none_strategy) == "None"
     assert str(default_strategy) == "Default"
-    assert str(model_strategy) == "Model(gpt-4.1)"
-@pytest.mark.asyncio
-async def test_combined_config_with_llm_and_other_settings(client, sample_path):
-    # Test combining LLM settings with other configuration options
-    config = Configuration(
-        llm_processing=LlmProcessing(
-            model_id="qwen-2.5-vl-7b-instruct",
-            fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
-            temperature=0.4
-        ),
-        segmentation_strategy=SegmentationStrategy.PAGE,
-        segment_processing=SegmentProcessing(
-            page=GenerationConfig(
-                html=GenerationStrategy.LLM,
-                markdown=GenerationStrategy.LLM
-            )
-        ),
-        chunk_processing=ChunkProcessing(target_length=1024)
-    )
-    response = await client.upload(sample_path, config)
-    assert response.task_id is not None
-    assert response.status == "Succeeded"
-    assert response.output is not None
-    assert response.configuration.llm_processing is not None
-    assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
-    assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
-    assert response.configuration.chunk_processing.target_length == 1024
+    assert str(model_strategy) == "Model(gpt-4.1)"

chunkr_ai-0.0.48/tests/test_file_handling.py ADDED Viewed

@@ -0,0 +1,362 @@
+import pytest
+from pathlib import Path
+from PIL import Image
+import base64
+import io
+import tempfile
+from chunkr_ai import Chunkr
+@pytest.fixture
+def sample_path():
+    return Path("tests/files/test.pdf")
+@pytest.fixture
+def sample_url():
+    return "https://chunkr-web.s3.us-east-1.amazonaws.com/landing_page/input/science.pdf"
+@pytest.fixture
+def sample_image():
+    return Image.open("tests/files/test.jpg")
+@pytest.fixture
+def client():
+    client = Chunkr()
+    yield client
+@pytest.mark.asyncio
+async def test_send_file_path(client, sample_path):
+    response = await client.upload(sample_path)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_file_path_str(client, sample_path):
+    response = await client.upload(str(sample_path))
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_file_relative_path_str(client):
+    response = await client.upload("./tests/files/test.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_file_url(client, sample_url):
+    response = await client.upload(sample_url)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_opened_file(client, sample_path):
+    with open(sample_path, "rb") as f:
+        response = await client.upload(f)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_pil_image(client, sample_image):
+    response = await client.upload(sample_image)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_base64_file(client, sample_path):
+    # Read file and convert to base64
+    with open(sample_path, "rb") as f:
+        base64_content = base64.b64encode(f.read())
+    response = await client.upload(base64_content)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_base64_file_w_decode(client, sample_path):
+    # Read file and convert to base64
+    with open(sample_path, "rb") as f:
+        base64_content = base64.b64encode(f.read()).decode()
+    response = await client.upload(base64_content)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_base64_file_with_data_url(client, sample_path):
+    with open(sample_path, "rb") as f:
+        base64_content = base64.b64encode(f.read()).decode('utf-8')
+    response = await client.upload(f"data:application/pdf;base64,{base64_content}")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_base64_file_with_filename(client, sample_path):
+    # Read file and convert to base64
+    with open(sample_path, "rb") as f:
+        base64_content = base64.b64encode(f.read()).decode('utf-8')
+    response = await client.upload(base64_content, filename="test.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_file_like_no_name_attribute(client, sample_path):
+    # Create a file-like object without a name attribute
+    class NamelessBuffer:
+        def __init__(self, content):
+            self.buffer = io.BytesIO(content)
+        def read(self):
+            return self.buffer.read()
+        def seek(self, pos):
+            return self.buffer.seek(pos)
+    with open(sample_path, "rb") as f:
+        content = f.read()
+    nameless_buffer = NamelessBuffer(content)
+    response = await client.upload(nameless_buffer, filename="test.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_file_like_none_name(client, sample_path):
+    # Create a file-like object with None as name
+    class NoneNameBuffer:
+        def __init__(self, content):
+            self.buffer = io.BytesIO(content)
+            self.name = None
+        def read(self):
+            return self.buffer.read()
+        def seek(self, pos):
+            return self.buffer.seek(pos)
+    with open(sample_path, "rb") as f:
+        content = f.read()
+    none_name_buffer = NoneNameBuffer(content)
+    response = await client.upload(none_name_buffer, filename="test.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_file_like_no_extension(client, sample_path):
+    # Create a file-like object with a name but no extension
+    class NoExtensionBuffer:
+        def __init__(self, content):
+            self.buffer = io.BytesIO(content)
+            self.name = "test_document"
+        def read(self):
+            return self.buffer.read()
+        def seek(self, pos):
+            return self.buffer.seek(pos)
+    with open(sample_path, "rb") as f:
+        content = f.read()
+    no_ext_buffer = NoExtensionBuffer(content)
+    response = await client.upload(no_ext_buffer, filename="test.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_spooled_temporary_file(client, sample_path):
+    # Test with SpooledTemporaryFile which is what the user is using
+    with open(sample_path, "rb") as f:
+        content = f.read()
+    temp_file = tempfile.SpooledTemporaryFile()
+    temp_file.write(content)
+    temp_file.seek(0)
+    response = await client.upload(temp_file, filename="test.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_bytearray(client, sample_path):
+    # Read file and convert to bytearray
+    with open(sample_path, "rb") as f:
+        content = bytearray(f.read())
+    response = await client.upload(content, filename="test.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_send_memoryview(client, sample_path):
+    # Read file and convert to memoryview
+    with open(sample_path, "rb") as f:
+        content_bytes = f.read()
+        content = memoryview(content_bytes)
+    response = await client.upload(content, filename="test.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_with_explicit_filename_pdf(client, sample_path):
+    response = await client.upload(sample_path, filename="custom_name.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_with_explicit_filename_image(client, sample_image):
+    response = await client.upload(sample_image, filename="custom_image.jpg")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_with_special_character_filename(client, sample_path):
+    response = await client.upload(sample_path, filename="test file (1)&%$#@!.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_filename_with_non_matching_extension(client, sample_path):
+    # Test providing a filename with a different extension than the actual file
+    response = await client.upload(sample_path, filename="document.docx")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_bytes_with_explicit_filename(client, sample_path):
+    with open(sample_path, "rb") as f:
+        content = f.read()
+    # For bytes objects, filename is required to know the file type
+    response = await client.upload(content, filename="document.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_bytearray_with_explicit_filename(client, sample_path):
+    with open(sample_path, "rb") as f:
+        content = bytearray(f.read())
+    response = await client.upload(content, filename="document.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_memoryview_with_explicit_filename(client, sample_path):
+    with open(sample_path, "rb") as f:
+        content_bytes = f.read()
+        content = memoryview(content_bytes)
+    response = await client.upload(content, filename="document.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_unicode_filename(client, sample_path):
+    # Test with a filename containing Unicode characters
+    response = await client.upload(sample_path, filename="测试文件.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_very_long_filename(client, sample_path):
+    # Test with an extremely long filename
+    long_name = "a" * 200 + ".pdf"  # 200 character filename
+    response = await client.upload(sample_path, filename=long_name)
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_filename_without_extension(client, sample_path):
+    # Test with a filename that has no extension
+    with open(sample_path, "rb") as f:
+        content = f.read()
+    # This test verifies that the system uses the provided filename even without extension
+    response = await client.upload(content, filename="document_without_extension")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_custom_file_like_with_filename(client, sample_path):
+    # A more complex file-like object implementation
+    class CustomFileWrapper:
+        def __init__(self, content):
+            self.buffer = io.BytesIO(content)
+            self.position = 0
+            self.name = "original_name.txt"  # Should be overridden by explicit filename
+        def read(self, size=-1):
+            return self.buffer.read(size)
+        def seek(self, position, whence=0):
+            return self.buffer.seek(position, whence)
+        def tell(self):
+            return self.buffer.tell()
+        def close(self):
+            self.buffer.close()
+    with open(sample_path, "rb") as f:
+        content = f.read()
+    custom_file = CustomFileWrapper(content)
+    response = await client.upload(custom_file, filename="custom_wrapper.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_seek_at_nonzero_position(client, sample_path):
+    # Test with a file-like object that's not at position 0
+    with open(sample_path, "rb") as f:
+        content = f.read()
+    buffer = io.BytesIO(content)
+    buffer.seek(100)  # Move position to 100
+    response = await client.upload(buffer, filename="seek_test.pdf")
+    assert response.task_id is not None
+    assert response.status == "Succeeded"
+    assert response.output is not None
+@pytest.mark.asyncio
+async def test_reused_file_object(client, sample_path):
+    # Test that a file object can be reused after being processed
+    with open(sample_path, "rb") as f:
+        response1 = await client.upload(f, filename="first_use.pdf")
+        f.seek(0)  # Reset position
+        response2 = await client.upload(f, filename="second_use.pdf")
+    assert response1.task_id is not None
+    assert response1.status == "Succeeded"
+    assert response2.task_id is not None
+    assert response2.status == "Succeeded"