PyPI - chunkr-ai - Versions diffs - 0.0.24__py3-none-any.whl → 0.0.26__py3-none-any.whl - Mend

chunkr-ai 0.0.24py3-none-any.whl → 0.0.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

chunkr_ai/api/chunkr.py +15 -7
chunkr_ai/api/chunkr_base.py +4 -2
chunkr_ai/api/{config.py → configuration.py} +6 -26
chunkr_ai/api/decorators.py +30 -0
chunkr_ai/api/misc.py +1 -1
chunkr_ai/api/task_response.py +65 -16
chunkr_ai/models.py +1 -1
{chunkr_ai-0.0.24.dist-info → chunkr_ai-0.0.26.dist-info}/METADATA +1 -1
chunkr_ai-0.0.26.dist-info/RECORD +16 -0
chunkr_ai/api/api.py +0 -0
chunkr_ai-0.0.24.dist-info/RECORD +0 -17
{chunkr_ai-0.0.24.dist-info → chunkr_ai-0.0.26.dist-info}/LICENSE +0 -0
{chunkr_ai-0.0.24.dist-info → chunkr_ai-0.0.26.dist-info}/WHEEL +0 -0
{chunkr_ai-0.0.24.dist-info → chunkr_ai-0.0.26.dist-info}/top_level.txt +0 -0

chunkr_ai/api/chunkr.py CHANGED Viewed

@@ -2,8 +2,8 @@ from pathlib import Path
 from PIL import Image
 from typing import Union, BinaryIO
-from .config import Configuration
-from .decorators import anywhere, ensure_client
+from .configuration import Configuration
+from .decorators import anywhere, ensure_client, retry_on_429
 from .misc import prepare_upload_data
 from .task_response import TaskResponse
 from .chunkr_base import ChunkrBase
@@ -29,6 +29,7 @@ class Chunkr(ChunkrBase):
     @anywhere()
     @ensure_client()
+    @retry_on_429()
     async def create_task(
         self,
         file: Union[str, Path, BinaryIO, Image.Image],
@@ -39,10 +40,11 @@ class Chunkr(ChunkrBase):
             f"{self.url}/api/v1/task", files=files, headers=self._headers()
         )
         r.raise_for_status()
-        return TaskResponse(**r.json()).with_client(self)
+        return TaskResponse(**r.json()).with_client(self, True, False)
     @anywhere()
     @ensure_client()
+    @retry_on_429()
     async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
         files = await prepare_upload_data(None, config, self._client)
         r = await self._client.patch(
@@ -51,16 +53,22 @@ class Chunkr(ChunkrBase):
             headers=self._headers(),
         )
         r.raise_for_status()
-        return TaskResponse(**r.json()).with_client(self)
+        return TaskResponse(**r.json()).with_client(self, True, False)
     @anywhere()
     @ensure_client()
-    async def get_task(self, task_id: str) -> TaskResponse:
+    async def get_task(self, task_id: str, include_chunks: bool = True, base64_urls: bool = False) -> TaskResponse:
+        params = {
+            "base64_urls": str(base64_urls).lower(),
+            "include_chunks": str(include_chunks).lower()
+        }
         r = await self._client.get(
-            f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
+            f"{self.url}/api/v1/task/{task_id}",
+            params=params,
+            headers=self._headers()
         )
         r.raise_for_status()
-        return TaskResponse(**r.json()).with_client(self)
+        return TaskResponse(**r.json()).with_client(self, include_chunks, base64_urls)
     @anywhere()
     @ensure_client()

chunkr_ai/api/chunkr_base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .config import Configuration
+from .configuration import Configuration
 from .task_response import TaskResponse
 from .auth import HeadersMixin
 from abc import abstractmethod
@@ -139,11 +139,13 @@ class ChunkrBase(HeadersMixin):
         pass
     @abstractmethod
-    def get_task(self, task_id: str) -> TaskResponse:
+    def get_task(self, task_id: str, include_chunks: bool = True, base64_urls: bool = False) -> TaskResponse:
         """Get a task response by its ID.
         Args:
             task_id: The ID of the task to get
+            include_chunks: Whether to include chunks in the output response. Defaults to True.
+            base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as presigned URLs. Defaults to False.
         Returns:
             TaskResponse: The task response

chunkr_ai/api/{config.py → configuration.py} RENAMED Viewed

@@ -1,6 +1,6 @@
-from pydantic import BaseModel, Field, model_validator, ConfigDict
+from pydantic import BaseModel, Field, ConfigDict
 from enum import Enum
-from typing import Optional, List, Dict, Union, Type
+from typing import Any, List, Optional
 class GenerationStrategy(str, Enum):
     LLM = "LLM"
@@ -37,16 +37,6 @@ class SegmentProcessing(BaseModel):
 class ChunkProcessing(BaseModel):
     target_length: Optional[int] = None
-class Property(BaseModel):
-    name: str
-    prop_type: str
-    description: Optional[str] = None
-    default: Optional[str] = None
-class JsonSchema(BaseModel):
-    title: str
-    properties: List[Property]
 class OcrStrategy(str, Enum):
     ALL = "All"
     AUTO = "Auto"
@@ -98,9 +88,6 @@ class Chunk(BaseModel):
     chunk_length: int
     segments: List[Segment]
-class ExtractedJson(BaseModel):
-    data: Dict
 class OutputResponse(BaseModel):
     chunks: List[Chunk]
     file_name: Optional[str]
@@ -118,7 +105,6 @@ class Configuration(BaseModel):
     chunk_processing: Optional[ChunkProcessing] = None
     expires_in: Optional[int] = None
     high_resolution: Optional[bool] = None
-    model: Optional[Model] = None
     ocr_strategy: Optional[OcrStrategy] = None
     segment_processing: Optional[SegmentProcessing] = None
     segmentation_strategy: Optional[SegmentationStrategy] = None
@@ -126,16 +112,10 @@ class Configuration(BaseModel):
 class OutputConfiguration(Configuration):
     input_file_url: Optional[str] = None
-    json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = None
-    @model_validator(mode="before")
-    def map_deprecated_fields(cls, values: Dict) -> Dict:
-        if isinstance(values, dict) and "target_chunk_length" in values:
-            target_length = values.pop("target_chunk_length")
-            if target_length is not None:
-                values["chunk_processing"] = values.get("chunk_processing", {}) or {}
-                values["chunk_processing"]["target_length"] = target_length
-        return values
+    # Deprecated
+    json_schema: Optional[Any] = None
+    model: Optional[Model] = None
+    target_chunk_length: Optional[int] = None
 class Status(str, Enum):
     STARTING = "Starting"

chunkr_ai/api/decorators.py CHANGED Viewed

@@ -59,4 +59,34 @@ def require_task() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitabl
                 self._client._client = httpx.AsyncClient()
             return await async_func(self, *args, **kwargs)
         return wrapper
+    return decorator
+def retry_on_429(max_retries: int = 10, initial_delay: float = 0.5) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
+    """Decorator that retries the request when encountering 429 Too Many Requests errors.
+    Args:
+        max_retries: Maximum number of retry attempts (default: 3)
+        initial_delay: Initial delay in seconds, will be exponentially increased (default: 1.0)
+    """
+    def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
+        @functools.wraps(async_func)
+        async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
+            retries = 0
+            while True:
+                try:
+                    return await async_func(*args, **kwargs)
+                except httpx.HTTPStatusError as e:
+                    if e.response.status_code != 429 or retries >= max_retries:
+                        raise
+                    retries += 1
+                    delay = initial_delay
+                    # Use Retry-After header if available
+                    retry_after = e.response.headers.get('Retry-After')
+                    if retry_after:
+                        try:
+                            delay = float(retry_after)
+                        except (ValueError, TypeError):
+                            pass
+                    await asyncio.sleep(delay)
+        return wrapper
     return decorator

chunkr_ai/api/misc.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .config import Configuration
+from .configuration import Configuration
 import io
 import json
 from pathlib import Path

chunkr_ai/api/task_response.py CHANGED Viewed

@@ -2,11 +2,12 @@ from datetime import datetime
 from typing import TypeVar, Optional, Generic
 from pydantic import BaseModel, PrivateAttr
 import asyncio
+import json
-from .config import Configuration, OutputConfiguration, OutputResponse, Status
+from .configuration import Configuration, OutputConfiguration, OutputResponse, Status
 from .protocol import ChunkrClientProtocol
 from .misc import prepare_upload_data
-from .decorators import anywhere, require_task
+from .decorators import anywhere, require_task, retry_on_429
 T = TypeVar("T", bound="TaskResponse")
@@ -21,10 +22,14 @@ class TaskResponse(BaseModel, Generic[T]):
     status: Status
     task_id: str
     task_url: Optional[str] = None
+    include_chunks: bool = False
+    _base64_urls: bool = False
     _client: Optional[ChunkrClientProtocol] = PrivateAttr(default=None)
-    def with_client(self, client: ChunkrClientProtocol) -> T:
+    def with_client(self, client: ChunkrClientProtocol, include_chunks: bool = False, base64_urls: bool = False) -> T:
         self._client = client
+        self.include_chunks = include_chunks
+        self._base64_urls = base64_urls
         return self
     def _check_status(self) -> Optional[T]:
@@ -45,11 +50,12 @@ class TaskResponse(BaseModel, Generic[T]):
             )
             r.raise_for_status()
             return r.json()
-        except (ConnectionError, TimeoutError) as _:
-            print("Connection error while polling the task, retrying...")
+        except (ConnectionError, TimeoutError, OSError) as e:
+            print(f"Connection error while polling the task: {str(e)}, retrying...")
             await asyncio.sleep(0.5)
-        except Exception:
-            raise
+            return await self._poll_request()
+        except Exception as e:
+            raise e
     @anywhere()
     async def poll(self) -> T:
@@ -64,6 +70,7 @@ class TaskResponse(BaseModel, Generic[T]):
     @anywhere()
     @require_task()
+    @retry_on_429()
     async def update(self, config: Configuration) -> T:
         """Update the task configuration."""
         f = await prepare_upload_data(None, config, self._client._client)
@@ -95,17 +102,59 @@ class TaskResponse(BaseModel, Generic[T]):
         r.raise_for_status()
         return await self.poll()
-    def html(self) -> str:
-        """Get the full HTML of the task"""
-        return self._get_content("html")
+    def html(self, output_file: str = None) -> str:
+        """Get the full HTML of the task
+        Args:
+            output_file (str, optional): Path to save the HTML content. Defaults to None.
+        """
+        content = self._get_content("html")
+        if output_file:
+            with open(output_file, "w", encoding="utf-8") as f:
+                f.write(content)
+        return content
-    def markdown(self) -> str:
-        """Get the full markdown of the task"""
-        return self._get_content("markdown")
+    def markdown(self, output_file: str = None) -> str:
+        """Get the full markdown of the task
+        Args:
+            output_file (str, optional): Path to save the markdown content. Defaults to None.
+        """
+        content = self._get_content("markdown")
+        if output_file:
+            with open(output_file, "w", encoding="utf-8") as f:
+                f.write(content)
+        return content
-    def content(self) -> str:
-        """Get the full content of the task"""
-        return self._get_content("content")
+    def content(self, output_file: str = None) -> str:
+        """Get the full content of the task
+        Args:
+            output_file (str, optional): Path to save the content. Defaults to None.
+        """
+        content = self._get_content("content")
+        if output_file:
+            with open(output_file, "w", encoding="utf-8") as f:
+                f.write(content)
+        return content
+    def json(self, output_file: str = None) -> dict:
+        """Get the full task data as JSON
+        Args:
+            output_file (str, optional): Path to save the task data as JSON. Defaults to None.
+        """
+        class DateTimeEncoder(json.JSONEncoder):
+            def default(self, obj):
+                if isinstance(obj, datetime):
+                    return obj.isoformat()
+                return super().default(obj)
+        data = self.model_dump()
+        if output_file:
+            with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(data, f, cls=DateTimeEncoder, indent=2)
+        return data
     def _get_content(self, t: str) -> str:
         if not self.output:

chunkr_ai/models.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .api.config import (
+from .api.configuration import (
     BoundingBox,
     Chunk,
     ChunkProcessing,

{chunkr_ai-0.0.24.dist-info → chunkr_ai-0.0.26.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: chunkr-ai
-Version: 0.0.24
+Version: 0.0.26
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

chunkr_ai-0.0.26.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
+chunkr_ai/models.py,sha256=tOI7ylkhyeFfCLMisk96EPsH4UEcjBx1Mcisxc_AYXI,757
+chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
+chunkr_ai/api/chunkr.py,sha256=VnbuAPlWLqyf8xCCU_kpdybgjVPTwZLarDQoD3uozY0,3065
+chunkr_ai/api/chunkr_base.py,sha256=giW56fL7xxJphdOTpIH52dXxpNt7OdP8pNiPSqbNjGM,5835
+chunkr_ai/api/configuration.py,sha256=0wnrKlUIO7opvV963Gr_S8tlAjpo_IkNmbTi1_FwEug,3751
+chunkr_ai/api/decorators.py,sha256=HSq3vcxOeUJkaWaf7HOvCyg9dWkVo8cG5BrU-jhbhmc,4053
+chunkr_ai/api/misc.py,sha256=5PBI6pvOXr0x-3WieSKLrC8MA0iGPa-IG-5FEZ3vnr0,5724
+chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
+chunkr_ai/api/task_response.py,sha256=lYzR3Oa6HwLmW5Plo5AF4Ky3UMXHU9zcUMRYOHb7Gwg,5805
+chunkr_ai-0.0.26.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
+chunkr_ai-0.0.26.dist-info/METADATA,sha256=LcIn-LIE_RsPawnkh26NyU2EGicKOQ1Qf1KsAu0dPuw,6996
+chunkr_ai-0.0.26.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+chunkr_ai-0.0.26.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
+chunkr_ai-0.0.26.dist-info/RECORD,,

chunkr_ai/api/api.py DELETED Viewed

File without changes

chunkr_ai-0.0.24.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
-chunkr_ai/models.py,sha256=MK8FPbWDj1ynvSHaYuslKCPybxLuAlrsVIM3Eym3kKI,750
-chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
-chunkr_ai/api/chunkr.py,sha256=XTXJFs0xjYY3w3N4fSQcxtJFBtNfzFYYkh6nDlFz4cY,2714
-chunkr_ai/api/chunkr_base.py,sha256=4SXA-gdZd1w2zZeeMdy4xog0NKOrKjmo6IMvSl9KSBg,5538
-chunkr_ai/api/config.py,sha256=NmPTsDvcjkvNx0gNzDTz-oFG5rQC7jm-H70O_crJCw8,4478
-chunkr_ai/api/decorators.py,sha256=y_Z9z0O2XXiX9z6jWDwdbCPdQyMLnjE0pGkJjHQEv_Q,2652
-chunkr_ai/api/misc.py,sha256=5Q2K713VPwf3S2519KTzjT9PKhTEBgBMk1d8NNnmpZ0,5717
-chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
-chunkr_ai/api/task_response.py,sha256=hcHsBgX-2C5Px5Bu0IKk33K_AkqHSEM1Wu2zkcPh9to,3935
-chunkr_ai-0.0.24.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
-chunkr_ai-0.0.24.dist-info/METADATA,sha256=JyDI8EkFaJQQ7vIo2osHxXmeuNqhQ0UWjgUMHSFIYow,6996
-chunkr_ai-0.0.24.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-chunkr_ai-0.0.24.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
-chunkr_ai-0.0.24.dist-info/RECORD,,

{chunkr_ai-0.0.24.dist-info → chunkr_ai-0.0.26.dist-info}/LICENSE RENAMED Viewed

File without changes

{chunkr_ai-0.0.24.dist-info → chunkr_ai-0.0.26.dist-info}/WHEEL RENAMED Viewed

File without changes

{chunkr_ai-0.0.24.dist-info → chunkr_ai-0.0.26.dist-info}/top_level.txt RENAMED Viewed

File without changes

chunkr-ai 0.0.24__py3-none-any.whl → 0.0.26__py3-none-any.whl

chunkr-ai 0.0.24py3-none-any.whl → 0.0.26py3-none-any.whl