PyPI - chunkr-ai - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl - Mend

chunkr-ai 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl

Files changed (17) hide show

chunkr_ai/api/__init__.py +0 -0
chunkr_ai/api/auth.py +0 -2
chunkr_ai/api/base.py +173 -0
chunkr_ai/api/chunkr.py +69 -86
chunkr_ai/api/chunkr_async.py +93 -27
chunkr_ai/api/config.py +131 -0
chunkr_ai/api/protocol.py +19 -0
chunkr_ai/api/task.py +131 -0
chunkr_ai/models.py +48 -0
chunkr_ai-0.0.4.dist-info/METADATA +204 -0
chunkr_ai-0.0.4.dist-info/RECORD +17 -0
chunkr_ai/api/models.py +0 -231
chunkr_ai-0.0.2.dist-info/METADATA +0 -16
chunkr_ai-0.0.2.dist-info/RECORD +0 -12
{chunkr_ai-0.0.2.dist-info → chunkr_ai-0.0.4.dist-info}/LICENSE +0 -0
{chunkr_ai-0.0.2.dist-info → chunkr_ai-0.0.4.dist-info}/WHEEL +0 -0
{chunkr_ai-0.0.2.dist-info → chunkr_ai-0.0.4.dist-info}/top_level.txt +0 -0

chunkr_ai/api/__init__.py ADDED Viewed

File without changes

chunkr_ai/api/auth.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from typing import Optional
 class HeadersMixin:
     """Mixin class for handling authorization headers"""

chunkr_ai/api/base.py ADDED Viewed

@@ -0,0 +1,173 @@
+from .config import Configuration
+from .task import TaskResponse
+from .auth import HeadersMixin
+from abc import abstractmethod
+from dotenv import load_dotenv
+import io
+import json
+import os
+from pathlib import Path
+from PIL import Image
+import requests
+from typing import BinaryIO, Tuple, Union
+class ChunkrBase(HeadersMixin):
+    """Base class with shared functionality for Chunkr API clients."""
+    def __init__(self, url: str = None, api_key: str = None):
+        load_dotenv()
+        self.url = (
+            url or
+            os.getenv('CHUNKR_URL') or
+            'https://api.chunkr.ai'
+        )
+        self._api_key = (
+            api_key or
+            os.getenv('CHUNKR_API_KEY')
+        )
+        if not self._api_key:
+            raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
+        self.url = self.url.rstrip("/")
+    def _prepare_file(
+        self,
+        file: Union[str, Path, BinaryIO, Image.Image]
+    ) -> Tuple[str, BinaryIO]:
+        """Convert various file types into a tuple of (filename, file-like object).
+        Args:
+            file: Input file, can be:
+                - String or Path to a file
+                - URL string starting with http:// or https://
+                - Base64 string
+                - Opened binary file (mode='rb')
+                - PIL/Pillow Image object
+        Returns:
+            Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
+        Raises:
+            FileNotFoundError: If the file path doesn't exist
+            TypeError: If the file type is not supported
+            ValueError: If the URL is invalid or unreachable
+            ValueError: If the MIME type is unsupported
+        """
+        # Handle URLs
+        if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
+            response = requests.get(file)
+            response.raise_for_status()
+            file_obj = io.BytesIO(response.content)
+            filename = Path(file.split('/')[-1]).name or 'downloaded_file'
+            return filename, file_obj
+        # Handle base64 strings
+        if isinstance(file, str) and ',' in file and ';base64,' in file:
+            try:
+                # Split header and data
+                header, base64_data = file.split(',', 1)
+                import base64
+                file_bytes = base64.b64decode(base64_data)
+                file_obj = io.BytesIO(file_bytes)
+                # Try to determine format from header
+                format = 'bin'
+                mime_type = header.split(':')[-1].split(';')[0].lower()
+                # Map MIME types to file extensions
+                mime_to_ext = {
+                    'application/pdf': 'pdf',
+                    'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
+                    'application/msword': 'doc',
+                    'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
+                    'application/vnd.ms-powerpoint': 'ppt',
+                    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
+                    'application/vnd.ms-excel': 'xls',
+                    'image/jpeg': 'jpg',
+                    'image/png': 'png',
+                    'image/jpg': 'jpg'
+                }
+                if mime_type in mime_to_ext:
+                    format = mime_to_ext[mime_type]
+                else:
+                    raise ValueError(f"Unsupported MIME type: {mime_type}")
+                return f"file.{format}", file_obj
+            except Exception as e:
+                raise ValueError(f"Invalid base64 string: {str(e)}")
+        # Handle file paths
+        if isinstance(file, (str, Path)):
+            path = Path(file).resolve()
+            if not path.exists():
+                raise FileNotFoundError(f"File not found: {file}")
+            return path.name, open(path, 'rb')
+        # Handle PIL Images
+        if isinstance(file, Image.Image):
+            img_byte_arr = io.BytesIO()
+            format = file.format or 'PNG'
+            file.save(img_byte_arr, format=format)
+            img_byte_arr.seek(0)
+            return f"image.{format.lower()}", img_byte_arr
+        # Handle file-like objects
+        if hasattr(file, 'read') and hasattr(file, 'seek'):
+            # Try to get the filename from the file object if possible
+            name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
+            return Path(name).name, file
+        raise TypeError(f"Unsupported file type: {type(file)}")
+    def _prepare_upload_data(
+        self,
+        file: Union[str, Path, BinaryIO, Image.Image],
+        config: Configuration = None
+    ) -> Tuple[dict, dict]:
+        """Prepare files and data dictionaries for upload.
+        Args:
+            file: The file to upload
+            config: Optional configuration settings
+        Returns:
+            Tuple[dict, dict]: (files dict, data dict) ready for upload
+        """
+        filename, file_obj = self._prepare_file(file)
+        files = {"file": (filename, file_obj)}
+        data = {}
+        if config:
+            config_dict = config.model_dump(mode="json", exclude_none=True)
+            for key, value in config_dict.items():
+                if isinstance(value, dict):
+                    files[key] = (None, json.dumps(value), 'application/json')
+                else:
+                    data[key] = value
+        return files, data
+    @abstractmethod
+    def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
+        """Upload a file and wait for processing to complete.
+        Must be implemented by subclasses.
+        """
+        pass
+    @abstractmethod
+    def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
+        """Upload a file for processing and immediately return the task response.
+        Must be implemented by subclasses.
+        """
+        pass
+    @abstractmethod
+    def get_task(self, task_id: str) -> TaskResponse:
+        """Get a task response by its ID.
+        Must be implemented by subclasses.
+        """
+        pass

chunkr_ai/api/chunkr.py CHANGED Viewed

@@ -1,125 +1,108 @@
-from .models import TaskResponse, Configuration
-from .auth import HeadersMixin
-from dotenv import load_dotenv
-import io
-import os
+from .base import ChunkrBase
+from .config import Configuration
+from .task import TaskResponse
 from pathlib import Path
 from PIL import Image
 import requests
-from typing import Union, BinaryIO, Tuple
+from typing import Union, BinaryIO
-class Chunkr(HeadersMixin):
-    """Client for interacting with the Chunkr API."""
+class Chunkr(ChunkrBase):
+    """Chunkr API client"""
     def __init__(self, url: str = None, api_key: str = None):
-        load_dotenv()
-        self.url = (
-            url or
-            os.getenv('CHUNKR_URL') or
-            'https://api.chunkr.ai'
-        )
-        self._api_key = (
-            api_key or
-            os.getenv('CHUNKR_API_KEY')
-        )
-        if not self._api_key:
-            raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
-        self.url = self.url.rstrip("/")
+        super().__init__(url, api_key)
+        self._session = requests.Session()
-    def _prepare_file(
-        self,
-        file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO]
-    ) -> Tuple[str, BinaryIO]:
-        """Convert various file types into a tuple of (filename, file-like object).
+    def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
+        """Upload a file and wait for processing to complete.
         Args:
-            file: Input file in various formats
+            file: The file to upload.
+            config: Configuration options for processing. Optional.
-        Returns:
-            Tuple[str, BinaryIO]: Filename and file-like object ready for upload
-        """
-        if isinstance(file, str):
-            path = Path(file).resolve()
-            if not path.exists():
-                raise FileNotFoundError(f"File not found: {file}")
-            return path.name, path.open("rb")
-        elif isinstance(file, Image.Image):
-            img_byte_arr = io.BytesIO()
-            file.save(img_byte_arr, format=file.format or 'PNG')
-            img_byte_arr.seek(0)
-            return "image.png", img_byte_arr
-        elif isinstance(file, bytes):
-            return "document", io.BytesIO(file)
-        elif isinstance(file, io.BytesIO):
-            return "document", file
-        else:
-            return "document", file
-    def upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
-        """Upload a file and wait for processing to complete.
+        Examples:
+        ```
+        # Upload from file path
+        chunkr.upload("document.pdf")
-        The file can be one of:
-        - str: Path to a file on disk
-        - BinaryIO: A file-like object (e.g., opened with 'rb' mode)
-        - Image.Image: A PIL/Pillow Image object
-        - bytes: Raw binary data
-        - io.BytesIO: A binary stream in memory
+        # Upload from URL
+        chunkr.upload("https://example.com/document.pdf")
-        Args:
-            file: The file to upload.
-            config:
-                Configuration options for processing. Optional.
+        # Upload from base64 string (must include MIME type header)
+        chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
+        # Upload from opened file
+        with open("document.pdf", "rb") as f:
+            chunkr.upload(f)
+        # Upload an image
+        from PIL import Image
+        img = Image.open("photo.jpg")
+        chunkr.upload(img)
+        ```
         Returns:
             TaskResponse: The completed task response
         """
-        return self.start_upload(file, config).poll()
+        task = self.start_upload(file, config)
+        return task.poll()
-    def start_upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
+    def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
         """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
-        The file can be one of:
-        - str: Path to a file on disk
-        - BinaryIO: A file-like object (e.g., opened with 'rb' mode)
-        - Image.Image: A PIL/Pillow Image object
-        - bytes: Raw binary data
-        - io.BytesIO: A binary stream in memory
         Args:
             file: The file to upload.
-            config (Configuration, optional): Configuration options for processing
+            config: Configuration options for processing. Optional.
+        Examples:
+        ```
+        # Upload from file path
+        task = chunkr.start_upload("document.pdf")
+        # Upload from opened file
+        with open("document.pdf", "rb") as f:
+            task = chunkr.start_upload(f)
+        # Upload from URL
+        task = chunkr.start_upload("https://example.com/document.pdf")
+        # Upload from base64 string (must include MIME type header)
+        task = chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
+        # Upload an image
+        from PIL import Image
+        img = Image.open("photo.jpg")
+        task = chunkr.start_upload(img)
+        # Wait for the task to complete - this can be done when needed
+        task.poll()
+        ```
         Returns:
             TaskResponse: The initial task response
-        Raises:
-            requests.exceptions.HTTPError: If the API request fails
         """
-        url = f"{self.url}/api/v1/task"
-        filename, file_obj = self._prepare_file(file)
-        files = {"file": (filename, file_obj)}
-        r = requests.post(
-            url,
-            files=files,
-            json=config.dict() if config else {},
+        files, data = self._prepare_upload_data(file, config)
+        r = self._session.post(
+            f"{self.url}/api/v1/task",
+            files=files,
+            data=data,
             headers=self._headers()
         )
         r.raise_for_status()
-        return TaskResponse(**r.json()).with_api_key(self._api_key)
+        return TaskResponse(**r.json()).with_client(self)
     def get_task(self, task_id: str) -> TaskResponse:
         """Get a task response by its ID.
         Args:
-            task_id (str): The ID of the task to get
+            task_id: The ID of the task to get
         Returns:
             TaskResponse: The task response
         """
-        url = f"{self.url}/api/v1/task/{task_id}"
-        r = requests.get(url, headers=self._headers())
+        r = self._session.get(
+            f"{self.url}/api/v1/task/{task_id}",
+            headers=self._headers()
+        )
         r.raise_for_status()
-        return TaskResponse(**r.json()).with_api_key(self._api_key)
+        return TaskResponse(**r.json()).with_client(self)

chunkr_ai/api/chunkr_async.py CHANGED Viewed

@@ -1,39 +1,105 @@
-from .chunkr import Chunkr
-from .models import TaskResponse, Configuration
+from .base import ChunkrBase
+from .task import TaskResponse
+from .config import Configuration
 import httpx
-import io
+from pathlib import Path
 from PIL import Image
 from typing import Union, BinaryIO
-class ChunkrAsync(Chunkr):
-    """Async client for interacting with the Chunkr API.
+class ChunkrAsync(ChunkrBase):
+    """Asynchronous Chunkr API client"""
-    This class inherits from the Chunkr class but works with async HTTP requests.
-    """
+    def __init__(self, url: str = None, api_key: str = None):
+        super().__init__(url, api_key)
+        self._client = httpx.AsyncClient()
-    async def upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
+    async def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
+        """Upload a file and wait for processing to complete.
+        Args:
+            file: The file to upload.
+            config: Configuration options for processing. Optional.
+        Examples:
+        ```python
+        # Upload from file path
+        await chunkr.upload("document.pdf")
+        # Upload from opened file
+        with open("document.pdf", "rb") as f:
+            await chunkr.upload(f)
+        # Upload from URL
+        await chunkr.upload("https://example.com/document.pdf")
+        # Upload from base64 string (must include MIME type header)
+        await chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
+        # Upload an image
+        from PIL import Image
+        img = Image.open("photo.jpg")
+        await chunkr.upload(img)
+        ```
+        Returns:
+            TaskResponse: The completed task response
+        """
         task = await self.start_upload(file, config)
         return await task.poll_async()
-    async def start_upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
-        url = f"{self.url}/api/v1/task"
-        filename, file_obj = self._prepare_file(file)
-        async with httpx.AsyncClient() as client:
-            files = {"file": (filename, file_obj)}
-            r = await client.post(
-                url,
-                files=files,
-                json=config.dict() if config else {},
-                headers=self._headers()
-            )
-            r.raise_for_status()
-            return TaskResponse(**r.json()).with_api_key(self._api_key)
+    async def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
+        """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
+        Args:
+            file: The file to upload.
+            config: Configuration options for processing. Optional.
+        Examples:
+        ```
+        # Upload from file path
+        task = await chunkr.start_upload("document.pdf")
+        # Upload from opened file
+        with open("document.pdf", "rb") as f:
+            task = await chunkr.start_upload(f)
+        # Upload from URL
+        task = await chunkr.start_upload("https://example.com/document.pdf")
+        # Upload from base64 string (must include MIME type header)
+        task = await chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
+        # Upload an image
+        from PIL import Image
+        img = Image.open("photo.jpg")
+        task = await chunkr.start_upload(img)
+        # Wait for the task to complete - this can be done when needed
+        await task.poll_async()
+        ```
+        Returns:
+            TaskResponse: The initial task response
+        """
+        files, data = self._prepare_upload_data(file, config)
+        r = await self._client.post(
+            f"{self.url}/api/v1/task",
+            files=files,
+            json=config.model_dump() if config else {},
+            headers=self._headers()
+        )
+        r.raise_for_status()
+        return TaskResponse(**r.json()).with_client(self)
     async def get_task(self, task_id: str) -> TaskResponse:
-        url = f"{self.url}/api/v1/task/{task_id}"
-        async with httpx.AsyncClient() as client:
-            r = await client.get(url, headers=self._headers())
-            r.raise_for_status()
-            return TaskResponse(**r.json()).with_api_key(self._api_key)
+        r = await self._client.get(
+            f"{self.url}/api/v1/task/{task_id}",
+            headers=self._headers()
+        )
+        r.raise_for_status()
+        return TaskResponse(**r.json()).with_client(self)
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self._client.aclose()

chunkr_ai/api/config.py ADDED Viewed

@@ -0,0 +1,131 @@
+from pydantic import BaseModel, Field, model_validator
+from enum import Enum
+from typing import Optional, List, Dict
+class GenerationStrategy(str, Enum):
+    LLM = "LLM"
+    AUTO = "Auto"
+class CroppingStrategy(str, Enum):
+    ALL = "All"
+    AUTO = "Auto"
+class LlmConfig(BaseModel):
+    model: str
+    prompt: str
+    temperature: float = 0.0
+class GenerationConfig(BaseModel):
+    html: Optional[GenerationStrategy] = None
+    llm: Optional[LlmConfig] = None
+    markdown: Optional[GenerationStrategy] = None
+    crop_image: Optional[CroppingStrategy] = None
+class SegmentProcessing(BaseModel):
+    title: Optional[GenerationConfig] = None
+    section_header: Optional[GenerationConfig] = None
+    text: Optional[GenerationConfig] = None
+    list_item: Optional[GenerationConfig] = None
+    table: Optional[GenerationConfig] = None
+    picture: Optional[GenerationConfig] = None
+    caption: Optional[GenerationConfig] = None
+    formula: Optional[GenerationConfig] = None
+    footnote: Optional[GenerationConfig] = None
+    page_header: Optional[GenerationConfig] = None
+    page_footer: Optional[GenerationConfig] = None
+    page: Optional[GenerationConfig] = None
+class ChunkProcessing(BaseModel):
+    target_length: Optional[int] = None
+class Property(BaseModel):
+    name: str
+    title: Optional[str] = None
+    prop_type: str
+    description: Optional[str] = None
+    default: Optional[str] = None
+class JsonSchema(BaseModel):
+    title: str
+    properties: List[Property]
+class OcrStrategy(str, Enum):
+    ALL = "All"
+    AUTO = "Auto"
+class SegmentationStrategy(str, Enum):
+    LAYOUT_ANALYSIS = "LayoutAnalysis"
+    PAGE = "Page"
+class BoundingBox(BaseModel):
+    left: float
+    top: float
+    width: float
+    height: float
+class OCRResult(BaseModel):
+    bbox: BoundingBox
+    text: str
+    confidence: Optional[float]
+class SegmentType(str, Enum):
+    CAPTION = "Caption"
+    FOOTNOTE = "Footnote"
+    FORMULA = "Formula"
+    LIST_ITEM = "ListItem"
+    PAGE = "Page"
+    PAGE_FOOTER = "PageFooter"
+    PAGE_HEADER = "PageHeader"
+    PICTURE = "Picture"
+    SECTION_HEADER = "SectionHeader"
+    TABLE = "Table"
+    TEXT = "Text"
+    TITLE = "Title"
+class Segment(BaseModel):
+    bbox: BoundingBox
+    content: str
+    page_height: float
+    html: Optional[str]
+    image: Optional[str]
+    markdown: Optional[str]
+    ocr: List[OCRResult]
+    page_number: int
+    page_width: float
+    segment_id: str
+    segment_type: SegmentType
+class Chunk(BaseModel):
+    chunk_id: str
+    chunk_length: int
+    segments: List[Segment]
+class ExtractedJson(BaseModel):
+    data: Dict
+class OutputResponse(BaseModel):
+    chunks: List[Chunk] = []
+    extracted_json: Optional[ExtractedJson]
+class Model(str, Enum):
+    FAST = "Fast"
+    HIGH_QUALITY = "HighQuality"
+class Configuration(BaseModel):
+    chunk_processing: Optional[ChunkProcessing] = Field(default=None)
+    expires_in: Optional[int] = Field(default=None)
+    high_resolution: Optional[bool] = Field(default=None)
+    json_schema: Optional[JsonSchema] = Field(default=None)
+    model: Optional[Model] = Field(default=None)
+    ocr_strategy: Optional[OcrStrategy] = Field(default=None)
+    segment_processing: Optional[SegmentProcessing] = Field(default=None)
+    segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
+    @model_validator(mode='before')
+    def map_deprecated_fields(cls, values: Dict) -> Dict:
+        if isinstance(values, dict) and "target_chunk_length" in values:
+            target_length = values.pop("target_chunk_length")
+            if target_length is not None:
+                values["chunk_processing"] = values.get("chunk_processing", {}) or {}
+                values["chunk_processing"]["target_length"] = target_length
+        return values

chunkr_ai/api/protocol.py ADDED Viewed

@@ -0,0 +1,19 @@
+from typing import runtime_checkable, Protocol
+from requests import Session
+from httpx import AsyncClient
+@runtime_checkable
+class ChunkrClientProtocol(Protocol):
+    """Protocol defining the interface for Chunkr clients"""
+    url: str
+    _api_key: str
+    _session: Session
+    _client: AsyncClient
+    def get_api_key(self) -> str:
+        """Get the API key"""
+        ...
+    def _headers(self) -> dict:
+        """Return headers required for API requests"""
+        ...

chunkr_ai/api/task.py ADDED Viewed

@@ -0,0 +1,131 @@
+from .protocol import ChunkrClientProtocol
+from .config import Configuration, OutputResponse
+import asyncio
+from datetime import datetime
+from enum import Enum
+from pydantic import BaseModel, PrivateAttr
+import time
+from typing import Optional, Union
+class Status(str, Enum):
+    STARTING = "Starting"
+    PROCESSING = "Processing"
+    SUCCEEDED = "Succeeded"
+    FAILED = "Failed"
+class TaskResponse(BaseModel):
+    configuration: Configuration
+    created_at: datetime
+    expires_at: Optional[datetime]
+    file_name: Optional[str]
+    finished_at: Optional[datetime]
+    input_file_url: Optional[str]
+    message: str
+    output: Optional[OutputResponse]
+    page_count: Optional[int]
+    pdf_url: Optional[str]
+    status: Status
+    task_id: str
+    task_url: Optional[str]
+    _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
+    def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
+        self._client = client
+        return self
+    def _poll_request_sync(self) -> dict:
+        """Helper method to make polling request with retry logic (synchronous)"""
+        if not self.task_url:
+            raise ValueError("Task URL not found in response")
+        while True:
+            try:
+                r = self._client._session.get(self.task_url, headers=self._client._headers())
+                r.raise_for_status()
+                return r.json()
+            except (ConnectionError, TimeoutError) as _:
+                print("Connection error while polling the task, retrying...")
+                time.sleep(0.5)
+            except Exception as e:
+                raise
+    async def _poll_request_async(self) -> dict:
+        """Helper method to make polling request with retry logic (asynchronous)"""
+        if not self.task_url:
+            raise ValueError("Task URL not found in response")
+        while True:
+            try:
+                r = await self._client._client.get(self.task_url, headers=self._client._headers())
+                await r.raise_for_status()
+                return await r.json()
+            except (ConnectionError, TimeoutError) as _:
+                print("Connection error while polling the task, retrying...")
+                await asyncio.sleep(0.5)
+            except Exception as e:
+                raise
+    def _check_status(self) -> Optional['TaskResponse']:
+        """Helper method to check task status and handle completion/failure"""
+        if self.status == "Failed":
+            raise ValueError(self.message)
+        if self.status not in ("Starting", "Processing"):
+            return self
+        return None
+    def poll(self) -> 'TaskResponse':
+        """Poll the task for completion."""
+        while True:
+            response = self._poll_request_sync()
+            self.__dict__.update(response)
+            if result := self._check_status():
+                return result
+            time.sleep(0.5)
+    async def poll_async(self) -> 'TaskResponse':
+        """Poll the task for completion asynchronously."""
+        while True:
+            response = await self._poll_request_async()
+            self.__dict__.update(response)
+            if result := self._check_status():
+                return result
+            await asyncio.sleep(0.5)
+    def _get_content(self, content_type: str) -> str:
+        """Helper method to get either HTML, Markdown, or raw content."""
+        if not self.output:
+            return ""
+        parts = []
+        for c in self.output.chunks:
+            for s in c.segments:
+                content = getattr(s, content_type)
+                if content:
+                    parts.append(content)
+        return "\n".join(parts)
+    def html(self) -> str:
+        """Get full HTML for the task"""
+        return self._get_content("html")
+    def markdown(self) -> str:
+        """Get full markdown for the task"""
+        return self._get_content("markdown")
+    def content(self) -> str:
+        """Get full text for the task"""
+        return self._get_content("content")
+class TaskPayload(BaseModel):
+    current_configuration: Configuration
+    file_name: str
+    image_folder_location: str
+    input_location: str
+    output_location: str
+    pdf_location: str
+    previous_configuration: Optional[Configuration]
+    task_id: str
+    user_id: str

chunkr_ai/models.py ADDED Viewed

@@ -0,0 +1,48 @@
+from .api.config import (
+    BoundingBox,
+    Chunk,
+    ChunkProcessing,
+    Configuration,
+    CroppingStrategy,
+    ExtractedJson,
+    GenerationStrategy,
+    GenerationConfig,
+    JsonSchema,
+    LlmConfig,
+    Model,
+    OCRResult,
+    OcrStrategy,
+    OutputResponse,
+    Property,
+    Segment,
+    SegmentProcessing,
+    SegmentType,
+    SegmentationStrategy,
+)
+from .api.task import TaskResponse, TaskPayload, Status
+__all__ = [
+    'BoundingBox',
+    'Chunk',
+    'ChunkProcessing',
+    'Configuration',
+    'CroppingStrategy',
+    'ExtractedJson',
+    'GenerationConfig',
+    'GenerationStrategy',
+    'JsonSchema',
+    'LlmConfig',
+    'Model',
+    'OCRResult',
+    'OcrStrategy',
+    'OutputResponse',
+    'Property',
+    'Segment',
+    'SegmentProcessing',
+    'SegmentType',
+    'SegmentationStrategy',
+    'Status',
+    'TaskPayload',
+    'TaskResponse'
+]

chunkr_ai-0.0.4.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,204 @@
+Metadata-Version: 2.2
+Name: chunkr-ai
+Version: 0.0.4
+Summary: Python client for Chunkr: open source document intelligence
+Author-email: Ishaan Kapoor <ishaan@lumina.sh>
+Project-URL: Homepage, https://chunkr.ai
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: httpx>=0.28.1
+Requires-Dist: pillow>=11.1.0
+Requires-Dist: pydantic>=2.10.4
+Requires-Dist: python-dotenv>=1.0.1
+Requires-Dist: requests>=2.32.3
+Provides-Extra: test
+Requires-Dist: pytest>=8.3.4; extra == "test"
+Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
+# Chunkr Python Client
+This provides a simple interface to interact with the Chunkr API.
+## Getting Started
+You can get an API key from [Chunkr](https://chunkr.ai) or deploy your own Chunkr instance. For self-hosted deployment options, check out our [deployment guide](https://github.com/lumina-ai-inc/chunkr/tree/main?tab=readme-ov-file#self-hosted-deployment-options).
+For more information about the API and its capabilities, visit the [Chunkr API docs](https://docs.chunkr.ai).
+## Installation
+```bash
+pip install chunkr-ai
+```
+## Usage
+We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
+### Synchronous Usage
+```python
+from chunkr_ai import Chunkr
+# Initialize client
+chunkr = Chunkr()
+# Upload a file and wait for processing
+task = chunkr.upload("document.pdf")
+# Print the response
+print(task)
+# Get output from task
+output = task.output
+# If you want to upload without waiting for processing
+task = chunkr.start_upload("document.pdf")
+# ... do other things ...
+task.poll()  # Check status when needed
+```
+### Asynchronous Usage
+```python
+from chunkr_ai import ChunkrAsync
+async def process_document():
+    # Initialize client
+    chunkr = ChunkrAsync()
+    # Upload a file and wait for processing
+    task = await chunkr.upload("document.pdf")
+    # Print the response
+    print(task)
+    # Get output from task
+    output = task.output
+    # If you want to upload without waiting for processing
+    task = await chunkr.start_upload("document.pdf")
+    # ... do other things ...
+    await task.poll_async()  # Check status when needed
+```
+### Additional Features
+Both clients support various input types:
+```python
+# Upload from file path
+chunkr.upload("document.pdf")
+# Upload from opened file
+with open("document.pdf", "rb") as f:
+    chunkr.upload(f)
+# Upload from URL
+chunkr.upload("https://example.com/document.pdf")
+# Upload from base64 string
+chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
+# Upload an image
+from PIL import Image
+img = Image.open("photo.jpg")
+chunkr.upload(img)
+```
+### Configuration
+You can customize the processing behavior by passing a `Configuration` object:
+```python
+from chunkr_ai.models import Configuration, OcrStrategy, SegmentationStrategy, GenerationStrategy
+# Basic configuration
+config = Configuration(
+    ocr_strategy=OcrStrategy.AUTO,
+    segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
+    high_resolution=True,
+    expires_in=3600,  # seconds
+)
+# Upload with configuration
+task = chunkr.upload("document.pdf", config)
+```
+#### Available Configuration Examples
+- **Chunk Processing**
+  ```python
+  from chunkr_ai.models import ChunkProcessing
+  config = Configuration(
+      chunk_processing=ChunkProcessing(target_length=1024)
+  )
+  ```
+- **Expires In**
+  ```python
+  config = Configuration(expires_in=3600)
+  ```
+- **High Resolution**
+  ```python
+  config = Configuration(high_resolution=True)
+  ```
+- **JSON Schema**
+  ```python
+  config = Configuration(json_schema=JsonSchema(
+      title="Sales Data",
+      properties=[
+          Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
+          Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
+      ]
+  ))
+  ```
+- **OCR Strategy**
+  ```python
+  config = Configuration(ocr_strategy=OcrStrategy.AUTO)
+  ```
+- **Segment Processing**
+  ```python
+  from chunkr_ai.models import SegmentProcessing, GenerationConfig, GenerationStrategy
+  config = Configuration(
+      segment_processing=SegmentProcessing(
+          page=GenerationConfig(
+              html=GenerationStrategy.LLM,
+              markdown=GenerationStrategy.LLM
+          )
+      )
+  )
+  ```
+- **Segmentation Strategy**
+  ```python
+  config = Configuration(
+      segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS  # or SegmentationStrategy.PAGE
+  )
+  ```
+## Environment setup
+You can provide your API key and URL in several ways:
+1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
+2. `.env` file
+3. Direct initialization:
+```python
+chunkr = Chunkr(
+    api_key="your-api-key",
+    url="https://api.chunkr.ai"
+)
+```
+## Run tests
+```python
+# Install dependencies
+uv pip install -e ".[test]"
+# Run tests
+uv run pytest
+```

chunkr_ai-0.0.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
+chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+chunkr_ai/models.py,sha256=d-B4vfgZClJOoHdPaH3vagwUc4qxeQSmUxab77DKYtQ,874
+chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
+chunkr_ai/api/base.py,sha256=WDHx8tU0fl9_-yvYTKL-U0uaxHv-8_bRfiw9Xkl-mWM,6499
+chunkr_ai/api/chunkr.py,sha256=LkBFzGB_T0y3fnBeIn_nwQW6Mb7eZO-iTlzWrmWBoko,3450
+chunkr_ai/api/chunkr_async.py,sha256=B9deRVoe4h3Csh_jEuQxuxQ-DKSuZPdwkanFTyfHmeM,3603
+chunkr_ai/api/config.py,sha256=K0s1giImciPksu-bO9gzRwUaK2Vo1nxNKQkXlRQ2cb8,3785
+chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
+chunkr_ai/api/task.py,sha256=ALU-rYlObbitlM1MKEFeSz_IBUpzb9736Iqu9huWg7c,4392
+chunkr_ai-0.0.4.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+chunkr_ai-0.0.4.dist-info/METADATA,sha256=7k2zij-F7_Kcs6nFCJMKQW382gFpOOLAnZoOOXFrKFs,4913
+chunkr_ai-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+chunkr_ai-0.0.4.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
+chunkr_ai-0.0.4.dist-info/RECORD,,

chunkr_ai/api/models.py DELETED Viewed

@@ -1,231 +0,0 @@
-from .auth import HeadersMixin
-import asyncio
-from datetime import datetime
-from enum import Enum
-import httpx
-from pydantic import BaseModel, Field, PrivateAttr
-import requests
-import time
-from typing import Optional, List, Dict, Union
-class GenerationStrategy(str, Enum):
-    LLM = "LLM"
-    AUTO = "Auto"
-class CroppingStrategy(str, Enum):
-    ALL = "All"
-    AUTO = "Auto"
-class LlmConfig(BaseModel):
-    model: str
-    prompt: str
-    temperature: float = 0.0
-class AutoGenerationConfig(BaseModel):
-    html: GenerationStrategy = GenerationStrategy.AUTO
-    llm: Optional[LlmConfig] = None
-    markdown: GenerationStrategy = GenerationStrategy.AUTO
-    crop_image: CroppingStrategy = CroppingStrategy.ALL
-class LlmGenerationConfig(BaseModel):
-    html: GenerationStrategy = GenerationStrategy.LLM
-    llm: Optional[LlmConfig] = None
-    markdown: GenerationStrategy = GenerationStrategy.LLM
-    crop_image: CroppingStrategy = CroppingStrategy.ALL
-class SegmentProcessing(BaseModel):
-    title: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
-    section_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
-    text: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
-    list_item: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
-    table: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
-    picture: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
-    caption: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
-    formula: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
-    footnote: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
-    page_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
-    page_footer: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
-    page: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
-class ChunkProcessing(BaseModel):
-    target_length: int = 512
-class Property(BaseModel):
-    name: str
-    title: Optional[str]
-    prop_type: str
-    description: Optional[str]
-    default: Optional[str]
-class JsonSchema(BaseModel):
-    title: str
-    properties: List[Property]
-    schema_type: Optional[str]
-class OcrStrategy(str, Enum):
-    ALL = "All"
-    AUTO = "Auto"
-class SegmentationStrategy(str, Enum):
-    LAYOUT_ANALYSIS = "LayoutAnalysis"
-    PAGE = "Page"
-class BoundingBox(BaseModel):
-    left: float
-    top: float
-    width: float
-    height: float
-class OCRResult(BaseModel):
-    bbox: BoundingBox
-    text: str
-    confidence: Optional[float]
-class SegmentType(str, Enum):
-    CAPTION = "Caption"
-    FOOTNOTE = "Footnote"
-    FORMULA = "Formula"
-    LIST_ITEM = "ListItem"
-    PAGE = "Page"
-    PAGE_FOOTER = "PageFooter"
-    PAGE_HEADER = "PageHeader"
-    PICTURE = "Picture"
-    SECTION_HEADER = "SectionHeader"
-    TABLE = "Table"
-    TEXT = "Text"
-    TITLE = "Title"
-class Segment(BaseModel):
-    bbox: BoundingBox
-    content: str
-    page_height: float
-    html: Optional[str]
-    image: Optional[str]
-    markdown: Optional[str]
-    ocr: List[OCRResult]
-    page_number: int
-    page_width: float
-    segment_id: str
-    segment_type: SegmentType
-class Chunk(BaseModel):
-    chunk_id: str
-    chunk_length: int
-    segments: List[Segment]
-class ExtractedJson(BaseModel):
-    data: Dict
-class OutputResponse(BaseModel):
-    chunks: List[Chunk] = []
-    extracted_json: Optional[ExtractedJson]
-class Model(str, Enum):
-    FAST = "Fast"
-    HIGH_QUALITY = "HighQuality"
-class Configuration(BaseModel):
-    chunk_processing: ChunkProcessing = Field(default_factory=ChunkProcessing)
-    expires_in: Optional[int] = None
-    high_resolution: bool = False
-    json_schema: Optional[JsonSchema] = None
-    model: Optional[Model] = Field(None, deprecated=True)
-    ocr_strategy: OcrStrategy = OcrStrategy.AUTO
-    segment_processing: SegmentProcessing = Field(default_factory=SegmentProcessing)
-    segmentation_strategy: SegmentationStrategy = SegmentationStrategy.LAYOUT_ANALYSIS
-    target_chunk_length: Optional[int] = Field(None, deprecated=True)
-class Status(str, Enum):
-    STARTING = "Starting"
-    PROCESSING = "Processing"
-    SUCCEEDED = "Succeeded"
-    FAILED = "Failed"
-class TaskResponse(BaseModel, HeadersMixin):
-    configuration: Configuration
-    created_at: datetime
-    expires_at: Optional[datetime]
-    file_name: Optional[str]
-    finished_at: Optional[datetime]
-    input_file_url: Optional[str]
-    message: str
-    output: Optional[OutputResponse]
-    page_count: Optional[int]
-    pdf_url: Optional[str]
-    status: Status
-    task_id: str
-    task_url: Optional[str]
-    _api_key: Optional[str] = PrivateAttr(default=None)
-    def with_api_key(self, api_key: str) -> 'TaskResponse':
-        """Helper function to set api key on a TaskResponse after creation"""
-        self._api_key = api_key
-        return self
-    def poll(self) -> 'TaskResponse':
-        """Poll the task for completion"""
-        if not self.task_url:
-            raise ValueError("Task URL not found in response")
-        while True:
-            r = requests.get(self.task_url, headers=self._headers())
-            r.raise_for_status()
-            self.__dict__.update(r.json())
-            if self.status == "Failed":
-                raise ValueError(self.message)
-            if self.status not in ("Starting", "Processing"):
-                return self
-            time.sleep(0.5)
-    async def poll_async(self) -> 'TaskResponse':
-        """Async poll the task for completion"""
-        if not self.task_url:
-            raise ValueError("Task URL not found in response")
-        async with httpx.AsyncClient() as client:
-            while True:
-                r = await client.get(self.task_url, headers=self._headers())
-                r.raise_for_status()
-                self.__dict__.update(r.json())
-                if self.status == "Failed":
-                    raise ValueError(self.message)
-                if self.status not in ("Starting", "Processing"):
-                    return self
-                await asyncio.sleep(0.5)
-    def _get_content(self, content_type: str) -> str:
-        """Helper method to get either HTML, Markdown, or raw content."""
-        if not self.output:
-            return ""
-        parts = []
-        for c in self.output.chunks:
-            for s in c.segments:
-                content = getattr(s, content_type)
-                if content:
-                    parts.append(content)
-        return "\n".join(parts)
-    def html(self) -> str:
-        """Get full HTML for the task"""
-        return self._get_content("html")
-    def markdown(self) -> str:
-        """Get full markdown for the task"""
-        return self._get_content("markdown")
-    def content(self) -> str:
-        """Get full text for the task"""
-        return self._get_content("content")
-class TaskPayload(BaseModel):
-    current_configuration: Configuration
-    file_name: str
-    image_folder_location: str
-    input_location: str
-    output_location: str
-    pdf_location: str
-    previous_configuration: Optional[Configuration]
-    task_id: str
-    user_id: str

chunkr_ai-0.0.2.dist-info/METADATA DELETED Viewed

@@ -1,16 +0,0 @@
-Metadata-Version: 2.2
-Name: chunkr-ai
-Version: 0.0.2
-Summary: Python client for chunkr: open source document intelligence
-Author-email: Ishaan Kapoor <ishaan@lumina.sh>
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: build>=1.2.2.post1
-Requires-Dist: httpx>=0.28.1
-Requires-Dist: pillow>=11.1.0
-Requires-Dist: pydantic>=2.10.4
-Requires-Dist: python-dotenv>=1.0.1
-Requires-Dist: requests>=2.32.3
-Requires-Dist: twine>=6.0.1
-Provides-Extra: test
-Requires-Dist: pytest>=8.3.4; extra == "test"

chunkr_ai-0.0.2.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
-chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-chunkr_ai/api/auth.py,sha256=U25WiNQBsrAWYAntuds0zSMvB4gUpAwGoSa5wnQ2LRQ,454
-chunkr_ai/api/chunkr.py,sha256=UqFoK8ytCsW1I5F0nM4OD6I4zigy-UHzGuMDtpvMSmE,4454
-chunkr_ai/api/chunkr_async.py,sha256=Kfh7_DEon6QTPe-XJops8l9R6rp0zIfJKeh9ZEGFQao,1529
-chunkr_ai/api/models.py,sha256=vAVeRHgdSO4SDl009R2Vz75WtuXAwkUZW8ZsVXk9yBA,7221
-chunkr_ai-0.0.2.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-chunkr_ai-0.0.2.dist-info/METADATA,sha256=ZK6gdzkukxMEVr1WxodLZ9dZNHar32C00ST1LG9mFl8,519
-chunkr_ai-0.0.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-chunkr_ai-0.0.2.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
-chunkr_ai-0.0.2.dist-info/RECORD,,

{chunkr_ai-0.0.2.dist-info → chunkr_ai-0.0.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{chunkr_ai-0.0.2.dist-info → chunkr_ai-0.0.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{chunkr_ai-0.0.2.dist-info → chunkr_ai-0.0.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

chunkr-ai 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

chunkr-ai 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl