PyPI - chunkr-ai - Versions diffs - 0.0.12__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

chunkr-ai 0.0.12py3-none-any.whl → 0.0.15py3-none-any.whl

Files changed (21) hide show

chunkr_ai/__init__.py +1 -1
chunkr_ai/api/api.py +0 -0
chunkr_ai/api/auth.py +4 -4
chunkr_ai/api/base.py +183 -0
chunkr_ai/api/chunkr.py +31 -118
chunkr_ai/api/chunkr_async.py +99 -123
chunkr_ai/api/chunkr_base.py +112 -37
chunkr_ai/api/config.py +38 -14
chunkr_ai/api/misc.py +51 -44
chunkr_ai/api/protocol.py +6 -4
chunkr_ai/api/schema.py +66 -58
chunkr_ai/api/task.py +23 -18
chunkr_ai/api/task_async.py +27 -8
chunkr_ai/api/task_base.py +6 -6
chunkr_ai/models.py +21 -22
{chunkr_ai-0.0.12.dist-info → chunkr_ai-0.0.15.dist-info}/METADATA +2 -3
chunkr_ai-0.0.15.dist-info/RECORD +21 -0
chunkr_ai-0.0.12.dist-info/RECORD +0 -19
{chunkr_ai-0.0.12.dist-info → chunkr_ai-0.0.15.dist-info}/LICENSE +0 -0
{chunkr_ai-0.0.12.dist-info → chunkr_ai-0.0.15.dist-info}/WHEEL +0 -0
{chunkr_ai-0.0.12.dist-info → chunkr_ai-0.0.15.dist-info}/top_level.txt +0 -0

chunkr_ai/api/chunkr_base.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from .config import Configuration
 from .task import TaskResponse
+from .task_async import TaskResponseAsync
 from .auth import HeadersMixin
 from abc import abstractmethod
 from dotenv import load_dotenv
@@ -8,78 +9,152 @@ from pathlib import Path
 from PIL import Image
 from typing import BinaryIO, Union
 class ChunkrBase(HeadersMixin):
     """Base class with shared functionality for Chunkr API clients."""
     def __init__(self, url: str = None, api_key: str = None):
         load_dotenv()
-        self.url = (
-            url or
-            os.getenv('CHUNKR_URL') or
-            'https://api.chunkr.ai'
-        )
-        self._api_key = (
-            api_key or
-            os.getenv('CHUNKR_API_KEY')
-        )
+        self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
+        self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
         if not self._api_key:
-            raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
+            raise ValueError(
+                "API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
+            )
         self.url = self.url.rstrip("/")
     @abstractmethod
-    def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
+    def upload(
+        self,
+        file: Union[str, Path, BinaryIO, Image.Image],
+        config: Configuration = None,
+    ) -> Union[TaskResponse, TaskResponseAsync]:
         """Upload a file and wait for processing to complete.
-        Must be implemented by subclasses.
+        Args:
+            file: The file to upload.
+            config: Configuration options for processing. Optional.
+        Examples:
+        ```python
+        # Upload from file path
+        await chunkr.upload("document.pdf")
+        # Upload from opened file
+        with open("document.pdf", "rb") as f:
+            await chunkr.upload(f)
+        # Upload from URL
+        await chunkr.upload("https://example.com/document.pdf")
+        # Upload from base64 string (must include MIME type header)
+        await chunkr.upload("data:application/pdf;base64,JVBERi0...")
+        # Upload an image
+        from PIL import Image
+        img = Image.open("photo.jpg")
+        await chunkr.upload(img)
+        ```
+        Returns:
+            TaskResponse: The completed task response
         """
         pass
     @abstractmethod
-    def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
-        """Update a task by its ID.
-        Must be implemented by subclasses.
+    def update(
+        self, task_id: str, config: Configuration
+    ) -> Union[TaskResponse, TaskResponseAsync]:
+        """Update a task by its ID and wait for processing to complete.
+        Args:
+            task_id: The ID of the task to update
+            config: Configuration options for processing. Optional.
+        Returns:
+            TaskResponse: The updated task response
         """
         pass
     @abstractmethod
-    def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
-        """Upload a file for processing and immediately return the task response.
-        Must be implemented by subclasses.
+    def create_task(
+        self,
+        file: Union[str, Path, BinaryIO, Image.Image],
+        config: Configuration = None,
+    ) -> Union[TaskResponse, TaskResponseAsync]:
+        """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
+        Args:
+            file: The file to upload.
+            config: Configuration options for processing. Optional.
+        Examples:
+        ```
+        # Upload from file path
+        task = await chunkr.create_task("document.pdf")
+        # Upload from opened file
+        with open("document.pdf", "rb") as f:
+            task = await chunkr.create_task(f)
+        # Upload from URL
+        task = await chunkr.create_task("https://example.com/document.pdf")
+        # Upload from base64 string (must include MIME type header)
+        task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
+        # Upload an image
+        from PIL import Image
+        img = Image.open("photo.jpg")
+        task = await chunkr.create_task(img)
+        # Wait for the task to complete - this can be done when needed
+        await task.poll()
+        ```
         """
         pass
     @abstractmethod
-    def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
-        """Update a task by its ID.
-        Must be implemented by subclasses.
+    def update_task(
+        self, task_id: str, config: Configuration
+    ) -> Union[TaskResponse, TaskResponseAsync]:
+        """Update a task by its ID and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
+        Args:
+            task_id: The ID of the task to update
+            config: Configuration options for processing. Optional.
+        Returns:
+            TaskResponse: The updated task response
         """
         pass
     @abstractmethod
-    def get_task(self, task_id: str) -> TaskResponse:
+    def get_task(self, task_id: str) -> Union[TaskResponse, TaskResponseAsync]:
         """Get a task response by its ID.
-        Must be implemented by subclasses.
+        Args:
+            task_id: The ID of the task to get
+        Returns:
+            TaskResponse: The task response
         """
         pass
     @abstractmethod
     def delete_task(self, task_id: str) -> None:
         """Delete a task by its ID.
-        Must be implemented by subclasses.
+        Args:
+            task_id: The ID of the task to delete
         """
         pass
     @abstractmethod
     def cancel_task(self, task_id: str) -> None:
         """Cancel a task by its ID.
-        Must be implemented by subclasses.
+        Args:
+            task_id: The ID of the task to cancel
         """
         pass

chunkr_ai/api/config.py CHANGED Viewed

@@ -3,28 +3,31 @@ from enum import Enum
 from typing import Optional, List, Dict, Union, Type
 from .schema import from_pydantic
 class GenerationStrategy(str, Enum):
     LLM = "LLM"
     AUTO = "Auto"
 class CroppingStrategy(str, Enum):
-    ALL = "All"
+    ALL = "All"
     AUTO = "Auto"
 class GenerationConfig(BaseModel):
     html: Optional[GenerationStrategy] = None
     llm: Optional[str] = None
     markdown: Optional[GenerationStrategy] = None
     crop_image: Optional[CroppingStrategy] = None
 class SegmentProcessing(BaseModel):
-    model_config = ConfigDict(
-        populate_by_name=True,
-        alias_generator=str.title
-    )
+    model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
     title: Optional[GenerationConfig] = Field(default=None, alias="Title")
-    section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
+    section_header: Optional[GenerationConfig] = Field(
+        default=None, alias="SectionHeader"
+    )
     text: Optional[GenerationConfig] = Field(default=None, alias="Text")
     list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
     table: Optional[GenerationConfig] = Field(default=None, alias="Table")
@@ -36,38 +39,46 @@ class SegmentProcessing(BaseModel):
     page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
     page: Optional[GenerationConfig] = Field(default=None, alias="Page")
 class ChunkProcessing(BaseModel):
     target_length: Optional[int] = None
 class Property(BaseModel):
     name: str
     prop_type: str
     description: Optional[str] = None
     default: Optional[str] = None
 class JsonSchema(BaseModel):
     title: str
     properties: List[Property]
 class OcrStrategy(str, Enum):
     ALL = "All"
     AUTO = "Auto"
 class SegmentationStrategy(str, Enum):
     LAYOUT_ANALYSIS = "LayoutAnalysis"
     PAGE = "Page"
 class BoundingBox(BaseModel):
     left: float
     top: float
     width: float
     height: float
 class OCRResult(BaseModel):
     bbox: BoundingBox
     text: str
     confidence: Optional[float]
 class SegmentType(str, Enum):
     CAPTION = "Caption"
     FOOTNOTE = "Footnote"
@@ -82,6 +93,7 @@ class SegmentType(str, Enum):
     TEXT = "Text"
     TITLE = "Title"
 class Segment(BaseModel):
     bbox: BoundingBox
     content: str
@@ -95,33 +107,40 @@ class Segment(BaseModel):
     segment_id: str
     segment_type: SegmentType
 class Chunk(BaseModel):
     chunk_id: str
     chunk_length: int
     segments: List[Segment]
 class ExtractedJson(BaseModel):
     data: Dict
 class OutputResponse(BaseModel):
     chunks: List[Chunk]
     extracted_json: Optional[ExtractedJson] = Field(default=None)
 class Model(str, Enum):
     FAST = "Fast"
     HIGH_QUALITY = "HighQuality"
 class Configuration(BaseModel):
     chunk_processing: Optional[ChunkProcessing] = Field(default=None)
     expires_in: Optional[int] = Field(default=None)
     high_resolution: Optional[bool] = Field(default=None)
-    json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(default=None)
+    json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(
+        default=None
+    )
     model: Optional[Model] = Field(default=None)
     ocr_strategy: Optional[OcrStrategy] = Field(default=None)
     segment_processing: Optional[SegmentProcessing] = Field(default=None)
     segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
-    @model_validator(mode='before')
+    @model_validator(mode="before")
     def map_deprecated_fields(cls, values: Dict) -> Dict:
         if isinstance(values, dict) and "target_chunk_length" in values:
             target_length = values.pop("target_chunk_length")
@@ -130,13 +149,18 @@ class Configuration(BaseModel):
                 values["chunk_processing"]["target_length"] = target_length
         return values
-    @model_validator(mode='after')
-    def convert_json_schema(self) -> 'Configuration':
-        if self.json_schema is not None and not isinstance(self.json_schema, JsonSchema):
-            if isinstance(self.json_schema, (BaseModel, type)) and issubclass(getattr(self.json_schema, '__class__', type), BaseModel):
+    @model_validator(mode="after")
+    def convert_json_schema(self) -> "Configuration":
+        if self.json_schema is not None and not isinstance(
+            self.json_schema, JsonSchema
+        ):
+            if isinstance(self.json_schema, (BaseModel, type)) and issubclass(
+                getattr(self.json_schema, "__class__", type), BaseModel
+            ):
                 self.json_schema = JsonSchema(**from_pydantic(self.json_schema))
         return self
 class Status(str, Enum):
     STARTING = "Starting"
     PROCESSING = "Processing"

chunkr_ai/api/misc.py CHANGED Viewed

@@ -6,68 +6,74 @@ from PIL import Image
 import requests
 from typing import Union, Tuple, BinaryIO, Optional
-def prepare_file(
-    file: Union[str, Path, BinaryIO, Image.Image]
-) -> Tuple[str, BinaryIO]:
+def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[str, BinaryIO]:
     """Convert various file types into a tuple of (filename, file-like object)."""
     # Handle URLs
-    if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
+    if isinstance(file, str) and (
+        file.startswith("http://") or file.startswith("https://")
+    ):
         response = requests.get(file)
         response.raise_for_status()
         # Try to get filename from Content-Disposition header first
         filename = None
-        content_disposition = response.headers.get('Content-Disposition')
-        if content_disposition and 'filename=' in content_disposition:
-            filename = content_disposition.split('filename=')[-1].strip('"\'')
+        content_disposition = response.headers.get("Content-Disposition")
+        if content_disposition and "filename=" in content_disposition:
+            filename = content_disposition.split("filename=")[-1].strip("\"'")
         # If no Content-Disposition, try to get clean filename from URL path
         if not filename:
             from urllib.parse import urlparse, unquote
             parsed_url = urlparse(file)
             path = unquote(parsed_url.path)
             filename = Path(path).name if path else None
         # Fallback to default name if we couldn't extract one
-        filename = filename or 'downloaded_file'
+        filename = filename or "downloaded_file"
         # Sanitize filename: remove invalid characters and limit length
         import re
-        filename = re.sub(r'[<>:"/\\|?*%]', '_', filename)  # Replace invalid chars with underscore
-        filename = re.sub(r'\s+', '_', filename)            # Replace whitespace with underscore
-        filename = filename.strip('._')                     # Remove leading/trailing dots and underscores
-        filename = filename[:255]                           # Limit length to 255 characters
+        filename = re.sub(
+            r'[<>:"/\\|?*%]', "_", filename
+        )  # Replace invalid chars with underscore
+        filename = re.sub(r"\s+", "_", filename)  # Replace whitespace with underscore
+        filename = filename.strip("._")  # Remove leading/trailing dots and underscores
+        filename = filename[:255]  # Limit length to 255 characters
         file_obj = io.BytesIO(response.content)
         return filename, file_obj
     # Handle base64 strings
-    if isinstance(file, str) and ',' in file and ';base64,' in file:
+    if isinstance(file, str) and "," in file and ";base64," in file:
         try:
             # Split header and data
-            header, base64_data = file.split(',', 1)
+            header, base64_data = file.split(",", 1)
             import base64
             file_bytes = base64.b64decode(base64_data)
             file_obj = io.BytesIO(file_bytes)
             # Try to determine format from header
-            format = 'bin'
-            mime_type = header.split(':')[-1].split(';')[0].lower()
+            format = "bin"
+            mime_type = header.split(":")[-1].split(";")[0].lower()
             # Map MIME types to file extensions
             mime_to_ext = {
-                'application/pdf': 'pdf',
-                'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
-                'application/msword': 'doc',
-                'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
-                'application/vnd.ms-powerpoint': 'ppt',
-                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
-                'application/vnd.ms-excel': 'xls',
-                'image/jpeg': 'jpg',
-                'image/png': 'png',
-                'image/jpg': 'jpg'
+                "application/pdf": "pdf",
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
+                "application/msword": "doc",
+                "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
+                "application/vnd.ms-powerpoint": "ppt",
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
+                "application/vnd.ms-excel": "xls",
+                "image/jpeg": "jpg",
+                "image/png": "png",
+                "image/jpg": "jpg",
             }
             if mime_type in mime_to_ext:
                 format = mime_to_ext[mime_type]
             else:
@@ -82,36 +88,37 @@ def prepare_file(
         path = Path(file).resolve()
         if not path.exists():
             raise FileNotFoundError(f"File not found: {file}")
-        return path.name, open(path, 'rb')
+        return path.name, open(path, "rb")
     # Handle PIL Images
     if isinstance(file, Image.Image):
         img_byte_arr = io.BytesIO()
-        format = file.format or 'PNG'
+        format = file.format or "PNG"
         file.save(img_byte_arr, format=format)
         img_byte_arr.seek(0)
         return f"image.{format.lower()}", img_byte_arr
     # Handle file-like objects
-    if hasattr(file, 'read') and hasattr(file, 'seek'):
+    if hasattr(file, "read") and hasattr(file, "seek"):
         # Try to get the filename from the file object if possible
-        name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
+        name = (
+            getattr(file, "name", "document") if hasattr(file, "name") else "document"
+        )
         return Path(name).name, file
     raise TypeError(f"Unsupported file type: {type(file)}")
 def prepare_upload_data(
     file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
-    config: Optional[Configuration] = None
+    config: Optional[Configuration] = None,
 ) -> dict:
     """Prepare files and data dictionaries for upload.
     Args:
         file: The file to upload
         config: Optional configuration settings
     Returns:
         dict: (files dict) ready for upload
     """
@@ -123,6 +130,6 @@ def prepare_upload_data(
     if config:
         config_dict = config.model_dump(mode="json", exclude_none=True)
         for key, value in config_dict.items():
-            files[key] = (None, json.dumps(value), 'application/json')
+            files[key] = (None, json.dumps(value), "application/json")
     return files

chunkr_ai/api/protocol.py CHANGED Viewed

@@ -1,14 +1,16 @@
-from typing import runtime_checkable, Protocol
+from typing import Optional, runtime_checkable, Protocol
 from requests import Session
 from httpx import AsyncClient
 @runtime_checkable
 class ChunkrClientProtocol(Protocol):
     """Protocol defining the interface for Chunkr clients"""
     url: str
     _api_key: str
-    _session: Session
-    _client: AsyncClient
+    _session: Optional[Session] = None
+    _client: Optional[AsyncClient] = None
     def get_api_key(self) -> str:
         """Get the API key"""
@@ -16,4 +18,4 @@ class ChunkrClientProtocol(Protocol):
     def _headers(self) -> dict:
         """Return headers required for API requests"""
-        ...
+        ...

chunkr-ai 0.0.12__py3-none-any.whl → 0.0.15__py3-none-any.whl

chunkr-ai 0.0.12py3-none-any.whl → 0.0.15py3-none-any.whl