PyPI - chunkr-ai - Versions diffs - 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl - Mend

chunkr-ai 0.0.14py3-none-any.whl → 0.0.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

chunkr_ai/__init__.py +1 -1
chunkr_ai/api/auth.py +4 -4
chunkr_ai/api/base.py +58 -48
chunkr_ai/api/chunkr.py +21 -20
chunkr_ai/api/chunkr_async.py +26 -20
chunkr_ai/api/chunkr_base.py +34 -27
chunkr_ai/api/config.py +41 -14
chunkr_ai/api/misc.py +52 -44
chunkr_ai/api/protocol.py +5 -3
chunkr_ai/api/schema.py +66 -58
chunkr_ai/api/task.py +13 -16
chunkr_ai/api/task_async.py +16 -7
chunkr_ai/api/task_base.py +4 -1
chunkr_ai/models.py +23 -22
{chunkr_ai-0.0.14.dist-info → chunkr_ai-0.0.16.dist-info}/METADATA +1 -1
chunkr_ai-0.0.16.dist-info/RECORD +21 -0
chunkr_ai-0.0.14.dist-info/RECORD +0 -21
{chunkr_ai-0.0.14.dist-info → chunkr_ai-0.0.16.dist-info}/LICENSE +0 -0
{chunkr_ai-0.0.14.dist-info → chunkr_ai-0.0.16.dist-info}/WHEEL +0 -0
{chunkr_ai-0.0.14.dist-info → chunkr_ai-0.0.16.dist-info}/top_level.txt +0 -0

chunkr_ai/api/config.py CHANGED Viewed

@@ -3,28 +3,31 @@ from enum import Enum
 from typing import Optional, List, Dict, Union, Type
 from .schema import from_pydantic
 class GenerationStrategy(str, Enum):
     LLM = "LLM"
     AUTO = "Auto"
 class CroppingStrategy(str, Enum):
-    ALL = "All"
+    ALL = "All"
     AUTO = "Auto"
 class GenerationConfig(BaseModel):
     html: Optional[GenerationStrategy] = None
     llm: Optional[str] = None
     markdown: Optional[GenerationStrategy] = None
     crop_image: Optional[CroppingStrategy] = None
 class SegmentProcessing(BaseModel):
-    model_config = ConfigDict(
-        populate_by_name=True,
-        alias_generator=str.title
-    )
+    model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
     title: Optional[GenerationConfig] = Field(default=None, alias="Title")
-    section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
+    section_header: Optional[GenerationConfig] = Field(
+        default=None, alias="SectionHeader"
+    )
     text: Optional[GenerationConfig] = Field(default=None, alias="Text")
     list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
     table: Optional[GenerationConfig] = Field(default=None, alias="Table")
@@ -36,38 +39,46 @@ class SegmentProcessing(BaseModel):
     page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
     page: Optional[GenerationConfig] = Field(default=None, alias="Page")
 class ChunkProcessing(BaseModel):
     target_length: Optional[int] = None
 class Property(BaseModel):
     name: str
     prop_type: str
     description: Optional[str] = None
     default: Optional[str] = None
 class JsonSchema(BaseModel):
     title: str
     properties: List[Property]
 class OcrStrategy(str, Enum):
     ALL = "All"
     AUTO = "Auto"
 class SegmentationStrategy(str, Enum):
     LAYOUT_ANALYSIS = "LayoutAnalysis"
     PAGE = "Page"
 class BoundingBox(BaseModel):
     left: float
     top: float
     width: float
     height: float
 class OCRResult(BaseModel):
     bbox: BoundingBox
     text: str
     confidence: Optional[float]
 class SegmentType(str, Enum):
     CAPTION = "Caption"
     FOOTNOTE = "Footnote"
@@ -82,6 +93,7 @@ class SegmentType(str, Enum):
     TEXT = "Text"
     TITLE = "Title"
 class Segment(BaseModel):
     bbox: BoundingBox
     content: str
@@ -95,33 +107,43 @@ class Segment(BaseModel):
     segment_id: str
     segment_type: SegmentType
 class Chunk(BaseModel):
     chunk_id: str
     chunk_length: int
     segments: List[Segment]
 class ExtractedJson(BaseModel):
     data: Dict
 class OutputResponse(BaseModel):
     chunks: List[Chunk]
     extracted_json: Optional[ExtractedJson] = Field(default=None)
 class Model(str, Enum):
     FAST = "Fast"
     HIGH_QUALITY = "HighQuality"
+class PipelineType(str, Enum):
+    AZURE = "Azure"
 class Configuration(BaseModel):
     chunk_processing: Optional[ChunkProcessing] = Field(default=None)
     expires_in: Optional[int] = Field(default=None)
     high_resolution: Optional[bool] = Field(default=None)
-    json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(default=None)
+    json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(
+        default=None
+    )
     model: Optional[Model] = Field(default=None)
     ocr_strategy: Optional[OcrStrategy] = Field(default=None)
     segment_processing: Optional[SegmentProcessing] = Field(default=None)
     segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
+    pipeline: Optional[PipelineType] = Field(default=None)
-    @model_validator(mode='before')
+    @model_validator(mode="before")
     def map_deprecated_fields(cls, values: Dict) -> Dict:
         if isinstance(values, dict) and "target_chunk_length" in values:
             target_length = values.pop("target_chunk_length")
@@ -130,13 +152,18 @@ class Configuration(BaseModel):
                 values["chunk_processing"]["target_length"] = target_length
         return values
-    @model_validator(mode='after')
-    def convert_json_schema(self) -> 'Configuration':
-        if self.json_schema is not None and not isinstance(self.json_schema, JsonSchema):
-            if isinstance(self.json_schema, (BaseModel, type)) and issubclass(getattr(self.json_schema, '__class__', type), BaseModel):
+    @model_validator(mode="after")
+    def convert_json_schema(self) -> "Configuration":
+        if self.json_schema is not None and not isinstance(
+            self.json_schema, JsonSchema
+        ):
+            if isinstance(self.json_schema, (BaseModel, type)) and issubclass(
+                getattr(self.json_schema, "__class__", type), BaseModel
+            ):
                 self.json_schema = JsonSchema(**from_pydantic(self.json_schema))
         return self
 class Status(str, Enum):
     STARTING = "Starting"
     PROCESSING = "Processing"

chunkr_ai/api/misc.py CHANGED Viewed

@@ -3,71 +3,78 @@ import io
 import json
 from pathlib import Path
 from PIL import Image
+from pydantic import BaseModel
 import requests
 from typing import Union, Tuple, BinaryIO, Optional
-def prepare_file(
-    file: Union[str, Path, BinaryIO, Image.Image]
-) -> Tuple[str, BinaryIO]:
+def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[str, BinaryIO]:
     """Convert various file types into a tuple of (filename, file-like object)."""
     # Handle URLs
-    if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
+    if isinstance(file, str) and (
+        file.startswith("http://") or file.startswith("https://")
+    ):
         response = requests.get(file)
         response.raise_for_status()
         # Try to get filename from Content-Disposition header first
         filename = None
-        content_disposition = response.headers.get('Content-Disposition')
-        if content_disposition and 'filename=' in content_disposition:
-            filename = content_disposition.split('filename=')[-1].strip('"\'')
+        content_disposition = response.headers.get("Content-Disposition")
+        if content_disposition and "filename=" in content_disposition:
+            filename = content_disposition.split("filename=")[-1].strip("\"'")
         # If no Content-Disposition, try to get clean filename from URL path
         if not filename:
             from urllib.parse import urlparse, unquote
             parsed_url = urlparse(file)
             path = unquote(parsed_url.path)
             filename = Path(path).name if path else None
         # Fallback to default name if we couldn't extract one
-        filename = filename or 'downloaded_file'
+        filename = filename or "downloaded_file"
         # Sanitize filename: remove invalid characters and limit length
         import re
-        filename = re.sub(r'[<>:"/\\|?*%]', '_', filename)  # Replace invalid chars with underscore
-        filename = re.sub(r'\s+', '_', filename)            # Replace whitespace with underscore
-        filename = filename.strip('._')                     # Remove leading/trailing dots and underscores
-        filename = filename[:255]                           # Limit length to 255 characters
+        filename = re.sub(
+            r'[<>:"/\\|?*%]', "_", filename
+        )  # Replace invalid chars with underscore
+        filename = re.sub(r"\s+", "_", filename)  # Replace whitespace with underscore
+        filename = filename.strip("._")  # Remove leading/trailing dots and underscores
+        filename = filename[:255]  # Limit length to 255 characters
         file_obj = io.BytesIO(response.content)
         return filename, file_obj
     # Handle base64 strings
-    if isinstance(file, str) and ',' in file and ';base64,' in file:
+    if isinstance(file, str) and "," in file and ";base64," in file:
         try:
             # Split header and data
-            header, base64_data = file.split(',', 1)
+            header, base64_data = file.split(",", 1)
             import base64
             file_bytes = base64.b64decode(base64_data)
             file_obj = io.BytesIO(file_bytes)
             # Try to determine format from header
-            format = 'bin'
-            mime_type = header.split(':')[-1].split(';')[0].lower()
+            format = "bin"
+            mime_type = header.split(":")[-1].split(";")[0].lower()
             # Map MIME types to file extensions
             mime_to_ext = {
-                'application/pdf': 'pdf',
-                'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
-                'application/msword': 'doc',
-                'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
-                'application/vnd.ms-powerpoint': 'ppt',
-                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
-                'application/vnd.ms-excel': 'xls',
-                'image/jpeg': 'jpg',
-                'image/png': 'png',
-                'image/jpg': 'jpg'
+                "application/pdf": "pdf",
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
+                "application/msword": "doc",
+                "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
+                "application/vnd.ms-powerpoint": "ppt",
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
+                "application/vnd.ms-excel": "xls",
+                "image/jpeg": "jpg",
+                "image/png": "png",
+                "image/jpg": "jpg",
             }
             if mime_type in mime_to_ext:
                 format = mime_to_ext[mime_type]
             else:
@@ -82,36 +89,37 @@ def prepare_file(
         path = Path(file).resolve()
         if not path.exists():
             raise FileNotFoundError(f"File not found: {file}")
-        return path.name, open(path, 'rb')
+        return path.name, open(path, "rb")
     # Handle PIL Images
     if isinstance(file, Image.Image):
         img_byte_arr = io.BytesIO()
-        format = file.format or 'PNG'
+        format = file.format or "PNG"
         file.save(img_byte_arr, format=format)
         img_byte_arr.seek(0)
         return f"image.{format.lower()}", img_byte_arr
     # Handle file-like objects
-    if hasattr(file, 'read') and hasattr(file, 'seek'):
+    if hasattr(file, "read") and hasattr(file, "seek"):
         # Try to get the filename from the file object if possible
-        name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
+        name = (
+            getattr(file, "name", "document") if hasattr(file, "name") else "document"
+        )
         return Path(name).name, file
     raise TypeError(f"Unsupported file type: {type(file)}")
 def prepare_upload_data(
     file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
-    config: Optional[Configuration] = None
+    config: Optional[Configuration] = None,
 ) -> dict:
     """Prepare files and data dictionaries for upload.
     Args:
         file: The file to upload
         config: Optional configuration settings
     Returns:
         dict: (files dict) ready for upload
     """
@@ -123,6 +131,6 @@ def prepare_upload_data(
     if config:
         config_dict = config.model_dump(mode="json", exclude_none=True)
         for key, value in config_dict.items():
-            files[key] = (None, json.dumps(value), 'application/json')
+            files[key] = (None, json.dumps(value), "application/json")
     return files

chunkr_ai/api/protocol.py CHANGED Viewed

@@ -1,14 +1,16 @@
 from typing import Optional, runtime_checkable, Protocol
 from requests import Session
-from aiohttp import ClientSession
+from httpx import AsyncClient
 @runtime_checkable
 class ChunkrClientProtocol(Protocol):
     """Protocol defining the interface for Chunkr clients"""
     url: str
     _api_key: str
     _session: Optional[Session] = None
-    _client: Optional[ClientSession] = None
+    _client: Optional[AsyncClient] = None
     def get_api_key(self) -> str:
         """Get the API key"""
@@ -16,4 +18,4 @@ class ChunkrClientProtocol(Protocol):
     def _headers(self) -> dict:
         """Return headers required for API requests"""
-        ...
+        ...

chunkr_ai/api/schema.py CHANGED Viewed

@@ -2,17 +2,22 @@ from pydantic import BaseModel
 from typing import Optional, List, Union, Type
 import json
 class Property(BaseModel):
     name: str
     prop_type: str
     description: Optional[str] = None
     default: Optional[str] = None
 class JsonSchema(BaseModel):
     title: str
     properties: List[Property]
-def from_pydantic(pydantic: Union[BaseModel, Type[BaseModel]], current_depth: int = 0) -> dict:
+def from_pydantic(
+    pydantic: Union[BaseModel, Type[BaseModel]], current_depth: int = 0
+) -> dict:
     """Convert a Pydantic model to a Chunk json schema."""
     MAX_DEPTH = 5
     model = pydantic if isinstance(pydantic, type) else pydantic.__class__
@@ -21,108 +26,111 @@ def from_pydantic(pydantic: Union[BaseModel, Type[BaseModel]], current_depth: in
     def get_enum_description(details: dict) -> str:
         """Get description including enum values if they exist"""
-        description = details.get('description', '')
+        description = details.get("description", "")
         # First check if this is a direct enum
-        if 'enum' in details:
-            enum_values = details['enum']
-            enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
+        if "enum" in details:
+            enum_values = details["enum"]
+            enum_str = "\nAllowed values:\n" + "\n".join(
+                f"- {val}" for val in enum_values
+            )
             return f"{description}{enum_str}"
         # Then check if it's a reference to an enum
-        if '$ref' in details:
-            ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
-            if 'enum' in ref_schema:
-                enum_values = ref_schema['enum']
-                enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
+        if "$ref" in details:
+            ref_schema = resolve_ref(details["$ref"], schema.get("$defs", {}))
+            if "enum" in ref_schema:
+                enum_values = ref_schema["enum"]
+                enum_str = "\nAllowed values:\n" + "\n".join(
+                    f"- {val}" for val in enum_values
+                )
                 return f"{description}{enum_str}"
         return description
     def resolve_ref(ref: str, definitions: dict) -> dict:
         """Resolve a $ref reference to its actual schema"""
-        if not ref.startswith('#/$defs/'):
+        if not ref.startswith("#/$defs/"):
             return {}
-        ref_name = ref[len('#/$defs/'):]
+        ref_name = ref[len("#/$defs/") :]
         return definitions.get(ref_name, {})
     def get_nested_schema(field_schema: dict, depth: int) -> dict:
         if depth >= MAX_DEPTH:
             return {}
         # If there's a $ref, resolve it first
-        if '$ref' in field_schema:
-            field_schema = resolve_ref(field_schema['$ref'], schema.get('$defs', {}))
+        if "$ref" in field_schema:
+            field_schema = resolve_ref(field_schema["$ref"], schema.get("$defs", {}))
         nested_props = {}
-        if field_schema.get('type') == 'object':
-            for name, details in field_schema.get('properties', {}).items():
-                if details.get('type') == 'object' or '$ref' in details:
+        if field_schema.get("type") == "object":
+            for name, details in field_schema.get("properties", {}).items():
+                if details.get("type") == "object" or "$ref" in details:
                     ref_schema = details
-                    if '$ref' in details:
-                        ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
+                    if "$ref" in details:
+                        ref_schema = resolve_ref(
+                            details["$ref"], schema.get("$defs", {})
+                        )
                     nested_schema = get_nested_schema(ref_schema, depth + 1)
                     nested_props[name] = {
-                        'type': 'object',
-                        'description': get_enum_description(details),
-                        'properties': nested_schema
+                        "type": "object",
+                        "description": get_enum_description(details),
+                        "properties": nested_schema,
                     }
                 else:
                     nested_props[name] = {
-                        'type': details.get('type', 'string'),
-                        'description': get_enum_description(details)
+                        "type": details.get("type", "string"),
+                        "description": get_enum_description(details),
                     }
         return nested_props
-    for name, details in schema.get('properties', {}).items():
+    for name, details in schema.get("properties", {}).items():
         # Handle arrays
-        if details.get('type') == 'array':
-            items = details.get('items', {})
-            if '$ref' in items:
-                items = resolve_ref(items['$ref'], schema.get('$defs', {}))
+        if details.get("type") == "array":
+            items = details.get("items", {})
+            if "$ref" in items:
+                items = resolve_ref(items["$ref"], schema.get("$defs", {}))
             # Get nested schema for array items
             item_schema = get_nested_schema(items, current_depth)
             description = get_enum_description(details)
             if item_schema:
                 description = f"{description}\nList items schema:\n{json.dumps(item_schema, indent=2)}"
-            prop = Property(
-                name=name,
-                prop_type='list',
-                description=description
-            )
+            prop = Property(name=name, prop_type="list", description=description)
         # Handle objects and references
-        elif details.get('type') == 'object' or '$ref' in details:
-            prop_type = 'object'
+        elif details.get("type") == "object" or "$ref" in details:
+            prop_type = "object"
             ref_schema = details
-            if '$ref' in details:
-                ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
+            if "$ref" in details:
+                ref_schema = resolve_ref(details["$ref"], schema.get("$defs", {}))
             nested_schema = get_nested_schema(ref_schema, current_depth)
             prop = Property(
                 name=name,
                 prop_type=prop_type,
                 description=get_enum_description(details),
-                properties=nested_schema
+                properties=nested_schema,
             )
         # Handle primitive types
         else:
             prop = Property(
                 name=name,
-                prop_type=details.get('type', 'string'),
+                prop_type=details.get("type", "string"),
                 description=get_enum_description(details),
-                default=str(details.get('default')) if details.get('default') is not None else None
+                default=str(details.get("default"))
+                if details.get("default") is not None
+                else None,
             )
         properties.append(prop)
     json_schema = JsonSchema(
-        title=schema.get('title', model.__name__),
-        properties=properties
+        title=schema.get("title", model.__name__), properties=properties
     )
-    return json_schema.model_dump(mode="json", exclude_none=True)
+    return json_schema.model_dump(mode="json", exclude_none=True)

chunkr_ai/api/task.py CHANGED Viewed

@@ -3,24 +3,27 @@ from .misc import prepare_upload_data
 from .task_base import TaskBase
 import time
 class TaskResponse(TaskBase):
     def _poll_request(self) -> dict:
         while True:
             try:
                 if not self.task_url:
-                   raise ValueError("Task URL not found in response")
+                    raise ValueError("Task URL not found in response")
                 if not self._client._session:
                     raise ValueError("Client session not found")
-                r = self._client._session.get(self.task_url, headers=self._client._headers())
+                r = self._client._session.get(
+                    self.task_url, headers=self._client._headers()
+                )
                 r.raise_for_status()
                 return r.json()
             except (ConnectionError, TimeoutError) as _:
                 print("Connection error while polling the task, retrying...")
                 time.sleep(0.5)
-            except Exception as e:
+            except Exception:
                 raise
-    def poll(self) -> 'TaskResponse':
+    def poll(self) -> "TaskResponse":
         while True:
             response = self._poll_request()
             updated_task = TaskResponse(**response).with_client(self._client)
@@ -28,31 +31,28 @@ class TaskResponse(TaskBase):
             if result := self._check_status():
                 return result
             time.sleep(0.5)
-    def update(self, config: Configuration) -> 'TaskResponse':
+    def update(self, config: Configuration) -> "TaskResponse":
         if not self.task_url:
             raise ValueError("Task URL not found")
         if not self._client._session:
             raise ValueError("Client session not found")
         files = prepare_upload_data(None, config)
         r = self._client._session.patch(
-            self.task_url,
-            files=files,
-            headers=self._client._headers()
+            self.task_url, files=files, headers=self._client._headers()
         )
         r.raise_for_status()
         updated = TaskResponse(**r.json()).with_client(self._client)
         self.__dict__.update(updated.__dict__)
         return self.poll()
     def cancel(self):
         if not self.task_url:
             raise ValueError("Task URL not found")
         if not self._client._session:
             raise ValueError("Client session not found")
         r = self._client._session.get(
-            f"{self.task_url}/cancel",
-            headers=self._client._headers()
+            f"{self.task_url}/cancel", headers=self._client._headers()
         )
         r.raise_for_status()
         self.poll()
@@ -62,8 +62,5 @@ class TaskResponse(TaskBase):
             raise ValueError("Task URL not found")
         if not self._client._session:
             raise ValueError("Client session not found")
-        r = self._client._session.delete(
-            self.task_url,
-            headers=self._client._headers()
-        )
+        r = self._client._session.delete(self.task_url, headers=self._client._headers())
         r.raise_for_status()

chunkr-ai 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

chunkr-ai 0.0.14py3-none-any.whl → 0.0.16py3-none-any.whl