PyPI - chunkr-ai - Versions diffs - 0.0.9__tar.gz → 0.0.11__tar.gz - Mend

chunkr-ai 0.0.9tar.gz → 0.0.11tar.gz

Files changed (27) hide show

{chunkr_ai-0.0.9/src/chunkr_ai.egg-info → chunkr_ai-0.0.11}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,12 @@
 Metadata-Version: 2.2
 Name: chunkr-ai
-Version: 0.0.9
+Version: 0.0.11
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 Project-URL: Homepage, https://chunkr.ai
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: httpx>=0.24.0
+Requires-Dist: httpx>=0.25.0
 Requires-Dist: pillow>=10.0.0
 Requires-Dist: pydantic>=2.0.0
 Requires-Dist: pytest-asyncio>=0.21.0

{chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/pyproject.toml RENAMED Viewed

@@ -4,14 +4,14 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "chunkr-ai"
-version = "0.0.9"
+version = "0.0.11"
 authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
 description = "Python client for Chunkr: open source document intelligence"
 readme = "README.md"
 license = {"file" = "LICENSE"}
 urls = {Homepage = "https://chunkr.ai"}
 dependencies = [
-    "httpx>=0.24.0",
+    "httpx>=0.25.0",
     "pillow>=10.0.0",
     "pydantic>=2.0.0",
     "pytest-asyncio>=0.21.0",

{chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/chunkr.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from .base import ChunkrBase
+from .chunkr_base import ChunkrBase
 from .config import Configuration
 from .task import TaskResponse
 from pathlib import Path
@@ -163,5 +163,3 @@ class Chunkr(ChunkrBase):
             headers=self._headers()
         )
         r.raise_for_status()

{chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/chunkr_async.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from .base import ChunkrBase
+from .chunkr_base import ChunkrBase
 from .task import TaskResponse
 from .config import Configuration
 import httpx

{chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/config.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from pydantic import BaseModel, Field, model_validator, ConfigDict
 from enum import Enum
-from typing import Optional, List, Dict
+from typing import Optional, List, Dict, Union, Type
+from .schema import from_pydantic
 class GenerationStrategy(str, Enum):
     LLM = "LLM"
@@ -40,7 +41,6 @@ class ChunkProcessing(BaseModel):
 class Property(BaseModel):
     name: str
-    title: Optional[str] = None
     prop_type: str
     description: Optional[str] = None
     default: Optional[str] = None
@@ -115,7 +115,7 @@ class Configuration(BaseModel):
     chunk_processing: Optional[ChunkProcessing] = Field(default=None)
     expires_in: Optional[int] = Field(default=None)
     high_resolution: Optional[bool] = Field(default=None)
-    json_schema: Optional[JsonSchema] = Field(default=None)
+    json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(default=None)
     model: Optional[Model] = Field(default=None)
     ocr_strategy: Optional[OcrStrategy] = Field(default=None)
     segment_processing: Optional[SegmentProcessing] = Field(default=None)
@@ -129,3 +129,17 @@ class Configuration(BaseModel):
                 values["chunk_processing"] = values.get("chunk_processing", {}) or {}
                 values["chunk_processing"]["target_length"] = target_length
         return values
+    @model_validator(mode='after')
+    def convert_json_schema(self) -> 'Configuration':
+        if self.json_schema is not None and not isinstance(self.json_schema, JsonSchema):
+            if isinstance(self.json_schema, (BaseModel, type)) and issubclass(getattr(self.json_schema, '__class__', type), BaseModel):
+                self.json_schema = JsonSchema(**from_pydantic(self.json_schema))
+        return self
+class Status(str, Enum):
+    STARTING = "Starting"
+    PROCESSING = "Processing"
+    SUCCEEDED = "Succeeded"
+    FAILED = "Failed"
+    CANCELLED = "Cancelled"

{chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/misc.py RENAMED Viewed

@@ -1,11 +1,10 @@
+from .config import Configuration
 import io
 import json
 from pathlib import Path
 from PIL import Image
 import requests
 from typing import Union, Tuple, BinaryIO, Optional
-from .config import Configuration
 def prepare_file(
     file: Union[str, Path, BinaryIO, Image.Image]
@@ -15,8 +14,31 @@ def prepare_file(
     if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
         response = requests.get(file)
         response.raise_for_status()
+        # Try to get filename from Content-Disposition header first
+        filename = None
+        content_disposition = response.headers.get('Content-Disposition')
+        if content_disposition and 'filename=' in content_disposition:
+            filename = content_disposition.split('filename=')[-1].strip('"\'')
+        # If no Content-Disposition, try to get clean filename from URL path
+        if not filename:
+            from urllib.parse import urlparse, unquote
+            parsed_url = urlparse(file)
+            path = unquote(parsed_url.path)
+            filename = Path(path).name if path else None
+        # Fallback to default name if we couldn't extract one
+        filename = filename or 'downloaded_file'
+        # Sanitize filename: remove invalid characters and limit length
+        import re
+        filename = re.sub(r'[<>:"/\\|?*%]', '_', filename)  # Replace invalid chars with underscore
+        filename = re.sub(r'\s+', '_', filename)            # Replace whitespace with underscore
+        filename = filename.strip('._')                     # Remove leading/trailing dots and underscores
+        filename = filename[:255]                           # Limit length to 255 characters
         file_obj = io.BytesIO(response.content)
-        filename = Path(file.split('/')[-1]).name or 'downloaded_file'
         return filename, file_obj
     # Handle base64 strings

chunkr_ai-0.0.11/src/chunkr_ai/api/schema.py ADDED Viewed

@@ -0,0 +1,128 @@
+from pydantic import BaseModel
+from typing import Optional, List, Union, Type
+import json
+class Property(BaseModel):
+    name: str
+    prop_type: str
+    description: Optional[str] = None
+    default: Optional[str] = None
+class JsonSchema(BaseModel):
+    title: str
+    properties: List[Property]
+def from_pydantic(pydantic: Union[BaseModel, Type[BaseModel]], current_depth: int = 0) -> dict:
+    """Convert a Pydantic model to a Chunk json schema."""
+    MAX_DEPTH = 5
+    model = pydantic if isinstance(pydantic, type) else pydantic.__class__
+    schema = model.model_json_schema()
+    properties = []
+    def get_enum_description(details: dict) -> str:
+        """Get description including enum values if they exist"""
+        description = details.get('description', '')
+        # First check if this is a direct enum
+        if 'enum' in details:
+            enum_values = details['enum']
+            enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
+            return f"{description}{enum_str}"
+        # Then check if it's a reference to an enum
+        if '$ref' in details:
+            ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
+            if 'enum' in ref_schema:
+                enum_values = ref_schema['enum']
+                enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
+                return f"{description}{enum_str}"
+        return description
+    def resolve_ref(ref: str, definitions: dict) -> dict:
+        """Resolve a $ref reference to its actual schema"""
+        if not ref.startswith('#/$defs/'):
+            return {}
+        ref_name = ref[len('#/$defs/'):]
+        return definitions.get(ref_name, {})
+    def get_nested_schema(field_schema: dict, depth: int) -> dict:
+        if depth >= MAX_DEPTH:
+            return {}
+        # If there's a $ref, resolve it first
+        if '$ref' in field_schema:
+            field_schema = resolve_ref(field_schema['$ref'], schema.get('$defs', {}))
+        nested_props = {}
+        if field_schema.get('type') == 'object':
+            for name, details in field_schema.get('properties', {}).items():
+                if details.get('type') == 'object' or '$ref' in details:
+                    ref_schema = details
+                    if '$ref' in details:
+                        ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
+                    nested_schema = get_nested_schema(ref_schema, depth + 1)
+                    nested_props[name] = {
+                        'type': 'object',
+                        'description': get_enum_description(details),
+                        'properties': nested_schema
+                    }
+                else:
+                    nested_props[name] = {
+                        'type': details.get('type', 'string'),
+                        'description': get_enum_description(details)
+                    }
+        return nested_props
+    for name, details in schema.get('properties', {}).items():
+        # Handle arrays
+        if details.get('type') == 'array':
+            items = details.get('items', {})
+            if '$ref' in items:
+                items = resolve_ref(items['$ref'], schema.get('$defs', {}))
+            # Get nested schema for array items
+            item_schema = get_nested_schema(items, current_depth)
+            description = get_enum_description(details)
+            if item_schema:
+                description = f"{description}\nList items schema:\n{json.dumps(item_schema, indent=2)}"
+            prop = Property(
+                name=name,
+                prop_type='list',
+                description=description
+            )
+        # Handle objects and references
+        elif details.get('type') == 'object' or '$ref' in details:
+            prop_type = 'object'
+            ref_schema = details
+            if '$ref' in details:
+                ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
+            nested_schema = get_nested_schema(ref_schema, current_depth)
+            prop = Property(
+                name=name,
+                prop_type=prop_type,
+                description=get_enum_description(details),
+                properties=nested_schema
+            )
+        # Handle primitive types
+        else:
+            prop = Property(
+                name=name,
+                prop_type=details.get('type', 'string'),
+                description=get_enum_description(details),
+                default=str(details.get('default')) if details.get('default') is not None else None
+            )
+        properties.append(prop)
+    json_schema = JsonSchema(
+        title=schema.get('title', model.__name__),
+        properties=properties
+    )
+    return json_schema.model_dump(mode="json", exclude_none=True)

{chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/task.py RENAMED Viewed

@@ -1,20 +1,12 @@
 from .protocol import ChunkrClientProtocol
-from .config import Configuration, OutputResponse
+from .config import Configuration, OutputResponse, Status
 from .misc import prepare_upload_data
 import asyncio
 from datetime import datetime
-from enum import Enum
 from pydantic import BaseModel, PrivateAttr
 import time
 from typing import Optional, Union
-class Status(str, Enum):
-    STARTING = "Starting"
-    PROCESSING = "Processing"
-    SUCCEEDED = "Succeeded"
-    FAILED = "Failed"
-    CANCELLED = "Cancelled"
 class TaskResponse(BaseModel):
     configuration: Configuration
     created_at: datetime

{chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/task_async.py RENAMED Viewed

@@ -1,20 +1,12 @@
 import asyncio
 from pydantic import BaseModel, PrivateAttr
 from datetime import datetime
-from enum import Enum
 from typing import Optional, Union
 from .task_base import TaskBase
 from .protocol import ChunkrClientProtocol
-from .config import Configuration, OutputResponse
+from .config import Configuration, OutputResponse, Status
 from .misc import prepare_upload_data
-class Status(str, Enum):
-    STARTING = "Starting"
-    PROCESSING = "Processing"
-    SUCCEEDED = "Succeeded"
-    FAILED = "Failed"
-    CANCELLED = "Cancelled"
 class TaskResponseAsync(BaseModel, TaskBase):
     configuration: Configuration
     created_at: datetime

{chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/models.py RENAMED Viewed

@@ -17,9 +17,11 @@ from .api.config import (
     SegmentProcessing,
     SegmentType,
     SegmentationStrategy,
+    Status,
 )
-from .api.task import TaskResponse, Status
+from .api.task import TaskResponse
+from .api.task_async import TaskResponseAsync
 __all__ = [
     'BoundingBox',
@@ -42,5 +44,6 @@ __all__ = [
     'SegmentType',
     'SegmentationStrategy',
     'Status',
-    'TaskResponse'
+    'TaskResponse',
+    'TaskResponseAsync',
 ]

{chunkr_ai-0.0.9 → chunkr_ai-0.0.11/src/chunkr_ai.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,12 @@
 Metadata-Version: 2.2
 Name: chunkr-ai
-Version: 0.0.9
+Version: 0.0.11
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 Project-URL: Homepage, https://chunkr.ai
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: httpx>=0.24.0
+Requires-Dist: httpx>=0.25.0
 Requires-Dist: pillow>=10.0.0
 Requires-Dist: pydantic>=2.0.0
 Requires-Dist: pytest-asyncio>=0.21.0

{chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai.egg-info/SOURCES.txt RENAMED Viewed

@@ -2,7 +2,6 @@ LICENSE
 README.md
 pyproject.toml
 src/chunkr_ai/__init__.py
-src/chunkr_ai/main.py
 src/chunkr_ai/models.py
 src/chunkr_ai.egg-info/PKG-INFO
 src/chunkr_ai.egg-info/SOURCES.txt
@@ -10,14 +9,14 @@ src/chunkr_ai.egg-info/dependency_links.txt
 src/chunkr_ai.egg-info/requires.txt
 src/chunkr_ai.egg-info/top_level.txt
 src/chunkr_ai/api/__init__.py
-src/chunkr_ai/api/api.py
 src/chunkr_ai/api/auth.py
-src/chunkr_ai/api/base.py
 src/chunkr_ai/api/chunkr.py
 src/chunkr_ai/api/chunkr_async.py
+src/chunkr_ai/api/chunkr_base.py
 src/chunkr_ai/api/config.py
 src/chunkr_ai/api/misc.py
 src/chunkr_ai/api/protocol.py
+src/chunkr_ai/api/schema.py
 src/chunkr_ai/api/task.py
 src/chunkr_ai/api/task_async.py
 src/chunkr_ai/api/task_base.py

{chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai.egg-info/requires.txt RENAMED Viewed

@@ -1,4 +1,4 @@
-httpx>=0.24.0
+httpx>=0.25.0
 pillow>=10.0.0
 pydantic>=2.0.0
 pytest-asyncio>=0.21.0

chunkr_ai-0.0.9/src/chunkr_ai/api/api.py DELETED Viewed

File without changes

chunkr_ai-0.0.9/src/chunkr_ai/main.py DELETED Viewed

@@ -1,12 +0,0 @@
-from chunkr_ai.api.chunkr import Chunkr
-from chunkr_ai.models import Configuration
-from chunkr_ai.api.config import SegmentationStrategy, ChunkProcessing
-if __name__ == "__main__":
-    chunkr = Chunkr()
-    task = chunkr.update_task("556b4fe5-e3f7-48dc-9f56-0fb7fbacdb87", Configuration(
-        chunk_processing=ChunkProcessing(
-            target_length=1000
-        )
-    ))
-    print(task)