PyPI - chunkr-ai - Versions diffs - 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl - Mend

chunkr-ai 0.0.9py3-none-any.whl → 0.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of chunkr-ai might be problematic. Click here for more details.

Files changed (17) hide show

chunkr_ai/api/chunkr.py +1 -3
chunkr_ai/api/chunkr_async.py +1 -1
chunkr_ai/api/config.py +17 -3
chunkr_ai/api/misc.py +25 -3
chunkr_ai/api/schema.py +128 -0
chunkr_ai/api/task.py +1 -9
chunkr_ai/api/task_async.py +1 -9
chunkr_ai/models.py +5 -2
{chunkr_ai-0.0.9.dist-info → chunkr_ai-0.0.11.dist-info}/METADATA +2 -2
chunkr_ai-0.0.11.dist-info/RECORD +19 -0
chunkr_ai/api/api.py +0 -0
chunkr_ai/main.py +0 -12
chunkr_ai-0.0.9.dist-info/RECORD +0 -20
/chunkr_ai/api/{base.py → chunkr_base.py} +0 -0
{chunkr_ai-0.0.9.dist-info → chunkr_ai-0.0.11.dist-info}/LICENSE +0 -0
{chunkr_ai-0.0.9.dist-info → chunkr_ai-0.0.11.dist-info}/WHEEL +0 -0
{chunkr_ai-0.0.9.dist-info → chunkr_ai-0.0.11.dist-info}/top_level.txt +0 -0

chunkr_ai/api/chunkr.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .base import ChunkrBase
+from .chunkr_base import ChunkrBase
 from .config import Configuration
 from .task import TaskResponse
 from pathlib import Path
@@ -163,5 +163,3 @@ class Chunkr(ChunkrBase):
             headers=self._headers()
         )
         r.raise_for_status()

chunkr_ai/api/chunkr_async.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .base import ChunkrBase
+from .chunkr_base import ChunkrBase
 from .task import TaskResponse
 from .config import Configuration
 import httpx

chunkr_ai/api/config.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from pydantic import BaseModel, Field, model_validator, ConfigDict
 from enum import Enum
-from typing import Optional, List, Dict
+from typing import Optional, List, Dict, Union, Type
+from .schema import from_pydantic
 class GenerationStrategy(str, Enum):
     LLM = "LLM"
@@ -40,7 +41,6 @@ class ChunkProcessing(BaseModel):
 class Property(BaseModel):
     name: str
-    title: Optional[str] = None
     prop_type: str
     description: Optional[str] = None
     default: Optional[str] = None
@@ -115,7 +115,7 @@ class Configuration(BaseModel):
     chunk_processing: Optional[ChunkProcessing] = Field(default=None)
     expires_in: Optional[int] = Field(default=None)
     high_resolution: Optional[bool] = Field(default=None)
-    json_schema: Optional[JsonSchema] = Field(default=None)
+    json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(default=None)
     model: Optional[Model] = Field(default=None)
     ocr_strategy: Optional[OcrStrategy] = Field(default=None)
     segment_processing: Optional[SegmentProcessing] = Field(default=None)
@@ -129,3 +129,17 @@ class Configuration(BaseModel):
                 values["chunk_processing"] = values.get("chunk_processing", {}) or {}
                 values["chunk_processing"]["target_length"] = target_length
         return values
+    @model_validator(mode='after')
+    def convert_json_schema(self) -> 'Configuration':
+        if self.json_schema is not None and not isinstance(self.json_schema, JsonSchema):
+            if isinstance(self.json_schema, (BaseModel, type)) and issubclass(getattr(self.json_schema, '__class__', type), BaseModel):
+                self.json_schema = JsonSchema(**from_pydantic(self.json_schema))
+        return self
+class Status(str, Enum):
+    STARTING = "Starting"
+    PROCESSING = "Processing"
+    SUCCEEDED = "Succeeded"
+    FAILED = "Failed"
+    CANCELLED = "Cancelled"

chunkr_ai/api/misc.py CHANGED Viewed

@@ -1,11 +1,10 @@
+from .config import Configuration
 import io
 import json
 from pathlib import Path
 from PIL import Image
 import requests
 from typing import Union, Tuple, BinaryIO, Optional
-from .config import Configuration
 def prepare_file(
     file: Union[str, Path, BinaryIO, Image.Image]
@@ -15,8 +14,31 @@ def prepare_file(
     if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
         response = requests.get(file)
         response.raise_for_status()
+        # Try to get filename from Content-Disposition header first
+        filename = None
+        content_disposition = response.headers.get('Content-Disposition')
+        if content_disposition and 'filename=' in content_disposition:
+            filename = content_disposition.split('filename=')[-1].strip('"\'')
+        # If no Content-Disposition, try to get clean filename from URL path
+        if not filename:
+            from urllib.parse import urlparse, unquote
+            parsed_url = urlparse(file)
+            path = unquote(parsed_url.path)
+            filename = Path(path).name if path else None
+        # Fallback to default name if we couldn't extract one
+        filename = filename or 'downloaded_file'
+        # Sanitize filename: remove invalid characters and limit length
+        import re
+        filename = re.sub(r'[<>:"/\\|?*%]', '_', filename)  # Replace invalid chars with underscore
+        filename = re.sub(r'\s+', '_', filename)            # Replace whitespace with underscore
+        filename = filename.strip('._')                     # Remove leading/trailing dots and underscores
+        filename = filename[:255]                           # Limit length to 255 characters
         file_obj = io.BytesIO(response.content)
-        filename = Path(file.split('/')[-1]).name or 'downloaded_file'
         return filename, file_obj
     # Handle base64 strings

chunkr_ai/api/schema.py ADDED Viewed

@@ -0,0 +1,128 @@
+from pydantic import BaseModel
+from typing import Optional, List, Union, Type
+import json
+class Property(BaseModel):
+    name: str
+    prop_type: str
+    description: Optional[str] = None
+    default: Optional[str] = None
+class JsonSchema(BaseModel):
+    title: str
+    properties: List[Property]
+def from_pydantic(pydantic: Union[BaseModel, Type[BaseModel]], current_depth: int = 0) -> dict:
+    """Convert a Pydantic model to a Chunk json schema."""
+    MAX_DEPTH = 5
+    model = pydantic if isinstance(pydantic, type) else pydantic.__class__
+    schema = model.model_json_schema()
+    properties = []
+    def get_enum_description(details: dict) -> str:
+        """Get description including enum values if they exist"""
+        description = details.get('description', '')
+        # First check if this is a direct enum
+        if 'enum' in details:
+            enum_values = details['enum']
+            enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
+            return f"{description}{enum_str}"
+        # Then check if it's a reference to an enum
+        if '$ref' in details:
+            ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
+            if 'enum' in ref_schema:
+                enum_values = ref_schema['enum']
+                enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
+                return f"{description}{enum_str}"
+        return description
+    def resolve_ref(ref: str, definitions: dict) -> dict:
+        """Resolve a $ref reference to its actual schema"""
+        if not ref.startswith('#/$defs/'):
+            return {}
+        ref_name = ref[len('#/$defs/'):]
+        return definitions.get(ref_name, {})
+    def get_nested_schema(field_schema: dict, depth: int) -> dict:
+        if depth >= MAX_DEPTH:
+            return {}
+        # If there's a $ref, resolve it first
+        if '$ref' in field_schema:
+            field_schema = resolve_ref(field_schema['$ref'], schema.get('$defs', {}))
+        nested_props = {}
+        if field_schema.get('type') == 'object':
+            for name, details in field_schema.get('properties', {}).items():
+                if details.get('type') == 'object' or '$ref' in details:
+                    ref_schema = details
+                    if '$ref' in details:
+                        ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
+                    nested_schema = get_nested_schema(ref_schema, depth + 1)
+                    nested_props[name] = {
+                        'type': 'object',
+                        'description': get_enum_description(details),
+                        'properties': nested_schema
+                    }
+                else:
+                    nested_props[name] = {
+                        'type': details.get('type', 'string'),
+                        'description': get_enum_description(details)
+                    }
+        return nested_props
+    for name, details in schema.get('properties', {}).items():
+        # Handle arrays
+        if details.get('type') == 'array':
+            items = details.get('items', {})
+            if '$ref' in items:
+                items = resolve_ref(items['$ref'], schema.get('$defs', {}))
+            # Get nested schema for array items
+            item_schema = get_nested_schema(items, current_depth)
+            description = get_enum_description(details)
+            if item_schema:
+                description = f"{description}\nList items schema:\n{json.dumps(item_schema, indent=2)}"
+            prop = Property(
+                name=name,
+                prop_type='list',
+                description=description
+            )
+        # Handle objects and references
+        elif details.get('type') == 'object' or '$ref' in details:
+            prop_type = 'object'
+            ref_schema = details
+            if '$ref' in details:
+                ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
+            nested_schema = get_nested_schema(ref_schema, current_depth)
+            prop = Property(
+                name=name,
+                prop_type=prop_type,
+                description=get_enum_description(details),
+                properties=nested_schema
+            )
+        # Handle primitive types
+        else:
+            prop = Property(
+                name=name,
+                prop_type=details.get('type', 'string'),
+                description=get_enum_description(details),
+                default=str(details.get('default')) if details.get('default') is not None else None
+            )
+        properties.append(prop)
+    json_schema = JsonSchema(
+        title=schema.get('title', model.__name__),
+        properties=properties
+    )
+    return json_schema.model_dump(mode="json", exclude_none=True)

chunkr_ai/api/task.py CHANGED Viewed

@@ -1,20 +1,12 @@
 from .protocol import ChunkrClientProtocol
-from .config import Configuration, OutputResponse
+from .config import Configuration, OutputResponse, Status
 from .misc import prepare_upload_data
 import asyncio
 from datetime import datetime
-from enum import Enum
 from pydantic import BaseModel, PrivateAttr
 import time
 from typing import Optional, Union
-class Status(str, Enum):
-    STARTING = "Starting"
-    PROCESSING = "Processing"
-    SUCCEEDED = "Succeeded"
-    FAILED = "Failed"
-    CANCELLED = "Cancelled"
 class TaskResponse(BaseModel):
     configuration: Configuration
     created_at: datetime

chunkr_ai/api/task_async.py CHANGED Viewed

@@ -1,20 +1,12 @@
 import asyncio
 from pydantic import BaseModel, PrivateAttr
 from datetime import datetime
-from enum import Enum
 from typing import Optional, Union
 from .task_base import TaskBase
 from .protocol import ChunkrClientProtocol
-from .config import Configuration, OutputResponse
+from .config import Configuration, OutputResponse, Status
 from .misc import prepare_upload_data
-class Status(str, Enum):
-    STARTING = "Starting"
-    PROCESSING = "Processing"
-    SUCCEEDED = "Succeeded"
-    FAILED = "Failed"
-    CANCELLED = "Cancelled"
 class TaskResponseAsync(BaseModel, TaskBase):
     configuration: Configuration
     created_at: datetime

chunkr_ai/models.py CHANGED Viewed

@@ -17,9 +17,11 @@ from .api.config import (
     SegmentProcessing,
     SegmentType,
     SegmentationStrategy,
+    Status,
 )
-from .api.task import TaskResponse, Status
+from .api.task import TaskResponse
+from .api.task_async import TaskResponseAsync
 __all__ = [
     'BoundingBox',
@@ -42,5 +44,6 @@ __all__ = [
     'SegmentType',
     'SegmentationStrategy',
     'Status',
-    'TaskResponse'
+    'TaskResponse',
+    'TaskResponseAsync',
 ]

{chunkr_ai-0.0.9.dist-info → chunkr_ai-0.0.11.dist-info}/METADATA RENAMED Viewed

@@ -1,12 +1,12 @@
 Metadata-Version: 2.2
 Name: chunkr-ai
-Version: 0.0.9
+Version: 0.0.11
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 Project-URL: Homepage, https://chunkr.ai
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: httpx>=0.24.0
+Requires-Dist: httpx>=0.25.0
 Requires-Dist: pillow>=10.0.0
 Requires-Dist: pydantic>=2.0.0
 Requires-Dist: pytest-asyncio>=0.21.0

chunkr_ai-0.0.11.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
+chunkr_ai/models.py,sha256=-dbwtTHTcGhH3LXUdVUPkobbPoeFNXRizeAW8BCGSkE,903
+chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
+chunkr_ai/api/chunkr.py,sha256=0qpV9b1hOpDhA9EuKkXW9X_laUmw5NY3ZYq0cUOTbww,5190
+chunkr_ai/api/chunkr_async.py,sha256=ZkLBrn4cqzu3sqMfS8cfZZgSvpdyQuWZP95lfGxuHx0,4900
+chunkr_ai/api/chunkr_base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
+chunkr_ai/api/config.py,sha256=y6wZz01ihRJ_5_cK_JklFWn397yll7jfXntd8bBBa5s,4861
+chunkr_ai/api/misc.py,sha256=9vnfrbJ7sFlZqwEIQ4NTMb5rhPOmETT7e1jR-b42PXM,4977
+chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
+chunkr_ai/api/schema.py,sha256=OeLOhBRXeRBgEImg0Q6O9Z10ojT6aSEVvwnDR8UeENo,4971
+chunkr_ai/api/task.py,sha256=Z5Da_Ijvih5rBz5ry98oAYNcJEDbQhhDWBQ35nHCRK4,5881
+chunkr_ai/api/task_async.py,sha256=o7tXvViIrdcrdclxaGzxrgIv-n-W8-twQ7XsDLXfXhM,3659
+chunkr_ai/api/task_base.py,sha256=Tkk7dhIeB3ic5M9g_b-MVRdNv4XQTvajpaUy8JylQ8A,526
+chunkr_ai-0.0.11.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+chunkr_ai-0.0.11.dist-info/METADATA,sha256=s8UeXDnBDVG_1RN5colcJCGhwrICRy9VMQWmTUKVRJc,4845
+chunkr_ai-0.0.11.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+chunkr_ai-0.0.11.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
+chunkr_ai-0.0.11.dist-info/RECORD,,

chunkr_ai/api/api.py DELETED Viewed

File without changes

chunkr_ai/main.py DELETED Viewed

@@ -1,12 +0,0 @@
-from chunkr_ai.api.chunkr import Chunkr
-from chunkr_ai.models import Configuration
-from chunkr_ai.api.config import SegmentationStrategy, ChunkProcessing
-if __name__ == "__main__":
-    chunkr = Chunkr()
-    task = chunkr.update_task("556b4fe5-e3f7-48dc-9f56-0fb7fbacdb87", Configuration(
-        chunk_processing=ChunkProcessing(
-            target_length=1000
-        )
-    ))
-    print(task)

chunkr_ai-0.0.9.dist-info/RECORD DELETED Viewed

@@ -1,20 +0,0 @@
-chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
-chunkr_ai/main.py,sha256=_MT1lcnNiXjVW9ZkZYl28SB_f6M9g_IOgZxvhodTzAo,394
-chunkr_ai/models.py,sha256=T8_F-Y1US21ZJVzLIaroqp-Hd0_ZFbdkbEOxr63-PNE,827
-chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
-chunkr_ai/api/base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
-chunkr_ai/api/chunkr.py,sha256=PmrK37HbK2T1KUPitKnt4wZqIujL61Jo12qW9DEpNMI,5186
-chunkr_ai/api/chunkr_async.py,sha256=2yYyAO9-j2xKQYH0fJb2S6gL26hgbtL4QyqlG9l0QBY,4893
-chunkr_ai/api/config.py,sha256=XIqXZ_8q7U_BEmY5wyIC9mbQGZBw1956EN9yhC4svD0,4235
-chunkr_ai/api/misc.py,sha256=tScsUUcrqeVh_bZv1YlbmjGkQSTDQN8NyKxoNwAG6XA,3792
-chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
-chunkr_ai/api/task.py,sha256=EB6RK8ms7EaNj57tNJZoNgNMHGWKXFhkQ1WC7gk5ht4,6059
-chunkr_ai/api/task_async.py,sha256=Dd-Fenie0Q6GxXce7OlXvuQ14NQ58F_0b9P7AGKWyYA,3833
-chunkr_ai/api/task_base.py,sha256=Tkk7dhIeB3ic5M9g_b-MVRdNv4XQTvajpaUy8JylQ8A,526
-chunkr_ai-0.0.9.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-chunkr_ai-0.0.9.dist-info/METADATA,sha256=XFGPjuDARO1VYvdcyMOHhxZK1FYjEr0_ySI0Ni6tWMc,4844
-chunkr_ai-0.0.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-chunkr_ai-0.0.9.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
-chunkr_ai-0.0.9.dist-info/RECORD,,

/chunkr_ai/api/{base.py → chunkr_base.py} RENAMED Viewed

File without changes

{chunkr_ai-0.0.9.dist-info → chunkr_ai-0.0.11.dist-info}/LICENSE RENAMED Viewed

File without changes

{chunkr_ai-0.0.9.dist-info → chunkr_ai-0.0.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{chunkr_ai-0.0.9.dist-info → chunkr_ai-0.0.11.dist-info}/top_level.txt RENAMED Viewed

File without changes

chunkr-ai 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl

Potentially problematic release.

chunkr-ai 0.0.9py3-none-any.whl → 0.0.11py3-none-any.whl