chunkr-ai 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chunkr_ai/api/chunkr.py CHANGED
@@ -1,4 +1,4 @@
1
- from .base import ChunkrBase
1
+ from .chunkr_base import ChunkrBase
2
2
  from .config import Configuration
3
3
  from .task import TaskResponse
4
4
  from pathlib import Path
@@ -163,5 +163,3 @@ class Chunkr(ChunkrBase):
163
163
  headers=self._headers()
164
164
  )
165
165
  r.raise_for_status()
166
-
167
-
@@ -1,4 +1,4 @@
1
- from .base import ChunkrBase
1
+ from .chunkr_base import ChunkrBase
2
2
  from .task import TaskResponse
3
3
  from .config import Configuration
4
4
  import httpx
chunkr_ai/api/config.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from pydantic import BaseModel, Field, model_validator, ConfigDict
2
2
  from enum import Enum
3
- from typing import Optional, List, Dict
3
+ from typing import Optional, List, Dict, Union, Type
4
+ from .schema import from_pydantic
4
5
 
5
6
  class GenerationStrategy(str, Enum):
6
7
  LLM = "LLM"
@@ -40,7 +41,6 @@ class ChunkProcessing(BaseModel):
40
41
 
41
42
  class Property(BaseModel):
42
43
  name: str
43
- title: Optional[str] = None
44
44
  prop_type: str
45
45
  description: Optional[str] = None
46
46
  default: Optional[str] = None
@@ -115,7 +115,7 @@ class Configuration(BaseModel):
115
115
  chunk_processing: Optional[ChunkProcessing] = Field(default=None)
116
116
  expires_in: Optional[int] = Field(default=None)
117
117
  high_resolution: Optional[bool] = Field(default=None)
118
- json_schema: Optional[JsonSchema] = Field(default=None)
118
+ json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(default=None)
119
119
  model: Optional[Model] = Field(default=None)
120
120
  ocr_strategy: Optional[OcrStrategy] = Field(default=None)
121
121
  segment_processing: Optional[SegmentProcessing] = Field(default=None)
@@ -129,3 +129,17 @@ class Configuration(BaseModel):
129
129
  values["chunk_processing"] = values.get("chunk_processing", {}) or {}
130
130
  values["chunk_processing"]["target_length"] = target_length
131
131
  return values
132
+
133
+ @model_validator(mode='after')
134
+ def convert_json_schema(self) -> 'Configuration':
135
+ if self.json_schema is not None and not isinstance(self.json_schema, JsonSchema):
136
+ if isinstance(self.json_schema, (BaseModel, type)) and issubclass(getattr(self.json_schema, '__class__', type), BaseModel):
137
+ self.json_schema = JsonSchema(**from_pydantic(self.json_schema))
138
+ return self
139
+
140
+ class Status(str, Enum):
141
+ STARTING = "Starting"
142
+ PROCESSING = "Processing"
143
+ SUCCEEDED = "Succeeded"
144
+ FAILED = "Failed"
145
+ CANCELLED = "Cancelled"
chunkr_ai/api/misc.py CHANGED
@@ -1,11 +1,10 @@
1
+ from .config import Configuration
1
2
  import io
2
3
  import json
3
4
  from pathlib import Path
4
5
  from PIL import Image
5
6
  import requests
6
7
  from typing import Union, Tuple, BinaryIO, Optional
7
- from .config import Configuration
8
-
9
8
 
10
9
  def prepare_file(
11
10
  file: Union[str, Path, BinaryIO, Image.Image]
@@ -15,8 +14,31 @@ def prepare_file(
15
14
  if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
16
15
  response = requests.get(file)
17
16
  response.raise_for_status()
17
+
18
+ # Try to get filename from Content-Disposition header first
19
+ filename = None
20
+ content_disposition = response.headers.get('Content-Disposition')
21
+ if content_disposition and 'filename=' in content_disposition:
22
+ filename = content_disposition.split('filename=')[-1].strip('"\'')
23
+
24
+ # If no Content-Disposition, try to get clean filename from URL path
25
+ if not filename:
26
+ from urllib.parse import urlparse, unquote
27
+ parsed_url = urlparse(file)
28
+ path = unquote(parsed_url.path)
29
+ filename = Path(path).name if path else None
30
+
31
+ # Fallback to default name if we couldn't extract one
32
+ filename = filename or 'downloaded_file'
33
+
34
+ # Sanitize filename: remove invalid characters and limit length
35
+ import re
36
+ filename = re.sub(r'[<>:"/\\|?*%]', '_', filename) # Replace invalid chars with underscore
37
+ filename = re.sub(r'\s+', '_', filename) # Replace whitespace with underscore
38
+ filename = filename.strip('._') # Remove leading/trailing dots and underscores
39
+ filename = filename[:255] # Limit length to 255 characters
40
+
18
41
  file_obj = io.BytesIO(response.content)
19
- filename = Path(file.split('/')[-1]).name or 'downloaded_file'
20
42
  return filename, file_obj
21
43
 
22
44
  # Handle base64 strings
@@ -0,0 +1,128 @@
1
+ from pydantic import BaseModel
2
+ from typing import Optional, List, Union, Type
3
+ import json
4
+
5
+ class Property(BaseModel):
6
+ name: str
7
+ prop_type: str
8
+ description: Optional[str] = None
9
+ default: Optional[str] = None
10
+
11
+ class JsonSchema(BaseModel):
12
+ title: str
13
+ properties: List[Property]
14
+
15
+ def from_pydantic(pydantic: Union[BaseModel, Type[BaseModel]], current_depth: int = 0) -> dict:
16
+ """Convert a Pydantic model to a Chunk json schema."""
17
+ MAX_DEPTH = 5
18
+ model = pydantic if isinstance(pydantic, type) else pydantic.__class__
19
+ schema = model.model_json_schema()
20
+ properties = []
21
+
22
+ def get_enum_description(details: dict) -> str:
23
+ """Get description including enum values if they exist"""
24
+ description = details.get('description', '')
25
+
26
+ # First check if this is a direct enum
27
+ if 'enum' in details:
28
+ enum_values = details['enum']
29
+ enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
30
+ return f"{description}{enum_str}"
31
+
32
+ # Then check if it's a reference to an enum
33
+ if '$ref' in details:
34
+ ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
35
+ if 'enum' in ref_schema:
36
+ enum_values = ref_schema['enum']
37
+ enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
38
+ return f"{description}{enum_str}"
39
+
40
+ return description
41
+
42
+ def resolve_ref(ref: str, definitions: dict) -> dict:
43
+ """Resolve a $ref reference to its actual schema"""
44
+ if not ref.startswith('#/$defs/'):
45
+ return {}
46
+ ref_name = ref[len('#/$defs/'):]
47
+ return definitions.get(ref_name, {})
48
+
49
+ def get_nested_schema(field_schema: dict, depth: int) -> dict:
50
+ if depth >= MAX_DEPTH:
51
+ return {}
52
+
53
+ # If there's a $ref, resolve it first
54
+ if '$ref' in field_schema:
55
+ field_schema = resolve_ref(field_schema['$ref'], schema.get('$defs', {}))
56
+
57
+ nested_props = {}
58
+ if field_schema.get('type') == 'object':
59
+ for name, details in field_schema.get('properties', {}).items():
60
+ if details.get('type') == 'object' or '$ref' in details:
61
+ ref_schema = details
62
+ if '$ref' in details:
63
+ ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
64
+ nested_schema = get_nested_schema(ref_schema, depth + 1)
65
+ nested_props[name] = {
66
+ 'type': 'object',
67
+ 'description': get_enum_description(details),
68
+ 'properties': nested_schema
69
+ }
70
+ else:
71
+ nested_props[name] = {
72
+ 'type': details.get('type', 'string'),
73
+ 'description': get_enum_description(details)
74
+ }
75
+ return nested_props
76
+
77
+ for name, details in schema.get('properties', {}).items():
78
+ # Handle arrays
79
+ if details.get('type') == 'array':
80
+ items = details.get('items', {})
81
+ if '$ref' in items:
82
+ items = resolve_ref(items['$ref'], schema.get('$defs', {}))
83
+
84
+ # Get nested schema for array items
85
+ item_schema = get_nested_schema(items, current_depth)
86
+ description = get_enum_description(details)
87
+
88
+ if item_schema:
89
+ description = f"{description}\nList items schema:\n{json.dumps(item_schema, indent=2)}"
90
+
91
+ prop = Property(
92
+ name=name,
93
+ prop_type='list',
94
+ description=description
95
+ )
96
+ # Handle objects and references
97
+ elif details.get('type') == 'object' or '$ref' in details:
98
+ prop_type = 'object'
99
+ ref_schema = details
100
+ if '$ref' in details:
101
+ ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
102
+
103
+ nested_schema = get_nested_schema(ref_schema, current_depth)
104
+
105
+ prop = Property(
106
+ name=name,
107
+ prop_type=prop_type,
108
+ description=get_enum_description(details),
109
+ properties=nested_schema
110
+ )
111
+
112
+ # Handle primitive types
113
+ else:
114
+ prop = Property(
115
+ name=name,
116
+ prop_type=details.get('type', 'string'),
117
+ description=get_enum_description(details),
118
+ default=str(details.get('default')) if details.get('default') is not None else None
119
+ )
120
+
121
+ properties.append(prop)
122
+
123
+ json_schema = JsonSchema(
124
+ title=schema.get('title', model.__name__),
125
+ properties=properties
126
+ )
127
+
128
+ return json_schema.model_dump(mode="json", exclude_none=True)
chunkr_ai/api/task.py CHANGED
@@ -1,20 +1,12 @@
1
1
  from .protocol import ChunkrClientProtocol
2
- from .config import Configuration, OutputResponse
2
+ from .config import Configuration, OutputResponse, Status
3
3
  from .misc import prepare_upload_data
4
4
  import asyncio
5
5
  from datetime import datetime
6
- from enum import Enum
7
6
  from pydantic import BaseModel, PrivateAttr
8
7
  import time
9
8
  from typing import Optional, Union
10
9
 
11
- class Status(str, Enum):
12
- STARTING = "Starting"
13
- PROCESSING = "Processing"
14
- SUCCEEDED = "Succeeded"
15
- FAILED = "Failed"
16
- CANCELLED = "Cancelled"
17
-
18
10
  class TaskResponse(BaseModel):
19
11
  configuration: Configuration
20
12
  created_at: datetime
@@ -1,20 +1,12 @@
1
1
  import asyncio
2
2
  from pydantic import BaseModel, PrivateAttr
3
3
  from datetime import datetime
4
- from enum import Enum
5
4
  from typing import Optional, Union
6
5
  from .task_base import TaskBase
7
6
  from .protocol import ChunkrClientProtocol
8
- from .config import Configuration, OutputResponse
7
+ from .config import Configuration, OutputResponse, Status
9
8
  from .misc import prepare_upload_data
10
9
 
11
- class Status(str, Enum):
12
- STARTING = "Starting"
13
- PROCESSING = "Processing"
14
- SUCCEEDED = "Succeeded"
15
- FAILED = "Failed"
16
- CANCELLED = "Cancelled"
17
-
18
10
  class TaskResponseAsync(BaseModel, TaskBase):
19
11
  configuration: Configuration
20
12
  created_at: datetime
chunkr_ai/models.py CHANGED
@@ -17,9 +17,11 @@ from .api.config import (
17
17
  SegmentProcessing,
18
18
  SegmentType,
19
19
  SegmentationStrategy,
20
+ Status,
20
21
  )
21
22
 
22
- from .api.task import TaskResponse, Status
23
+ from .api.task import TaskResponse
24
+ from .api.task_async import TaskResponseAsync
23
25
 
24
26
  __all__ = [
25
27
  'BoundingBox',
@@ -42,5 +44,6 @@ __all__ = [
42
44
  'SegmentType',
43
45
  'SegmentationStrategy',
44
46
  'Status',
45
- 'TaskResponse'
47
+ 'TaskResponse',
48
+ 'TaskResponseAsync',
46
49
  ]
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.9
3
+ Version: 0.0.11
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
- Requires-Dist: httpx>=0.24.0
9
+ Requires-Dist: httpx>=0.25.0
10
10
  Requires-Dist: pillow>=10.0.0
11
11
  Requires-Dist: pydantic>=2.0.0
12
12
  Requires-Dist: pytest-asyncio>=0.21.0
@@ -0,0 +1,19 @@
1
+ chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
2
+ chunkr_ai/models.py,sha256=-dbwtTHTcGhH3LXUdVUPkobbPoeFNXRizeAW8BCGSkE,903
3
+ chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
5
+ chunkr_ai/api/chunkr.py,sha256=0qpV9b1hOpDhA9EuKkXW9X_laUmw5NY3ZYq0cUOTbww,5190
6
+ chunkr_ai/api/chunkr_async.py,sha256=ZkLBrn4cqzu3sqMfS8cfZZgSvpdyQuWZP95lfGxuHx0,4900
7
+ chunkr_ai/api/chunkr_base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
8
+ chunkr_ai/api/config.py,sha256=y6wZz01ihRJ_5_cK_JklFWn397yll7jfXntd8bBBa5s,4861
9
+ chunkr_ai/api/misc.py,sha256=9vnfrbJ7sFlZqwEIQ4NTMb5rhPOmETT7e1jR-b42PXM,4977
10
+ chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
11
+ chunkr_ai/api/schema.py,sha256=OeLOhBRXeRBgEImg0Q6O9Z10ojT6aSEVvwnDR8UeENo,4971
12
+ chunkr_ai/api/task.py,sha256=Z5Da_Ijvih5rBz5ry98oAYNcJEDbQhhDWBQ35nHCRK4,5881
13
+ chunkr_ai/api/task_async.py,sha256=o7tXvViIrdcrdclxaGzxrgIv-n-W8-twQ7XsDLXfXhM,3659
14
+ chunkr_ai/api/task_base.py,sha256=Tkk7dhIeB3ic5M9g_b-MVRdNv4XQTvajpaUy8JylQ8A,526
15
+ chunkr_ai-0.0.11.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ chunkr_ai-0.0.11.dist-info/METADATA,sha256=s8UeXDnBDVG_1RN5colcJCGhwrICRy9VMQWmTUKVRJc,4845
17
+ chunkr_ai-0.0.11.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
18
+ chunkr_ai-0.0.11.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
19
+ chunkr_ai-0.0.11.dist-info/RECORD,,
chunkr_ai/api/api.py DELETED
File without changes
chunkr_ai/main.py DELETED
@@ -1,12 +0,0 @@
1
- from chunkr_ai.api.chunkr import Chunkr
2
- from chunkr_ai.models import Configuration
3
- from chunkr_ai.api.config import SegmentationStrategy, ChunkProcessing
4
-
5
- if __name__ == "__main__":
6
- chunkr = Chunkr()
7
- task = chunkr.update_task("556b4fe5-e3f7-48dc-9f56-0fb7fbacdb87", Configuration(
8
- chunk_processing=ChunkProcessing(
9
- target_length=1000
10
- )
11
- ))
12
- print(task)
@@ -1,20 +0,0 @@
1
- chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
2
- chunkr_ai/main.py,sha256=_MT1lcnNiXjVW9ZkZYl28SB_f6M9g_IOgZxvhodTzAo,394
3
- chunkr_ai/models.py,sha256=T8_F-Y1US21ZJVzLIaroqp-Hd0_ZFbdkbEOxr63-PNE,827
4
- chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
7
- chunkr_ai/api/base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
8
- chunkr_ai/api/chunkr.py,sha256=PmrK37HbK2T1KUPitKnt4wZqIujL61Jo12qW9DEpNMI,5186
9
- chunkr_ai/api/chunkr_async.py,sha256=2yYyAO9-j2xKQYH0fJb2S6gL26hgbtL4QyqlG9l0QBY,4893
10
- chunkr_ai/api/config.py,sha256=XIqXZ_8q7U_BEmY5wyIC9mbQGZBw1956EN9yhC4svD0,4235
11
- chunkr_ai/api/misc.py,sha256=tScsUUcrqeVh_bZv1YlbmjGkQSTDQN8NyKxoNwAG6XA,3792
12
- chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
13
- chunkr_ai/api/task.py,sha256=EB6RK8ms7EaNj57tNJZoNgNMHGWKXFhkQ1WC7gk5ht4,6059
14
- chunkr_ai/api/task_async.py,sha256=Dd-Fenie0Q6GxXce7OlXvuQ14NQ58F_0b9P7AGKWyYA,3833
15
- chunkr_ai/api/task_base.py,sha256=Tkk7dhIeB3ic5M9g_b-MVRdNv4XQTvajpaUy8JylQ8A,526
16
- chunkr_ai-0.0.9.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- chunkr_ai-0.0.9.dist-info/METADATA,sha256=XFGPjuDARO1VYvdcyMOHhxZK1FYjEr0_ySI0Ni6tWMc,4844
18
- chunkr_ai-0.0.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
19
- chunkr_ai-0.0.9.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
20
- chunkr_ai-0.0.9.dist-info/RECORD,,
File without changes