chunkr-ai 0.0.9__tar.gz → 0.0.11__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (27) hide show
  1. {chunkr_ai-0.0.9/src/chunkr_ai.egg-info → chunkr_ai-0.0.11}/PKG-INFO +2 -2
  2. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/pyproject.toml +2 -2
  3. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/chunkr.py +1 -3
  4. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/chunkr_async.py +1 -1
  5. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/config.py +17 -3
  6. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/misc.py +25 -3
  7. chunkr_ai-0.0.11/src/chunkr_ai/api/schema.py +128 -0
  8. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/task.py +1 -9
  9. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/task_async.py +1 -9
  10. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/models.py +5 -2
  11. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11/src/chunkr_ai.egg-info}/PKG-INFO +2 -2
  12. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai.egg-info/SOURCES.txt +2 -3
  13. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai.egg-info/requires.txt +1 -1
  14. chunkr_ai-0.0.9/src/chunkr_ai/api/api.py +0 -0
  15. chunkr_ai-0.0.9/src/chunkr_ai/main.py +0 -12
  16. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/LICENSE +0 -0
  17. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/README.md +0 -0
  18. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/setup.cfg +0 -0
  19. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/__init__.py +0 -0
  20. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/__init__.py +0 -0
  21. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/auth.py +0 -0
  22. /chunkr_ai-0.0.9/src/chunkr_ai/api/base.py → /chunkr_ai-0.0.11/src/chunkr_ai/api/chunkr_base.py +0 -0
  23. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/protocol.py +0 -0
  24. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai/api/task_base.py +0 -0
  25. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  26. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/src/chunkr_ai.egg-info/top_level.txt +0 -0
  27. {chunkr_ai-0.0.9 → chunkr_ai-0.0.11}/tests/test_chunkr.py +0 -0
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.9
3
+ Version: 0.0.11
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
- Requires-Dist: httpx>=0.24.0
9
+ Requires-Dist: httpx>=0.25.0
10
10
  Requires-Dist: pillow>=10.0.0
11
11
  Requires-Dist: pydantic>=2.0.0
12
12
  Requires-Dist: pytest-asyncio>=0.21.0
@@ -4,14 +4,14 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.9"
7
+ version = "0.0.11"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
11
11
  license = {"file" = "LICENSE"}
12
12
  urls = {Homepage = "https://chunkr.ai"}
13
13
  dependencies = [
14
- "httpx>=0.24.0",
14
+ "httpx>=0.25.0",
15
15
  "pillow>=10.0.0",
16
16
  "pydantic>=2.0.0",
17
17
  "pytest-asyncio>=0.21.0",
@@ -1,4 +1,4 @@
1
- from .base import ChunkrBase
1
+ from .chunkr_base import ChunkrBase
2
2
  from .config import Configuration
3
3
  from .task import TaskResponse
4
4
  from pathlib import Path
@@ -163,5 +163,3 @@ class Chunkr(ChunkrBase):
163
163
  headers=self._headers()
164
164
  )
165
165
  r.raise_for_status()
166
-
167
-
@@ -1,4 +1,4 @@
1
- from .base import ChunkrBase
1
+ from .chunkr_base import ChunkrBase
2
2
  from .task import TaskResponse
3
3
  from .config import Configuration
4
4
  import httpx
@@ -1,6 +1,7 @@
1
1
  from pydantic import BaseModel, Field, model_validator, ConfigDict
2
2
  from enum import Enum
3
- from typing import Optional, List, Dict
3
+ from typing import Optional, List, Dict, Union, Type
4
+ from .schema import from_pydantic
4
5
 
5
6
  class GenerationStrategy(str, Enum):
6
7
  LLM = "LLM"
@@ -40,7 +41,6 @@ class ChunkProcessing(BaseModel):
40
41
 
41
42
  class Property(BaseModel):
42
43
  name: str
43
- title: Optional[str] = None
44
44
  prop_type: str
45
45
  description: Optional[str] = None
46
46
  default: Optional[str] = None
@@ -115,7 +115,7 @@ class Configuration(BaseModel):
115
115
  chunk_processing: Optional[ChunkProcessing] = Field(default=None)
116
116
  expires_in: Optional[int] = Field(default=None)
117
117
  high_resolution: Optional[bool] = Field(default=None)
118
- json_schema: Optional[JsonSchema] = Field(default=None)
118
+ json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(default=None)
119
119
  model: Optional[Model] = Field(default=None)
120
120
  ocr_strategy: Optional[OcrStrategy] = Field(default=None)
121
121
  segment_processing: Optional[SegmentProcessing] = Field(default=None)
@@ -129,3 +129,17 @@ class Configuration(BaseModel):
129
129
  values["chunk_processing"] = values.get("chunk_processing", {}) or {}
130
130
  values["chunk_processing"]["target_length"] = target_length
131
131
  return values
132
+
133
+ @model_validator(mode='after')
134
+ def convert_json_schema(self) -> 'Configuration':
135
+ if self.json_schema is not None and not isinstance(self.json_schema, JsonSchema):
136
+ if isinstance(self.json_schema, (BaseModel, type)) and issubclass(getattr(self.json_schema, '__class__', type), BaseModel):
137
+ self.json_schema = JsonSchema(**from_pydantic(self.json_schema))
138
+ return self
139
+
140
+ class Status(str, Enum):
141
+ STARTING = "Starting"
142
+ PROCESSING = "Processing"
143
+ SUCCEEDED = "Succeeded"
144
+ FAILED = "Failed"
145
+ CANCELLED = "Cancelled"
@@ -1,11 +1,10 @@
1
+ from .config import Configuration
1
2
  import io
2
3
  import json
3
4
  from pathlib import Path
4
5
  from PIL import Image
5
6
  import requests
6
7
  from typing import Union, Tuple, BinaryIO, Optional
7
- from .config import Configuration
8
-
9
8
 
10
9
  def prepare_file(
11
10
  file: Union[str, Path, BinaryIO, Image.Image]
@@ -15,8 +14,31 @@ def prepare_file(
15
14
  if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
16
15
  response = requests.get(file)
17
16
  response.raise_for_status()
17
+
18
+ # Try to get filename from Content-Disposition header first
19
+ filename = None
20
+ content_disposition = response.headers.get('Content-Disposition')
21
+ if content_disposition and 'filename=' in content_disposition:
22
+ filename = content_disposition.split('filename=')[-1].strip('"\'')
23
+
24
+ # If no Content-Disposition, try to get clean filename from URL path
25
+ if not filename:
26
+ from urllib.parse import urlparse, unquote
27
+ parsed_url = urlparse(file)
28
+ path = unquote(parsed_url.path)
29
+ filename = Path(path).name if path else None
30
+
31
+ # Fallback to default name if we couldn't extract one
32
+ filename = filename or 'downloaded_file'
33
+
34
+ # Sanitize filename: remove invalid characters and limit length
35
+ import re
36
+ filename = re.sub(r'[<>:"/\\|?*%]', '_', filename) # Replace invalid chars with underscore
37
+ filename = re.sub(r'\s+', '_', filename) # Replace whitespace with underscore
38
+ filename = filename.strip('._') # Remove leading/trailing dots and underscores
39
+ filename = filename[:255] # Limit length to 255 characters
40
+
18
41
  file_obj = io.BytesIO(response.content)
19
- filename = Path(file.split('/')[-1]).name or 'downloaded_file'
20
42
  return filename, file_obj
21
43
 
22
44
  # Handle base64 strings
@@ -0,0 +1,128 @@
1
+ from pydantic import BaseModel
2
+ from typing import Optional, List, Union, Type
3
+ import json
4
+
5
+ class Property(BaseModel):
6
+ name: str
7
+ prop_type: str
8
+ description: Optional[str] = None
9
+ default: Optional[str] = None
10
+
11
+ class JsonSchema(BaseModel):
12
+ title: str
13
+ properties: List[Property]
14
+
15
+ def from_pydantic(pydantic: Union[BaseModel, Type[BaseModel]], current_depth: int = 0) -> dict:
16
+ """Convert a Pydantic model to a Chunk json schema."""
17
+ MAX_DEPTH = 5
18
+ model = pydantic if isinstance(pydantic, type) else pydantic.__class__
19
+ schema = model.model_json_schema()
20
+ properties = []
21
+
22
+ def get_enum_description(details: dict) -> str:
23
+ """Get description including enum values if they exist"""
24
+ description = details.get('description', '')
25
+
26
+ # First check if this is a direct enum
27
+ if 'enum' in details:
28
+ enum_values = details['enum']
29
+ enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
30
+ return f"{description}{enum_str}"
31
+
32
+ # Then check if it's a reference to an enum
33
+ if '$ref' in details:
34
+ ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
35
+ if 'enum' in ref_schema:
36
+ enum_values = ref_schema['enum']
37
+ enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
38
+ return f"{description}{enum_str}"
39
+
40
+ return description
41
+
42
+ def resolve_ref(ref: str, definitions: dict) -> dict:
43
+ """Resolve a $ref reference to its actual schema"""
44
+ if not ref.startswith('#/$defs/'):
45
+ return {}
46
+ ref_name = ref[len('#/$defs/'):]
47
+ return definitions.get(ref_name, {})
48
+
49
+ def get_nested_schema(field_schema: dict, depth: int) -> dict:
50
+ if depth >= MAX_DEPTH:
51
+ return {}
52
+
53
+ # If there's a $ref, resolve it first
54
+ if '$ref' in field_schema:
55
+ field_schema = resolve_ref(field_schema['$ref'], schema.get('$defs', {}))
56
+
57
+ nested_props = {}
58
+ if field_schema.get('type') == 'object':
59
+ for name, details in field_schema.get('properties', {}).items():
60
+ if details.get('type') == 'object' or '$ref' in details:
61
+ ref_schema = details
62
+ if '$ref' in details:
63
+ ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
64
+ nested_schema = get_nested_schema(ref_schema, depth + 1)
65
+ nested_props[name] = {
66
+ 'type': 'object',
67
+ 'description': get_enum_description(details),
68
+ 'properties': nested_schema
69
+ }
70
+ else:
71
+ nested_props[name] = {
72
+ 'type': details.get('type', 'string'),
73
+ 'description': get_enum_description(details)
74
+ }
75
+ return nested_props
76
+
77
+ for name, details in schema.get('properties', {}).items():
78
+ # Handle arrays
79
+ if details.get('type') == 'array':
80
+ items = details.get('items', {})
81
+ if '$ref' in items:
82
+ items = resolve_ref(items['$ref'], schema.get('$defs', {}))
83
+
84
+ # Get nested schema for array items
85
+ item_schema = get_nested_schema(items, current_depth)
86
+ description = get_enum_description(details)
87
+
88
+ if item_schema:
89
+ description = f"{description}\nList items schema:\n{json.dumps(item_schema, indent=2)}"
90
+
91
+ prop = Property(
92
+ name=name,
93
+ prop_type='list',
94
+ description=description
95
+ )
96
+ # Handle objects and references
97
+ elif details.get('type') == 'object' or '$ref' in details:
98
+ prop_type = 'object'
99
+ ref_schema = details
100
+ if '$ref' in details:
101
+ ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
102
+
103
+ nested_schema = get_nested_schema(ref_schema, current_depth)
104
+
105
+ prop = Property(
106
+ name=name,
107
+ prop_type=prop_type,
108
+ description=get_enum_description(details),
109
+ properties=nested_schema
110
+ )
111
+
112
+ # Handle primitive types
113
+ else:
114
+ prop = Property(
115
+ name=name,
116
+ prop_type=details.get('type', 'string'),
117
+ description=get_enum_description(details),
118
+ default=str(details.get('default')) if details.get('default') is not None else None
119
+ )
120
+
121
+ properties.append(prop)
122
+
123
+ json_schema = JsonSchema(
124
+ title=schema.get('title', model.__name__),
125
+ properties=properties
126
+ )
127
+
128
+ return json_schema.model_dump(mode="json", exclude_none=True)
@@ -1,20 +1,12 @@
1
1
  from .protocol import ChunkrClientProtocol
2
- from .config import Configuration, OutputResponse
2
+ from .config import Configuration, OutputResponse, Status
3
3
  from .misc import prepare_upload_data
4
4
  import asyncio
5
5
  from datetime import datetime
6
- from enum import Enum
7
6
  from pydantic import BaseModel, PrivateAttr
8
7
  import time
9
8
  from typing import Optional, Union
10
9
 
11
- class Status(str, Enum):
12
- STARTING = "Starting"
13
- PROCESSING = "Processing"
14
- SUCCEEDED = "Succeeded"
15
- FAILED = "Failed"
16
- CANCELLED = "Cancelled"
17
-
18
10
  class TaskResponse(BaseModel):
19
11
  configuration: Configuration
20
12
  created_at: datetime
@@ -1,20 +1,12 @@
1
1
  import asyncio
2
2
  from pydantic import BaseModel, PrivateAttr
3
3
  from datetime import datetime
4
- from enum import Enum
5
4
  from typing import Optional, Union
6
5
  from .task_base import TaskBase
7
6
  from .protocol import ChunkrClientProtocol
8
- from .config import Configuration, OutputResponse
7
+ from .config import Configuration, OutputResponse, Status
9
8
  from .misc import prepare_upload_data
10
9
 
11
- class Status(str, Enum):
12
- STARTING = "Starting"
13
- PROCESSING = "Processing"
14
- SUCCEEDED = "Succeeded"
15
- FAILED = "Failed"
16
- CANCELLED = "Cancelled"
17
-
18
10
  class TaskResponseAsync(BaseModel, TaskBase):
19
11
  configuration: Configuration
20
12
  created_at: datetime
@@ -17,9 +17,11 @@ from .api.config import (
17
17
  SegmentProcessing,
18
18
  SegmentType,
19
19
  SegmentationStrategy,
20
+ Status,
20
21
  )
21
22
 
22
- from .api.task import TaskResponse, Status
23
+ from .api.task import TaskResponse
24
+ from .api.task_async import TaskResponseAsync
23
25
 
24
26
  __all__ = [
25
27
  'BoundingBox',
@@ -42,5 +44,6 @@ __all__ = [
42
44
  'SegmentType',
43
45
  'SegmentationStrategy',
44
46
  'Status',
45
- 'TaskResponse'
47
+ 'TaskResponse',
48
+ 'TaskResponseAsync',
46
49
  ]
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.9
3
+ Version: 0.0.11
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
- Requires-Dist: httpx>=0.24.0
9
+ Requires-Dist: httpx>=0.25.0
10
10
  Requires-Dist: pillow>=10.0.0
11
11
  Requires-Dist: pydantic>=2.0.0
12
12
  Requires-Dist: pytest-asyncio>=0.21.0
@@ -2,7 +2,6 @@ LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
4
  src/chunkr_ai/__init__.py
5
- src/chunkr_ai/main.py
6
5
  src/chunkr_ai/models.py
7
6
  src/chunkr_ai.egg-info/PKG-INFO
8
7
  src/chunkr_ai.egg-info/SOURCES.txt
@@ -10,14 +9,14 @@ src/chunkr_ai.egg-info/dependency_links.txt
10
9
  src/chunkr_ai.egg-info/requires.txt
11
10
  src/chunkr_ai.egg-info/top_level.txt
12
11
  src/chunkr_ai/api/__init__.py
13
- src/chunkr_ai/api/api.py
14
12
  src/chunkr_ai/api/auth.py
15
- src/chunkr_ai/api/base.py
16
13
  src/chunkr_ai/api/chunkr.py
17
14
  src/chunkr_ai/api/chunkr_async.py
15
+ src/chunkr_ai/api/chunkr_base.py
18
16
  src/chunkr_ai/api/config.py
19
17
  src/chunkr_ai/api/misc.py
20
18
  src/chunkr_ai/api/protocol.py
19
+ src/chunkr_ai/api/schema.py
21
20
  src/chunkr_ai/api/task.py
22
21
  src/chunkr_ai/api/task_async.py
23
22
  src/chunkr_ai/api/task_base.py
@@ -1,4 +1,4 @@
1
- httpx>=0.24.0
1
+ httpx>=0.25.0
2
2
  pillow>=10.0.0
3
3
  pydantic>=2.0.0
4
4
  pytest-asyncio>=0.21.0
File without changes
@@ -1,12 +0,0 @@
1
- from chunkr_ai.api.chunkr import Chunkr
2
- from chunkr_ai.models import Configuration
3
- from chunkr_ai.api.config import SegmentationStrategy, ChunkProcessing
4
-
5
- if __name__ == "__main__":
6
- chunkr = Chunkr()
7
- task = chunkr.update_task("556b4fe5-e3f7-48dc-9f56-0fb7fbacdb87", Configuration(
8
- chunk_processing=ChunkProcessing(
9
- target_length=1000
10
- )
11
- ))
12
- print(task)
File without changes
File without changes
File without changes