chunkr-ai 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/chunkr.py +1 -3
- chunkr_ai/api/chunkr_async.py +1 -1
- chunkr_ai/api/config.py +0 -1
- chunkr_ai/api/misc.py +56 -3
- chunkr_ai/api/task_async.py +111 -0
- chunkr_ai/api/task_base.py +31 -0
- {chunkr_ai-0.0.8.dist-info → chunkr_ai-0.0.10.dist-info}/METADATA +9 -9
- chunkr_ai-0.0.10.dist-info/RECORD +19 -0
- chunkr_ai/api/api.py +0 -0
- chunkr_ai-0.0.8.dist-info/RECORD +0 -18
- /chunkr_ai/api/{base.py → chunkr_base.py} +0 -0
- {chunkr_ai-0.0.8.dist-info → chunkr_ai-0.0.10.dist-info}/LICENSE +0 -0
- {chunkr_ai-0.0.8.dist-info → chunkr_ai-0.0.10.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.0.8.dist-info → chunkr_ai-0.0.10.dist-info}/top_level.txt +0 -0
chunkr_ai/api/chunkr.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from .
|
1
|
+
from .chunkr_base import ChunkrBase
|
2
2
|
from .config import Configuration
|
3
3
|
from .task import TaskResponse
|
4
4
|
from pathlib import Path
|
@@ -163,5 +163,3 @@ class Chunkr(ChunkrBase):
|
|
163
163
|
headers=self._headers()
|
164
164
|
)
|
165
165
|
r.raise_for_status()
|
166
|
-
|
167
|
-
|
chunkr_ai/api/chunkr_async.py
CHANGED
chunkr_ai/api/config.py
CHANGED
chunkr_ai/api/misc.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
|
+
from .config import Configuration, Property, JsonSchema
|
1
2
|
import io
|
2
3
|
import json
|
3
4
|
from pathlib import Path
|
4
5
|
from PIL import Image
|
5
6
|
import requests
|
6
7
|
from typing import Union, Tuple, BinaryIO, Optional
|
7
|
-
from
|
8
|
-
|
8
|
+
from pydantic import BaseModel
|
9
9
|
|
10
10
|
def prepare_file(
|
11
11
|
file: Union[str, Path, BinaryIO, Image.Image]
|
@@ -15,8 +15,31 @@ def prepare_file(
|
|
15
15
|
if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
|
16
16
|
response = requests.get(file)
|
17
17
|
response.raise_for_status()
|
18
|
+
|
19
|
+
# Try to get filename from Content-Disposition header first
|
20
|
+
filename = None
|
21
|
+
content_disposition = response.headers.get('Content-Disposition')
|
22
|
+
if content_disposition and 'filename=' in content_disposition:
|
23
|
+
filename = content_disposition.split('filename=')[-1].strip('"\'')
|
24
|
+
|
25
|
+
# If no Content-Disposition, try to get clean filename from URL path
|
26
|
+
if not filename:
|
27
|
+
from urllib.parse import urlparse, unquote
|
28
|
+
parsed_url = urlparse(file)
|
29
|
+
path = unquote(parsed_url.path)
|
30
|
+
filename = Path(path).name if path else None
|
31
|
+
|
32
|
+
# Fallback to default name if we couldn't extract one
|
33
|
+
filename = filename or 'downloaded_file'
|
34
|
+
|
35
|
+
# Sanitize filename: remove invalid characters and limit length
|
36
|
+
import re
|
37
|
+
filename = re.sub(r'[<>:"/\\|?*%]', '_', filename) # Replace invalid chars with underscore
|
38
|
+
filename = re.sub(r'\s+', '_', filename) # Replace whitespace with underscore
|
39
|
+
filename = filename.strip('._') # Remove leading/trailing dots and underscores
|
40
|
+
filename = filename[:255] # Limit length to 255 characters
|
41
|
+
|
18
42
|
file_obj = io.BytesIO(response.content)
|
19
|
-
filename = Path(file.split('/')[-1]).name or 'downloaded_file'
|
20
43
|
return filename, file_obj
|
21
44
|
|
22
45
|
# Handle base64 strings
|
@@ -104,3 +127,33 @@ def prepare_upload_data(
|
|
104
127
|
files[key] = (None, json.dumps(value), 'application/json')
|
105
128
|
|
106
129
|
return files
|
130
|
+
|
131
|
+
def from_pydantic(pydantic: BaseModel) -> dict:
|
132
|
+
"""Convert a Pydantic model to a Chunk json schema.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
pydantic: A Pydantic BaseModel class or instance
|
136
|
+
|
137
|
+
Returns:
|
138
|
+
dict: A JSON schema compatible with Chunk's format
|
139
|
+
"""
|
140
|
+
model = pydantic if isinstance(pydantic, type) else pydantic.__class__
|
141
|
+
schema = model.model_json_schema()
|
142
|
+
print(schema)
|
143
|
+
properties = []
|
144
|
+
for name, details in schema.get('properties', {}).items():
|
145
|
+
prop = Property(
|
146
|
+
name=name,
|
147
|
+
title=details.get('title'),
|
148
|
+
prop_type=details.get('type', 'string'),
|
149
|
+
description=details.get('description'),
|
150
|
+
default=str(details.get('default')) if details.get('default') is not None else None
|
151
|
+
)
|
152
|
+
properties.append(prop)
|
153
|
+
|
154
|
+
json_schema = JsonSchema(
|
155
|
+
title=schema.get('title', model.__name__),
|
156
|
+
properties=properties
|
157
|
+
)
|
158
|
+
|
159
|
+
return json_schema.model_dump(mode="json", exclude_none=True)
|
@@ -0,0 +1,111 @@
|
|
1
|
+
import asyncio
|
2
|
+
from pydantic import BaseModel, PrivateAttr
|
3
|
+
from datetime import datetime
|
4
|
+
from enum import Enum
|
5
|
+
from typing import Optional, Union
|
6
|
+
from .task_base import TaskBase
|
7
|
+
from .protocol import ChunkrClientProtocol
|
8
|
+
from .config import Configuration, OutputResponse
|
9
|
+
from .misc import prepare_upload_data
|
10
|
+
|
11
|
+
class Status(str, Enum):
|
12
|
+
STARTING = "Starting"
|
13
|
+
PROCESSING = "Processing"
|
14
|
+
SUCCEEDED = "Succeeded"
|
15
|
+
FAILED = "Failed"
|
16
|
+
CANCELLED = "Cancelled"
|
17
|
+
|
18
|
+
class TaskResponseAsync(BaseModel, TaskBase):
|
19
|
+
configuration: Configuration
|
20
|
+
created_at: datetime
|
21
|
+
expires_at: Optional[datetime]
|
22
|
+
file_name: Optional[str]
|
23
|
+
finished_at: Optional[datetime]
|
24
|
+
input_file_url: Optional[str]
|
25
|
+
message: str
|
26
|
+
output: Optional[OutputResponse]
|
27
|
+
page_count: Optional[int]
|
28
|
+
pdf_url: Optional[str]
|
29
|
+
started_at: Optional[datetime]
|
30
|
+
status: Status
|
31
|
+
task_id: str
|
32
|
+
task_url: Optional[str]
|
33
|
+
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
34
|
+
|
35
|
+
def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponseAsync':
|
36
|
+
self._client = client
|
37
|
+
return self
|
38
|
+
|
39
|
+
async def poll(self) -> 'TaskResponseAsync':
|
40
|
+
while True:
|
41
|
+
j = await self._poll_request()
|
42
|
+
updated = TaskResponseAsync(**j).with_client(self._client)
|
43
|
+
self.__dict__.update(updated.__dict__)
|
44
|
+
if res := self._check_status():
|
45
|
+
return res
|
46
|
+
await asyncio.sleep(0.5)
|
47
|
+
|
48
|
+
async def _poll_request(self) -> dict:
|
49
|
+
if not self.task_url:
|
50
|
+
raise ValueError("Task URL not found")
|
51
|
+
while True:
|
52
|
+
try:
|
53
|
+
r = await self._client._client.get(self.task_url, headers=self._client._headers())
|
54
|
+
r.raise_for_status()
|
55
|
+
return r.json()
|
56
|
+
except Exception as e:
|
57
|
+
if self.status == Status.FAILED:
|
58
|
+
raise ValueError(self.message) from e
|
59
|
+
await asyncio.sleep(0.5)
|
60
|
+
|
61
|
+
def _check_status(self) -> Optional['TaskResponseAsync']:
|
62
|
+
if self.status == Status.FAILED:
|
63
|
+
raise ValueError(f"Task failed: {self.message}")
|
64
|
+
if self.status == Status.CANCELLED:
|
65
|
+
return self
|
66
|
+
if self.status not in [Status.STARTING, Status.PROCESSING]:
|
67
|
+
return self
|
68
|
+
return None
|
69
|
+
|
70
|
+
async def update(self, config: Configuration) -> 'TaskResponseAsync':
|
71
|
+
if not self.task_url:
|
72
|
+
raise ValueError("Task URL not found")
|
73
|
+
f = prepare_upload_data(None, config)
|
74
|
+
r = await self._client._client.patch(self.task_url, files=f, headers=self._client._headers())
|
75
|
+
r.raise_for_status()
|
76
|
+
updated = TaskResponseAsync(**r.json()).with_client(self._client)
|
77
|
+
self.__dict__.update(updated.__dict__)
|
78
|
+
return await self.poll()
|
79
|
+
|
80
|
+
async def cancel(self):
|
81
|
+
if not self.task_url:
|
82
|
+
raise ValueError("Task URL not found")
|
83
|
+
r = await self._client._client.get(f"{self.task_url}/cancel", headers=self._client._headers())
|
84
|
+
r.raise_for_status()
|
85
|
+
return await self.poll()
|
86
|
+
|
87
|
+
async def delete(self):
|
88
|
+
r = await self._client._client.delete(self.task_url, headers=self._client._headers())
|
89
|
+
r.raise_for_status()
|
90
|
+
|
91
|
+
def html(self) -> str:
|
92
|
+
return self._get_content("html")
|
93
|
+
|
94
|
+
def markdown(self) -> str:
|
95
|
+
return self._get_content("markdown")
|
96
|
+
|
97
|
+
def content(self) -> str:
|
98
|
+
return self._get_content("content")
|
99
|
+
|
100
|
+
def _get_content(self, t: str) -> str:
|
101
|
+
if not self.output:
|
102
|
+
return ""
|
103
|
+
parts = []
|
104
|
+
for c in self.output.chunks:
|
105
|
+
for s in c.segments:
|
106
|
+
v = getattr(s, t)
|
107
|
+
if v:
|
108
|
+
parts.append(v)
|
109
|
+
return "\n".join(parts)
|
110
|
+
|
111
|
+
# Satisfying TaskBase abstract methods with stubs
|
@@ -0,0 +1,31 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from .config import Configuration
|
3
|
+
|
4
|
+
class TaskBase(ABC):
|
5
|
+
@abstractmethod
|
6
|
+
def poll(self):
|
7
|
+
pass
|
8
|
+
|
9
|
+
@abstractmethod
|
10
|
+
def update(self, config: Configuration):
|
11
|
+
pass
|
12
|
+
|
13
|
+
@abstractmethod
|
14
|
+
def cancel(self):
|
15
|
+
pass
|
16
|
+
|
17
|
+
@abstractmethod
|
18
|
+
def delete(self):
|
19
|
+
pass
|
20
|
+
|
21
|
+
@abstractmethod
|
22
|
+
def html(self) -> str:
|
23
|
+
pass
|
24
|
+
|
25
|
+
@abstractmethod
|
26
|
+
def markdown(self) -> str:
|
27
|
+
pass
|
28
|
+
|
29
|
+
@abstractmethod
|
30
|
+
def content(self) -> str:
|
31
|
+
pass
|
@@ -1,20 +1,20 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.10
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
Project-URL: Homepage, https://chunkr.ai
|
7
7
|
Description-Content-Type: text/markdown
|
8
8
|
License-File: LICENSE
|
9
|
-
Requires-Dist: httpx>=0.
|
10
|
-
Requires-Dist: pillow>=
|
11
|
-
Requires-Dist: pydantic>=2.
|
12
|
-
Requires-Dist: pytest-asyncio>=0.
|
13
|
-
Requires-Dist: python-dotenv>=
|
14
|
-
Requires-Dist: requests>=2.
|
9
|
+
Requires-Dist: httpx>=0.25.0
|
10
|
+
Requires-Dist: pillow>=10.0.0
|
11
|
+
Requires-Dist: pydantic>=2.0.0
|
12
|
+
Requires-Dist: pytest-asyncio>=0.21.0
|
13
|
+
Requires-Dist: python-dotenv>=0.19.0
|
14
|
+
Requires-Dist: requests>=2.28.0
|
15
15
|
Provides-Extra: test
|
16
|
-
Requires-Dist: pytest>=
|
17
|
-
Requires-Dist: pytest-xdist>=3.
|
16
|
+
Requires-Dist: pytest>=7.0.0; extra == "test"
|
17
|
+
Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
|
18
18
|
|
19
19
|
# Chunkr Python Client
|
20
20
|
|
@@ -0,0 +1,19 @@
|
|
1
|
+
chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
|
2
|
+
chunkr_ai/main.py,sha256=_MT1lcnNiXjVW9ZkZYl28SB_f6M9g_IOgZxvhodTzAo,394
|
3
|
+
chunkr_ai/models.py,sha256=T8_F-Y1US21ZJVzLIaroqp-Hd0_ZFbdkbEOxr63-PNE,827
|
4
|
+
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
|
6
|
+
chunkr_ai/api/chunkr.py,sha256=0qpV9b1hOpDhA9EuKkXW9X_laUmw5NY3ZYq0cUOTbww,5190
|
7
|
+
chunkr_ai/api/chunkr_async.py,sha256=ZkLBrn4cqzu3sqMfS8cfZZgSvpdyQuWZP95lfGxuHx0,4900
|
8
|
+
chunkr_ai/api/chunkr_base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
|
9
|
+
chunkr_ai/api/config.py,sha256=eu7a28UjlNaM3QRrzElRTQXqMPBynAvlusVSIAMNXUY,4203
|
10
|
+
chunkr_ai/api/misc.py,sha256=DiY-BV5nPMDVKAiHTcND8w-8mSB-dENxrOhxnkyEoRA,6034
|
11
|
+
chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
|
12
|
+
chunkr_ai/api/task.py,sha256=EB6RK8ms7EaNj57tNJZoNgNMHGWKXFhkQ1WC7gk5ht4,6059
|
13
|
+
chunkr_ai/api/task_async.py,sha256=Dd-Fenie0Q6GxXce7OlXvuQ14NQ58F_0b9P7AGKWyYA,3833
|
14
|
+
chunkr_ai/api/task_base.py,sha256=Tkk7dhIeB3ic5M9g_b-MVRdNv4XQTvajpaUy8JylQ8A,526
|
15
|
+
chunkr_ai-0.0.10.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
chunkr_ai-0.0.10.dist-info/METADATA,sha256=W8PCDpT4hN5tpn_9fyVrjEbd0abG0ReP5reG4_9Glp8,4845
|
17
|
+
chunkr_ai-0.0.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
18
|
+
chunkr_ai-0.0.10.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
19
|
+
chunkr_ai-0.0.10.dist-info/RECORD,,
|
chunkr_ai/api/api.py
DELETED
File without changes
|
chunkr_ai-0.0.8.dist-info/RECORD
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
|
2
|
-
chunkr_ai/main.py,sha256=_MT1lcnNiXjVW9ZkZYl28SB_f6M9g_IOgZxvhodTzAo,394
|
3
|
-
chunkr_ai/models.py,sha256=T8_F-Y1US21ZJVzLIaroqp-Hd0_ZFbdkbEOxr63-PNE,827
|
4
|
-
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
|
7
|
-
chunkr_ai/api/base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
|
8
|
-
chunkr_ai/api/chunkr.py,sha256=PmrK37HbK2T1KUPitKnt4wZqIujL61Jo12qW9DEpNMI,5186
|
9
|
-
chunkr_ai/api/chunkr_async.py,sha256=2yYyAO9-j2xKQYH0fJb2S6gL26hgbtL4QyqlG9l0QBY,4893
|
10
|
-
chunkr_ai/api/config.py,sha256=XIqXZ_8q7U_BEmY5wyIC9mbQGZBw1956EN9yhC4svD0,4235
|
11
|
-
chunkr_ai/api/misc.py,sha256=tScsUUcrqeVh_bZv1YlbmjGkQSTDQN8NyKxoNwAG6XA,3792
|
12
|
-
chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
|
13
|
-
chunkr_ai/api/task.py,sha256=EB6RK8ms7EaNj57tNJZoNgNMHGWKXFhkQ1WC7gk5ht4,6059
|
14
|
-
chunkr_ai-0.0.8.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
chunkr_ai-0.0.8.dist-info/METADATA,sha256=tL3OZfFIRsgfIKoDYWAS89bZw48_0C8cdqHJ6_GrT7A,4844
|
16
|
-
chunkr_ai-0.0.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
17
|
-
chunkr_ai-0.0.8.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
18
|
-
chunkr_ai-0.0.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|