chunkr-ai 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/config.py +1 -1
- chunkr_ai/api/task.py +15 -122
- chunkr_ai/api/task_async.py +19 -72
- chunkr_ai/api/task_base.py +63 -11
- {chunkr_ai-0.0.11.dist-info → chunkr_ai-0.0.12.dist-info}/METADATA +2 -1
- {chunkr_ai-0.0.11.dist-info → chunkr_ai-0.0.12.dist-info}/RECORD +9 -9
- {chunkr_ai-0.0.11.dist-info → chunkr_ai-0.0.12.dist-info}/LICENSE +0 -0
- {chunkr_ai-0.0.11.dist-info → chunkr_ai-0.0.12.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.0.11.dist-info → chunkr_ai-0.0.12.dist-info}/top_level.txt +0 -0
chunkr_ai/api/config.py
CHANGED
chunkr_ai/api/task.py
CHANGED
@@ -1,38 +1,10 @@
|
|
1
|
-
from .
|
2
|
-
from .config import Configuration, OutputResponse, Status
|
1
|
+
from .config import Configuration
|
3
2
|
from .misc import prepare_upload_data
|
4
|
-
import
|
5
|
-
from datetime import datetime
|
6
|
-
from pydantic import BaseModel, PrivateAttr
|
3
|
+
from .task_base import TaskBase
|
7
4
|
import time
|
8
|
-
from typing import Optional, Union
|
9
|
-
|
10
|
-
class TaskResponse(BaseModel):
|
11
|
-
configuration: Configuration
|
12
|
-
created_at: datetime
|
13
|
-
expires_at: Optional[datetime] = None
|
14
|
-
file_name: Optional[str] = None
|
15
|
-
finished_at: Optional[datetime] = None
|
16
|
-
input_file_url: Optional[str] = None
|
17
|
-
message: str
|
18
|
-
output: Optional[OutputResponse] = None
|
19
|
-
page_count: Optional[int] = None
|
20
|
-
pdf_url: Optional[str] = None
|
21
|
-
started_at: Optional[datetime] = None
|
22
|
-
status: Status
|
23
|
-
task_id: str
|
24
|
-
task_url: Optional[str] = None
|
25
|
-
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
26
|
-
|
27
|
-
def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
|
28
|
-
self._client = client
|
29
|
-
return self
|
30
|
-
|
31
|
-
def _poll_request_sync(self) -> dict:
|
32
|
-
"""Helper method to make polling request with retry logic (synchronous)"""
|
33
|
-
if not self.task_url:
|
34
|
-
raise ValueError("Task URL not found in response")
|
35
5
|
|
6
|
+
class TaskResponse(TaskBase):
|
7
|
+
def _poll_request(self) -> dict:
|
36
8
|
while True:
|
37
9
|
try:
|
38
10
|
r = self._client._session.get(self.task_url, headers=self._client._headers())
|
@@ -44,68 +16,20 @@ class TaskResponse(BaseModel):
|
|
44
16
|
except Exception as e:
|
45
17
|
raise
|
46
18
|
|
47
|
-
|
48
|
-
"""Helper method to make polling request with retry logic (asynchronous)"""
|
19
|
+
def poll(self) -> 'TaskResponse':
|
49
20
|
if not self.task_url:
|
50
21
|
raise ValueError("Task URL not found in response")
|
51
|
-
|
52
|
-
while True:
|
53
|
-
try:
|
54
|
-
r = await self._client._client.get(self.task_url, headers=self._client._headers())
|
55
|
-
r.raise_for_status()
|
56
|
-
response = r.json()
|
57
|
-
return response
|
58
|
-
except (ConnectionError, TimeoutError) as _:
|
59
|
-
print("Connection error while polling the task, retrying...")
|
60
|
-
await asyncio.sleep(0.5)
|
61
|
-
except Exception as e:
|
62
|
-
raise
|
63
|
-
|
64
|
-
def _check_status(self) -> Optional['TaskResponse']:
|
65
|
-
"""Helper method to check task status and handle completion/failure"""
|
66
|
-
if self.status == "Failed":
|
67
|
-
raise ValueError(self.message)
|
68
|
-
if self.status not in ("Starting", "Processing"):
|
69
|
-
return self
|
70
|
-
return None
|
71
|
-
|
72
|
-
def poll(self) -> 'TaskResponse':
|
73
|
-
"""Poll the task for completion."""
|
74
22
|
while True:
|
75
23
|
response = self._poll_request_sync()
|
76
24
|
updated_task = TaskResponse(**response).with_client(self._client)
|
77
25
|
self.__dict__.update(updated_task.__dict__)
|
78
|
-
|
79
26
|
if result := self._check_status():
|
80
27
|
return result
|
81
|
-
|
82
28
|
time.sleep(0.5)
|
83
|
-
|
84
|
-
async def poll_async(self) -> 'TaskResponse':
|
85
|
-
"""Poll the task for completion asynchronously."""
|
86
|
-
while True:
|
87
|
-
response = await self._poll_request_async()
|
88
|
-
updated_task = TaskResponse(**response).with_client(self._client)
|
89
|
-
self.__dict__.update(updated_task.__dict__)
|
90
|
-
|
91
|
-
if result := self._check_status():
|
92
|
-
return result
|
93
|
-
|
94
|
-
await asyncio.sleep(0.5)
|
95
|
-
|
96
|
-
def _get_content(self, content_type: str) -> str:
|
97
|
-
"""Helper method to get either HTML, Markdown, or raw content."""
|
98
|
-
if not self.output:
|
99
|
-
return ""
|
100
|
-
parts = []
|
101
|
-
for c in self.output.chunks:
|
102
|
-
for s in c.segments:
|
103
|
-
content = getattr(s, content_type)
|
104
|
-
if content:
|
105
|
-
parts.append(content)
|
106
|
-
return "\n".join(parts)
|
107
29
|
|
108
30
|
def update(self, config: Configuration) -> 'TaskResponse':
|
31
|
+
if not self.task_url:
|
32
|
+
raise ValueError("Task URL not found")
|
109
33
|
files = prepare_upload_data(None, config)
|
110
34
|
r = self._client._session.patch(
|
111
35
|
f"{self.task_url}",
|
@@ -113,56 +37,25 @@ class TaskResponse(BaseModel):
|
|
113
37
|
headers=self._client._headers()
|
114
38
|
)
|
115
39
|
r.raise_for_status()
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
files = prepare_upload_data(None, config)
|
120
|
-
r = await self._client._client.patch(
|
121
|
-
f"{self.task_url}",
|
122
|
-
files=files,
|
123
|
-
headers=self._client._headers()
|
124
|
-
)
|
125
|
-
r.raise_for_status()
|
126
|
-
return TaskResponse(**r.json()).with_client(self._client)
|
40
|
+
updated = TaskResponse(**r.json()).with_client(self._client)
|
41
|
+
self.__dict__.update(updated.__dict__)
|
42
|
+
return self.poll()
|
127
43
|
|
128
44
|
def cancel(self):
|
45
|
+
if not self.task_url:
|
46
|
+
raise ValueError("Task URL not found")
|
129
47
|
r = self._client._session.get(
|
130
48
|
f"{self.task_url}/cancel",
|
131
49
|
headers=self._client._headers()
|
132
50
|
)
|
133
51
|
r.raise_for_status()
|
134
52
|
self.poll()
|
135
|
-
|
136
|
-
async def cancel_async(self):
|
137
|
-
r = await self._client._client.get(
|
138
|
-
f"{self.task_url}/cancel",
|
139
|
-
headers=self._client._headers()
|
140
|
-
)
|
141
|
-
r.raise_for_status()
|
142
|
-
await self.poll_async()
|
143
53
|
|
144
54
|
def delete(self):
|
55
|
+
if not self.task_url:
|
56
|
+
raise ValueError("Task URL not found")
|
145
57
|
r = self._client._session.delete(
|
146
|
-
|
58
|
+
self.task_url,
|
147
59
|
headers=self._client._headers()
|
148
60
|
)
|
149
61
|
r.raise_for_status()
|
150
|
-
|
151
|
-
async def delete_async(self):
|
152
|
-
r = await self._client._client.delete(
|
153
|
-
f"{self.task_url}",
|
154
|
-
headers=self._client._headers()
|
155
|
-
)
|
156
|
-
r.raise_for_status()
|
157
|
-
|
158
|
-
def html(self) -> str:
|
159
|
-
"""Get full HTML for the task"""
|
160
|
-
return self._get_content("html")
|
161
|
-
|
162
|
-
def markdown(self) -> str:
|
163
|
-
"""Get full markdown for the task"""
|
164
|
-
return self._get_content("markdown")
|
165
|
-
|
166
|
-
def content(self) -> str:
|
167
|
-
"""Get full text for the task"""
|
168
|
-
return self._get_content("content")
|
chunkr_ai/api/task_async.py
CHANGED
@@ -1,34 +1,23 @@
|
|
1
|
-
import
|
2
|
-
from pydantic import BaseModel, PrivateAttr
|
3
|
-
from datetime import datetime
|
4
|
-
from typing import Optional, Union
|
5
|
-
from .task_base import TaskBase
|
6
|
-
from .protocol import ChunkrClientProtocol
|
7
|
-
from .config import Configuration, OutputResponse, Status
|
1
|
+
from .config import Configuration
|
8
2
|
from .misc import prepare_upload_data
|
3
|
+
from .task_base import TaskBase
|
4
|
+
import asyncio
|
9
5
|
|
10
|
-
class TaskResponseAsync(
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
started_at: Optional[datetime]
|
22
|
-
status: Status
|
23
|
-
task_id: str
|
24
|
-
task_url: Optional[str]
|
25
|
-
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
26
|
-
|
27
|
-
def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponseAsync':
|
28
|
-
self._client = client
|
29
|
-
return self
|
6
|
+
class TaskResponseAsync(TaskBase):
|
7
|
+
async def _poll_request(self) -> dict:
|
8
|
+
try:
|
9
|
+
r = await self._client._client.get(self.task_url, headers=self._client._headers())
|
10
|
+
r.raise_for_status()
|
11
|
+
return r.json()
|
12
|
+
except (ConnectionError, TimeoutError) as _:
|
13
|
+
print("Connection error while polling the task, retrying...")
|
14
|
+
await asyncio.sleep(0.5)
|
15
|
+
except Exception as e:
|
16
|
+
raise
|
30
17
|
|
31
18
|
async def poll(self) -> 'TaskResponseAsync':
|
19
|
+
if not self.task_url:
|
20
|
+
raise ValueError("Task URL not found")
|
32
21
|
while True:
|
33
22
|
j = await self._poll_request()
|
34
23
|
updated = TaskResponseAsync(**j).with_client(self._client)
|
@@ -37,28 +26,6 @@ class TaskResponseAsync(BaseModel, TaskBase):
|
|
37
26
|
return res
|
38
27
|
await asyncio.sleep(0.5)
|
39
28
|
|
40
|
-
async def _poll_request(self) -> dict:
|
41
|
-
if not self.task_url:
|
42
|
-
raise ValueError("Task URL not found")
|
43
|
-
while True:
|
44
|
-
try:
|
45
|
-
r = await self._client._client.get(self.task_url, headers=self._client._headers())
|
46
|
-
r.raise_for_status()
|
47
|
-
return r.json()
|
48
|
-
except Exception as e:
|
49
|
-
if self.status == Status.FAILED:
|
50
|
-
raise ValueError(self.message) from e
|
51
|
-
await asyncio.sleep(0.5)
|
52
|
-
|
53
|
-
def _check_status(self) -> Optional['TaskResponseAsync']:
|
54
|
-
if self.status == Status.FAILED:
|
55
|
-
raise ValueError(f"Task failed: {self.message}")
|
56
|
-
if self.status == Status.CANCELLED:
|
57
|
-
return self
|
58
|
-
if self.status not in [Status.STARTING, Status.PROCESSING]:
|
59
|
-
return self
|
60
|
-
return None
|
61
|
-
|
62
29
|
async def update(self, config: Configuration) -> 'TaskResponseAsync':
|
63
30
|
if not self.task_url:
|
64
31
|
raise ValueError("Task URL not found")
|
@@ -77,27 +44,7 @@ class TaskResponseAsync(BaseModel, TaskBase):
|
|
77
44
|
return await self.poll()
|
78
45
|
|
79
46
|
async def delete(self):
|
47
|
+
if not self.task_url:
|
48
|
+
raise ValueError("Task URL not found")
|
80
49
|
r = await self._client._client.delete(self.task_url, headers=self._client._headers())
|
81
|
-
r.raise_for_status()
|
82
|
-
|
83
|
-
def html(self) -> str:
|
84
|
-
return self._get_content("html")
|
85
|
-
|
86
|
-
def markdown(self) -> str:
|
87
|
-
return self._get_content("markdown")
|
88
|
-
|
89
|
-
def content(self) -> str:
|
90
|
-
return self._get_content("content")
|
91
|
-
|
92
|
-
def _get_content(self, t: str) -> str:
|
93
|
-
if not self.output:
|
94
|
-
return ""
|
95
|
-
parts = []
|
96
|
-
for c in self.output.chunks:
|
97
|
-
for s in c.segments:
|
98
|
-
v = getattr(s, t)
|
99
|
-
if v:
|
100
|
-
parts.append(v)
|
101
|
-
return "\n".join(parts)
|
102
|
-
|
103
|
-
# Satisfying TaskBase abstract methods with stubs
|
50
|
+
r.raise_for_status()
|
chunkr_ai/api/task_base.py
CHANGED
@@ -1,31 +1,83 @@
|
|
1
|
-
from abc import ABC, abstractmethod
|
2
1
|
from .config import Configuration
|
2
|
+
from .protocol import ChunkrClientProtocol
|
3
|
+
from ..models import Status, OutputResponse
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from typing import TypeVar, Optional, Generic, Union
|
6
|
+
from pydantic import BaseModel, PrivateAttr
|
7
|
+
from datetime import datetime
|
8
|
+
|
9
|
+
T = TypeVar('T', bound='TaskBase')
|
10
|
+
|
11
|
+
class TaskBase(BaseModel, ABC, Generic[T]):
|
12
|
+
configuration: Configuration
|
13
|
+
created_at: datetime
|
14
|
+
expires_at: Optional[datetime]
|
15
|
+
file_name: Optional[str]
|
16
|
+
finished_at: Optional[datetime]
|
17
|
+
input_file_url: Optional[str]
|
18
|
+
message: str
|
19
|
+
output: Optional[OutputResponse]
|
20
|
+
page_count: Optional[int]
|
21
|
+
pdf_url: Optional[str]
|
22
|
+
started_at: Optional[datetime]
|
23
|
+
status: Status
|
24
|
+
task_id: str
|
25
|
+
task_url: Optional[str]
|
26
|
+
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
3
27
|
|
4
|
-
class TaskBase(ABC):
|
5
28
|
@abstractmethod
|
6
|
-
def
|
29
|
+
def _poll_request(self) -> dict:
|
30
|
+
"""Helper method to make polling request with retry logic (synchronous)"""
|
7
31
|
pass
|
8
32
|
|
9
33
|
@abstractmethod
|
10
|
-
def
|
34
|
+
def poll(self) -> T:
|
35
|
+
"""Poll the task for completion."""
|
11
36
|
pass
|
12
37
|
|
13
38
|
@abstractmethod
|
14
|
-
def
|
39
|
+
def update(self, config: Configuration) -> T:
|
40
|
+
"""Update the task configuration."""
|
15
41
|
pass
|
16
42
|
|
17
43
|
@abstractmethod
|
18
|
-
def
|
44
|
+
def cancel(self) -> T:
|
45
|
+
"""Cancel the task."""
|
19
46
|
pass
|
20
47
|
|
21
48
|
@abstractmethod
|
22
|
-
def
|
49
|
+
def delete(self) -> T:
|
50
|
+
"""Delete the task."""
|
23
51
|
pass
|
24
52
|
|
25
|
-
|
53
|
+
def with_client(self, client: Union[ChunkrClientProtocol]) -> T:
|
54
|
+
self._client = client
|
55
|
+
return self
|
56
|
+
|
57
|
+
def _check_status(self) -> Optional[T]:
|
58
|
+
"""Helper method to check task status and handle completion/failure"""
|
59
|
+
if self.status == "Failed":
|
60
|
+
raise ValueError(self.message)
|
61
|
+
if self.status not in ("Starting", "Processing"):
|
62
|
+
return self
|
63
|
+
return None
|
64
|
+
|
65
|
+
def html(self) -> str:
|
66
|
+
return self._get_content("html")
|
67
|
+
|
26
68
|
def markdown(self) -> str:
|
27
|
-
|
69
|
+
return self._get_content("markdown")
|
28
70
|
|
29
|
-
@abstractmethod
|
30
71
|
def content(self) -> str:
|
31
|
-
|
72
|
+
return self._get_content("content")
|
73
|
+
|
74
|
+
def _get_content(self, t: str) -> str:
|
75
|
+
if not self.output:
|
76
|
+
return ""
|
77
|
+
parts = []
|
78
|
+
for c in self.output.chunks:
|
79
|
+
for s in c.segments:
|
80
|
+
v = getattr(s, t)
|
81
|
+
if v:
|
82
|
+
parts.append(v)
|
83
|
+
return "\n".join(parts)
|
@@ -1,12 +1,13 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.12
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
Project-URL: Homepage, https://chunkr.ai
|
7
7
|
Description-Content-Type: text/markdown
|
8
8
|
License-File: LICENSE
|
9
9
|
Requires-Dist: httpx>=0.25.0
|
10
|
+
Requires-Dist: httpx>=0.25.0
|
10
11
|
Requires-Dist: pillow>=10.0.0
|
11
12
|
Requires-Dist: pydantic>=2.0.0
|
12
13
|
Requires-Dist: pytest-asyncio>=0.21.0
|
@@ -5,15 +5,15 @@ chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
|
|
5
5
|
chunkr_ai/api/chunkr.py,sha256=0qpV9b1hOpDhA9EuKkXW9X_laUmw5NY3ZYq0cUOTbww,5190
|
6
6
|
chunkr_ai/api/chunkr_async.py,sha256=ZkLBrn4cqzu3sqMfS8cfZZgSvpdyQuWZP95lfGxuHx0,4900
|
7
7
|
chunkr_ai/api/chunkr_base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
|
8
|
-
chunkr_ai/api/config.py,sha256=
|
8
|
+
chunkr_ai/api/config.py,sha256=joTn7jiOlJXTwwza-jHauLV-39CMzaxZVGB9JBm8Cok,4862
|
9
9
|
chunkr_ai/api/misc.py,sha256=9vnfrbJ7sFlZqwEIQ4NTMb5rhPOmETT7e1jR-b42PXM,4977
|
10
10
|
chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
|
11
11
|
chunkr_ai/api/schema.py,sha256=OeLOhBRXeRBgEImg0Q6O9Z10ojT6aSEVvwnDR8UeENo,4971
|
12
|
-
chunkr_ai/api/task.py,sha256=
|
13
|
-
chunkr_ai/api/task_async.py,sha256=
|
14
|
-
chunkr_ai/api/task_base.py,sha256=
|
15
|
-
chunkr_ai-0.0.
|
16
|
-
chunkr_ai-0.0.
|
17
|
-
chunkr_ai-0.0.
|
18
|
-
chunkr_ai-0.0.
|
19
|
-
chunkr_ai-0.0.
|
12
|
+
chunkr_ai/api/task.py,sha256=4insrdGEVzBHs1ejZvde8bbEetVzgJELa47UjhfBqCA,2116
|
13
|
+
chunkr_ai/api/task_async.py,sha256=LqS-LL-mCOgfGsgvuSXhKkSEUM6MMro-EZHl_ZedQQk,1998
|
14
|
+
chunkr_ai/api/task_base.py,sha256=iS5UVIDEPIiDoWrn21Oh_dQurkd_hvKQ8ng32j6sGoA,2369
|
15
|
+
chunkr_ai-0.0.12.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
chunkr_ai-0.0.12.dist-info/METADATA,sha256=dfo9myRizW2A5W0H6FpIoBzHa4QxmEe3lsedPYhwjXM,4874
|
17
|
+
chunkr_ai-0.0.12.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
18
|
+
chunkr_ai-0.0.12.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
19
|
+
chunkr_ai-0.0.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|