chunkr-ai 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/config.py +5 -5
- chunkr_ai/api/task.py +9 -9
- chunkr_ai/api/task_async.py +111 -0
- chunkr_ai/api/task_base.py +31 -0
- {chunkr_ai-0.0.7.dist-info → chunkr_ai-0.0.9.dist-info}/METADATA +9 -9
- {chunkr_ai-0.0.7.dist-info → chunkr_ai-0.0.9.dist-info}/RECORD +9 -7
- {chunkr_ai-0.0.7.dist-info → chunkr_ai-0.0.9.dist-info}/LICENSE +0 -0
- {chunkr_ai-0.0.7.dist-info → chunkr_ai-0.0.9.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.0.7.dist-info → chunkr_ai-0.0.9.dist-info}/top_level.txt +0 -0
chunkr_ai/api/config.py
CHANGED
@@ -86,9 +86,9 @@ class Segment(BaseModel):
|
|
86
86
|
bbox: BoundingBox
|
87
87
|
content: str
|
88
88
|
page_height: float
|
89
|
-
html: Optional[str]
|
90
|
-
image: Optional[str]
|
91
|
-
markdown: Optional[str]
|
89
|
+
html: Optional[str] = None
|
90
|
+
image: Optional[str] = None
|
91
|
+
markdown: Optional[str] = None
|
92
92
|
ocr: List[OCRResult]
|
93
93
|
page_number: int
|
94
94
|
page_width: float
|
@@ -104,8 +104,8 @@ class ExtractedJson(BaseModel):
|
|
104
104
|
data: Dict
|
105
105
|
|
106
106
|
class OutputResponse(BaseModel):
|
107
|
-
chunks: List[Chunk]
|
108
|
-
extracted_json: Optional[ExtractedJson]
|
107
|
+
chunks: List[Chunk]
|
108
|
+
extracted_json: Optional[ExtractedJson] = Field(default=None)
|
109
109
|
|
110
110
|
class Model(str, Enum):
|
111
111
|
FAST = "Fast"
|
chunkr_ai/api/task.py
CHANGED
@@ -18,18 +18,18 @@ class Status(str, Enum):
|
|
18
18
|
class TaskResponse(BaseModel):
|
19
19
|
configuration: Configuration
|
20
20
|
created_at: datetime
|
21
|
-
expires_at: Optional[datetime]
|
22
|
-
file_name: Optional[str]
|
23
|
-
finished_at: Optional[datetime]
|
24
|
-
input_file_url: Optional[str]
|
21
|
+
expires_at: Optional[datetime] = None
|
22
|
+
file_name: Optional[str] = None
|
23
|
+
finished_at: Optional[datetime] = None
|
24
|
+
input_file_url: Optional[str] = None
|
25
25
|
message: str
|
26
|
-
output: Optional[OutputResponse]
|
27
|
-
page_count: Optional[int]
|
28
|
-
pdf_url: Optional[str]
|
29
|
-
started_at: Optional[datetime]
|
26
|
+
output: Optional[OutputResponse] = None
|
27
|
+
page_count: Optional[int] = None
|
28
|
+
pdf_url: Optional[str] = None
|
29
|
+
started_at: Optional[datetime] = None
|
30
30
|
status: Status
|
31
31
|
task_id: str
|
32
|
-
task_url: Optional[str]
|
32
|
+
task_url: Optional[str] = None
|
33
33
|
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
34
34
|
|
35
35
|
def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
|
@@ -0,0 +1,111 @@
|
|
1
|
+
import asyncio
|
2
|
+
from pydantic import BaseModel, PrivateAttr
|
3
|
+
from datetime import datetime
|
4
|
+
from enum import Enum
|
5
|
+
from typing import Optional, Union
|
6
|
+
from .task_base import TaskBase
|
7
|
+
from .protocol import ChunkrClientProtocol
|
8
|
+
from .config import Configuration, OutputResponse
|
9
|
+
from .misc import prepare_upload_data
|
10
|
+
|
11
|
+
class Status(str, Enum):
|
12
|
+
STARTING = "Starting"
|
13
|
+
PROCESSING = "Processing"
|
14
|
+
SUCCEEDED = "Succeeded"
|
15
|
+
FAILED = "Failed"
|
16
|
+
CANCELLED = "Cancelled"
|
17
|
+
|
18
|
+
class TaskResponseAsync(BaseModel, TaskBase):
|
19
|
+
configuration: Configuration
|
20
|
+
created_at: datetime
|
21
|
+
expires_at: Optional[datetime]
|
22
|
+
file_name: Optional[str]
|
23
|
+
finished_at: Optional[datetime]
|
24
|
+
input_file_url: Optional[str]
|
25
|
+
message: str
|
26
|
+
output: Optional[OutputResponse]
|
27
|
+
page_count: Optional[int]
|
28
|
+
pdf_url: Optional[str]
|
29
|
+
started_at: Optional[datetime]
|
30
|
+
status: Status
|
31
|
+
task_id: str
|
32
|
+
task_url: Optional[str]
|
33
|
+
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
34
|
+
|
35
|
+
def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponseAsync':
|
36
|
+
self._client = client
|
37
|
+
return self
|
38
|
+
|
39
|
+
async def poll(self) -> 'TaskResponseAsync':
|
40
|
+
while True:
|
41
|
+
j = await self._poll_request()
|
42
|
+
updated = TaskResponseAsync(**j).with_client(self._client)
|
43
|
+
self.__dict__.update(updated.__dict__)
|
44
|
+
if res := self._check_status():
|
45
|
+
return res
|
46
|
+
await asyncio.sleep(0.5)
|
47
|
+
|
48
|
+
async def _poll_request(self) -> dict:
|
49
|
+
if not self.task_url:
|
50
|
+
raise ValueError("Task URL not found")
|
51
|
+
while True:
|
52
|
+
try:
|
53
|
+
r = await self._client._client.get(self.task_url, headers=self._client._headers())
|
54
|
+
r.raise_for_status()
|
55
|
+
return r.json()
|
56
|
+
except Exception as e:
|
57
|
+
if self.status == Status.FAILED:
|
58
|
+
raise ValueError(self.message) from e
|
59
|
+
await asyncio.sleep(0.5)
|
60
|
+
|
61
|
+
def _check_status(self) -> Optional['TaskResponseAsync']:
|
62
|
+
if self.status == Status.FAILED:
|
63
|
+
raise ValueError(f"Task failed: {self.message}")
|
64
|
+
if self.status == Status.CANCELLED:
|
65
|
+
return self
|
66
|
+
if self.status not in [Status.STARTING, Status.PROCESSING]:
|
67
|
+
return self
|
68
|
+
return None
|
69
|
+
|
70
|
+
async def update(self, config: Configuration) -> 'TaskResponseAsync':
|
71
|
+
if not self.task_url:
|
72
|
+
raise ValueError("Task URL not found")
|
73
|
+
f = prepare_upload_data(None, config)
|
74
|
+
r = await self._client._client.patch(self.task_url, files=f, headers=self._client._headers())
|
75
|
+
r.raise_for_status()
|
76
|
+
updated = TaskResponseAsync(**r.json()).with_client(self._client)
|
77
|
+
self.__dict__.update(updated.__dict__)
|
78
|
+
return await self.poll()
|
79
|
+
|
80
|
+
async def cancel(self):
|
81
|
+
if not self.task_url:
|
82
|
+
raise ValueError("Task URL not found")
|
83
|
+
r = await self._client._client.get(f"{self.task_url}/cancel", headers=self._client._headers())
|
84
|
+
r.raise_for_status()
|
85
|
+
return await self.poll()
|
86
|
+
|
87
|
+
async def delete(self):
|
88
|
+
r = await self._client._client.delete(self.task_url, headers=self._client._headers())
|
89
|
+
r.raise_for_status()
|
90
|
+
|
91
|
+
def html(self) -> str:
|
92
|
+
return self._get_content("html")
|
93
|
+
|
94
|
+
def markdown(self) -> str:
|
95
|
+
return self._get_content("markdown")
|
96
|
+
|
97
|
+
def content(self) -> str:
|
98
|
+
return self._get_content("content")
|
99
|
+
|
100
|
+
def _get_content(self, t: str) -> str:
|
101
|
+
if not self.output:
|
102
|
+
return ""
|
103
|
+
parts = []
|
104
|
+
for c in self.output.chunks:
|
105
|
+
for s in c.segments:
|
106
|
+
v = getattr(s, t)
|
107
|
+
if v:
|
108
|
+
parts.append(v)
|
109
|
+
return "\n".join(parts)
|
110
|
+
|
111
|
+
# Satisfying TaskBase abstract methods with stubs
|
@@ -0,0 +1,31 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from .config import Configuration
|
3
|
+
|
4
|
+
class TaskBase(ABC):
|
5
|
+
@abstractmethod
|
6
|
+
def poll(self):
|
7
|
+
pass
|
8
|
+
|
9
|
+
@abstractmethod
|
10
|
+
def update(self, config: Configuration):
|
11
|
+
pass
|
12
|
+
|
13
|
+
@abstractmethod
|
14
|
+
def cancel(self):
|
15
|
+
pass
|
16
|
+
|
17
|
+
@abstractmethod
|
18
|
+
def delete(self):
|
19
|
+
pass
|
20
|
+
|
21
|
+
@abstractmethod
|
22
|
+
def html(self) -> str:
|
23
|
+
pass
|
24
|
+
|
25
|
+
@abstractmethod
|
26
|
+
def markdown(self) -> str:
|
27
|
+
pass
|
28
|
+
|
29
|
+
@abstractmethod
|
30
|
+
def content(self) -> str:
|
31
|
+
pass
|
@@ -1,20 +1,20 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.9
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
Project-URL: Homepage, https://chunkr.ai
|
7
7
|
Description-Content-Type: text/markdown
|
8
8
|
License-File: LICENSE
|
9
|
-
Requires-Dist: httpx>=0.
|
10
|
-
Requires-Dist: pillow>=
|
11
|
-
Requires-Dist: pydantic>=2.
|
12
|
-
Requires-Dist: pytest-asyncio>=0.
|
13
|
-
Requires-Dist: python-dotenv>=
|
14
|
-
Requires-Dist: requests>=2.
|
9
|
+
Requires-Dist: httpx>=0.24.0
|
10
|
+
Requires-Dist: pillow>=10.0.0
|
11
|
+
Requires-Dist: pydantic>=2.0.0
|
12
|
+
Requires-Dist: pytest-asyncio>=0.21.0
|
13
|
+
Requires-Dist: python-dotenv>=0.19.0
|
14
|
+
Requires-Dist: requests>=2.28.0
|
15
15
|
Provides-Extra: test
|
16
|
-
Requires-Dist: pytest>=
|
17
|
-
Requires-Dist: pytest-xdist>=3.
|
16
|
+
Requires-Dist: pytest>=7.0.0; extra == "test"
|
17
|
+
Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
|
18
18
|
|
19
19
|
# Chunkr Python Client
|
20
20
|
|
@@ -7,12 +7,14 @@ chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
|
|
7
7
|
chunkr_ai/api/base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
|
8
8
|
chunkr_ai/api/chunkr.py,sha256=PmrK37HbK2T1KUPitKnt4wZqIujL61Jo12qW9DEpNMI,5186
|
9
9
|
chunkr_ai/api/chunkr_async.py,sha256=2yYyAO9-j2xKQYH0fJb2S6gL26hgbtL4QyqlG9l0QBY,4893
|
10
|
-
chunkr_ai/api/config.py,sha256=
|
10
|
+
chunkr_ai/api/config.py,sha256=XIqXZ_8q7U_BEmY5wyIC9mbQGZBw1956EN9yhC4svD0,4235
|
11
11
|
chunkr_ai/api/misc.py,sha256=tScsUUcrqeVh_bZv1YlbmjGkQSTDQN8NyKxoNwAG6XA,3792
|
12
12
|
chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
|
13
|
-
chunkr_ai/api/task.py,sha256=
|
14
|
-
chunkr_ai
|
15
|
-
chunkr_ai
|
16
|
-
chunkr_ai-0.0.
|
17
|
-
chunkr_ai-0.0.
|
18
|
-
chunkr_ai-0.0.
|
13
|
+
chunkr_ai/api/task.py,sha256=EB6RK8ms7EaNj57tNJZoNgNMHGWKXFhkQ1WC7gk5ht4,6059
|
14
|
+
chunkr_ai/api/task_async.py,sha256=Dd-Fenie0Q6GxXce7OlXvuQ14NQ58F_0b9P7AGKWyYA,3833
|
15
|
+
chunkr_ai/api/task_base.py,sha256=Tkk7dhIeB3ic5M9g_b-MVRdNv4XQTvajpaUy8JylQ8A,526
|
16
|
+
chunkr_ai-0.0.9.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
chunkr_ai-0.0.9.dist-info/METADATA,sha256=XFGPjuDARO1VYvdcyMOHhxZK1FYjEr0_ySI0Ni6tWMc,4844
|
18
|
+
chunkr_ai-0.0.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
19
|
+
chunkr_ai-0.0.9.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
20
|
+
chunkr_ai-0.0.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|