chunkr-ai 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/__init__.py +1 -2
- chunkr_ai/api/chunkr.py +46 -39
- chunkr_ai/api/chunkr_base.py +142 -8
- chunkr_ai/api/config.py +18 -45
- chunkr_ai/api/decorators.py +58 -0
- chunkr_ai/api/misc.py +0 -2
- chunkr_ai/api/protocol.py +0 -2
- chunkr_ai/api/task_response.py +119 -0
- chunkr_ai/models.py +3 -12
- {chunkr_ai-0.0.17.dist-info → chunkr_ai-0.0.19.dist-info}/METADATA +89 -40
- chunkr_ai-0.0.19.dist-info/RECORD +17 -0
- chunkr_ai/api/base.py +0 -183
- chunkr_ai/api/chunkr_async.py +0 -120
- chunkr_ai/api/schema.py +0 -136
- chunkr_ai/api/task.py +0 -66
- chunkr_ai/api/task_async.py +0 -69
- chunkr_ai/api/task_base.py +0 -85
- chunkr_ai-0.0.17.dist-info/RECORD +0 -21
- {chunkr_ai-0.0.17.dist-info → chunkr_ai-0.0.19.dist-info}/LICENSE +0 -0
- {chunkr_ai-0.0.17.dist-info → chunkr_ai-0.0.19.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.0.17.dist-info → chunkr_ai-0.0.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,119 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
from typing import TypeVar, Optional, Generic
|
3
|
+
from pydantic import BaseModel, PrivateAttr
|
4
|
+
import asyncio
|
5
|
+
|
6
|
+
from .config import Configuration, OutputConfiguration, OutputResponse, Status
|
7
|
+
from .protocol import ChunkrClientProtocol
|
8
|
+
from .misc import prepare_upload_data
|
9
|
+
from .decorators import anywhere, require_task
|
10
|
+
|
11
|
+
T = TypeVar("T", bound="TaskResponse")
|
12
|
+
|
13
|
+
class TaskResponse(BaseModel, Generic[T]):
|
14
|
+
configuration: OutputConfiguration
|
15
|
+
created_at: datetime
|
16
|
+
expires_at: Optional[datetime]
|
17
|
+
finished_at: Optional[datetime]
|
18
|
+
message: str
|
19
|
+
output: Optional[OutputResponse]
|
20
|
+
started_at: Optional[datetime]
|
21
|
+
status: Status
|
22
|
+
task_id: str
|
23
|
+
task_url: Optional[str]
|
24
|
+
_client: Optional[ChunkrClientProtocol] = PrivateAttr(default=None)
|
25
|
+
|
26
|
+
def with_client(self, client: ChunkrClientProtocol) -> T:
|
27
|
+
self._client = client
|
28
|
+
return self
|
29
|
+
|
30
|
+
def _check_status(self) -> Optional[T]:
|
31
|
+
"""Helper method to check task status and handle completion/failure"""
|
32
|
+
if self.status == "Failed":
|
33
|
+
raise ValueError(self.message)
|
34
|
+
if self.status not in ("Starting", "Processing"):
|
35
|
+
return self
|
36
|
+
return None
|
37
|
+
|
38
|
+
async def _poll_request(self) -> dict:
|
39
|
+
try:
|
40
|
+
if not self._client._client:
|
41
|
+
raise ValueError("Client not found")
|
42
|
+
r = await self._client._client.get(
|
43
|
+
self.task_url, headers=self._client._headers()
|
44
|
+
)
|
45
|
+
r.raise_for_status()
|
46
|
+
return r.json()
|
47
|
+
except (ConnectionError, TimeoutError) as _:
|
48
|
+
print("Connection error while polling the task, retrying...")
|
49
|
+
await asyncio.sleep(0.5)
|
50
|
+
except Exception:
|
51
|
+
raise
|
52
|
+
|
53
|
+
@anywhere()
|
54
|
+
@require_task()
|
55
|
+
async def poll(self) -> T:
|
56
|
+
"""Poll the task for completion."""
|
57
|
+
while True:
|
58
|
+
j = await self._poll_request()
|
59
|
+
updated = TaskResponse(**j).with_client(self._client)
|
60
|
+
self.__dict__.update(updated.__dict__)
|
61
|
+
if res := self._check_status():
|
62
|
+
return res
|
63
|
+
await asyncio.sleep(0.5)
|
64
|
+
|
65
|
+
@anywhere()
|
66
|
+
@require_task()
|
67
|
+
async def update(self, config: Configuration) -> T:
|
68
|
+
"""Update the task configuration."""
|
69
|
+
f = prepare_upload_data(None, config)
|
70
|
+
r = await self._client._client.patch(
|
71
|
+
self.task_url, files=f, headers=self._client._headers()
|
72
|
+
)
|
73
|
+
r.raise_for_status()
|
74
|
+
updated = TaskResponse(**r.json()).with_client(self._client)
|
75
|
+
self.__dict__.update(updated.__dict__)
|
76
|
+
return await self.poll()
|
77
|
+
|
78
|
+
@anywhere()
|
79
|
+
@require_task()
|
80
|
+
async def delete(self) -> T:
|
81
|
+
"""Delete the task."""
|
82
|
+
r = await self._client._client.delete(
|
83
|
+
self.task_url, headers=self._client._headers()
|
84
|
+
)
|
85
|
+
r.raise_for_status()
|
86
|
+
return self
|
87
|
+
|
88
|
+
@anywhere()
|
89
|
+
@require_task()
|
90
|
+
async def cancel(self) -> T:
|
91
|
+
"""Cancel the task."""
|
92
|
+
r = await self._client._client.get(
|
93
|
+
f"{self.task_url}/cancel", headers=self._client._headers()
|
94
|
+
)
|
95
|
+
r.raise_for_status()
|
96
|
+
return await self.poll()
|
97
|
+
|
98
|
+
def html(self) -> str:
|
99
|
+
"""Get the full HTML of the task"""
|
100
|
+
return self._get_content("html")
|
101
|
+
|
102
|
+
def markdown(self) -> str:
|
103
|
+
"""Get the full markdown of the task"""
|
104
|
+
return self._get_content("markdown")
|
105
|
+
|
106
|
+
def content(self) -> str:
|
107
|
+
"""Get the full content of the task"""
|
108
|
+
return self._get_content("content")
|
109
|
+
|
110
|
+
def _get_content(self, t: str) -> str:
|
111
|
+
if not self.output:
|
112
|
+
return ""
|
113
|
+
parts = []
|
114
|
+
for c in self.output.chunks:
|
115
|
+
for s in c.segments:
|
116
|
+
v = getattr(s, t)
|
117
|
+
if v:
|
118
|
+
parts.append(v)
|
119
|
+
return "\n".join(parts)
|
chunkr_ai/models.py
CHANGED
@@ -4,25 +4,20 @@ from .api.config import (
|
|
4
4
|
ChunkProcessing,
|
5
5
|
Configuration,
|
6
6
|
CroppingStrategy,
|
7
|
-
ExtractedJson,
|
8
7
|
GenerationStrategy,
|
9
8
|
GenerationConfig,
|
10
|
-
JsonSchema,
|
11
9
|
Model,
|
12
10
|
OCRResult,
|
13
11
|
OcrStrategy,
|
14
12
|
OutputResponse,
|
15
|
-
PipelineType,
|
16
|
-
Property,
|
17
13
|
Segment,
|
18
14
|
SegmentProcessing,
|
19
15
|
SegmentType,
|
20
16
|
SegmentationStrategy,
|
21
17
|
Status,
|
18
|
+
Pipeline,
|
22
19
|
)
|
23
|
-
|
24
|
-
from .api.task import TaskResponse
|
25
|
-
from .api.task_async import TaskResponseAsync
|
20
|
+
from .api.task_response import TaskResponse
|
26
21
|
|
27
22
|
__all__ = [
|
28
23
|
"BoundingBox",
|
@@ -30,21 +25,17 @@ __all__ = [
|
|
30
25
|
"ChunkProcessing",
|
31
26
|
"Configuration",
|
32
27
|
"CroppingStrategy",
|
33
|
-
"ExtractedJson",
|
34
28
|
"GenerationConfig",
|
35
29
|
"GenerationStrategy",
|
36
|
-
"JsonSchema",
|
37
30
|
"Model",
|
38
31
|
"OCRResult",
|
39
32
|
"OcrStrategy",
|
40
33
|
"OutputResponse",
|
41
|
-
"PipelineType",
|
42
|
-
"Property",
|
43
34
|
"Segment",
|
44
35
|
"SegmentProcessing",
|
45
36
|
"SegmentType",
|
46
37
|
"SegmentationStrategy",
|
47
38
|
"Status",
|
48
39
|
"TaskResponse",
|
49
|
-
"
|
40
|
+
"Pipeline",
|
50
41
|
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.19
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
Project-URL: Homepage, https://chunkr.ai
|
@@ -11,7 +11,6 @@ Requires-Dist: pillow>=10.0.0
|
|
11
11
|
Requires-Dist: pydantic>=2.0.0
|
12
12
|
Requires-Dist: pytest-asyncio>=0.21.0
|
13
13
|
Requires-Dist: python-dotenv>=0.19.0
|
14
|
-
Requires-Dist: requests>=2.28.0
|
15
14
|
Provides-Extra: test
|
16
15
|
Requires-Dist: pytest>=7.0.0; extra == "test"
|
17
16
|
Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
|
@@ -34,7 +33,7 @@ pip install chunkr-ai
|
|
34
33
|
|
35
34
|
## Usage
|
36
35
|
|
37
|
-
|
36
|
+
The `Chunkr` client works seamlessly in both synchronous and asynchronous contexts.
|
38
37
|
|
39
38
|
### Synchronous Usage
|
40
39
|
|
@@ -46,62 +45,86 @@ chunkr = Chunkr()
|
|
46
45
|
|
47
46
|
# Upload a file and wait for processing
|
48
47
|
task = chunkr.upload("document.pdf")
|
48
|
+
print(task.task_id)
|
49
49
|
|
50
|
-
#
|
51
|
-
|
50
|
+
# Create task without waiting
|
51
|
+
task = chunkr.create_task("document.pdf")
|
52
|
+
result = task.poll() # Check status when needed
|
52
53
|
|
53
|
-
#
|
54
|
-
|
55
|
-
|
56
|
-
# If you want to upload without waiting for processing
|
57
|
-
task = chunkr.start_upload("document.pdf")
|
58
|
-
# ... do other things ...
|
59
|
-
task.poll() # Check status when needed
|
54
|
+
# Clean up when done
|
55
|
+
chunkr.close()
|
60
56
|
```
|
61
57
|
|
62
58
|
### Asynchronous Usage
|
63
59
|
|
64
60
|
```python
|
65
|
-
from chunkr_ai import
|
61
|
+
from chunkr_ai import Chunkr
|
62
|
+
import asyncio
|
66
63
|
|
67
64
|
async def process_document():
|
68
65
|
# Initialize client
|
69
|
-
chunkr =
|
66
|
+
chunkr = Chunkr()
|
67
|
+
|
68
|
+
try:
|
69
|
+
# Upload a file and wait for processing
|
70
|
+
task = await chunkr.upload("document.pdf")
|
71
|
+
print(task.task_id)
|
72
|
+
|
73
|
+
# Create task without waiting
|
74
|
+
task = await chunkr.create_task("document.pdf")
|
75
|
+
result = await task.poll() # Check status when needed
|
76
|
+
finally:
|
77
|
+
await chunkr.close()
|
70
78
|
|
71
|
-
|
72
|
-
|
79
|
+
# Run the async function
|
80
|
+
asyncio.run(process_document())
|
81
|
+
```
|
73
82
|
|
74
|
-
|
75
|
-
print(task)
|
83
|
+
### Concurrent Processing
|
76
84
|
|
77
|
-
|
78
|
-
output = task.output
|
85
|
+
The client supports both async concurrency and multiprocessing:
|
79
86
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
87
|
+
```python
|
88
|
+
# Async concurrency
|
89
|
+
async def process_multiple():
|
90
|
+
chunkr = Chunkr()
|
91
|
+
try:
|
92
|
+
tasks = [
|
93
|
+
chunkr.upload("doc1.pdf"),
|
94
|
+
chunkr.upload("doc2.pdf"),
|
95
|
+
chunkr.upload("doc3.pdf")
|
96
|
+
]
|
97
|
+
results = await asyncio.gather(*tasks)
|
98
|
+
finally:
|
99
|
+
await chunkr.close()
|
100
|
+
|
101
|
+
# Multiprocessing
|
102
|
+
from multiprocessing import Pool
|
103
|
+
|
104
|
+
def process_file(path):
|
105
|
+
chunkr = Chunkr()
|
106
|
+
try:
|
107
|
+
return chunkr.upload(path)
|
108
|
+
finally:
|
109
|
+
chunkr.close()
|
110
|
+
|
111
|
+
with Pool(processes=3) as pool:
|
112
|
+
results = pool.map(process_file, ["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
84
113
|
```
|
85
114
|
|
86
|
-
###
|
115
|
+
### Input Types
|
87
116
|
|
88
|
-
|
117
|
+
The client supports various input types:
|
89
118
|
|
90
119
|
```python
|
91
|
-
#
|
120
|
+
# File path
|
92
121
|
chunkr.upload("document.pdf")
|
93
122
|
|
94
|
-
#
|
123
|
+
# Opened file
|
95
124
|
with open("document.pdf", "rb") as f:
|
96
125
|
chunkr.upload(f)
|
97
126
|
|
98
|
-
#
|
99
|
-
chunkr.upload("https://example.com/document.pdf")
|
100
|
-
|
101
|
-
# Upload from base64 string
|
102
|
-
chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
103
|
-
|
104
|
-
# Upload an image
|
127
|
+
# PIL Image
|
105
128
|
from PIL import Image
|
106
129
|
img = Image.open("photo.jpg")
|
107
130
|
chunkr.upload(img)
|
@@ -112,9 +135,13 @@ chunkr.upload(img)
|
|
112
135
|
You can customize the processing behavior by passing a `Configuration` object:
|
113
136
|
|
114
137
|
```python
|
115
|
-
from chunkr_ai.models import
|
138
|
+
from chunkr_ai.models import (
|
139
|
+
Configuration,
|
140
|
+
OcrStrategy,
|
141
|
+
SegmentationStrategy,
|
142
|
+
GenerationStrategy
|
143
|
+
)
|
116
144
|
|
117
|
-
# Basic configuration
|
118
145
|
config = Configuration(
|
119
146
|
ocr_strategy=OcrStrategy.AUTO,
|
120
147
|
segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
|
@@ -122,8 +149,9 @@ config = Configuration(
|
|
122
149
|
expires_in=3600, # seconds
|
123
150
|
)
|
124
151
|
|
125
|
-
#
|
126
|
-
task = chunkr.upload("document.pdf", config)
|
152
|
+
# Works in both sync and async contexts
|
153
|
+
task = chunkr.upload("document.pdf", config) # sync
|
154
|
+
task = await chunkr.upload("document.pdf", config) # async
|
127
155
|
```
|
128
156
|
|
129
157
|
#### Available Configuration Examples
|
@@ -181,7 +209,7 @@ task = chunkr.upload("document.pdf", config)
|
|
181
209
|
)
|
182
210
|
```
|
183
211
|
|
184
|
-
## Environment
|
212
|
+
## Environment Setup
|
185
213
|
|
186
214
|
You can provide your API key and URL in several ways:
|
187
215
|
1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
|
@@ -193,3 +221,24 @@ chunkr = Chunkr(
|
|
193
221
|
url="https://api.chunkr.ai"
|
194
222
|
)
|
195
223
|
```
|
224
|
+
|
225
|
+
## Resource Management
|
226
|
+
|
227
|
+
It's recommended to properly close the client when you're done:
|
228
|
+
|
229
|
+
```python
|
230
|
+
# Sync context
|
231
|
+
chunkr = Chunkr()
|
232
|
+
try:
|
233
|
+
result = chunkr.upload("document.pdf")
|
234
|
+
finally:
|
235
|
+
chunkr.close()
|
236
|
+
|
237
|
+
# Async context
|
238
|
+
async def process():
|
239
|
+
chunkr = Chunkr()
|
240
|
+
try:
|
241
|
+
result = await chunkr.upload("document.pdf")
|
242
|
+
finally:
|
243
|
+
await chunkr.close()
|
244
|
+
```
|
@@ -0,0 +1,17 @@
|
|
1
|
+
chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
|
2
|
+
chunkr_ai/models.py,sha256=sEsnoJaL6wz-4R-cYg2WNl6Wmj4Ad_F8B0QuK9t2sZ8,749
|
3
|
+
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
|
6
|
+
chunkr_ai/api/chunkr.py,sha256=V56SP8qs7J2QKRCRM9NGlyA1TtDTdFmGYZWbwbFTK_I,2674
|
7
|
+
chunkr_ai/api/chunkr_base.py,sha256=TDqEwCCfgshggi_Mzv76FhPj5z21QP8EVj7siczvfao,9826
|
8
|
+
chunkr_ai/api/config.py,sha256=NmPTsDvcjkvNx0gNzDTz-oFG5rQC7jm-H70O_crJCw8,4478
|
9
|
+
chunkr_ai/api/decorators.py,sha256=PzaTaPBXUMHoSLz6P0sL5JXANFSJff2vjvESKNiOGQY,2566
|
10
|
+
chunkr_ai/api/misc.py,sha256=wUG4SpfEEo7NcVK47gmw42dRy9zT5F9S2DtVC4T4ERs,4877
|
11
|
+
chunkr_ai/api/protocol.py,sha256=Nt8aWr4ouVwCvoLqVI5vnXJhT2cvxt0sQC-svUk2G5w,458
|
12
|
+
chunkr_ai/api/task_response.py,sha256=I0_XJ6WYYu_TwbaSF95wqRPaOm2PhgMKnarxjAx-BZI,3857
|
13
|
+
chunkr_ai-0.0.19.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
chunkr_ai-0.0.19.dist-info/METADATA,sha256=azezU0CsGwjHNZet5YMKkY8G6yDVWeclXphs3ESTpSw,5696
|
15
|
+
chunkr_ai-0.0.19.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
16
|
+
chunkr_ai-0.0.19.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
17
|
+
chunkr_ai-0.0.19.dist-info/RECORD,,
|
chunkr_ai/api/base.py
DELETED
@@ -1,183 +0,0 @@
|
|
1
|
-
from .config import Configuration
|
2
|
-
from .task import TaskResponse
|
3
|
-
from .auth import HeadersMixin
|
4
|
-
from abc import abstractmethod
|
5
|
-
from dotenv import load_dotenv
|
6
|
-
import io
|
7
|
-
import json
|
8
|
-
import os
|
9
|
-
from pathlib import Path
|
10
|
-
from PIL import Image
|
11
|
-
import requests
|
12
|
-
from typing import BinaryIO, Tuple, Union
|
13
|
-
|
14
|
-
|
15
|
-
class ChunkrBase(HeadersMixin):
|
16
|
-
"""Base class with shared functionality for Chunkr API clients."""
|
17
|
-
|
18
|
-
def __init__(self, url: str = None, api_key: str = None):
|
19
|
-
load_dotenv()
|
20
|
-
self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
|
21
|
-
self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
|
22
|
-
if not self._api_key:
|
23
|
-
raise ValueError(
|
24
|
-
"API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
|
25
|
-
)
|
26
|
-
|
27
|
-
self.url = self.url.rstrip("/")
|
28
|
-
|
29
|
-
def _prepare_file(
|
30
|
-
self, file: Union[str, Path, BinaryIO, Image.Image]
|
31
|
-
) -> Tuple[str, BinaryIO]:
|
32
|
-
"""Convert various file types into a tuple of (filename, file-like object).
|
33
|
-
|
34
|
-
Args:
|
35
|
-
file: Input file, can be:
|
36
|
-
- String or Path to a file
|
37
|
-
- URL string starting with http:// or https://
|
38
|
-
- Base64 string
|
39
|
-
- Opened binary file (mode='rb')
|
40
|
-
- PIL/Pillow Image object
|
41
|
-
|
42
|
-
Returns:
|
43
|
-
Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
|
44
|
-
|
45
|
-
Raises:
|
46
|
-
FileNotFoundError: If the file path doesn't exist
|
47
|
-
TypeError: If the file type is not supported
|
48
|
-
ValueError: If the URL is invalid or unreachable
|
49
|
-
ValueError: If the MIME type is unsupported
|
50
|
-
"""
|
51
|
-
# Handle URLs
|
52
|
-
if isinstance(file, str) and (
|
53
|
-
file.startswith("http://") or file.startswith("https://")
|
54
|
-
):
|
55
|
-
response = requests.get(file)
|
56
|
-
response.raise_for_status()
|
57
|
-
file_obj = io.BytesIO(response.content)
|
58
|
-
filename = Path(file.split("/")[-1]).name or "downloaded_file"
|
59
|
-
return filename, file_obj
|
60
|
-
|
61
|
-
# Handle base64 strings
|
62
|
-
if isinstance(file, str) and "," in file and ";base64," in file:
|
63
|
-
try:
|
64
|
-
# Split header and data
|
65
|
-
header, base64_data = file.split(",", 1)
|
66
|
-
import base64
|
67
|
-
|
68
|
-
file_bytes = base64.b64decode(base64_data)
|
69
|
-
file_obj = io.BytesIO(file_bytes)
|
70
|
-
|
71
|
-
# Try to determine format from header
|
72
|
-
format = "bin"
|
73
|
-
mime_type = header.split(":")[-1].split(";")[0].lower()
|
74
|
-
|
75
|
-
# Map MIME types to file extensions
|
76
|
-
mime_to_ext = {
|
77
|
-
"application/pdf": "pdf",
|
78
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
79
|
-
"application/msword": "doc",
|
80
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
|
81
|
-
"application/vnd.ms-powerpoint": "ppt",
|
82
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
83
|
-
"application/vnd.ms-excel": "xls",
|
84
|
-
"image/jpeg": "jpg",
|
85
|
-
"image/png": "png",
|
86
|
-
"image/jpg": "jpg",
|
87
|
-
}
|
88
|
-
|
89
|
-
if mime_type in mime_to_ext:
|
90
|
-
format = mime_to_ext[mime_type]
|
91
|
-
else:
|
92
|
-
raise ValueError(f"Unsupported MIME type: {mime_type}")
|
93
|
-
|
94
|
-
return f"file.{format}", file_obj
|
95
|
-
except Exception as e:
|
96
|
-
raise ValueError(f"Invalid base64 string: {str(e)}")
|
97
|
-
|
98
|
-
# Handle file paths
|
99
|
-
if isinstance(file, (str, Path)):
|
100
|
-
path = Path(file).resolve()
|
101
|
-
if not path.exists():
|
102
|
-
raise FileNotFoundError(f"File not found: {file}")
|
103
|
-
return path.name, open(path, "rb")
|
104
|
-
|
105
|
-
# Handle PIL Images
|
106
|
-
if isinstance(file, Image.Image):
|
107
|
-
img_byte_arr = io.BytesIO()
|
108
|
-
format = file.format or "PNG"
|
109
|
-
file.save(img_byte_arr, format=format)
|
110
|
-
img_byte_arr.seek(0)
|
111
|
-
return f"image.{format.lower()}", img_byte_arr
|
112
|
-
|
113
|
-
# Handle file-like objects
|
114
|
-
if hasattr(file, "read") and hasattr(file, "seek"):
|
115
|
-
# Try to get the filename from the file object if possible
|
116
|
-
name = (
|
117
|
-
getattr(file, "name", "document")
|
118
|
-
if hasattr(file, "name")
|
119
|
-
else "document"
|
120
|
-
)
|
121
|
-
return Path(name).name, file
|
122
|
-
|
123
|
-
raise TypeError(f"Unsupported file type: {type(file)}")
|
124
|
-
|
125
|
-
def _prepare_upload_data(
|
126
|
-
self,
|
127
|
-
file: Union[str, Path, BinaryIO, Image.Image],
|
128
|
-
config: Configuration = None,
|
129
|
-
) -> Tuple[dict, dict]:
|
130
|
-
"""Prepare files and data dictionaries for upload.
|
131
|
-
|
132
|
-
Args:
|
133
|
-
file: The file to upload
|
134
|
-
config: Optional configuration settings
|
135
|
-
|
136
|
-
Returns:
|
137
|
-
Tuple[dict, dict]: (files dict, data dict) ready for upload
|
138
|
-
"""
|
139
|
-
filename, file_obj = self._prepare_file(file)
|
140
|
-
files = {"file": (filename, file_obj)}
|
141
|
-
data = {}
|
142
|
-
|
143
|
-
if config:
|
144
|
-
config_dict = config.model_dump(mode="json", exclude_none=True)
|
145
|
-
for key, value in config_dict.items():
|
146
|
-
if isinstance(value, dict):
|
147
|
-
files[key] = (None, json.dumps(value), "application/json")
|
148
|
-
else:
|
149
|
-
data[key] = value
|
150
|
-
|
151
|
-
return files, data
|
152
|
-
|
153
|
-
@abstractmethod
|
154
|
-
def upload(
|
155
|
-
self,
|
156
|
-
file: Union[str, Path, BinaryIO, Image.Image],
|
157
|
-
config: Configuration = None,
|
158
|
-
) -> TaskResponse:
|
159
|
-
"""Upload a file and wait for processing to complete.
|
160
|
-
|
161
|
-
Must be implemented by subclasses.
|
162
|
-
"""
|
163
|
-
pass
|
164
|
-
|
165
|
-
@abstractmethod
|
166
|
-
def start_upload(
|
167
|
-
self,
|
168
|
-
file: Union[str, Path, BinaryIO, Image.Image],
|
169
|
-
config: Configuration = None,
|
170
|
-
) -> TaskResponse:
|
171
|
-
"""Upload a file for processing and immediately return the task response.
|
172
|
-
|
173
|
-
Must be implemented by subclasses.
|
174
|
-
"""
|
175
|
-
pass
|
176
|
-
|
177
|
-
@abstractmethod
|
178
|
-
def get_task(self, task_id: str) -> TaskResponse:
|
179
|
-
"""Get a task response by its ID.
|
180
|
-
|
181
|
-
Must be implemented by subclasses.
|
182
|
-
"""
|
183
|
-
pass
|
chunkr_ai/api/chunkr_async.py
DELETED
@@ -1,120 +0,0 @@
|
|
1
|
-
from .chunkr_base import ChunkrBase
|
2
|
-
from .config import Configuration
|
3
|
-
from .misc import prepare_upload_data
|
4
|
-
from .task_async import TaskResponseAsync
|
5
|
-
import httpx
|
6
|
-
from pathlib import Path
|
7
|
-
from PIL import Image
|
8
|
-
from typing import Union, BinaryIO
|
9
|
-
|
10
|
-
|
11
|
-
class ChunkrAsync(ChunkrBase):
|
12
|
-
"""Asynchronous Chunkr API client"""
|
13
|
-
|
14
|
-
def __init__(self, url: str = None, api_key: str = None):
|
15
|
-
super().__init__(url, api_key)
|
16
|
-
self._client = httpx.AsyncClient()
|
17
|
-
|
18
|
-
async def upload(
|
19
|
-
self,
|
20
|
-
file: Union[str, Path, BinaryIO, Image.Image],
|
21
|
-
config: Configuration = None,
|
22
|
-
) -> TaskResponseAsync:
|
23
|
-
if not self._client or self._client.is_closed:
|
24
|
-
self._client = httpx.AsyncClient()
|
25
|
-
try:
|
26
|
-
task = await self.create_task(file, config)
|
27
|
-
return await task.poll()
|
28
|
-
except Exception as e:
|
29
|
-
await self._client.aclose()
|
30
|
-
raise e
|
31
|
-
|
32
|
-
async def update(self, task_id: str, config: Configuration) -> TaskResponseAsync:
|
33
|
-
if not self._client or self._client.is_closed:
|
34
|
-
self._client = httpx.AsyncClient()
|
35
|
-
try:
|
36
|
-
task = await self.update_task(task_id, config)
|
37
|
-
return await task.poll()
|
38
|
-
except Exception as e:
|
39
|
-
await self._client.aclose()
|
40
|
-
raise e
|
41
|
-
|
42
|
-
async def create_task(
|
43
|
-
self,
|
44
|
-
file: Union[str, Path, BinaryIO, Image.Image],
|
45
|
-
config: Configuration = None,
|
46
|
-
) -> TaskResponseAsync:
|
47
|
-
if not self._client or self._client.is_closed:
|
48
|
-
self._client = httpx.AsyncClient()
|
49
|
-
try:
|
50
|
-
files = prepare_upload_data(file, config)
|
51
|
-
r = await self._client.post(
|
52
|
-
f"{self.url}/api/v1/task", files=files, headers=self._headers()
|
53
|
-
)
|
54
|
-
r.raise_for_status()
|
55
|
-
return TaskResponseAsync(**r.json()).with_client(self)
|
56
|
-
except Exception as e:
|
57
|
-
await self._client.aclose()
|
58
|
-
raise e
|
59
|
-
|
60
|
-
async def update_task(
|
61
|
-
self, task_id: str, config: Configuration
|
62
|
-
) -> TaskResponseAsync:
|
63
|
-
if not self._client or self._client.is_closed:
|
64
|
-
self._client = httpx.AsyncClient()
|
65
|
-
try:
|
66
|
-
files = prepare_upload_data(None, config)
|
67
|
-
r = await self._client.patch(
|
68
|
-
f"{self.url}/api/v1/task/{task_id}",
|
69
|
-
files=files,
|
70
|
-
headers=self._headers(),
|
71
|
-
)
|
72
|
-
|
73
|
-
r.raise_for_status()
|
74
|
-
return TaskResponseAsync(**r.json()).with_client(self)
|
75
|
-
except Exception as e:
|
76
|
-
await self._client.aclose()
|
77
|
-
raise e
|
78
|
-
|
79
|
-
async def get_task(self, task_id: str) -> TaskResponseAsync:
|
80
|
-
if not self._client or self._client.is_closed:
|
81
|
-
self._client = httpx.AsyncClient()
|
82
|
-
try:
|
83
|
-
r = await self._client.get(
|
84
|
-
f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
|
85
|
-
)
|
86
|
-
r.raise_for_status()
|
87
|
-
return TaskResponseAsync(**r.json()).with_client(self)
|
88
|
-
except Exception as e:
|
89
|
-
await self._client.aclose()
|
90
|
-
raise e
|
91
|
-
|
92
|
-
async def delete_task(self, task_id: str) -> None:
|
93
|
-
if not self._client or self._client.is_closed:
|
94
|
-
self._client = httpx.AsyncClient()
|
95
|
-
try:
|
96
|
-
r = await self._client.delete(
|
97
|
-
f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
|
98
|
-
)
|
99
|
-
r.raise_for_status()
|
100
|
-
except Exception as e:
|
101
|
-
await self._client.aclose()
|
102
|
-
raise e
|
103
|
-
|
104
|
-
async def cancel_task(self, task_id: str) -> None:
|
105
|
-
if not self._client or self._client.is_closed:
|
106
|
-
self._client = httpx.AsyncClient()
|
107
|
-
try:
|
108
|
-
r = await self._client.get(
|
109
|
-
f"{self.url}/api/v1/task/{task_id}/cancel", headers=self._headers()
|
110
|
-
)
|
111
|
-
r.raise_for_status()
|
112
|
-
except Exception as e:
|
113
|
-
await self._client.aclose()
|
114
|
-
raise e
|
115
|
-
|
116
|
-
async def __aenter__(self):
|
117
|
-
return self
|
118
|
-
|
119
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
120
|
-
await self._client.aclose()
|