chunkr-ai 0.0.12__tar.gz → 0.0.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chunkr_ai-0.0.12/src/chunkr_ai.egg-info → chunkr_ai-0.0.14}/PKG-INFO +2 -3
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/README.md +1 -1
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/pyproject.toml +1 -2
- chunkr_ai-0.0.14/src/chunkr_ai/api/api.py +0 -0
- chunkr_ai-0.0.14/src/chunkr_ai/api/base.py +173 -0
- chunkr_ai-0.0.14/src/chunkr_ai/api/chunkr.py +77 -0
- chunkr_ai-0.0.14/src/chunkr_ai/api/chunkr_async.py +114 -0
- chunkr_ai-0.0.12/src/chunkr_ai/api/chunkr.py → chunkr_ai-0.0.14/src/chunkr_ai/api/chunkr_base.py +63 -75
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai/api/protocol.py +4 -4
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai/api/task.py +12 -4
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai/api/task_async.py +11 -1
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai/api/task_base.py +4 -5
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14/src/chunkr_ai.egg-info}/PKG-INFO +2 -3
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai.egg-info/SOURCES.txt +2 -0
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai.egg-info/requires.txt +0 -1
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/tests/test_chunkr.py +15 -25
- chunkr_ai-0.0.12/src/chunkr_ai/api/chunkr_async.py +0 -144
- chunkr_ai-0.0.12/src/chunkr_ai/api/chunkr_base.py +0 -85
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/LICENSE +0 -0
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/setup.cfg +0 -0
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai/__init__.py +0 -0
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai/api/auth.py +0 -0
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai/api/config.py +0 -0
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai/api/misc.py +0 -0
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai/api/schema.py +0 -0
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai/models.py +0 -0
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.12 → chunkr_ai-0.0.14}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -1,13 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.14
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
Project-URL: Homepage, https://chunkr.ai
|
7
7
|
Description-Content-Type: text/markdown
|
8
8
|
License-File: LICENSE
|
9
9
|
Requires-Dist: httpx>=0.25.0
|
10
|
-
Requires-Dist: httpx>=0.25.0
|
11
10
|
Requires-Dist: pillow>=10.0.0
|
12
11
|
Requires-Dist: pydantic>=2.0.0
|
13
12
|
Requires-Dist: pytest-asyncio>=0.21.0
|
@@ -81,7 +80,7 @@ async def process_document():
|
|
81
80
|
# If you want to upload without waiting for processing
|
82
81
|
task = await chunkr.start_upload("document.pdf")
|
83
82
|
# ... do other things ...
|
84
|
-
await task.
|
83
|
+
await task.poll() # Check status when needed
|
85
84
|
```
|
86
85
|
|
87
86
|
### Additional Features
|
@@ -62,7 +62,7 @@ async def process_document():
|
|
62
62
|
# If you want to upload without waiting for processing
|
63
63
|
task = await chunkr.start_upload("document.pdf")
|
64
64
|
# ... do other things ...
|
65
|
-
await task.
|
65
|
+
await task.poll() # Check status when needed
|
66
66
|
```
|
67
67
|
|
68
68
|
### Additional Features
|
@@ -4,14 +4,13 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.0.
|
7
|
+
version = "0.0.14"
|
8
8
|
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
9
|
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
11
11
|
license = {"file" = "LICENSE"}
|
12
12
|
urls = {Homepage = "https://chunkr.ai"}
|
13
13
|
dependencies = [
|
14
|
-
"httpx>=0.25.0",
|
15
14
|
"httpx>=0.25.0",
|
16
15
|
"pillow>=10.0.0",
|
17
16
|
"pydantic>=2.0.0",
|
File without changes
|
@@ -0,0 +1,173 @@
|
|
1
|
+
from .config import Configuration
|
2
|
+
from .task import TaskResponse
|
3
|
+
from .auth import HeadersMixin
|
4
|
+
from abc import abstractmethod
|
5
|
+
from dotenv import load_dotenv
|
6
|
+
import io
|
7
|
+
import json
|
8
|
+
import os
|
9
|
+
from pathlib import Path
|
10
|
+
from PIL import Image
|
11
|
+
import requests
|
12
|
+
from typing import BinaryIO, Tuple, Union
|
13
|
+
|
14
|
+
class ChunkrBase(HeadersMixin):
|
15
|
+
"""Base class with shared functionality for Chunkr API clients."""
|
16
|
+
|
17
|
+
def __init__(self, url: str = None, api_key: str = None):
|
18
|
+
load_dotenv()
|
19
|
+
self.url = (
|
20
|
+
url or
|
21
|
+
os.getenv('CHUNKR_URL') or
|
22
|
+
'https://api.chunkr.ai'
|
23
|
+
)
|
24
|
+
self._api_key = (
|
25
|
+
api_key or
|
26
|
+
os.getenv('CHUNKR_API_KEY')
|
27
|
+
)
|
28
|
+
if not self._api_key:
|
29
|
+
raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
|
30
|
+
|
31
|
+
self.url = self.url.rstrip("/")
|
32
|
+
|
33
|
+
def _prepare_file(
|
34
|
+
self,
|
35
|
+
file: Union[str, Path, BinaryIO, Image.Image]
|
36
|
+
) -> Tuple[str, BinaryIO]:
|
37
|
+
"""Convert various file types into a tuple of (filename, file-like object).
|
38
|
+
|
39
|
+
Args:
|
40
|
+
file: Input file, can be:
|
41
|
+
- String or Path to a file
|
42
|
+
- URL string starting with http:// or https://
|
43
|
+
- Base64 string
|
44
|
+
- Opened binary file (mode='rb')
|
45
|
+
- PIL/Pillow Image object
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
|
49
|
+
|
50
|
+
Raises:
|
51
|
+
FileNotFoundError: If the file path doesn't exist
|
52
|
+
TypeError: If the file type is not supported
|
53
|
+
ValueError: If the URL is invalid or unreachable
|
54
|
+
ValueError: If the MIME type is unsupported
|
55
|
+
"""
|
56
|
+
# Handle URLs
|
57
|
+
if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
|
58
|
+
response = requests.get(file)
|
59
|
+
response.raise_for_status()
|
60
|
+
file_obj = io.BytesIO(response.content)
|
61
|
+
filename = Path(file.split('/')[-1]).name or 'downloaded_file'
|
62
|
+
return filename, file_obj
|
63
|
+
|
64
|
+
# Handle base64 strings
|
65
|
+
if isinstance(file, str) and ',' in file and ';base64,' in file:
|
66
|
+
try:
|
67
|
+
# Split header and data
|
68
|
+
header, base64_data = file.split(',', 1)
|
69
|
+
import base64
|
70
|
+
file_bytes = base64.b64decode(base64_data)
|
71
|
+
file_obj = io.BytesIO(file_bytes)
|
72
|
+
|
73
|
+
# Try to determine format from header
|
74
|
+
format = 'bin'
|
75
|
+
mime_type = header.split(':')[-1].split(';')[0].lower()
|
76
|
+
|
77
|
+
# Map MIME types to file extensions
|
78
|
+
mime_to_ext = {
|
79
|
+
'application/pdf': 'pdf',
|
80
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
81
|
+
'application/msword': 'doc',
|
82
|
+
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
|
83
|
+
'application/vnd.ms-powerpoint': 'ppt',
|
84
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
|
85
|
+
'application/vnd.ms-excel': 'xls',
|
86
|
+
'image/jpeg': 'jpg',
|
87
|
+
'image/png': 'png',
|
88
|
+
'image/jpg': 'jpg'
|
89
|
+
}
|
90
|
+
|
91
|
+
if mime_type in mime_to_ext:
|
92
|
+
format = mime_to_ext[mime_type]
|
93
|
+
else:
|
94
|
+
raise ValueError(f"Unsupported MIME type: {mime_type}")
|
95
|
+
|
96
|
+
return f"file.{format}", file_obj
|
97
|
+
except Exception as e:
|
98
|
+
raise ValueError(f"Invalid base64 string: {str(e)}")
|
99
|
+
|
100
|
+
# Handle file paths
|
101
|
+
if isinstance(file, (str, Path)):
|
102
|
+
path = Path(file).resolve()
|
103
|
+
if not path.exists():
|
104
|
+
raise FileNotFoundError(f"File not found: {file}")
|
105
|
+
return path.name, open(path, 'rb')
|
106
|
+
|
107
|
+
# Handle PIL Images
|
108
|
+
if isinstance(file, Image.Image):
|
109
|
+
img_byte_arr = io.BytesIO()
|
110
|
+
format = file.format or 'PNG'
|
111
|
+
file.save(img_byte_arr, format=format)
|
112
|
+
img_byte_arr.seek(0)
|
113
|
+
return f"image.{format.lower()}", img_byte_arr
|
114
|
+
|
115
|
+
# Handle file-like objects
|
116
|
+
if hasattr(file, 'read') and hasattr(file, 'seek'):
|
117
|
+
# Try to get the filename from the file object if possible
|
118
|
+
name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
|
119
|
+
return Path(name).name, file
|
120
|
+
|
121
|
+
raise TypeError(f"Unsupported file type: {type(file)}")
|
122
|
+
|
123
|
+
def _prepare_upload_data(
|
124
|
+
self,
|
125
|
+
file: Union[str, Path, BinaryIO, Image.Image],
|
126
|
+
config: Configuration = None
|
127
|
+
) -> Tuple[dict, dict]:
|
128
|
+
"""Prepare files and data dictionaries for upload.
|
129
|
+
|
130
|
+
Args:
|
131
|
+
file: The file to upload
|
132
|
+
config: Optional configuration settings
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
Tuple[dict, dict]: (files dict, data dict) ready for upload
|
136
|
+
"""
|
137
|
+
filename, file_obj = self._prepare_file(file)
|
138
|
+
files = {"file": (filename, file_obj)}
|
139
|
+
data = {}
|
140
|
+
|
141
|
+
if config:
|
142
|
+
config_dict = config.model_dump(mode="json", exclude_none=True)
|
143
|
+
for key, value in config_dict.items():
|
144
|
+
if isinstance(value, dict):
|
145
|
+
files[key] = (None, json.dumps(value), 'application/json')
|
146
|
+
else:
|
147
|
+
data[key] = value
|
148
|
+
|
149
|
+
return files, data
|
150
|
+
|
151
|
+
@abstractmethod
|
152
|
+
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
153
|
+
"""Upload a file and wait for processing to complete.
|
154
|
+
|
155
|
+
Must be implemented by subclasses.
|
156
|
+
"""
|
157
|
+
pass
|
158
|
+
|
159
|
+
@abstractmethod
|
160
|
+
def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
161
|
+
"""Upload a file for processing and immediately return the task response.
|
162
|
+
|
163
|
+
Must be implemented by subclasses.
|
164
|
+
"""
|
165
|
+
pass
|
166
|
+
|
167
|
+
@abstractmethod
|
168
|
+
def get_task(self, task_id: str) -> TaskResponse:
|
169
|
+
"""Get a task response by its ID.
|
170
|
+
|
171
|
+
Must be implemented by subclasses.
|
172
|
+
"""
|
173
|
+
pass
|
@@ -0,0 +1,77 @@
|
|
1
|
+
from .chunkr_base import ChunkrBase
|
2
|
+
from .config import Configuration
|
3
|
+
from .task import TaskResponse
|
4
|
+
from pathlib import Path
|
5
|
+
from PIL import Image
|
6
|
+
import requests
|
7
|
+
from typing import Union, BinaryIO
|
8
|
+
from .misc import prepare_upload_data
|
9
|
+
|
10
|
+
class Chunkr(ChunkrBase):
|
11
|
+
"""Chunkr API client"""
|
12
|
+
|
13
|
+
def __init__(self, url: str = None, api_key: str = None):
|
14
|
+
super().__init__(url, api_key)
|
15
|
+
self._session = requests.Session()
|
16
|
+
|
17
|
+
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
18
|
+
task = self.create_task(file, config)
|
19
|
+
return task.poll()
|
20
|
+
|
21
|
+
def update(self, task_id: str, config: Configuration) -> TaskResponse:
|
22
|
+
task = self.update_task(task_id, config)
|
23
|
+
return task.poll()
|
24
|
+
|
25
|
+
def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
26
|
+
files= prepare_upload_data(file, config)
|
27
|
+
if not self._session:
|
28
|
+
raise ValueError("Session not found")
|
29
|
+
r = self._session.post(
|
30
|
+
f"{self.url}/api/v1/task",
|
31
|
+
files=files,
|
32
|
+
headers=self._headers()
|
33
|
+
)
|
34
|
+
r.raise_for_status()
|
35
|
+
return TaskResponse(**r.json()).with_client(self)
|
36
|
+
|
37
|
+
def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
38
|
+
files = prepare_upload_data(None, config)
|
39
|
+
if not self._session:
|
40
|
+
raise ValueError("Session not found")
|
41
|
+
r = self._session.patch(
|
42
|
+
f"{self.url}/api/v1/task/{task_id}",
|
43
|
+
files=files,
|
44
|
+
headers=self._headers()
|
45
|
+
)
|
46
|
+
|
47
|
+
r.raise_for_status()
|
48
|
+
return TaskResponse(**r.json()).with_client(self)
|
49
|
+
|
50
|
+
def get_task(self, task_id: str) -> TaskResponse:
|
51
|
+
if not self._session:
|
52
|
+
raise ValueError("Session not found")
|
53
|
+
r = self._session.get(
|
54
|
+
f"{self.url}/api/v1/task/{task_id}",
|
55
|
+
headers=self._headers()
|
56
|
+
)
|
57
|
+
r.raise_for_status()
|
58
|
+
return TaskResponse(**r.json()).with_client(self)
|
59
|
+
|
60
|
+
|
61
|
+
def delete_task(self, task_id: str) -> None:
|
62
|
+
if not self._session:
|
63
|
+
raise ValueError("Session not found")
|
64
|
+
r = self._session.delete(
|
65
|
+
f"{self.url}/api/v1/task/{task_id}",
|
66
|
+
headers=self._headers()
|
67
|
+
)
|
68
|
+
r.raise_for_status()
|
69
|
+
|
70
|
+
def cancel_task(self, task_id: str) -> None:
|
71
|
+
if not self._session:
|
72
|
+
raise ValueError("Session not found")
|
73
|
+
r = self._session.get(
|
74
|
+
f"{self.url}/api/v1/task/{task_id}/cancel",
|
75
|
+
headers=self._headers()
|
76
|
+
)
|
77
|
+
r.raise_for_status()
|
@@ -0,0 +1,114 @@
|
|
1
|
+
from .chunkr_base import ChunkrBase
|
2
|
+
from .config import Configuration
|
3
|
+
from .misc import prepare_upload_data
|
4
|
+
from .task_async import TaskResponseAsync
|
5
|
+
import httpx
|
6
|
+
from pathlib import Path
|
7
|
+
from PIL import Image
|
8
|
+
from typing import Union, BinaryIO
|
9
|
+
|
10
|
+
class ChunkrAsync(ChunkrBase):
|
11
|
+
"""Asynchronous Chunkr API client"""
|
12
|
+
|
13
|
+
def __init__(self, url: str = None, api_key: str = None):
|
14
|
+
super().__init__(url, api_key)
|
15
|
+
self._client = httpx.AsyncClient()
|
16
|
+
|
17
|
+
async def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponseAsync:
|
18
|
+
if not self._client or self._client.is_closed:
|
19
|
+
self._client = httpx.AsyncClient()
|
20
|
+
try:
|
21
|
+
task = await self.create_task(file, config)
|
22
|
+
return await task.poll()
|
23
|
+
except Exception as e:
|
24
|
+
await self._client.aclose()
|
25
|
+
raise e
|
26
|
+
|
27
|
+
async def update(self, task_id: str, config: Configuration) -> TaskResponseAsync:
|
28
|
+
if not self._client or self._client.is_closed:
|
29
|
+
self._client = httpx.AsyncClient()
|
30
|
+
try:
|
31
|
+
task = await self.update_task(task_id, config)
|
32
|
+
return await task.poll()
|
33
|
+
except Exception as e:
|
34
|
+
await self._client.aclose()
|
35
|
+
raise e
|
36
|
+
|
37
|
+
async def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponseAsync:
|
38
|
+
if not self._client or self._client.is_closed:
|
39
|
+
self._client = httpx.AsyncClient()
|
40
|
+
try:
|
41
|
+
files = prepare_upload_data(file, config)
|
42
|
+
r = await self._client.post(
|
43
|
+
f"{self.url}/api/v1/task",
|
44
|
+
files=files,
|
45
|
+
headers=self._headers()
|
46
|
+
)
|
47
|
+
r.raise_for_status()
|
48
|
+
return TaskResponseAsync(**r.json()).with_client(self)
|
49
|
+
except Exception as e:
|
50
|
+
await self._client.aclose()
|
51
|
+
raise e
|
52
|
+
|
53
|
+
async def update_task(self, task_id: str, config: Configuration) -> TaskResponseAsync:
|
54
|
+
if not self._client or self._client.is_closed:
|
55
|
+
self._client = httpx.AsyncClient()
|
56
|
+
try:
|
57
|
+
files = prepare_upload_data(None, config)
|
58
|
+
r = await self._client.patch(
|
59
|
+
f"{self.url}/api/v1/task/{task_id}",
|
60
|
+
files=files,
|
61
|
+
headers=self._headers()
|
62
|
+
)
|
63
|
+
|
64
|
+
r.raise_for_status()
|
65
|
+
return TaskResponseAsync(**r.json()).with_client(self)
|
66
|
+
except Exception as e:
|
67
|
+
await self._client.aclose()
|
68
|
+
raise e
|
69
|
+
|
70
|
+
async def get_task(self, task_id: str) -> TaskResponseAsync:
|
71
|
+
if not self._client or self._client.is_closed:
|
72
|
+
self._client = httpx.AsyncClient()
|
73
|
+
try:
|
74
|
+
r = await self._client.get(
|
75
|
+
f"{self.url}/api/v1/task/{task_id}",
|
76
|
+
headers=self._headers()
|
77
|
+
)
|
78
|
+
r.raise_for_status()
|
79
|
+
return TaskResponseAsync(**r.json()).with_client(self)
|
80
|
+
except Exception as e:
|
81
|
+
await self._client.aclose()
|
82
|
+
raise e
|
83
|
+
|
84
|
+
async def delete_task(self, task_id: str) -> None:
|
85
|
+
if not self._client or self._client.is_closed:
|
86
|
+
self._client = httpx.AsyncClient()
|
87
|
+
try:
|
88
|
+
r = await self._client.delete(
|
89
|
+
f"{self.url}/api/v1/task/{task_id}",
|
90
|
+
headers=self._headers()
|
91
|
+
)
|
92
|
+
r.raise_for_status()
|
93
|
+
except Exception as e:
|
94
|
+
await self._client.aclose()
|
95
|
+
raise e
|
96
|
+
|
97
|
+
async def cancel_task(self, task_id: str) -> None:
|
98
|
+
if not self._client or self._client.is_closed:
|
99
|
+
self._client = httpx.AsyncClient()
|
100
|
+
try:
|
101
|
+
r = await self._client.get(
|
102
|
+
f"{self.url}/api/v1/task/{task_id}/cancel",
|
103
|
+
headers=self._headers()
|
104
|
+
)
|
105
|
+
r.raise_for_status()
|
106
|
+
except Exception as e:
|
107
|
+
await self._client.aclose()
|
108
|
+
raise e
|
109
|
+
|
110
|
+
async def __aenter__(self):
|
111
|
+
return self
|
112
|
+
|
113
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
114
|
+
await self._client.aclose()
|
chunkr_ai-0.0.12/src/chunkr_ai/api/chunkr.py → chunkr_ai-0.0.14/src/chunkr_ai/api/chunkr_base.py
RENAMED
@@ -1,20 +1,35 @@
|
|
1
|
-
from .chunkr_base import ChunkrBase
|
2
1
|
from .config import Configuration
|
3
2
|
from .task import TaskResponse
|
3
|
+
from .task_async import TaskResponseAsync
|
4
|
+
from .auth import HeadersMixin
|
5
|
+
from abc import abstractmethod
|
6
|
+
from dotenv import load_dotenv
|
7
|
+
import os
|
4
8
|
from pathlib import Path
|
5
9
|
from PIL import Image
|
6
|
-
import
|
7
|
-
from typing import Union, BinaryIO
|
8
|
-
from .misc import prepare_upload_data
|
10
|
+
from typing import BinaryIO, Union
|
9
11
|
|
10
|
-
class
|
11
|
-
"""Chunkr API
|
12
|
+
class ChunkrBase(HeadersMixin):
|
13
|
+
"""Base class with shared functionality for Chunkr API clients."""
|
12
14
|
|
13
15
|
def __init__(self, url: str = None, api_key: str = None):
|
14
|
-
|
15
|
-
self.
|
16
|
+
load_dotenv()
|
17
|
+
self.url = (
|
18
|
+
url or
|
19
|
+
os.getenv('CHUNKR_URL') or
|
20
|
+
'https://api.chunkr.ai'
|
21
|
+
)
|
22
|
+
self._api_key = (
|
23
|
+
api_key or
|
24
|
+
os.getenv('CHUNKR_API_KEY')
|
25
|
+
)
|
26
|
+
if not self._api_key:
|
27
|
+
raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
|
28
|
+
|
29
|
+
self.url = self.url.rstrip("/")
|
16
30
|
|
17
|
-
|
31
|
+
@abstractmethod
|
32
|
+
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> Union[TaskResponse, TaskResponseAsync]:
|
18
33
|
"""Upload a file and wait for processing to complete.
|
19
34
|
|
20
35
|
Args:
|
@@ -22,32 +37,32 @@ class Chunkr(ChunkrBase):
|
|
22
37
|
config: Configuration options for processing. Optional.
|
23
38
|
|
24
39
|
Examples:
|
25
|
-
```
|
40
|
+
```python
|
26
41
|
# Upload from file path
|
27
|
-
chunkr.upload("document.pdf")
|
42
|
+
await chunkr.upload("document.pdf")
|
28
43
|
|
44
|
+
# Upload from opened file
|
45
|
+
with open("document.pdf", "rb") as f:
|
46
|
+
await chunkr.upload(f)
|
47
|
+
|
29
48
|
# Upload from URL
|
30
|
-
chunkr.upload("https://example.com/document.pdf")
|
49
|
+
await chunkr.upload("https://example.com/document.pdf")
|
31
50
|
|
32
51
|
# Upload from base64 string (must include MIME type header)
|
33
|
-
chunkr.upload("data:application/pdf;base64,
|
34
|
-
|
35
|
-
# Upload from opened file
|
36
|
-
with open("document.pdf", "rb") as f:
|
37
|
-
chunkr.upload(f)
|
52
|
+
await chunkr.upload("data:application/pdf;base64,JVBERi0...")
|
38
53
|
|
39
54
|
# Upload an image
|
40
55
|
from PIL import Image
|
41
56
|
img = Image.open("photo.jpg")
|
42
|
-
chunkr.upload(img)
|
57
|
+
await chunkr.upload(img)
|
43
58
|
```
|
44
59
|
Returns:
|
45
60
|
TaskResponse: The completed task response
|
46
61
|
"""
|
47
|
-
|
48
|
-
return task.poll()
|
62
|
+
pass
|
49
63
|
|
50
|
-
|
64
|
+
@abstractmethod
|
65
|
+
def update(self, task_id: str, config: Configuration) -> Union[TaskResponse, TaskResponseAsync]:
|
51
66
|
"""Update a task by its ID and wait for processing to complete.
|
52
67
|
|
53
68
|
Args:
|
@@ -57,11 +72,11 @@ class Chunkr(ChunkrBase):
|
|
57
72
|
Returns:
|
58
73
|
TaskResponse: The updated task response
|
59
74
|
"""
|
60
|
-
|
61
|
-
return task.poll()
|
75
|
+
pass
|
62
76
|
|
63
|
-
|
64
|
-
|
77
|
+
@abstractmethod
|
78
|
+
def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> Union[TaskResponse, TaskResponseAsync]:
|
79
|
+
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
|
65
80
|
|
66
81
|
Args:
|
67
82
|
file: The file to upload.
|
@@ -70,60 +85,44 @@ class Chunkr(ChunkrBase):
|
|
70
85
|
Examples:
|
71
86
|
```
|
72
87
|
# Upload from file path
|
73
|
-
task = chunkr.
|
88
|
+
task = await chunkr.create_task("document.pdf")
|
74
89
|
|
75
90
|
# Upload from opened file
|
76
91
|
with open("document.pdf", "rb") as f:
|
77
|
-
task = chunkr.
|
78
|
-
|
92
|
+
task = await chunkr.create_task(f)
|
93
|
+
|
79
94
|
# Upload from URL
|
80
|
-
task = chunkr.
|
95
|
+
task = await chunkr.create_task("https://example.com/document.pdf")
|
81
96
|
|
82
97
|
# Upload from base64 string (must include MIME type header)
|
83
|
-
task = chunkr.
|
98
|
+
task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
84
99
|
|
85
100
|
# Upload an image
|
86
101
|
from PIL import Image
|
87
102
|
img = Image.open("photo.jpg")
|
88
|
-
task = chunkr.
|
103
|
+
task = await chunkr.create_task(img)
|
89
104
|
|
90
105
|
# Wait for the task to complete - this can be done when needed
|
91
|
-
task.poll()
|
106
|
+
await task.poll()
|
92
107
|
```
|
93
|
-
|
94
|
-
Returns:
|
95
|
-
TaskResponse: The initial task response
|
96
108
|
"""
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
)
|
103
|
-
r.raise_for_status()
|
104
|
-
return TaskResponse(**r.json()).with_client(self)
|
105
|
-
|
106
|
-
def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
107
|
-
"""Update a task by its ID.
|
109
|
+
pass
|
110
|
+
|
111
|
+
@abstractmethod
|
112
|
+
def update_task(self, task_id: str, config: Configuration) -> Union[TaskResponse, TaskResponseAsync]:
|
113
|
+
"""Update a task by its ID and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
|
108
114
|
|
109
115
|
Args:
|
110
116
|
task_id: The ID of the task to update
|
111
|
-
config:
|
117
|
+
config: Configuration options for processing. Optional.
|
112
118
|
|
113
119
|
Returns:
|
114
120
|
TaskResponse: The updated task response
|
115
121
|
"""
|
116
|
-
|
117
|
-
r = self._session.patch(
|
118
|
-
f"{self.url}/api/v1/task/{task_id}",
|
119
|
-
files=files,
|
120
|
-
headers=self._headers()
|
121
|
-
)
|
122
|
-
|
123
|
-
r.raise_for_status()
|
124
|
-
return TaskResponse(**r.json()).with_client(self)
|
122
|
+
pass
|
125
123
|
|
126
|
-
|
124
|
+
@abstractmethod
|
125
|
+
def get_task(self, task_id: str) -> Union[TaskResponse, TaskResponseAsync]:
|
127
126
|
"""Get a task response by its ID.
|
128
127
|
|
129
128
|
Args:
|
@@ -132,34 +131,23 @@ class Chunkr(ChunkrBase):
|
|
132
131
|
Returns:
|
133
132
|
TaskResponse: The task response
|
134
133
|
"""
|
135
|
-
|
136
|
-
f"{self.url}/api/v1/task/{task_id}",
|
137
|
-
headers=self._headers()
|
138
|
-
)
|
139
|
-
r.raise_for_status()
|
140
|
-
return TaskResponse(**r.json()).with_client(self)
|
141
|
-
|
134
|
+
pass
|
142
135
|
|
136
|
+
@abstractmethod
|
143
137
|
def delete_task(self, task_id: str) -> None:
|
144
138
|
"""Delete a task by its ID.
|
145
139
|
|
146
140
|
Args:
|
147
141
|
task_id: The ID of the task to delete
|
148
142
|
"""
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
)
|
153
|
-
r.raise_for_status()
|
154
|
-
|
143
|
+
pass
|
144
|
+
|
145
|
+
@abstractmethod
|
155
146
|
def cancel_task(self, task_id: str) -> None:
|
156
147
|
"""Cancel a task by its ID.
|
157
148
|
|
158
149
|
Args:
|
159
150
|
task_id: The ID of the task to cancel
|
160
151
|
"""
|
161
|
-
|
162
|
-
|
163
|
-
headers=self._headers()
|
164
|
-
)
|
165
|
-
r.raise_for_status()
|
152
|
+
pass
|
153
|
+
|
@@ -1,14 +1,14 @@
|
|
1
|
-
from typing import runtime_checkable, Protocol
|
1
|
+
from typing import Optional, runtime_checkable, Protocol
|
2
2
|
from requests import Session
|
3
|
-
from
|
3
|
+
from aiohttp import ClientSession
|
4
4
|
|
5
5
|
@runtime_checkable
|
6
6
|
class ChunkrClientProtocol(Protocol):
|
7
7
|
"""Protocol defining the interface for Chunkr clients"""
|
8
8
|
url: str
|
9
9
|
_api_key: str
|
10
|
-
_session: Session
|
11
|
-
_client:
|
10
|
+
_session: Optional[Session] = None
|
11
|
+
_client: Optional[ClientSession] = None
|
12
12
|
|
13
13
|
def get_api_key(self) -> str:
|
14
14
|
"""Get the API key"""
|
@@ -7,6 +7,10 @@ class TaskResponse(TaskBase):
|
|
7
7
|
def _poll_request(self) -> dict:
|
8
8
|
while True:
|
9
9
|
try:
|
10
|
+
if not self.task_url:
|
11
|
+
raise ValueError("Task URL not found in response")
|
12
|
+
if not self._client._session:
|
13
|
+
raise ValueError("Client session not found")
|
10
14
|
r = self._client._session.get(self.task_url, headers=self._client._headers())
|
11
15
|
r.raise_for_status()
|
12
16
|
return r.json()
|
@@ -17,10 +21,8 @@ class TaskResponse(TaskBase):
|
|
17
21
|
raise
|
18
22
|
|
19
23
|
def poll(self) -> 'TaskResponse':
|
20
|
-
if not self.task_url:
|
21
|
-
raise ValueError("Task URL not found in response")
|
22
24
|
while True:
|
23
|
-
response = self.
|
25
|
+
response = self._poll_request()
|
24
26
|
updated_task = TaskResponse(**response).with_client(self._client)
|
25
27
|
self.__dict__.update(updated_task.__dict__)
|
26
28
|
if result := self._check_status():
|
@@ -30,9 +32,11 @@ class TaskResponse(TaskBase):
|
|
30
32
|
def update(self, config: Configuration) -> 'TaskResponse':
|
31
33
|
if not self.task_url:
|
32
34
|
raise ValueError("Task URL not found")
|
35
|
+
if not self._client._session:
|
36
|
+
raise ValueError("Client session not found")
|
33
37
|
files = prepare_upload_data(None, config)
|
34
38
|
r = self._client._session.patch(
|
35
|
-
|
39
|
+
self.task_url,
|
36
40
|
files=files,
|
37
41
|
headers=self._client._headers()
|
38
42
|
)
|
@@ -44,6 +48,8 @@ class TaskResponse(TaskBase):
|
|
44
48
|
def cancel(self):
|
45
49
|
if not self.task_url:
|
46
50
|
raise ValueError("Task URL not found")
|
51
|
+
if not self._client._session:
|
52
|
+
raise ValueError("Client session not found")
|
47
53
|
r = self._client._session.get(
|
48
54
|
f"{self.task_url}/cancel",
|
49
55
|
headers=self._client._headers()
|
@@ -54,6 +60,8 @@ class TaskResponse(TaskBase):
|
|
54
60
|
def delete(self):
|
55
61
|
if not self.task_url:
|
56
62
|
raise ValueError("Task URL not found")
|
63
|
+
if not self._client._session:
|
64
|
+
raise ValueError("Client session not found")
|
57
65
|
r = self._client._session.delete(
|
58
66
|
self.task_url,
|
59
67
|
headers=self._client._headers()
|
@@ -6,6 +6,8 @@ import asyncio
|
|
6
6
|
class TaskResponseAsync(TaskBase):
|
7
7
|
async def _poll_request(self) -> dict:
|
8
8
|
try:
|
9
|
+
if not self._client._client:
|
10
|
+
raise ValueError("Client not found")
|
9
11
|
r = await self._client._client.get(self.task_url, headers=self._client._headers())
|
10
12
|
r.raise_for_status()
|
11
13
|
return r.json()
|
@@ -18,6 +20,8 @@ class TaskResponseAsync(TaskBase):
|
|
18
20
|
async def poll(self) -> 'TaskResponseAsync':
|
19
21
|
if not self.task_url:
|
20
22
|
raise ValueError("Task URL not found")
|
23
|
+
if not self._client._client:
|
24
|
+
raise ValueError("Client not found")
|
21
25
|
while True:
|
22
26
|
j = await self._poll_request()
|
23
27
|
updated = TaskResponseAsync(**j).with_client(self._client)
|
@@ -29,6 +33,8 @@ class TaskResponseAsync(TaskBase):
|
|
29
33
|
async def update(self, config: Configuration) -> 'TaskResponseAsync':
|
30
34
|
if not self.task_url:
|
31
35
|
raise ValueError("Task URL not found")
|
36
|
+
if not self._client._client:
|
37
|
+
raise ValueError("Client not found")
|
32
38
|
f = prepare_upload_data(None, config)
|
33
39
|
r = await self._client._client.patch(self.task_url, files=f, headers=self._client._headers())
|
34
40
|
r.raise_for_status()
|
@@ -39,6 +45,8 @@ class TaskResponseAsync(TaskBase):
|
|
39
45
|
async def cancel(self):
|
40
46
|
if not self.task_url:
|
41
47
|
raise ValueError("Task URL not found")
|
48
|
+
if not self._client._client:
|
49
|
+
raise ValueError("Client not found")
|
42
50
|
r = await self._client._client.get(f"{self.task_url}/cancel", headers=self._client._headers())
|
43
51
|
r.raise_for_status()
|
44
52
|
return await self.poll()
|
@@ -46,5 +54,7 @@ class TaskResponseAsync(TaskBase):
|
|
46
54
|
async def delete(self):
|
47
55
|
if not self.task_url:
|
48
56
|
raise ValueError("Task URL not found")
|
57
|
+
if not self._client._client:
|
58
|
+
raise ValueError("Client not found")
|
49
59
|
r = await self._client._client.delete(self.task_url, headers=self._client._headers())
|
50
|
-
r.raise_for_status()
|
60
|
+
r.raise_for_status()
|
@@ -1,8 +1,7 @@
|
|
1
|
-
from .config import Configuration
|
1
|
+
from .config import Configuration, Status, OutputResponse
|
2
2
|
from .protocol import ChunkrClientProtocol
|
3
|
-
from ..models import Status, OutputResponse
|
4
3
|
from abc import ABC, abstractmethod
|
5
|
-
from typing import TypeVar, Optional, Generic
|
4
|
+
from typing import TypeVar, Optional, Generic
|
6
5
|
from pydantic import BaseModel, PrivateAttr
|
7
6
|
from datetime import datetime
|
8
7
|
|
@@ -23,7 +22,7 @@ class TaskBase(BaseModel, ABC, Generic[T]):
|
|
23
22
|
status: Status
|
24
23
|
task_id: str
|
25
24
|
task_url: Optional[str]
|
26
|
-
_client: Optional[
|
25
|
+
_client: Optional[ChunkrClientProtocol] = PrivateAttr(default=None)
|
27
26
|
|
28
27
|
@abstractmethod
|
29
28
|
def _poll_request(self) -> dict:
|
@@ -50,7 +49,7 @@ class TaskBase(BaseModel, ABC, Generic[T]):
|
|
50
49
|
"""Delete the task."""
|
51
50
|
pass
|
52
51
|
|
53
|
-
def with_client(self, client:
|
52
|
+
def with_client(self, client: ChunkrClientProtocol) -> T:
|
54
53
|
self._client = client
|
55
54
|
return self
|
56
55
|
|
@@ -1,13 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.14
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
Project-URL: Homepage, https://chunkr.ai
|
7
7
|
Description-Content-Type: text/markdown
|
8
8
|
License-File: LICENSE
|
9
9
|
Requires-Dist: httpx>=0.25.0
|
10
|
-
Requires-Dist: httpx>=0.25.0
|
11
10
|
Requires-Dist: pillow>=10.0.0
|
12
11
|
Requires-Dist: pydantic>=2.0.0
|
13
12
|
Requires-Dist: pytest-asyncio>=0.21.0
|
@@ -81,7 +80,7 @@ async def process_document():
|
|
81
80
|
# If you want to upload without waiting for processing
|
82
81
|
task = await chunkr.start_upload("document.pdf")
|
83
82
|
# ... do other things ...
|
84
|
-
await task.
|
83
|
+
await task.poll() # Check status when needed
|
85
84
|
```
|
86
85
|
|
87
86
|
### Additional Features
|
@@ -9,7 +9,9 @@ src/chunkr_ai.egg-info/dependency_links.txt
|
|
9
9
|
src/chunkr_ai.egg-info/requires.txt
|
10
10
|
src/chunkr_ai.egg-info/top_level.txt
|
11
11
|
src/chunkr_ai/api/__init__.py
|
12
|
+
src/chunkr_ai/api/api.py
|
12
13
|
src/chunkr_ai/api/auth.py
|
14
|
+
src/chunkr_ai/api/base.py
|
13
15
|
src/chunkr_ai/api/chunkr.py
|
14
16
|
src/chunkr_ai/api/chunkr_async.py
|
15
17
|
src/chunkr_ai/api/chunkr_base.py
|
@@ -38,7 +38,7 @@ async def test_send_file_path(chunkr_client, sample_path):
|
|
38
38
|
client_type, client = chunkr_client
|
39
39
|
response = await client.upload(sample_path) if client_type == "async" else client.upload(sample_path)
|
40
40
|
|
41
|
-
|
41
|
+
|
42
42
|
assert response.task_id is not None
|
43
43
|
assert response.status == "Succeeded"
|
44
44
|
assert response.output is not None
|
@@ -48,7 +48,7 @@ async def test_send_file_path_str(chunkr_client, sample_path):
|
|
48
48
|
client_type, client = chunkr_client
|
49
49
|
response = await client.upload(str(sample_path)) if client_type == "async" else client.upload(str(sample_path))
|
50
50
|
|
51
|
-
|
51
|
+
|
52
52
|
assert response.task_id is not None
|
53
53
|
assert response.status == "Succeeded"
|
54
54
|
assert response.output is not None
|
@@ -59,7 +59,7 @@ async def test_send_opened_file(chunkr_client, sample_path):
|
|
59
59
|
with open(sample_path, 'rb') as f:
|
60
60
|
response = await client.upload(f) if client_type == "async" else client.upload(f)
|
61
61
|
|
62
|
-
|
62
|
+
|
63
63
|
assert response.task_id is not None
|
64
64
|
assert response.status == "Succeeded"
|
65
65
|
assert response.output is not None
|
@@ -69,7 +69,6 @@ async def test_send_pil_image(chunkr_client, sample_image):
|
|
69
69
|
client_type, client = chunkr_client
|
70
70
|
response = await client.upload(sample_image) if client_type == "async" else client.upload(sample_image)
|
71
71
|
|
72
|
-
assert isinstance(response, TaskResponse)
|
73
72
|
assert response.task_id is not None
|
74
73
|
assert response.status == "Succeeded"
|
75
74
|
|
@@ -82,7 +81,6 @@ async def test_ocr_auto(chunkr_client, sample_path):
|
|
82
81
|
ocr_strategy=OcrStrategy.AUTO
|
83
82
|
))
|
84
83
|
|
85
|
-
assert isinstance(response, TaskResponse)
|
86
84
|
assert response.task_id is not None
|
87
85
|
assert response.status == "Succeeded"
|
88
86
|
assert response.output is not None
|
@@ -96,7 +94,7 @@ async def test_expires_in(chunkr_client, sample_path):
|
|
96
94
|
expires_in=10
|
97
95
|
))
|
98
96
|
|
99
|
-
|
97
|
+
|
100
98
|
assert response.task_id is not None
|
101
99
|
assert response.status == "Succeeded"
|
102
100
|
assert response.output is not None
|
@@ -114,7 +112,7 @@ async def test_chunk_processing(chunkr_client, sample_path):
|
|
114
112
|
)
|
115
113
|
))
|
116
114
|
|
117
|
-
|
115
|
+
|
118
116
|
assert response.task_id is not None
|
119
117
|
assert response.status == "Succeeded"
|
120
118
|
assert response.output is not None
|
@@ -128,7 +126,6 @@ async def test_segmentation_strategy_page(chunkr_client, sample_path):
|
|
128
126
|
segmentation_strategy=SegmentationStrategy.PAGE
|
129
127
|
))
|
130
128
|
|
131
|
-
assert isinstance(response, TaskResponse)
|
132
129
|
assert response.task_id is not None
|
133
130
|
assert response.status == "Succeeded"
|
134
131
|
assert response.output is not None
|
@@ -152,7 +149,7 @@ async def test_page_llm_html(chunkr_client, sample_path):
|
|
152
149
|
)
|
153
150
|
))
|
154
151
|
|
155
|
-
|
152
|
+
|
156
153
|
assert response.task_id is not None
|
157
154
|
assert response.status == "Succeeded"
|
158
155
|
assert response.output is not None
|
@@ -160,15 +157,7 @@ async def test_page_llm_html(chunkr_client, sample_path):
|
|
160
157
|
@pytest.mark.asyncio
|
161
158
|
async def test_page_llm(chunkr_client, sample_path):
|
162
159
|
client_type, client = chunkr_client
|
163
|
-
|
164
|
-
segmentation_strategy=SegmentationStrategy.PAGE,
|
165
|
-
segment_processing=SegmentProcessing(
|
166
|
-
page=GenerationConfig(
|
167
|
-
html=GenerationStrategy.LLM,
|
168
|
-
markdown=GenerationStrategy.LLM
|
169
|
-
)
|
170
|
-
)
|
171
|
-
)) if client_type == "async" else client.upload(sample_path, Configuration(
|
160
|
+
configuration = Configuration(
|
172
161
|
segmentation_strategy=SegmentationStrategy.PAGE,
|
173
162
|
segment_processing=SegmentProcessing(
|
174
163
|
page=GenerationConfig(
|
@@ -176,9 +165,10 @@ async def test_page_llm(chunkr_client, sample_path):
|
|
176
165
|
markdown=GenerationStrategy.LLM
|
177
166
|
)
|
178
167
|
)
|
179
|
-
)
|
168
|
+
)
|
169
|
+
|
170
|
+
response = await client.upload(sample_path, configuration) if client_type == "async" else client.upload(sample_path, configuration)
|
180
171
|
|
181
|
-
assert isinstance(response, TaskResponse)
|
182
172
|
assert response.task_id is not None
|
183
173
|
assert response.status == "Succeeded"
|
184
174
|
assert response.output is not None
|
@@ -204,16 +194,16 @@ async def test_json_schema(chunkr_client, sample_path):
|
|
204
194
|
)
|
205
195
|
))
|
206
196
|
|
207
|
-
assert isinstance(response, TaskResponse)
|
208
197
|
assert response.task_id is not None
|
209
198
|
if response.status != "Succeeded":
|
210
199
|
raise ValueError(f"Task failed with message: {response.message}")
|
211
200
|
assert response.output is not None
|
201
|
+
|
212
202
|
@pytest.mark.asyncio
|
213
203
|
async def test_delete_task(chunkr_client, sample_path):
|
214
204
|
client_type, client = chunkr_client
|
215
205
|
response = await client.upload(sample_path) if client_type == "async" else client.upload(sample_path)
|
216
|
-
|
206
|
+
|
217
207
|
assert response.task_id is not None
|
218
208
|
assert response.status == "Succeeded"
|
219
209
|
assert response.output is not None
|
@@ -249,14 +239,14 @@ async def test_delete_task_direct(chunkr_client, sample_path):
|
|
249
239
|
async def test_cancel_task(chunkr_client, sample_path):
|
250
240
|
client_type, client = chunkr_client
|
251
241
|
response = await client.create_task(sample_path) if client_type == "async" else client.create_task(sample_path)
|
252
|
-
|
242
|
+
|
253
243
|
assert response.task_id is not None
|
254
244
|
assert response.status == "Starting"
|
255
245
|
|
256
246
|
if client_type == "async":
|
257
247
|
await client.cancel_task(response.task_id)
|
258
248
|
assert (await client.get_task(response.task_id)).status == "Cancelled"
|
259
|
-
await response.
|
249
|
+
await response.poll()
|
260
250
|
else:
|
261
251
|
client.cancel_task(response.task_id)
|
262
252
|
assert client.get_task(response.task_id).status == "Cancelled"
|
@@ -290,7 +280,7 @@ async def test_update_task(chunkr_client, sample_path):
|
|
290
280
|
segmentation_strategy=SegmentationStrategy.PAGE,
|
291
281
|
)
|
292
282
|
response = await client.upload(sample_path, original_config) if client_type == "async" else client.upload(sample_path, original_config)
|
293
|
-
|
283
|
+
|
294
284
|
assert response.task_id is not None
|
295
285
|
assert response.status == "Succeeded"
|
296
286
|
assert response.output is not None
|
@@ -1,144 +0,0 @@
|
|
1
|
-
from .chunkr_base import ChunkrBase
|
2
|
-
from .task import TaskResponse
|
3
|
-
from .config import Configuration
|
4
|
-
import httpx
|
5
|
-
from pathlib import Path
|
6
|
-
from PIL import Image
|
7
|
-
from typing import Union, BinaryIO
|
8
|
-
from .misc import prepare_upload_data
|
9
|
-
|
10
|
-
class ChunkrAsync(ChunkrBase):
|
11
|
-
"""Asynchronous Chunkr API client"""
|
12
|
-
|
13
|
-
def __init__(self, url: str = None, api_key: str = None):
|
14
|
-
super().__init__(url, api_key)
|
15
|
-
self._client = httpx.AsyncClient()
|
16
|
-
|
17
|
-
async def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
18
|
-
"""Upload a file and wait for processing to complete.
|
19
|
-
|
20
|
-
Args:
|
21
|
-
file: The file to upload.
|
22
|
-
config: Configuration options for processing. Optional.
|
23
|
-
|
24
|
-
Examples:
|
25
|
-
```python
|
26
|
-
# Upload from file path
|
27
|
-
await chunkr.upload("document.pdf")
|
28
|
-
|
29
|
-
# Upload from opened file
|
30
|
-
with open("document.pdf", "rb") as f:
|
31
|
-
await chunkr.upload(f)
|
32
|
-
|
33
|
-
# Upload from URL
|
34
|
-
await chunkr.upload("https://example.com/document.pdf")
|
35
|
-
|
36
|
-
# Upload from base64 string (must include MIME type header)
|
37
|
-
await chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
38
|
-
|
39
|
-
# Upload an image
|
40
|
-
from PIL import Image
|
41
|
-
img = Image.open("photo.jpg")
|
42
|
-
await chunkr.upload(img)
|
43
|
-
```
|
44
|
-
Returns:
|
45
|
-
TaskResponse: The completed task response
|
46
|
-
"""
|
47
|
-
task = await self.create_task(file, config)
|
48
|
-
return await task.poll_async()
|
49
|
-
|
50
|
-
async def update(self, task_id: str, config: Configuration) -> TaskResponse:
|
51
|
-
"""Update a task by its ID and wait for processing to complete.
|
52
|
-
|
53
|
-
Args:
|
54
|
-
task_id: The ID of the task to update
|
55
|
-
config: Configuration options for processing. Optional.
|
56
|
-
|
57
|
-
Returns:
|
58
|
-
TaskResponse: The updated task response
|
59
|
-
"""
|
60
|
-
task = await self.update_task(task_id, config)
|
61
|
-
return await task.poll_async()
|
62
|
-
|
63
|
-
async def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
64
|
-
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
|
65
|
-
|
66
|
-
Args:
|
67
|
-
file: The file to upload.
|
68
|
-
config: Configuration options for processing. Optional.
|
69
|
-
|
70
|
-
Examples:
|
71
|
-
```
|
72
|
-
# Upload from file path
|
73
|
-
task = await chunkr.start_upload("document.pdf")
|
74
|
-
|
75
|
-
# Upload from opened file
|
76
|
-
with open("document.pdf", "rb") as f:
|
77
|
-
task = await chunkr.start_upload(f)
|
78
|
-
|
79
|
-
# Upload from URL
|
80
|
-
task = await chunkr.start_upload("https://example.com/document.pdf")
|
81
|
-
|
82
|
-
# Upload from base64 string (must include MIME type header)
|
83
|
-
task = await chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
84
|
-
|
85
|
-
# Upload an image
|
86
|
-
from PIL import Image
|
87
|
-
img = Image.open("photo.jpg")
|
88
|
-
task = await chunkr.start_upload(img)
|
89
|
-
|
90
|
-
# Wait for the task to complete - this can be done when needed
|
91
|
-
await task.poll_async()
|
92
|
-
```
|
93
|
-
|
94
|
-
Returns:
|
95
|
-
TaskResponse: The initial task response
|
96
|
-
"""
|
97
|
-
files = prepare_upload_data(file, config)
|
98
|
-
r = await self._client.post(
|
99
|
-
f"{self.url}/api/v1/task",
|
100
|
-
files=files,
|
101
|
-
headers=self._headers()
|
102
|
-
)
|
103
|
-
r.raise_for_status()
|
104
|
-
return TaskResponse(**r.json()).with_client(self)
|
105
|
-
|
106
|
-
async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
107
|
-
files = prepare_upload_data(None, config)
|
108
|
-
r = await self._client.patch(
|
109
|
-
f"{self.url}/api/v1/task/{task_id}",
|
110
|
-
files=files,
|
111
|
-
headers=self._headers()
|
112
|
-
)
|
113
|
-
|
114
|
-
r.raise_for_status()
|
115
|
-
return TaskResponse(**r.json()).with_client(self)
|
116
|
-
|
117
|
-
async def get_task(self, task_id: str) -> TaskResponse:
|
118
|
-
r = await self._client.get(
|
119
|
-
f"{self.url}/api/v1/task/{task_id}",
|
120
|
-
headers=self._headers()
|
121
|
-
)
|
122
|
-
r.raise_for_status()
|
123
|
-
return TaskResponse(**r.json()).with_client(self)
|
124
|
-
|
125
|
-
async def delete_task(self, task_id: str) -> None:
|
126
|
-
r = await self._client.delete(
|
127
|
-
f"{self.url}/api/v1/task/{task_id}",
|
128
|
-
headers=self._headers()
|
129
|
-
)
|
130
|
-
r.raise_for_status()
|
131
|
-
|
132
|
-
async def cancel_task(self, task_id: str) -> None:
|
133
|
-
r = await self._client.get(
|
134
|
-
f"{self.url}/api/v1/task/{task_id}/cancel",
|
135
|
-
headers=self._headers()
|
136
|
-
)
|
137
|
-
r.raise_for_status()
|
138
|
-
|
139
|
-
|
140
|
-
async def __aenter__(self):
|
141
|
-
return self
|
142
|
-
|
143
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
144
|
-
await self._client.aclose()
|
@@ -1,85 +0,0 @@
|
|
1
|
-
from .config import Configuration
|
2
|
-
from .task import TaskResponse
|
3
|
-
from .auth import HeadersMixin
|
4
|
-
from abc import abstractmethod
|
5
|
-
from dotenv import load_dotenv
|
6
|
-
import os
|
7
|
-
from pathlib import Path
|
8
|
-
from PIL import Image
|
9
|
-
from typing import BinaryIO, Union
|
10
|
-
|
11
|
-
class ChunkrBase(HeadersMixin):
|
12
|
-
"""Base class with shared functionality for Chunkr API clients."""
|
13
|
-
|
14
|
-
def __init__(self, url: str = None, api_key: str = None):
|
15
|
-
load_dotenv()
|
16
|
-
self.url = (
|
17
|
-
url or
|
18
|
-
os.getenv('CHUNKR_URL') or
|
19
|
-
'https://api.chunkr.ai'
|
20
|
-
)
|
21
|
-
self._api_key = (
|
22
|
-
api_key or
|
23
|
-
os.getenv('CHUNKR_API_KEY')
|
24
|
-
)
|
25
|
-
if not self._api_key:
|
26
|
-
raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
|
27
|
-
|
28
|
-
self.url = self.url.rstrip("/")
|
29
|
-
|
30
|
-
@abstractmethod
|
31
|
-
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
32
|
-
"""Upload a file and wait for processing to complete.
|
33
|
-
|
34
|
-
Must be implemented by subclasses.
|
35
|
-
"""
|
36
|
-
pass
|
37
|
-
|
38
|
-
@abstractmethod
|
39
|
-
def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
40
|
-
"""Update a task by its ID.
|
41
|
-
|
42
|
-
Must be implemented by subclasses.
|
43
|
-
"""
|
44
|
-
pass
|
45
|
-
|
46
|
-
@abstractmethod
|
47
|
-
def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
48
|
-
"""Upload a file for processing and immediately return the task response.
|
49
|
-
|
50
|
-
Must be implemented by subclasses.
|
51
|
-
"""
|
52
|
-
pass
|
53
|
-
|
54
|
-
@abstractmethod
|
55
|
-
def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
56
|
-
"""Update a task by its ID.
|
57
|
-
|
58
|
-
Must be implemented by subclasses.
|
59
|
-
"""
|
60
|
-
pass
|
61
|
-
|
62
|
-
@abstractmethod
|
63
|
-
def get_task(self, task_id: str) -> TaskResponse:
|
64
|
-
"""Get a task response by its ID.
|
65
|
-
|
66
|
-
Must be implemented by subclasses.
|
67
|
-
"""
|
68
|
-
pass
|
69
|
-
|
70
|
-
@abstractmethod
|
71
|
-
def delete_task(self, task_id: str) -> None:
|
72
|
-
"""Delete a task by its ID.
|
73
|
-
|
74
|
-
Must be implemented by subclasses.
|
75
|
-
"""
|
76
|
-
pass
|
77
|
-
|
78
|
-
@abstractmethod
|
79
|
-
def cancel_task(self, task_id: str) -> None:
|
80
|
-
"""Cancel a task by its ID.
|
81
|
-
|
82
|
-
Must be implemented by subclasses.
|
83
|
-
"""
|
84
|
-
pass
|
85
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|