chunkr-ai 0.0.22__py3-none-any.whl → 0.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/chunkr.py +2 -2
- chunkr_ai/api/chunkr_base.py +12 -131
- chunkr_ai/api/misc.py +27 -6
- chunkr_ai/api/protocol.py +2 -7
- chunkr_ai/api/task_response.py +5 -5
- {chunkr_ai-0.0.22.dist-info → chunkr_ai-0.0.24.dist-info}/METADATA +2 -1
- chunkr_ai-0.0.24.dist-info/RECORD +17 -0
- chunkr_ai-0.0.22.dist-info/RECORD +0 -17
- {chunkr_ai-0.0.22.dist-info → chunkr_ai-0.0.24.dist-info}/LICENSE +0 -0
- {chunkr_ai-0.0.22.dist-info → chunkr_ai-0.0.24.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.0.22.dist-info → chunkr_ai-0.0.24.dist-info}/top_level.txt +0 -0
chunkr_ai/api/chunkr.py
CHANGED
@@ -34,7 +34,7 @@ class Chunkr(ChunkrBase):
|
|
34
34
|
file: Union[str, Path, BinaryIO, Image.Image],
|
35
35
|
config: Configuration = None,
|
36
36
|
) -> TaskResponse:
|
37
|
-
files = prepare_upload_data(file, config)
|
37
|
+
files = await prepare_upload_data(file, config, self._client)
|
38
38
|
r = await self._client.post(
|
39
39
|
f"{self.url}/api/v1/task", files=files, headers=self._headers()
|
40
40
|
)
|
@@ -44,7 +44,7 @@ class Chunkr(ChunkrBase):
|
|
44
44
|
@anywhere()
|
45
45
|
@ensure_client()
|
46
46
|
async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
47
|
-
files = prepare_upload_data(None, config)
|
47
|
+
files = await prepare_upload_data(None, config, self._client)
|
48
48
|
r = await self._client.patch(
|
49
49
|
f"{self.url}/api/v1/task/{task_id}",
|
50
50
|
files=files,
|
chunkr_ai/api/chunkr_base.py
CHANGED
@@ -4,22 +4,27 @@ from .auth import HeadersMixin
|
|
4
4
|
from abc import abstractmethod
|
5
5
|
from dotenv import load_dotenv
|
6
6
|
import httpx
|
7
|
-
import io
|
8
|
-
import json
|
9
7
|
import os
|
10
8
|
from pathlib import Path
|
11
9
|
from PIL import Image
|
12
|
-
import
|
13
|
-
from typing import BinaryIO, Tuple, Union
|
10
|
+
from typing import BinaryIO, Union
|
14
11
|
|
15
12
|
|
16
13
|
class ChunkrBase(HeadersMixin):
|
17
|
-
"""Base class with shared functionality for Chunkr API clients.
|
18
|
-
|
19
|
-
|
14
|
+
"""Base class with shared functionality for Chunkr API clients.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
url: The base URL of the Chunkr API. Defaults to the value of the CHUNKR_URL environment variable, or "https://api.chunkr.ai" if not set.
|
18
|
+
api_key: The API key to use for authentication. Defaults to the value of the CHUNKR_API_KEY environment variable, or None if not set.
|
19
|
+
raise_on_failure: Whether to raise an exception if the task fails. Defaults to False.
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(self, url: str = None, api_key: str = None, raise_on_failure: bool = False):
|
20
23
|
load_dotenv()
|
21
24
|
self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
|
22
25
|
self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
|
26
|
+
self.raise_on_failure = raise_on_failure
|
27
|
+
|
23
28
|
if not self._api_key:
|
24
29
|
raise ValueError(
|
25
30
|
"API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
|
@@ -28,130 +33,6 @@ class ChunkrBase(HeadersMixin):
|
|
28
33
|
self.url = self.url.rstrip("/")
|
29
34
|
self._client = httpx.AsyncClient()
|
30
35
|
|
31
|
-
def _prepare_file(
|
32
|
-
self, file: Union[str, Path, BinaryIO, Image.Image]
|
33
|
-
) -> Tuple[str, BinaryIO]:
|
34
|
-
"""Convert various file types into a tuple of (filename, file-like object).
|
35
|
-
|
36
|
-
Args:
|
37
|
-
file: Input file, can be:
|
38
|
-
- String or Path to a file
|
39
|
-
- URL string starting with http:// or https://
|
40
|
-
- Base64 string
|
41
|
-
- Opened binary file (mode='rb')
|
42
|
-
- PIL/Pillow Image object
|
43
|
-
|
44
|
-
Returns:
|
45
|
-
Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
|
46
|
-
|
47
|
-
Raises:
|
48
|
-
FileNotFoundError: If the file path doesn't exist
|
49
|
-
TypeError: If the file type is not supported
|
50
|
-
ValueError: If the URL is invalid or unreachable
|
51
|
-
ValueError: If the MIME type is unsupported
|
52
|
-
"""
|
53
|
-
# Handle URLs
|
54
|
-
if isinstance(file, str) and (
|
55
|
-
file.startswith("http://") or file.startswith("https://")
|
56
|
-
):
|
57
|
-
response = requests.get(file)
|
58
|
-
response.raise_for_status()
|
59
|
-
file_obj = io.BytesIO(response.content)
|
60
|
-
filename = Path(file.split("/")[-1]).name or "downloaded_file"
|
61
|
-
return filename, file_obj
|
62
|
-
|
63
|
-
# Handle base64 strings
|
64
|
-
if isinstance(file, str) and "," in file and ";base64," in file:
|
65
|
-
try:
|
66
|
-
# Split header and data
|
67
|
-
header, base64_data = file.split(",", 1)
|
68
|
-
import base64
|
69
|
-
|
70
|
-
file_bytes = base64.b64decode(base64_data)
|
71
|
-
file_obj = io.BytesIO(file_bytes)
|
72
|
-
|
73
|
-
# Try to determine format from header
|
74
|
-
format = "bin"
|
75
|
-
mime_type = header.split(":")[-1].split(";")[0].lower()
|
76
|
-
|
77
|
-
# Map MIME types to file extensions
|
78
|
-
mime_to_ext = {
|
79
|
-
"application/pdf": "pdf",
|
80
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
81
|
-
"application/msword": "doc",
|
82
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
|
83
|
-
"application/vnd.ms-powerpoint": "ppt",
|
84
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
85
|
-
"application/vnd.ms-excel": "xls",
|
86
|
-
"image/jpeg": "jpg",
|
87
|
-
"image/png": "png",
|
88
|
-
"image/jpg": "jpg",
|
89
|
-
}
|
90
|
-
|
91
|
-
if mime_type in mime_to_ext:
|
92
|
-
format = mime_to_ext[mime_type]
|
93
|
-
else:
|
94
|
-
raise ValueError(f"Unsupported MIME type: {mime_type}")
|
95
|
-
|
96
|
-
return f"file.{format}", file_obj
|
97
|
-
except Exception as e:
|
98
|
-
raise ValueError(f"Invalid base64 string: {str(e)}")
|
99
|
-
|
100
|
-
# Handle file paths
|
101
|
-
if isinstance(file, (str, Path)):
|
102
|
-
path = Path(file).resolve()
|
103
|
-
if not path.exists():
|
104
|
-
raise FileNotFoundError(f"File not found: {file}")
|
105
|
-
return path.name, open(path, "rb")
|
106
|
-
|
107
|
-
# Handle PIL Images
|
108
|
-
if isinstance(file, Image.Image):
|
109
|
-
img_byte_arr = io.BytesIO()
|
110
|
-
format = file.format or "PNG"
|
111
|
-
file.save(img_byte_arr, format=format)
|
112
|
-
img_byte_arr.seek(0)
|
113
|
-
return f"image.{format.lower()}", img_byte_arr
|
114
|
-
|
115
|
-
# Handle file-like objects
|
116
|
-
if hasattr(file, "read") and hasattr(file, "seek"):
|
117
|
-
# Try to get the filename from the file object if possible
|
118
|
-
name = (
|
119
|
-
getattr(file, "name", "document")
|
120
|
-
if hasattr(file, "name")
|
121
|
-
else "document"
|
122
|
-
)
|
123
|
-
return Path(name).name, file
|
124
|
-
|
125
|
-
raise TypeError(f"Unsupported file type: {type(file)}")
|
126
|
-
|
127
|
-
def _prepare_upload_data(
|
128
|
-
self,
|
129
|
-
file: Union[str, Path, BinaryIO, Image.Image],
|
130
|
-
config: Configuration = None,
|
131
|
-
) -> Tuple[dict, dict]:
|
132
|
-
"""Prepare files and data dictionaries for upload.
|
133
|
-
|
134
|
-
Args:
|
135
|
-
file: The file to upload
|
136
|
-
config: Optional configuration settings
|
137
|
-
|
138
|
-
Returns:
|
139
|
-
Tuple[dict, dict]: (files dict, data dict) ready for upload
|
140
|
-
"""
|
141
|
-
filename, file_obj = self._prepare_file(file)
|
142
|
-
files = {"file": (filename, file_obj)}
|
143
|
-
data = {}
|
144
|
-
|
145
|
-
if config:
|
146
|
-
config_dict = config.model_dump(mode="json", exclude_none=True)
|
147
|
-
for key, value in config_dict.items():
|
148
|
-
if isinstance(value, dict):
|
149
|
-
files[key] = (None, json.dumps(value), "application/json")
|
150
|
-
else:
|
151
|
-
data[key] = value
|
152
|
-
|
153
|
-
return files, data
|
154
|
-
|
155
36
|
@abstractmethod
|
156
37
|
def upload(
|
157
38
|
self,
|
chunkr_ai/api/misc.py
CHANGED
@@ -3,16 +3,36 @@ import io
|
|
3
3
|
import json
|
4
4
|
from pathlib import Path
|
5
5
|
from PIL import Image
|
6
|
-
import
|
6
|
+
import httpx
|
7
7
|
from typing import Union, Tuple, BinaryIO, Optional
|
8
8
|
|
9
|
-
def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[str, BinaryIO]:
|
10
|
-
"""Convert various file types into a tuple of (filename, file-like object).
|
9
|
+
async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image], client: httpx.AsyncClient = None) -> Tuple[str, BinaryIO]:
|
10
|
+
"""Convert various file types into a tuple of (filename, file-like object).
|
11
|
+
|
12
|
+
Args:
|
13
|
+
file: Input file, can be:
|
14
|
+
- String or Path to a file
|
15
|
+
- URL string starting with http:// or https://
|
16
|
+
- Base64 string
|
17
|
+
- Opened binary file (mode='rb')
|
18
|
+
- PIL/Pillow Image object
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
|
22
|
+
|
23
|
+
Raises:
|
24
|
+
FileNotFoundError: If the file path doesn't exist
|
25
|
+
TypeError: If the file type is not supported
|
26
|
+
ValueError: If the URL is invalid or unreachable
|
27
|
+
ValueError: If the MIME type is unsupported
|
28
|
+
"""
|
11
29
|
# Handle URLs
|
12
30
|
if isinstance(file, str) and (
|
13
31
|
file.startswith("http://") or file.startswith("https://")
|
14
32
|
):
|
15
|
-
|
33
|
+
if not client:
|
34
|
+
raise ValueError("Client must be provided to download files from URLs")
|
35
|
+
response = client.get(file)
|
16
36
|
response.raise_for_status()
|
17
37
|
|
18
38
|
# Try to get filename from Content-Disposition header first
|
@@ -108,9 +128,10 @@ def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[str, Bi
|
|
108
128
|
raise TypeError(f"Unsupported file type: {type(file)}")
|
109
129
|
|
110
130
|
|
111
|
-
def prepare_upload_data(
|
131
|
+
async def prepare_upload_data(
|
112
132
|
file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
|
113
133
|
config: Optional[Configuration] = None,
|
134
|
+
client: httpx.AsyncClient = None,
|
114
135
|
) -> dict:
|
115
136
|
"""Prepare files and data dictionaries for upload.
|
116
137
|
|
@@ -123,7 +144,7 @@ def prepare_upload_data(
|
|
123
144
|
"""
|
124
145
|
files = {}
|
125
146
|
if file:
|
126
|
-
filename, file_obj = prepare_file(file)
|
147
|
+
filename, file_obj = await prepare_file(file, client)
|
127
148
|
files = {"file": (filename, file_obj)}
|
128
149
|
|
129
150
|
if config:
|
chunkr_ai/api/protocol.py
CHANGED
@@ -5,15 +5,10 @@ from httpx import AsyncClient
|
|
5
5
|
@runtime_checkable
|
6
6
|
class ChunkrClientProtocol(Protocol):
|
7
7
|
"""Protocol defining the interface for Chunkr clients"""
|
8
|
-
|
9
|
-
|
10
|
-
_api_key: str
|
8
|
+
|
9
|
+
raise_on_failure: bool = True
|
11
10
|
_client: Optional[AsyncClient] = None
|
12
11
|
|
13
|
-
def get_api_key(self) -> str:
|
14
|
-
"""Get the API key"""
|
15
|
-
...
|
16
|
-
|
17
12
|
def _headers(self) -> dict:
|
18
13
|
"""Return headers required for API requests"""
|
19
14
|
...
|
chunkr_ai/api/task_response.py
CHANGED
@@ -30,15 +30,16 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
30
30
|
def _check_status(self) -> Optional[T]:
|
31
31
|
"""Helper method to check task status and handle completion/failure"""
|
32
32
|
if self.status == "Failed":
|
33
|
-
|
33
|
+
if getattr(self._client, 'raise_on_failure', True):
|
34
|
+
raise ValueError(self.message)
|
35
|
+
return self
|
34
36
|
if self.status not in ("Starting", "Processing"):
|
35
37
|
return self
|
36
38
|
return None
|
37
39
|
|
40
|
+
@require_task()
|
38
41
|
async def _poll_request(self) -> dict:
|
39
42
|
try:
|
40
|
-
if not self._client._client:
|
41
|
-
raise ValueError("Client not found")
|
42
43
|
r = await self._client._client.get(
|
43
44
|
self.task_url, headers=self._client._headers()
|
44
45
|
)
|
@@ -51,7 +52,6 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
51
52
|
raise
|
52
53
|
|
53
54
|
@anywhere()
|
54
|
-
@require_task()
|
55
55
|
async def poll(self) -> T:
|
56
56
|
"""Poll the task for completion."""
|
57
57
|
while True:
|
@@ -66,7 +66,7 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
66
66
|
@require_task()
|
67
67
|
async def update(self, config: Configuration) -> T:
|
68
68
|
"""Update the task configuration."""
|
69
|
-
f = prepare_upload_data(None, config)
|
69
|
+
f = await prepare_upload_data(None, config, self._client._client)
|
70
70
|
r = await self._client._client.patch(
|
71
71
|
self.task_url, files=f, headers=self._client._headers()
|
72
72
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.24
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
License: MIT License
|
@@ -35,6 +35,7 @@ Provides-Extra: test
|
|
35
35
|
Requires-Dist: pytest>=7.0.0; extra == "test"
|
36
36
|
Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
|
37
37
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
|
38
|
+
Requires-Dist: ruff>=0.9.3; extra == "test"
|
38
39
|
|
39
40
|
# Chunkr Python Client
|
40
41
|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
|
2
|
+
chunkr_ai/models.py,sha256=MK8FPbWDj1ynvSHaYuslKCPybxLuAlrsVIM3Eym3kKI,750
|
3
|
+
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
|
6
|
+
chunkr_ai/api/chunkr.py,sha256=XTXJFs0xjYY3w3N4fSQcxtJFBtNfzFYYkh6nDlFz4cY,2714
|
7
|
+
chunkr_ai/api/chunkr_base.py,sha256=4SXA-gdZd1w2zZeeMdy4xog0NKOrKjmo6IMvSl9KSBg,5538
|
8
|
+
chunkr_ai/api/config.py,sha256=NmPTsDvcjkvNx0gNzDTz-oFG5rQC7jm-H70O_crJCw8,4478
|
9
|
+
chunkr_ai/api/decorators.py,sha256=y_Z9z0O2XXiX9z6jWDwdbCPdQyMLnjE0pGkJjHQEv_Q,2652
|
10
|
+
chunkr_ai/api/misc.py,sha256=5Q2K713VPwf3S2519KTzjT9PKhTEBgBMk1d8NNnmpZ0,5717
|
11
|
+
chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
|
12
|
+
chunkr_ai/api/task_response.py,sha256=hcHsBgX-2C5Px5Bu0IKk33K_AkqHSEM1Wu2zkcPh9to,3935
|
13
|
+
chunkr_ai-0.0.24.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
|
14
|
+
chunkr_ai-0.0.24.dist-info/METADATA,sha256=JyDI8EkFaJQQ7vIo2osHxXmeuNqhQ0UWjgUMHSFIYow,6996
|
15
|
+
chunkr_ai-0.0.24.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
16
|
+
chunkr_ai-0.0.24.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
17
|
+
chunkr_ai-0.0.24.dist-info/RECORD,,
|
@@ -1,17 +0,0 @@
|
|
1
|
-
chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
|
2
|
-
chunkr_ai/models.py,sha256=MK8FPbWDj1ynvSHaYuslKCPybxLuAlrsVIM3Eym3kKI,750
|
3
|
-
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
|
6
|
-
chunkr_ai/api/chunkr.py,sha256=ZjDVUeR1KPiLLHbj7AJipt-prBTw7a_GvjJ0nd7JV7c,2674
|
7
|
-
chunkr_ai/api/chunkr_base.py,sha256=TDqEwCCfgshggi_Mzv76FhPj5z21QP8EVj7siczvfao,9826
|
8
|
-
chunkr_ai/api/config.py,sha256=NmPTsDvcjkvNx0gNzDTz-oFG5rQC7jm-H70O_crJCw8,4478
|
9
|
-
chunkr_ai/api/decorators.py,sha256=y_Z9z0O2XXiX9z6jWDwdbCPdQyMLnjE0pGkJjHQEv_Q,2652
|
10
|
-
chunkr_ai/api/misc.py,sha256=wUG4SpfEEo7NcVK47gmw42dRy9zT5F9S2DtVC4T4ERs,4877
|
11
|
-
chunkr_ai/api/protocol.py,sha256=Nt8aWr4ouVwCvoLqVI5vnXJhT2cvxt0sQC-svUk2G5w,458
|
12
|
-
chunkr_ai/api/task_response.py,sha256=yjlUOADqf0O9X-yF7lbZuv39ttvitZHQpbGy3DtrJ80,3909
|
13
|
-
chunkr_ai-0.0.22.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
|
14
|
-
chunkr_ai-0.0.22.dist-info/METADATA,sha256=ro8wcqBnNGLkIhh7XrbSxdl4zaWXhA0d5SUgBQteAec,6952
|
15
|
-
chunkr_ai-0.0.22.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
16
|
-
chunkr_ai-0.0.22.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
17
|
-
chunkr_ai-0.0.22.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|