chunkr-ai 0.0.36__py3-none-any.whl → 0.0.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/chunkr.py +11 -7
- chunkr_ai/api/chunkr_base.py +9 -7
- chunkr_ai/api/misc.py +66 -118
- chunkr_ai/api/task_response.py +4 -2
- {chunkr_ai-0.0.36.dist-info → chunkr_ai-0.0.37.dist-info}/METADATA +1 -1
- chunkr_ai-0.0.37.dist-info/RECORD +16 -0
- chunkr_ai-0.0.36.dist-info/RECORD +0 -16
- {chunkr_ai-0.0.36.dist-info → chunkr_ai-0.0.37.dist-info}/LICENSE +0 -0
- {chunkr_ai-0.0.36.dist-info → chunkr_ai-0.0.37.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.0.36.dist-info → chunkr_ai-0.0.37.dist-info}/top_level.txt +0 -0
chunkr_ai/api/chunkr.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from PIL import Image
|
3
|
-
from typing import Union, BinaryIO
|
3
|
+
from typing import Union, BinaryIO, Optional
|
4
4
|
|
5
5
|
from .configuration import Configuration
|
6
6
|
from .decorators import anywhere, ensure_client, retry_on_429
|
@@ -17,8 +17,9 @@ class Chunkr(ChunkrBase):
|
|
17
17
|
self,
|
18
18
|
file: Union[str, Path, BinaryIO, Image.Image],
|
19
19
|
config: Configuration = None,
|
20
|
+
filename: Optional[str] = None,
|
20
21
|
) -> TaskResponse:
|
21
|
-
task = await self.create_task(file, config)
|
22
|
+
task = await self.create_task(file, config, filename)
|
22
23
|
return await task.poll()
|
23
24
|
|
24
25
|
@anywhere()
|
@@ -34,10 +35,12 @@ class Chunkr(ChunkrBase):
|
|
34
35
|
self,
|
35
36
|
file: Union[str, Path, BinaryIO, Image.Image],
|
36
37
|
config: Configuration = None,
|
38
|
+
filename: Optional[str] = None,
|
37
39
|
) -> TaskResponse:
|
38
|
-
|
40
|
+
"""Create a new task with the given file and configuration."""
|
41
|
+
data = await prepare_upload_data(file, filename, config)
|
39
42
|
r = await self._client.post(
|
40
|
-
f"{self.url}/api/v1/task",
|
43
|
+
f"{self.url}/api/v1/task/parse", json=data, headers=self._headers()
|
41
44
|
)
|
42
45
|
r.raise_for_status()
|
43
46
|
return TaskResponse(**r.json()).with_client(self, True, False)
|
@@ -46,10 +49,11 @@ class Chunkr(ChunkrBase):
|
|
46
49
|
@ensure_client()
|
47
50
|
@retry_on_429()
|
48
51
|
async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
49
|
-
|
52
|
+
"""Update an existing task with new configuration."""
|
53
|
+
data = await prepare_upload_data(None, None, config)
|
50
54
|
r = await self._client.patch(
|
51
|
-
f"{self.url}/api/v1/task/{task_id}",
|
52
|
-
|
55
|
+
f"{self.url}/api/v1/task/{task_id}/parse",
|
56
|
+
json=data,
|
53
57
|
headers=self._headers(),
|
54
58
|
)
|
55
59
|
r.raise_for_status()
|
chunkr_ai/api/chunkr_base.py
CHANGED
@@ -7,8 +7,7 @@ import httpx
|
|
7
7
|
import os
|
8
8
|
from pathlib import Path
|
9
9
|
from PIL import Image
|
10
|
-
from typing import BinaryIO, Union
|
11
|
-
|
10
|
+
from typing import BinaryIO, Union, Optional
|
12
11
|
|
13
12
|
class ChunkrBase(HeadersMixin):
|
14
13
|
"""Base class with shared functionality for Chunkr API clients.
|
@@ -20,7 +19,7 @@ class ChunkrBase(HeadersMixin):
|
|
20
19
|
"""
|
21
20
|
|
22
21
|
def __init__(self, url: str = None, api_key: str = None, raise_on_failure: bool = False):
|
23
|
-
load_dotenv()
|
22
|
+
load_dotenv(override=True)
|
24
23
|
self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
|
25
24
|
self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
|
26
25
|
self.raise_on_failure = raise_on_failure
|
@@ -38,13 +37,15 @@ class ChunkrBase(HeadersMixin):
|
|
38
37
|
self,
|
39
38
|
file: Union[str, Path, BinaryIO, Image.Image],
|
40
39
|
config: Configuration = None,
|
40
|
+
filename: Optional[str] = None,
|
41
41
|
) -> TaskResponse:
|
42
42
|
"""Upload a file and wait for processing to complete.
|
43
43
|
|
44
44
|
Args:
|
45
45
|
file: The file to upload.
|
46
46
|
config: Configuration options for processing. Optional.
|
47
|
-
|
47
|
+
filename: The filename to use for the file. Optional.
|
48
|
+
|
48
49
|
Examples:
|
49
50
|
```python
|
50
51
|
# Upload from file path
|
@@ -58,7 +59,7 @@ class ChunkrBase(HeadersMixin):
|
|
58
59
|
await chunkr.upload("https://example.com/document.pdf")
|
59
60
|
|
60
61
|
# Upload from base64 string (must include MIME type header)
|
61
|
-
await chunkr.upload("data:application/pdf;base64,JVBERi0...")
|
62
|
+
await chunkr.upload("data:application/pdf;base64,JVBERi0...", filename="document.pdf")
|
62
63
|
|
63
64
|
# Upload an image
|
64
65
|
from PIL import Image
|
@@ -90,13 +91,14 @@ class ChunkrBase(HeadersMixin):
|
|
90
91
|
self,
|
91
92
|
file: Union[str, Path, BinaryIO, Image.Image],
|
92
93
|
config: Configuration = None,
|
94
|
+
filename: Optional[str] = None,
|
93
95
|
) -> TaskResponse:
|
94
96
|
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
|
95
97
|
|
96
98
|
Args:
|
97
99
|
file: The file to upload.
|
98
100
|
config: Configuration options for processing. Optional.
|
99
|
-
|
101
|
+
filename: The filename to use for the file. Optional.
|
100
102
|
Examples:
|
101
103
|
```
|
102
104
|
# Upload from file path
|
@@ -110,7 +112,7 @@ class ChunkrBase(HeadersMixin):
|
|
110
112
|
task = await chunkr.create_task("https://example.com/document.pdf")
|
111
113
|
|
112
114
|
# Upload from base64 string (must include MIME type header)
|
113
|
-
task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
115
|
+
task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...", filename="document.pdf")
|
114
116
|
|
115
117
|
# Upload an image
|
116
118
|
from PIL import Image
|
chunkr_ai/api/misc.py
CHANGED
@@ -1,155 +1,103 @@
|
|
1
1
|
from .configuration import Configuration
|
2
|
+
import base64
|
2
3
|
import io
|
3
|
-
import json
|
4
4
|
from pathlib import Path
|
5
5
|
from PIL import Image
|
6
|
-
import httpx
|
7
6
|
from typing import Union, Tuple, BinaryIO, Optional
|
8
7
|
|
9
|
-
async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]
|
10
|
-
"""Convert various file types into a tuple of (filename, file
|
8
|
+
async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[Optional[str], str]:
|
9
|
+
"""Convert various file types into a tuple of (filename, file content).
|
11
10
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
Returns:
|
21
|
-
Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
|
22
|
-
|
23
|
-
Raises:
|
24
|
-
FileNotFoundError: If the file path doesn't exist
|
25
|
-
TypeError: If the file type is not supported
|
26
|
-
ValueError: If the URL is invalid or unreachable
|
27
|
-
ValueError: If the MIME type is unsupported
|
28
|
-
"""
|
29
|
-
# Handle URLs
|
30
|
-
if isinstance(file, str) and (
|
31
|
-
file.startswith("http://") or file.startswith("https://")
|
32
|
-
):
|
33
|
-
if not client:
|
34
|
-
raise ValueError("Client must be provided to download files from URLs")
|
35
|
-
response = await client.get(file)
|
36
|
-
response.raise_for_status()
|
37
|
-
|
38
|
-
# Try to get filename from Content-Disposition header first
|
39
|
-
filename = None
|
40
|
-
content_disposition = response.headers.get("Content-Disposition")
|
41
|
-
if content_disposition and "filename=" in content_disposition:
|
42
|
-
filename = content_disposition.split("filename=")[-1].strip("\"'")
|
43
|
-
|
44
|
-
# If no Content-Disposition, try to get clean filename from URL path
|
45
|
-
if not filename:
|
46
|
-
from urllib.parse import urlparse, unquote
|
47
|
-
|
48
|
-
parsed_url = urlparse(file)
|
49
|
-
path = unquote(parsed_url.path)
|
50
|
-
filename = Path(path).name if path else None
|
51
|
-
|
52
|
-
# Fallback to default name if we couldn't extract one
|
53
|
-
filename = filename or "downloaded_file"
|
54
|
-
|
55
|
-
# Sanitize filename: remove invalid characters and limit length
|
56
|
-
import re
|
57
|
-
|
58
|
-
filename = re.sub(
|
59
|
-
r'[<>:"/\\|?*%]', "_", filename
|
60
|
-
) # Replace invalid chars with underscore
|
61
|
-
filename = re.sub(r"\s+", "_", filename) # Replace whitespace with underscore
|
62
|
-
filename = filename.strip("._") # Remove leading/trailing dots and underscores
|
63
|
-
filename = filename[:255] # Limit length to 255 characters
|
64
|
-
|
65
|
-
file_obj = io.BytesIO(response.content)
|
66
|
-
return filename, file_obj
|
11
|
+
Args:
|
12
|
+
file: Input file, can be:
|
13
|
+
- URL string starting with http:// or https://
|
14
|
+
- Base64 string
|
15
|
+
- Local file path (will be converted to base64)
|
16
|
+
- Opened binary file (will be converted to base64)
|
17
|
+
- PIL/Pillow Image object (will be converted to base64)
|
67
18
|
|
68
|
-
|
69
|
-
|
19
|
+
Returns:
|
20
|
+
Tuple[Optional[str], str]: (filename, content) where content is either a URL or base64 string
|
21
|
+
The filename may be None for URLs, base64 strings, and PIL Images
|
22
|
+
|
23
|
+
Raises:
|
24
|
+
FileNotFoundError: If the file path doesn't exist
|
25
|
+
TypeError: If the file type is not supported
|
26
|
+
ValueError: If the URL is invalid or unreachable
|
27
|
+
ValueError: If the MIME type is unsupported
|
28
|
+
"""
|
29
|
+
# Handle strings
|
30
|
+
if isinstance(file, str):
|
31
|
+
if file.startswith(('http://', 'https://')):
|
32
|
+
return None, file
|
70
33
|
try:
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
# Map MIME types to file extensions
|
83
|
-
mime_to_ext = {
|
84
|
-
"application/pdf": "pdf",
|
85
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
86
|
-
"application/msword": "doc",
|
87
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
|
88
|
-
"application/vnd.ms-powerpoint": "ppt",
|
89
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
90
|
-
"application/vnd.ms-excel": "xls",
|
91
|
-
"image/jpeg": "jpg",
|
92
|
-
"image/png": "png",
|
93
|
-
"image/jpg": "jpg",
|
94
|
-
}
|
95
|
-
|
96
|
-
if mime_type in mime_to_ext:
|
97
|
-
format = mime_to_ext[mime_type]
|
98
|
-
else:
|
99
|
-
raise ValueError(f"Unsupported MIME type: {mime_type}")
|
100
|
-
|
101
|
-
return f"file.{format}", file_obj
|
102
|
-
except Exception as e:
|
103
|
-
raise ValueError(f"Invalid base64 string: {str(e)}")
|
104
|
-
|
105
|
-
# Handle file paths
|
106
|
-
if isinstance(file, (str, Path)):
|
34
|
+
base64.b64decode(file)
|
35
|
+
return None, file
|
36
|
+
except:
|
37
|
+
try:
|
38
|
+
file = Path(file)
|
39
|
+
except:
|
40
|
+
raise ValueError("File must be a valid path, URL, or base64 string")
|
41
|
+
|
42
|
+
# Handle file paths - convert to base64
|
43
|
+
if isinstance(file, Path):
|
107
44
|
path = Path(file).resolve()
|
108
45
|
if not path.exists():
|
109
46
|
raise FileNotFoundError(f"File not found: {file}")
|
110
|
-
|
111
|
-
|
112
|
-
|
47
|
+
|
48
|
+
with open(path, "rb") as f:
|
49
|
+
file_content = f.read()
|
50
|
+
file_ext = path.suffix.lower().lstrip('.')
|
51
|
+
if not file_ext:
|
52
|
+
raise ValueError("File must have an extension")
|
53
|
+
base64_str = base64.b64encode(file_content).decode()
|
54
|
+
return path.name, base64_str
|
55
|
+
|
56
|
+
# Handle PIL Images - convert to base64
|
113
57
|
if isinstance(file, Image.Image):
|
114
58
|
img_byte_arr = io.BytesIO()
|
115
59
|
format = file.format or "PNG"
|
116
60
|
file.save(img_byte_arr, format=format)
|
117
61
|
img_byte_arr.seek(0)
|
118
|
-
|
62
|
+
base64_str = base64.b64encode(img_byte_arr.getvalue()).decode()
|
63
|
+
return None, base64_str
|
119
64
|
|
120
|
-
# Handle file-like objects
|
65
|
+
# Handle file-like objects - convert to base64
|
121
66
|
if hasattr(file, "read") and hasattr(file, "seek"):
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
)
|
126
|
-
|
67
|
+
file.seek(0)
|
68
|
+
file_content = file.read()
|
69
|
+
name = getattr(file, "name", "document")
|
70
|
+
file_ext = Path(name).suffix.lower().lstrip('.')
|
71
|
+
if not file_ext:
|
72
|
+
raise ValueError("File must have an extension")
|
73
|
+
base64_str = base64.b64encode(file_content).decode()
|
74
|
+
return Path(name).name, base64_str
|
127
75
|
|
128
76
|
raise TypeError(f"Unsupported file type: {type(file)}")
|
129
77
|
|
130
78
|
|
131
79
|
async def prepare_upload_data(
|
132
80
|
file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
|
81
|
+
filename: Optional[str] = None,
|
133
82
|
config: Optional[Configuration] = None,
|
134
|
-
client: httpx.AsyncClient = None,
|
135
83
|
) -> dict:
|
136
|
-
"""Prepare
|
84
|
+
"""Prepare data dictionary for upload.
|
137
85
|
|
138
86
|
Args:
|
139
87
|
file: The file to upload
|
140
88
|
config: Optional configuration settings
|
89
|
+
client: HTTP client for downloading remote files
|
141
90
|
|
142
91
|
Returns:
|
143
|
-
dict:
|
92
|
+
dict: JSON-serializable data dictionary ready for upload
|
144
93
|
"""
|
145
|
-
|
94
|
+
data = {}
|
146
95
|
if file:
|
147
|
-
|
148
|
-
|
96
|
+
processed_filename, processed_file = await prepare_file(file)
|
97
|
+
data["file"] = processed_file
|
98
|
+
data["file_name"] = filename or processed_filename
|
149
99
|
|
150
100
|
if config:
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
return files
|
101
|
+
data.update(config.model_dump(mode="json", exclude_none=True))
|
102
|
+
|
103
|
+
return data
|
chunkr_ai/api/task_response.py
CHANGED
@@ -74,9 +74,11 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
74
74
|
@retry_on_429()
|
75
75
|
async def update(self, config: Configuration) -> T:
|
76
76
|
"""Update the task configuration."""
|
77
|
-
|
77
|
+
data = await prepare_upload_data(None, None, config)
|
78
78
|
r = await self._client._client.patch(
|
79
|
-
self.task_url,
|
79
|
+
f"{self.task_url}/parse",
|
80
|
+
json=data,
|
81
|
+
headers=self._client._headers()
|
80
82
|
)
|
81
83
|
r.raise_for_status()
|
82
84
|
updated = TaskResponse(**r.json()).with_client(self._client)
|
@@ -0,0 +1,16 @@
|
|
1
|
+
chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
|
2
|
+
chunkr_ai/models.py,sha256=tOI7ylkhyeFfCLMisk96EPsH4UEcjBx1Mcisxc_AYXI,757
|
3
|
+
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
|
5
|
+
chunkr_ai/api/chunkr.py,sha256=BzwcKNCuLfVR-HzgY8tKStsW4pIDVVjBgnEqPLyUUMM,3292
|
6
|
+
chunkr_ai/api/chunkr_base.py,sha256=FDl0Ew8eOY4hur5FFqPENZiq9YQy0G3XWEqcKPeCO-U,6130
|
7
|
+
chunkr_ai/api/configuration.py,sha256=2Bfw_c8eQVijb0EvsexiuRbF1pZUspYFBMuZ-ErJHvs,3835
|
8
|
+
chunkr_ai/api/decorators.py,sha256=VJX4qGBIL00K2zY8bh5KAMWv7SltJ38TvPJH06FnFss,4415
|
9
|
+
chunkr_ai/api/misc.py,sha256=QN-2YWQ8e3VvvK63Ua-e8jsx6gxVxkO88Z96yWOofu0,3653
|
10
|
+
chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
|
11
|
+
chunkr_ai/api/task_response.py,sha256=E1H5Cmn9GSYHX60f3Iz8hcTItPv6DpBEEO2vP2vcKDM,6282
|
12
|
+
chunkr_ai-0.0.37.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
|
13
|
+
chunkr_ai-0.0.37.dist-info/METADATA,sha256=eKMu3F3yYMO2v1i-PAgUKEz1LyU_ManhLOpZqY7RiPo,7031
|
14
|
+
chunkr_ai-0.0.37.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
15
|
+
chunkr_ai-0.0.37.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
16
|
+
chunkr_ai-0.0.37.dist-info/RECORD,,
|
@@ -1,16 +0,0 @@
|
|
1
|
-
chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
|
2
|
-
chunkr_ai/models.py,sha256=tOI7ylkhyeFfCLMisk96EPsH4UEcjBx1Mcisxc_AYXI,757
|
3
|
-
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
|
5
|
-
chunkr_ai/api/chunkr.py,sha256=VnbuAPlWLqyf8xCCU_kpdybgjVPTwZLarDQoD3uozY0,3065
|
6
|
-
chunkr_ai/api/chunkr_base.py,sha256=giW56fL7xxJphdOTpIH52dXxpNt7OdP8pNiPSqbNjGM,5835
|
7
|
-
chunkr_ai/api/configuration.py,sha256=2Bfw_c8eQVijb0EvsexiuRbF1pZUspYFBMuZ-ErJHvs,3835
|
8
|
-
chunkr_ai/api/decorators.py,sha256=VJX4qGBIL00K2zY8bh5KAMWv7SltJ38TvPJH06FnFss,4415
|
9
|
-
chunkr_ai/api/misc.py,sha256=gTL8UG_R6bunQdKSXwm_SpyIyTmLprzdX3re_X-mMto,5730
|
10
|
-
chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
|
11
|
-
chunkr_ai/api/task_response.py,sha256=FC4OQUv4fltUij5OtFRlWRE9LxzRJGgBhh0olfHJBBg,6258
|
12
|
-
chunkr_ai-0.0.36.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
|
13
|
-
chunkr_ai-0.0.36.dist-info/METADATA,sha256=1hamwWrDvj0DirX84MAAbZs_yqSxmzGVR7mK7521HK0,7031
|
14
|
-
chunkr_ai-0.0.36.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
15
|
-
chunkr_ai-0.0.36.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
16
|
-
chunkr_ai-0.0.36.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|