chunkr-ai 0.0.22__tar.gz → 0.0.23__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {chunkr_ai-0.0.22/src/chunkr_ai.egg-info → chunkr_ai-0.0.23}/PKG-INFO +1 -1
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/pyproject.toml +1 -1
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai/api/chunkr.py +2 -2
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai/api/chunkr_base.py +1 -128
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai/api/misc.py +27 -6
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai/api/task_response.py +2 -4
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/LICENSE +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/README.md +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/setup.cfg +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai/__init__.py +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai/api/api.py +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai/api/auth.py +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai/api/config.py +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai/api/decorators.py +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai/api/protocol.py +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai/models.py +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai.egg-info/requires.txt +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/src/chunkr_ai.egg-info/top_level.txt +0 -0
- {chunkr_ai-0.0.22 → chunkr_ai-0.0.23}/tests/test_chunkr.py +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.0.
|
7
|
+
version = "0.0.23"
|
8
8
|
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
9
|
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
@@ -34,7 +34,7 @@ class Chunkr(ChunkrBase):
|
|
34
34
|
file: Union[str, Path, BinaryIO, Image.Image],
|
35
35
|
config: Configuration = None,
|
36
36
|
) -> TaskResponse:
|
37
|
-
files = prepare_upload_data(file, config)
|
37
|
+
files = await prepare_upload_data(file, config, self._client)
|
38
38
|
r = await self._client.post(
|
39
39
|
f"{self.url}/api/v1/task", files=files, headers=self._headers()
|
40
40
|
)
|
@@ -44,7 +44,7 @@ class Chunkr(ChunkrBase):
|
|
44
44
|
@anywhere()
|
45
45
|
@ensure_client()
|
46
46
|
async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
47
|
-
files = prepare_upload_data(None, config)
|
47
|
+
files = await prepare_upload_data(None, config, self._client)
|
48
48
|
r = await self._client.patch(
|
49
49
|
f"{self.url}/api/v1/task/{task_id}",
|
50
50
|
files=files,
|
@@ -4,13 +4,10 @@ from .auth import HeadersMixin
|
|
4
4
|
from abc import abstractmethod
|
5
5
|
from dotenv import load_dotenv
|
6
6
|
import httpx
|
7
|
-
import io
|
8
|
-
import json
|
9
7
|
import os
|
10
8
|
from pathlib import Path
|
11
9
|
from PIL import Image
|
12
|
-
import
|
13
|
-
from typing import BinaryIO, Tuple, Union
|
10
|
+
from typing import BinaryIO, Union
|
14
11
|
|
15
12
|
|
16
13
|
class ChunkrBase(HeadersMixin):
|
@@ -28,130 +25,6 @@ class ChunkrBase(HeadersMixin):
|
|
28
25
|
self.url = self.url.rstrip("/")
|
29
26
|
self._client = httpx.AsyncClient()
|
30
27
|
|
31
|
-
def _prepare_file(
|
32
|
-
self, file: Union[str, Path, BinaryIO, Image.Image]
|
33
|
-
) -> Tuple[str, BinaryIO]:
|
34
|
-
"""Convert various file types into a tuple of (filename, file-like object).
|
35
|
-
|
36
|
-
Args:
|
37
|
-
file: Input file, can be:
|
38
|
-
- String or Path to a file
|
39
|
-
- URL string starting with http:// or https://
|
40
|
-
- Base64 string
|
41
|
-
- Opened binary file (mode='rb')
|
42
|
-
- PIL/Pillow Image object
|
43
|
-
|
44
|
-
Returns:
|
45
|
-
Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
|
46
|
-
|
47
|
-
Raises:
|
48
|
-
FileNotFoundError: If the file path doesn't exist
|
49
|
-
TypeError: If the file type is not supported
|
50
|
-
ValueError: If the URL is invalid or unreachable
|
51
|
-
ValueError: If the MIME type is unsupported
|
52
|
-
"""
|
53
|
-
# Handle URLs
|
54
|
-
if isinstance(file, str) and (
|
55
|
-
file.startswith("http://") or file.startswith("https://")
|
56
|
-
):
|
57
|
-
response = requests.get(file)
|
58
|
-
response.raise_for_status()
|
59
|
-
file_obj = io.BytesIO(response.content)
|
60
|
-
filename = Path(file.split("/")[-1]).name or "downloaded_file"
|
61
|
-
return filename, file_obj
|
62
|
-
|
63
|
-
# Handle base64 strings
|
64
|
-
if isinstance(file, str) and "," in file and ";base64," in file:
|
65
|
-
try:
|
66
|
-
# Split header and data
|
67
|
-
header, base64_data = file.split(",", 1)
|
68
|
-
import base64
|
69
|
-
|
70
|
-
file_bytes = base64.b64decode(base64_data)
|
71
|
-
file_obj = io.BytesIO(file_bytes)
|
72
|
-
|
73
|
-
# Try to determine format from header
|
74
|
-
format = "bin"
|
75
|
-
mime_type = header.split(":")[-1].split(";")[0].lower()
|
76
|
-
|
77
|
-
# Map MIME types to file extensions
|
78
|
-
mime_to_ext = {
|
79
|
-
"application/pdf": "pdf",
|
80
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
81
|
-
"application/msword": "doc",
|
82
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
|
83
|
-
"application/vnd.ms-powerpoint": "ppt",
|
84
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
85
|
-
"application/vnd.ms-excel": "xls",
|
86
|
-
"image/jpeg": "jpg",
|
87
|
-
"image/png": "png",
|
88
|
-
"image/jpg": "jpg",
|
89
|
-
}
|
90
|
-
|
91
|
-
if mime_type in mime_to_ext:
|
92
|
-
format = mime_to_ext[mime_type]
|
93
|
-
else:
|
94
|
-
raise ValueError(f"Unsupported MIME type: {mime_type}")
|
95
|
-
|
96
|
-
return f"file.{format}", file_obj
|
97
|
-
except Exception as e:
|
98
|
-
raise ValueError(f"Invalid base64 string: {str(e)}")
|
99
|
-
|
100
|
-
# Handle file paths
|
101
|
-
if isinstance(file, (str, Path)):
|
102
|
-
path = Path(file).resolve()
|
103
|
-
if not path.exists():
|
104
|
-
raise FileNotFoundError(f"File not found: {file}")
|
105
|
-
return path.name, open(path, "rb")
|
106
|
-
|
107
|
-
# Handle PIL Images
|
108
|
-
if isinstance(file, Image.Image):
|
109
|
-
img_byte_arr = io.BytesIO()
|
110
|
-
format = file.format or "PNG"
|
111
|
-
file.save(img_byte_arr, format=format)
|
112
|
-
img_byte_arr.seek(0)
|
113
|
-
return f"image.{format.lower()}", img_byte_arr
|
114
|
-
|
115
|
-
# Handle file-like objects
|
116
|
-
if hasattr(file, "read") and hasattr(file, "seek"):
|
117
|
-
# Try to get the filename from the file object if possible
|
118
|
-
name = (
|
119
|
-
getattr(file, "name", "document")
|
120
|
-
if hasattr(file, "name")
|
121
|
-
else "document"
|
122
|
-
)
|
123
|
-
return Path(name).name, file
|
124
|
-
|
125
|
-
raise TypeError(f"Unsupported file type: {type(file)}")
|
126
|
-
|
127
|
-
def _prepare_upload_data(
|
128
|
-
self,
|
129
|
-
file: Union[str, Path, BinaryIO, Image.Image],
|
130
|
-
config: Configuration = None,
|
131
|
-
) -> Tuple[dict, dict]:
|
132
|
-
"""Prepare files and data dictionaries for upload.
|
133
|
-
|
134
|
-
Args:
|
135
|
-
file: The file to upload
|
136
|
-
config: Optional configuration settings
|
137
|
-
|
138
|
-
Returns:
|
139
|
-
Tuple[dict, dict]: (files dict, data dict) ready for upload
|
140
|
-
"""
|
141
|
-
filename, file_obj = self._prepare_file(file)
|
142
|
-
files = {"file": (filename, file_obj)}
|
143
|
-
data = {}
|
144
|
-
|
145
|
-
if config:
|
146
|
-
config_dict = config.model_dump(mode="json", exclude_none=True)
|
147
|
-
for key, value in config_dict.items():
|
148
|
-
if isinstance(value, dict):
|
149
|
-
files[key] = (None, json.dumps(value), "application/json")
|
150
|
-
else:
|
151
|
-
data[key] = value
|
152
|
-
|
153
|
-
return files, data
|
154
|
-
|
155
28
|
@abstractmethod
|
156
29
|
def upload(
|
157
30
|
self,
|
@@ -3,16 +3,36 @@ import io
|
|
3
3
|
import json
|
4
4
|
from pathlib import Path
|
5
5
|
from PIL import Image
|
6
|
-
import
|
6
|
+
import httpx
|
7
7
|
from typing import Union, Tuple, BinaryIO, Optional
|
8
8
|
|
9
|
-
def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[str, BinaryIO]:
|
10
|
-
"""Convert various file types into a tuple of (filename, file-like object).
|
9
|
+
async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image], client: httpx.AsyncClient = None) -> Tuple[str, BinaryIO]:
|
10
|
+
"""Convert various file types into a tuple of (filename, file-like object).
|
11
|
+
|
12
|
+
Args:
|
13
|
+
file: Input file, can be:
|
14
|
+
- String or Path to a file
|
15
|
+
- URL string starting with http:// or https://
|
16
|
+
- Base64 string
|
17
|
+
- Opened binary file (mode='rb')
|
18
|
+
- PIL/Pillow Image object
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
|
22
|
+
|
23
|
+
Raises:
|
24
|
+
FileNotFoundError: If the file path doesn't exist
|
25
|
+
TypeError: If the file type is not supported
|
26
|
+
ValueError: If the URL is invalid or unreachable
|
27
|
+
ValueError: If the MIME type is unsupported
|
28
|
+
"""
|
11
29
|
# Handle URLs
|
12
30
|
if isinstance(file, str) and (
|
13
31
|
file.startswith("http://") or file.startswith("https://")
|
14
32
|
):
|
15
|
-
|
33
|
+
if not client:
|
34
|
+
raise ValueError("Client must be provided to download files from URLs")
|
35
|
+
response = client.get(file)
|
16
36
|
response.raise_for_status()
|
17
37
|
|
18
38
|
# Try to get filename from Content-Disposition header first
|
@@ -108,9 +128,10 @@ def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[str, Bi
|
|
108
128
|
raise TypeError(f"Unsupported file type: {type(file)}")
|
109
129
|
|
110
130
|
|
111
|
-
def prepare_upload_data(
|
131
|
+
async def prepare_upload_data(
|
112
132
|
file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
|
113
133
|
config: Optional[Configuration] = None,
|
134
|
+
client: httpx.AsyncClient = None,
|
114
135
|
) -> dict:
|
115
136
|
"""Prepare files and data dictionaries for upload.
|
116
137
|
|
@@ -123,7 +144,7 @@ def prepare_upload_data(
|
|
123
144
|
"""
|
124
145
|
files = {}
|
125
146
|
if file:
|
126
|
-
filename, file_obj = prepare_file(file)
|
147
|
+
filename, file_obj = await prepare_file(file, client)
|
127
148
|
files = {"file": (filename, file_obj)}
|
128
149
|
|
129
150
|
if config:
|
@@ -35,10 +35,9 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
35
35
|
return self
|
36
36
|
return None
|
37
37
|
|
38
|
+
@require_task()
|
38
39
|
async def _poll_request(self) -> dict:
|
39
40
|
try:
|
40
|
-
if not self._client._client:
|
41
|
-
raise ValueError("Client not found")
|
42
41
|
r = await self._client._client.get(
|
43
42
|
self.task_url, headers=self._client._headers()
|
44
43
|
)
|
@@ -51,7 +50,6 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
51
50
|
raise
|
52
51
|
|
53
52
|
@anywhere()
|
54
|
-
@require_task()
|
55
53
|
async def poll(self) -> T:
|
56
54
|
"""Poll the task for completion."""
|
57
55
|
while True:
|
@@ -66,7 +64,7 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
66
64
|
@require_task()
|
67
65
|
async def update(self, config: Configuration) -> T:
|
68
66
|
"""Update the task configuration."""
|
69
|
-
f = prepare_upload_data(None, config)
|
67
|
+
f = await prepare_upload_data(None, config, self._client._client)
|
70
68
|
r = await self._client._client.patch(
|
71
69
|
self.task_url, files=f, headers=self._client._headers()
|
72
70
|
)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|