chunkr-ai 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/base.py +34 -122
- chunkr_ai/api/chunkr.py +63 -4
- chunkr_ai/api/chunkr_async.py +43 -4
- chunkr_ai/api/config.py +24 -24
- chunkr_ai/api/misc.py +106 -0
- chunkr_ai/api/task.py +65 -11
- chunkr_ai/main.py +12 -0
- chunkr_ai/models.py +0 -1
- {chunkr_ai-0.0.6.dist-info → chunkr_ai-0.0.8.dist-info}/METADATA +1 -1
- chunkr_ai-0.0.8.dist-info/RECORD +18 -0
- chunkr_ai-0.0.6.dist-info/RECORD +0 -17
- {chunkr_ai-0.0.6.dist-info → chunkr_ai-0.0.8.dist-info}/LICENSE +0 -0
- {chunkr_ai-0.0.6.dist-info → chunkr_ai-0.0.8.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.0.6.dist-info → chunkr_ai-0.0.8.dist-info}/top_level.txt +0 -0
chunkr_ai/api/base.py
CHANGED
@@ -3,13 +3,10 @@ from .task import TaskResponse
|
|
3
3
|
from .auth import HeadersMixin
|
4
4
|
from abc import abstractmethod
|
5
5
|
from dotenv import load_dotenv
|
6
|
-
import io
|
7
|
-
import json
|
8
6
|
import os
|
9
7
|
from pathlib import Path
|
10
8
|
from PIL import Image
|
11
|
-
import
|
12
|
-
from typing import BinaryIO, Tuple, Union
|
9
|
+
from typing import BinaryIO, Union
|
13
10
|
|
14
11
|
class ChunkrBase(HeadersMixin):
|
15
12
|
"""Base class with shared functionality for Chunkr API clients."""
|
@@ -30,140 +27,38 @@ class ChunkrBase(HeadersMixin):
|
|
30
27
|
|
31
28
|
self.url = self.url.rstrip("/")
|
32
29
|
|
33
|
-
|
34
|
-
|
35
|
-
file
|
36
|
-
) -> Tuple[str, BinaryIO]:
|
37
|
-
"""Convert various file types into a tuple of (filename, file-like object).
|
38
|
-
|
39
|
-
Args:
|
40
|
-
file: Input file, can be:
|
41
|
-
- String or Path to a file
|
42
|
-
- URL string starting with http:// or https://
|
43
|
-
- Base64 string
|
44
|
-
- Opened binary file (mode='rb')
|
45
|
-
- PIL/Pillow Image object
|
46
|
-
|
47
|
-
Returns:
|
48
|
-
Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
|
49
|
-
|
50
|
-
Raises:
|
51
|
-
FileNotFoundError: If the file path doesn't exist
|
52
|
-
TypeError: If the file type is not supported
|
53
|
-
ValueError: If the URL is invalid or unreachable
|
54
|
-
ValueError: If the MIME type is unsupported
|
55
|
-
"""
|
56
|
-
# Handle URLs
|
57
|
-
if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
|
58
|
-
response = requests.get(file)
|
59
|
-
response.raise_for_status()
|
60
|
-
file_obj = io.BytesIO(response.content)
|
61
|
-
filename = Path(file.split('/')[-1]).name or 'downloaded_file'
|
62
|
-
return filename, file_obj
|
63
|
-
|
64
|
-
# Handle base64 strings
|
65
|
-
if isinstance(file, str) and ',' in file and ';base64,' in file:
|
66
|
-
try:
|
67
|
-
# Split header and data
|
68
|
-
header, base64_data = file.split(',', 1)
|
69
|
-
import base64
|
70
|
-
file_bytes = base64.b64decode(base64_data)
|
71
|
-
file_obj = io.BytesIO(file_bytes)
|
72
|
-
|
73
|
-
# Try to determine format from header
|
74
|
-
format = 'bin'
|
75
|
-
mime_type = header.split(':')[-1].split(';')[0].lower()
|
76
|
-
|
77
|
-
# Map MIME types to file extensions
|
78
|
-
mime_to_ext = {
|
79
|
-
'application/pdf': 'pdf',
|
80
|
-
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
81
|
-
'application/msword': 'doc',
|
82
|
-
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
|
83
|
-
'application/vnd.ms-powerpoint': 'ppt',
|
84
|
-
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
|
85
|
-
'application/vnd.ms-excel': 'xls',
|
86
|
-
'image/jpeg': 'jpg',
|
87
|
-
'image/png': 'png',
|
88
|
-
'image/jpg': 'jpg'
|
89
|
-
}
|
90
|
-
|
91
|
-
if mime_type in mime_to_ext:
|
92
|
-
format = mime_to_ext[mime_type]
|
93
|
-
else:
|
94
|
-
raise ValueError(f"Unsupported MIME type: {mime_type}")
|
95
|
-
|
96
|
-
return f"file.{format}", file_obj
|
97
|
-
except Exception as e:
|
98
|
-
raise ValueError(f"Invalid base64 string: {str(e)}")
|
99
|
-
|
100
|
-
# Handle file paths
|
101
|
-
if isinstance(file, (str, Path)):
|
102
|
-
path = Path(file).resolve()
|
103
|
-
if not path.exists():
|
104
|
-
raise FileNotFoundError(f"File not found: {file}")
|
105
|
-
return path.name, open(path, 'rb')
|
106
|
-
|
107
|
-
# Handle PIL Images
|
108
|
-
if isinstance(file, Image.Image):
|
109
|
-
img_byte_arr = io.BytesIO()
|
110
|
-
format = file.format or 'PNG'
|
111
|
-
file.save(img_byte_arr, format=format)
|
112
|
-
img_byte_arr.seek(0)
|
113
|
-
return f"image.{format.lower()}", img_byte_arr
|
114
|
-
|
115
|
-
# Handle file-like objects
|
116
|
-
if hasattr(file, 'read') and hasattr(file, 'seek'):
|
117
|
-
# Try to get the filename from the file object if possible
|
118
|
-
name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
|
119
|
-
return Path(name).name, file
|
120
|
-
|
121
|
-
raise TypeError(f"Unsupported file type: {type(file)}")
|
122
|
-
|
123
|
-
def _prepare_upload_data(
|
124
|
-
self,
|
125
|
-
file: Union[str, Path, BinaryIO, Image.Image],
|
126
|
-
config: Configuration = None
|
127
|
-
) -> Tuple[dict, dict]:
|
128
|
-
"""Prepare files and data dictionaries for upload.
|
30
|
+
@abstractmethod
|
31
|
+
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
32
|
+
"""Upload a file and wait for processing to complete.
|
129
33
|
|
130
|
-
|
131
|
-
file: The file to upload
|
132
|
-
config: Optional configuration settings
|
133
|
-
|
134
|
-
Returns:
|
135
|
-
Tuple[dict, dict]: (files dict, data dict) ready for upload
|
34
|
+
Must be implemented by subclasses.
|
136
35
|
"""
|
137
|
-
|
138
|
-
files = {"file": (filename, file_obj)}
|
139
|
-
data = {}
|
140
|
-
|
141
|
-
if config:
|
142
|
-
config_dict = config.model_dump(mode="json", exclude_none=True)
|
143
|
-
for key, value in config_dict.items():
|
144
|
-
if isinstance(value, dict):
|
145
|
-
files[key] = (None, json.dumps(value), 'application/json')
|
146
|
-
else:
|
147
|
-
data[key] = value
|
148
|
-
|
149
|
-
return files, data
|
36
|
+
pass
|
150
37
|
|
151
38
|
@abstractmethod
|
152
|
-
def
|
153
|
-
"""
|
39
|
+
def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
40
|
+
"""Update a task by its ID.
|
154
41
|
|
155
42
|
Must be implemented by subclasses.
|
156
43
|
"""
|
157
44
|
pass
|
158
45
|
|
159
46
|
@abstractmethod
|
160
|
-
def
|
47
|
+
def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
161
48
|
"""Upload a file for processing and immediately return the task response.
|
162
49
|
|
163
50
|
Must be implemented by subclasses.
|
164
51
|
"""
|
165
52
|
pass
|
166
53
|
|
54
|
+
@abstractmethod
|
55
|
+
def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
56
|
+
"""Update a task by its ID.
|
57
|
+
|
58
|
+
Must be implemented by subclasses.
|
59
|
+
"""
|
60
|
+
pass
|
61
|
+
|
167
62
|
@abstractmethod
|
168
63
|
def get_task(self, task_id: str) -> TaskResponse:
|
169
64
|
"""Get a task response by its ID.
|
@@ -171,3 +66,20 @@ class ChunkrBase(HeadersMixin):
|
|
171
66
|
Must be implemented by subclasses.
|
172
67
|
"""
|
173
68
|
pass
|
69
|
+
|
70
|
+
@abstractmethod
|
71
|
+
def delete_task(self, task_id: str) -> None:
|
72
|
+
"""Delete a task by its ID.
|
73
|
+
|
74
|
+
Must be implemented by subclasses.
|
75
|
+
"""
|
76
|
+
pass
|
77
|
+
|
78
|
+
@abstractmethod
|
79
|
+
def cancel_task(self, task_id: str) -> None:
|
80
|
+
"""Cancel a task by its ID.
|
81
|
+
|
82
|
+
Must be implemented by subclasses.
|
83
|
+
"""
|
84
|
+
pass
|
85
|
+
|
chunkr_ai/api/chunkr.py
CHANGED
@@ -5,6 +5,7 @@ from pathlib import Path
|
|
5
5
|
from PIL import Image
|
6
6
|
import requests
|
7
7
|
from typing import Union, BinaryIO
|
8
|
+
from .misc import prepare_upload_data
|
8
9
|
|
9
10
|
class Chunkr(ChunkrBase):
|
10
11
|
"""Chunkr API client"""
|
@@ -43,10 +44,23 @@ class Chunkr(ChunkrBase):
|
|
43
44
|
Returns:
|
44
45
|
TaskResponse: The completed task response
|
45
46
|
"""
|
46
|
-
task = self.
|
47
|
+
task = self.create_task(file, config)
|
47
48
|
return task.poll()
|
49
|
+
|
50
|
+
def update(self, task_id: str, config: Configuration) -> TaskResponse:
|
51
|
+
"""Update a task by its ID and wait for processing to complete.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
task_id: The ID of the task to update
|
55
|
+
config: Configuration options for processing. Optional.
|
48
56
|
|
49
|
-
|
57
|
+
Returns:
|
58
|
+
TaskResponse: The updated task response
|
59
|
+
"""
|
60
|
+
task = self.update_task(task_id, config)
|
61
|
+
return task.poll()
|
62
|
+
|
63
|
+
def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
50
64
|
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
|
51
65
|
|
52
66
|
Args:
|
@@ -80,16 +94,35 @@ class Chunkr(ChunkrBase):
|
|
80
94
|
Returns:
|
81
95
|
TaskResponse: The initial task response
|
82
96
|
"""
|
83
|
-
files
|
97
|
+
files= prepare_upload_data(file, config)
|
84
98
|
r = self._session.post(
|
85
99
|
f"{self.url}/api/v1/task",
|
86
100
|
files=files,
|
87
|
-
data=data,
|
88
101
|
headers=self._headers()
|
89
102
|
)
|
90
103
|
r.raise_for_status()
|
91
104
|
return TaskResponse(**r.json()).with_client(self)
|
105
|
+
|
106
|
+
def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
107
|
+
"""Update a task by its ID.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
task_id: The ID of the task to update
|
111
|
+
config: The new configuration to use
|
92
112
|
|
113
|
+
Returns:
|
114
|
+
TaskResponse: The updated task response
|
115
|
+
"""
|
116
|
+
files = prepare_upload_data(None, config)
|
117
|
+
r = self._session.patch(
|
118
|
+
f"{self.url}/api/v1/task/{task_id}",
|
119
|
+
files=files,
|
120
|
+
headers=self._headers()
|
121
|
+
)
|
122
|
+
|
123
|
+
r.raise_for_status()
|
124
|
+
return TaskResponse(**r.json()).with_client(self)
|
125
|
+
|
93
126
|
def get_task(self, task_id: str) -> TaskResponse:
|
94
127
|
"""Get a task response by its ID.
|
95
128
|
|
@@ -106,3 +139,29 @@ class Chunkr(ChunkrBase):
|
|
106
139
|
r.raise_for_status()
|
107
140
|
return TaskResponse(**r.json()).with_client(self)
|
108
141
|
|
142
|
+
|
143
|
+
def delete_task(self, task_id: str) -> None:
|
144
|
+
"""Delete a task by its ID.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
task_id: The ID of the task to delete
|
148
|
+
"""
|
149
|
+
r = self._session.delete(
|
150
|
+
f"{self.url}/api/v1/task/{task_id}",
|
151
|
+
headers=self._headers()
|
152
|
+
)
|
153
|
+
r.raise_for_status()
|
154
|
+
|
155
|
+
def cancel_task(self, task_id: str) -> None:
|
156
|
+
"""Cancel a task by its ID.
|
157
|
+
|
158
|
+
Args:
|
159
|
+
task_id: The ID of the task to cancel
|
160
|
+
"""
|
161
|
+
r = self._session.get(
|
162
|
+
f"{self.url}/api/v1/task/{task_id}/cancel",
|
163
|
+
headers=self._headers()
|
164
|
+
)
|
165
|
+
r.raise_for_status()
|
166
|
+
|
167
|
+
|
chunkr_ai/api/chunkr_async.py
CHANGED
@@ -5,6 +5,7 @@ import httpx
|
|
5
5
|
from pathlib import Path
|
6
6
|
from PIL import Image
|
7
7
|
from typing import Union, BinaryIO
|
8
|
+
from .misc import prepare_upload_data
|
8
9
|
|
9
10
|
class ChunkrAsync(ChunkrBase):
|
10
11
|
"""Asynchronous Chunkr API client"""
|
@@ -43,10 +44,23 @@ class ChunkrAsync(ChunkrBase):
|
|
43
44
|
Returns:
|
44
45
|
TaskResponse: The completed task response
|
45
46
|
"""
|
46
|
-
task = await self.
|
47
|
+
task = await self.create_task(file, config)
|
47
48
|
return await task.poll_async()
|
49
|
+
|
50
|
+
async def update(self, task_id: str, config: Configuration) -> TaskResponse:
|
51
|
+
"""Update a task by its ID and wait for processing to complete.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
task_id: The ID of the task to update
|
55
|
+
config: Configuration options for processing. Optional.
|
48
56
|
|
49
|
-
|
57
|
+
Returns:
|
58
|
+
TaskResponse: The updated task response
|
59
|
+
"""
|
60
|
+
task = await self.update_task(task_id, config)
|
61
|
+
return await task.poll_async()
|
62
|
+
|
63
|
+
async def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
50
64
|
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
|
51
65
|
|
52
66
|
Args:
|
@@ -80,16 +94,26 @@ class ChunkrAsync(ChunkrBase):
|
|
80
94
|
Returns:
|
81
95
|
TaskResponse: The initial task response
|
82
96
|
"""
|
83
|
-
files
|
97
|
+
files = prepare_upload_data(file, config)
|
84
98
|
r = await self._client.post(
|
85
99
|
f"{self.url}/api/v1/task",
|
86
100
|
files=files,
|
87
|
-
json=config.model_dump() if config else {},
|
88
101
|
headers=self._headers()
|
89
102
|
)
|
90
103
|
r.raise_for_status()
|
91
104
|
return TaskResponse(**r.json()).with_client(self)
|
92
105
|
|
106
|
+
async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
107
|
+
files = prepare_upload_data(None, config)
|
108
|
+
r = await self._client.patch(
|
109
|
+
f"{self.url}/api/v1/task/{task_id}",
|
110
|
+
files=files,
|
111
|
+
headers=self._headers()
|
112
|
+
)
|
113
|
+
|
114
|
+
r.raise_for_status()
|
115
|
+
return TaskResponse(**r.json()).with_client(self)
|
116
|
+
|
93
117
|
async def get_task(self, task_id: str) -> TaskResponse:
|
94
118
|
r = await self._client.get(
|
95
119
|
f"{self.url}/api/v1/task/{task_id}",
|
@@ -97,7 +121,22 @@ class ChunkrAsync(ChunkrBase):
|
|
97
121
|
)
|
98
122
|
r.raise_for_status()
|
99
123
|
return TaskResponse(**r.json()).with_client(self)
|
124
|
+
|
125
|
+
async def delete_task(self, task_id: str) -> None:
|
126
|
+
r = await self._client.delete(
|
127
|
+
f"{self.url}/api/v1/task/{task_id}",
|
128
|
+
headers=self._headers()
|
129
|
+
)
|
130
|
+
r.raise_for_status()
|
131
|
+
|
132
|
+
async def cancel_task(self, task_id: str) -> None:
|
133
|
+
r = await self._client.get(
|
134
|
+
f"{self.url}/api/v1/task/{task_id}/cancel",
|
135
|
+
headers=self._headers()
|
136
|
+
)
|
137
|
+
r.raise_for_status()
|
100
138
|
|
139
|
+
|
101
140
|
async def __aenter__(self):
|
102
141
|
return self
|
103
142
|
|
chunkr_ai/api/config.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from pydantic import BaseModel, Field, model_validator
|
1
|
+
from pydantic import BaseModel, Field, model_validator, ConfigDict
|
2
2
|
from enum import Enum
|
3
3
|
from typing import Optional, List, Dict
|
4
4
|
|
@@ -10,30 +10,30 @@ class CroppingStrategy(str, Enum):
|
|
10
10
|
ALL = "All"
|
11
11
|
AUTO = "Auto"
|
12
12
|
|
13
|
-
class LlmConfig(BaseModel):
|
14
|
-
model: str
|
15
|
-
prompt: str
|
16
|
-
temperature: float = 0.0
|
17
|
-
|
18
13
|
class GenerationConfig(BaseModel):
|
19
14
|
html: Optional[GenerationStrategy] = None
|
20
|
-
llm: Optional[
|
15
|
+
llm: Optional[str] = None
|
21
16
|
markdown: Optional[GenerationStrategy] = None
|
22
17
|
crop_image: Optional[CroppingStrategy] = None
|
23
18
|
|
24
19
|
class SegmentProcessing(BaseModel):
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
20
|
+
model_config = ConfigDict(
|
21
|
+
populate_by_name=True,
|
22
|
+
alias_generator=str.title
|
23
|
+
)
|
24
|
+
|
25
|
+
title: Optional[GenerationConfig] = Field(default=None, alias="Title")
|
26
|
+
section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
|
27
|
+
text: Optional[GenerationConfig] = Field(default=None, alias="Text")
|
28
|
+
list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
|
29
|
+
table: Optional[GenerationConfig] = Field(default=None, alias="Table")
|
30
|
+
picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
|
31
|
+
caption: Optional[GenerationConfig] = Field(default=None, alias="Caption")
|
32
|
+
formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
|
33
|
+
footnote: Optional[GenerationConfig] = Field(default=None, alias="Footnote")
|
34
|
+
page_header: Optional[GenerationConfig] = Field(default=None, alias="PageHeader")
|
35
|
+
page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
|
36
|
+
page: Optional[GenerationConfig] = Field(default=None, alias="Page")
|
37
37
|
|
38
38
|
class ChunkProcessing(BaseModel):
|
39
39
|
target_length: Optional[int] = None
|
@@ -86,9 +86,9 @@ class Segment(BaseModel):
|
|
86
86
|
bbox: BoundingBox
|
87
87
|
content: str
|
88
88
|
page_height: float
|
89
|
-
html: Optional[str]
|
90
|
-
image: Optional[str]
|
91
|
-
markdown: Optional[str]
|
89
|
+
html: Optional[str] = None
|
90
|
+
image: Optional[str] = None
|
91
|
+
markdown: Optional[str] = None
|
92
92
|
ocr: List[OCRResult]
|
93
93
|
page_number: int
|
94
94
|
page_width: float
|
@@ -104,8 +104,8 @@ class ExtractedJson(BaseModel):
|
|
104
104
|
data: Dict
|
105
105
|
|
106
106
|
class OutputResponse(BaseModel):
|
107
|
-
chunks: List[Chunk]
|
108
|
-
extracted_json: Optional[ExtractedJson]
|
107
|
+
chunks: List[Chunk]
|
108
|
+
extracted_json: Optional[ExtractedJson] = Field(default=None)
|
109
109
|
|
110
110
|
class Model(str, Enum):
|
111
111
|
FAST = "Fast"
|
chunkr_ai/api/misc.py
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
import io
|
2
|
+
import json
|
3
|
+
from pathlib import Path
|
4
|
+
from PIL import Image
|
5
|
+
import requests
|
6
|
+
from typing import Union, Tuple, BinaryIO, Optional
|
7
|
+
from .config import Configuration
|
8
|
+
|
9
|
+
|
10
|
+
def prepare_file(
|
11
|
+
file: Union[str, Path, BinaryIO, Image.Image]
|
12
|
+
) -> Tuple[str, BinaryIO]:
|
13
|
+
"""Convert various file types into a tuple of (filename, file-like object)."""
|
14
|
+
# Handle URLs
|
15
|
+
if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
|
16
|
+
response = requests.get(file)
|
17
|
+
response.raise_for_status()
|
18
|
+
file_obj = io.BytesIO(response.content)
|
19
|
+
filename = Path(file.split('/')[-1]).name or 'downloaded_file'
|
20
|
+
return filename, file_obj
|
21
|
+
|
22
|
+
# Handle base64 strings
|
23
|
+
if isinstance(file, str) and ',' in file and ';base64,' in file:
|
24
|
+
try:
|
25
|
+
# Split header and data
|
26
|
+
header, base64_data = file.split(',', 1)
|
27
|
+
import base64
|
28
|
+
file_bytes = base64.b64decode(base64_data)
|
29
|
+
file_obj = io.BytesIO(file_bytes)
|
30
|
+
|
31
|
+
# Try to determine format from header
|
32
|
+
format = 'bin'
|
33
|
+
mime_type = header.split(':')[-1].split(';')[0].lower()
|
34
|
+
|
35
|
+
# Map MIME types to file extensions
|
36
|
+
mime_to_ext = {
|
37
|
+
'application/pdf': 'pdf',
|
38
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
39
|
+
'application/msword': 'doc',
|
40
|
+
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
|
41
|
+
'application/vnd.ms-powerpoint': 'ppt',
|
42
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
|
43
|
+
'application/vnd.ms-excel': 'xls',
|
44
|
+
'image/jpeg': 'jpg',
|
45
|
+
'image/png': 'png',
|
46
|
+
'image/jpg': 'jpg'
|
47
|
+
}
|
48
|
+
|
49
|
+
if mime_type in mime_to_ext:
|
50
|
+
format = mime_to_ext[mime_type]
|
51
|
+
else:
|
52
|
+
raise ValueError(f"Unsupported MIME type: {mime_type}")
|
53
|
+
|
54
|
+
return f"file.{format}", file_obj
|
55
|
+
except Exception as e:
|
56
|
+
raise ValueError(f"Invalid base64 string: {str(e)}")
|
57
|
+
|
58
|
+
# Handle file paths
|
59
|
+
if isinstance(file, (str, Path)):
|
60
|
+
path = Path(file).resolve()
|
61
|
+
if not path.exists():
|
62
|
+
raise FileNotFoundError(f"File not found: {file}")
|
63
|
+
return path.name, open(path, 'rb')
|
64
|
+
|
65
|
+
# Handle PIL Images
|
66
|
+
if isinstance(file, Image.Image):
|
67
|
+
img_byte_arr = io.BytesIO()
|
68
|
+
format = file.format or 'PNG'
|
69
|
+
file.save(img_byte_arr, format=format)
|
70
|
+
img_byte_arr.seek(0)
|
71
|
+
return f"image.{format.lower()}", img_byte_arr
|
72
|
+
|
73
|
+
# Handle file-like objects
|
74
|
+
if hasattr(file, 'read') and hasattr(file, 'seek'):
|
75
|
+
# Try to get the filename from the file object if possible
|
76
|
+
name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
|
77
|
+
return Path(name).name, file
|
78
|
+
|
79
|
+
raise TypeError(f"Unsupported file type: {type(file)}")
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
def prepare_upload_data(
|
84
|
+
file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
|
85
|
+
config: Optional[Configuration] = None
|
86
|
+
) -> dict:
|
87
|
+
"""Prepare files and data dictionaries for upload.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
file: The file to upload
|
91
|
+
config: Optional configuration settings
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
dict: (files dict) ready for upload
|
95
|
+
"""
|
96
|
+
files = {}
|
97
|
+
if file:
|
98
|
+
filename, file_obj = prepare_file(file)
|
99
|
+
files = {"file": (filename, file_obj)}
|
100
|
+
|
101
|
+
if config:
|
102
|
+
config_dict = config.model_dump(mode="json", exclude_none=True)
|
103
|
+
for key, value in config_dict.items():
|
104
|
+
files[key] = (None, json.dumps(value), 'application/json')
|
105
|
+
|
106
|
+
return files
|
chunkr_ai/api/task.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from .protocol import ChunkrClientProtocol
|
2
2
|
from .config import Configuration, OutputResponse
|
3
|
+
from .misc import prepare_upload_data
|
3
4
|
import asyncio
|
4
5
|
from datetime import datetime
|
5
6
|
from enum import Enum
|
@@ -12,22 +13,23 @@ class Status(str, Enum):
|
|
12
13
|
PROCESSING = "Processing"
|
13
14
|
SUCCEEDED = "Succeeded"
|
14
15
|
FAILED = "Failed"
|
16
|
+
CANCELLED = "Cancelled"
|
15
17
|
|
16
18
|
class TaskResponse(BaseModel):
|
17
19
|
configuration: Configuration
|
18
20
|
created_at: datetime
|
19
|
-
expires_at: Optional[datetime]
|
20
|
-
file_name: Optional[str]
|
21
|
-
finished_at: Optional[datetime]
|
22
|
-
input_file_url: Optional[str]
|
21
|
+
expires_at: Optional[datetime] = None
|
22
|
+
file_name: Optional[str] = None
|
23
|
+
finished_at: Optional[datetime] = None
|
24
|
+
input_file_url: Optional[str] = None
|
23
25
|
message: str
|
24
|
-
output: Optional[OutputResponse]
|
25
|
-
page_count: Optional[int]
|
26
|
-
pdf_url: Optional[str]
|
27
|
-
started_at: Optional[datetime]
|
26
|
+
output: Optional[OutputResponse] = None
|
27
|
+
page_count: Optional[int] = None
|
28
|
+
pdf_url: Optional[str] = None
|
29
|
+
started_at: Optional[datetime] = None
|
28
30
|
status: Status
|
29
31
|
task_id: str
|
30
|
-
task_url: Optional[str]
|
32
|
+
task_url: Optional[str] = None
|
31
33
|
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
32
34
|
|
33
35
|
def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
|
@@ -79,7 +81,8 @@ class TaskResponse(BaseModel):
|
|
79
81
|
"""Poll the task for completion."""
|
80
82
|
while True:
|
81
83
|
response = self._poll_request_sync()
|
82
|
-
|
84
|
+
updated_task = TaskResponse(**response).with_client(self._client)
|
85
|
+
self.__dict__.update(updated_task.__dict__)
|
83
86
|
|
84
87
|
if result := self._check_status():
|
85
88
|
return result
|
@@ -90,7 +93,8 @@ class TaskResponse(BaseModel):
|
|
90
93
|
"""Poll the task for completion asynchronously."""
|
91
94
|
while True:
|
92
95
|
response = await self._poll_request_async()
|
93
|
-
|
96
|
+
updated_task = TaskResponse(**response).with_client(self._client)
|
97
|
+
self.__dict__.update(updated_task.__dict__)
|
94
98
|
|
95
99
|
if result := self._check_status():
|
96
100
|
return result
|
@@ -108,6 +112,56 @@ class TaskResponse(BaseModel):
|
|
108
112
|
if content:
|
109
113
|
parts.append(content)
|
110
114
|
return "\n".join(parts)
|
115
|
+
|
116
|
+
def update(self, config: Configuration) -> 'TaskResponse':
|
117
|
+
files = prepare_upload_data(None, config)
|
118
|
+
r = self._client._session.patch(
|
119
|
+
f"{self.task_url}",
|
120
|
+
files=files,
|
121
|
+
headers=self._client._headers()
|
122
|
+
)
|
123
|
+
r.raise_for_status()
|
124
|
+
return TaskResponse(**r.json()).with_client(self._client)
|
125
|
+
|
126
|
+
async def update_async(self, config: Configuration) -> 'TaskResponse':
|
127
|
+
files = prepare_upload_data(None, config)
|
128
|
+
r = await self._client._client.patch(
|
129
|
+
f"{self.task_url}",
|
130
|
+
files=files,
|
131
|
+
headers=self._client._headers()
|
132
|
+
)
|
133
|
+
r.raise_for_status()
|
134
|
+
return TaskResponse(**r.json()).with_client(self._client)
|
135
|
+
|
136
|
+
def cancel(self):
|
137
|
+
r = self._client._session.get(
|
138
|
+
f"{self.task_url}/cancel",
|
139
|
+
headers=self._client._headers()
|
140
|
+
)
|
141
|
+
r.raise_for_status()
|
142
|
+
self.poll()
|
143
|
+
|
144
|
+
async def cancel_async(self):
|
145
|
+
r = await self._client._client.get(
|
146
|
+
f"{self.task_url}/cancel",
|
147
|
+
headers=self._client._headers()
|
148
|
+
)
|
149
|
+
r.raise_for_status()
|
150
|
+
await self.poll_async()
|
151
|
+
|
152
|
+
def delete(self):
|
153
|
+
r = self._client._session.delete(
|
154
|
+
f"{self.task_url}",
|
155
|
+
headers=self._client._headers()
|
156
|
+
)
|
157
|
+
r.raise_for_status()
|
158
|
+
|
159
|
+
async def delete_async(self):
|
160
|
+
r = await self._client._client.delete(
|
161
|
+
f"{self.task_url}",
|
162
|
+
headers=self._client._headers()
|
163
|
+
)
|
164
|
+
r.raise_for_status()
|
111
165
|
|
112
166
|
def html(self) -> str:
|
113
167
|
"""Get full HTML for the task"""
|
chunkr_ai/main.py
CHANGED
@@ -0,0 +1,12 @@
|
|
1
|
+
from chunkr_ai.api.chunkr import Chunkr
|
2
|
+
from chunkr_ai.models import Configuration
|
3
|
+
from chunkr_ai.api.config import SegmentationStrategy, ChunkProcessing
|
4
|
+
|
5
|
+
if __name__ == "__main__":
|
6
|
+
chunkr = Chunkr()
|
7
|
+
task = chunkr.update_task("556b4fe5-e3f7-48dc-9f56-0fb7fbacdb87", Configuration(
|
8
|
+
chunk_processing=ChunkProcessing(
|
9
|
+
target_length=1000
|
10
|
+
)
|
11
|
+
))
|
12
|
+
print(task)
|
chunkr_ai/models.py
CHANGED
@@ -0,0 +1,18 @@
|
|
1
|
+
chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
|
2
|
+
chunkr_ai/main.py,sha256=_MT1lcnNiXjVW9ZkZYl28SB_f6M9g_IOgZxvhodTzAo,394
|
3
|
+
chunkr_ai/models.py,sha256=T8_F-Y1US21ZJVzLIaroqp-Hd0_ZFbdkbEOxr63-PNE,827
|
4
|
+
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
|
7
|
+
chunkr_ai/api/base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
|
8
|
+
chunkr_ai/api/chunkr.py,sha256=PmrK37HbK2T1KUPitKnt4wZqIujL61Jo12qW9DEpNMI,5186
|
9
|
+
chunkr_ai/api/chunkr_async.py,sha256=2yYyAO9-j2xKQYH0fJb2S6gL26hgbtL4QyqlG9l0QBY,4893
|
10
|
+
chunkr_ai/api/config.py,sha256=XIqXZ_8q7U_BEmY5wyIC9mbQGZBw1956EN9yhC4svD0,4235
|
11
|
+
chunkr_ai/api/misc.py,sha256=tScsUUcrqeVh_bZv1YlbmjGkQSTDQN8NyKxoNwAG6XA,3792
|
12
|
+
chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
|
13
|
+
chunkr_ai/api/task.py,sha256=EB6RK8ms7EaNj57tNJZoNgNMHGWKXFhkQ1WC7gk5ht4,6059
|
14
|
+
chunkr_ai-0.0.8.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
chunkr_ai-0.0.8.dist-info/METADATA,sha256=tL3OZfFIRsgfIKoDYWAS89bZw48_0C8cdqHJ6_GrT7A,4844
|
16
|
+
chunkr_ai-0.0.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
17
|
+
chunkr_ai-0.0.8.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
18
|
+
chunkr_ai-0.0.8.dist-info/RECORD,,
|
chunkr_ai-0.0.6.dist-info/RECORD
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
|
2
|
-
chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
chunkr_ai/models.py,sha256=kNeYtBO4TFvQWKFCent7tLEQjyKlVUieKNiuTt3u564,842
|
4
|
-
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
|
7
|
-
chunkr_ai/api/base.py,sha256=WDHx8tU0fl9_-yvYTKL-U0uaxHv-8_bRfiw9Xkl-mWM,6499
|
8
|
-
chunkr_ai/api/chunkr.py,sha256=LkBFzGB_T0y3fnBeIn_nwQW6Mb7eZO-iTlzWrmWBoko,3450
|
9
|
-
chunkr_ai/api/chunkr_async.py,sha256=B9deRVoe4h3Csh_jEuQxuxQ-DKSuZPdwkanFTyfHmeM,3603
|
10
|
-
chunkr_ai/api/config.py,sha256=K0s1giImciPksu-bO9gzRwUaK2Vo1nxNKQkXlRQ2cb8,3785
|
11
|
-
chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
|
12
|
-
chunkr_ai/api/task.py,sha256=_WOGRirlLEow_wS9kJB_dNYb2RvYE9nlu7Spq16AhME,4172
|
13
|
-
chunkr_ai-0.0.6.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
chunkr_ai-0.0.6.dist-info/METADATA,sha256=TuBBU6n1g7kdLVky2vAx94TFWZVyu8PqQ_47vi6tN5E,4844
|
15
|
-
chunkr_ai-0.0.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
16
|
-
chunkr_ai-0.0.6.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
17
|
-
chunkr_ai-0.0.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|