chunkr-ai 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/__init__.py +0 -0
- chunkr_ai/api/auth.py +0 -2
- chunkr_ai/api/base.py +173 -0
- chunkr_ai/api/chunkr.py +69 -86
- chunkr_ai/api/chunkr_async.py +93 -27
- chunkr_ai/api/config.py +131 -0
- chunkr_ai/api/protocol.py +19 -0
- chunkr_ai/api/task.py +131 -0
- chunkr_ai/models.py +48 -0
- chunkr_ai-0.0.4.dist-info/METADATA +204 -0
- chunkr_ai-0.0.4.dist-info/RECORD +17 -0
- chunkr_ai/api/models.py +0 -231
- chunkr_ai-0.0.2.dist-info/METADATA +0 -16
- chunkr_ai-0.0.2.dist-info/RECORD +0 -12
- {chunkr_ai-0.0.2.dist-info → chunkr_ai-0.0.4.dist-info}/LICENSE +0 -0
- {chunkr_ai-0.0.2.dist-info → chunkr_ai-0.0.4.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.0.2.dist-info → chunkr_ai-0.0.4.dist-info}/top_level.txt +0 -0
File without changes
|
chunkr_ai/api/auth.py
CHANGED
chunkr_ai/api/base.py
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
from .config import Configuration
|
2
|
+
from .task import TaskResponse
|
3
|
+
from .auth import HeadersMixin
|
4
|
+
from abc import abstractmethod
|
5
|
+
from dotenv import load_dotenv
|
6
|
+
import io
|
7
|
+
import json
|
8
|
+
import os
|
9
|
+
from pathlib import Path
|
10
|
+
from PIL import Image
|
11
|
+
import requests
|
12
|
+
from typing import BinaryIO, Tuple, Union
|
13
|
+
|
14
|
+
class ChunkrBase(HeadersMixin):
|
15
|
+
"""Base class with shared functionality for Chunkr API clients."""
|
16
|
+
|
17
|
+
def __init__(self, url: str = None, api_key: str = None):
|
18
|
+
load_dotenv()
|
19
|
+
self.url = (
|
20
|
+
url or
|
21
|
+
os.getenv('CHUNKR_URL') or
|
22
|
+
'https://api.chunkr.ai'
|
23
|
+
)
|
24
|
+
self._api_key = (
|
25
|
+
api_key or
|
26
|
+
os.getenv('CHUNKR_API_KEY')
|
27
|
+
)
|
28
|
+
if not self._api_key:
|
29
|
+
raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
|
30
|
+
|
31
|
+
self.url = self.url.rstrip("/")
|
32
|
+
|
33
|
+
def _prepare_file(
|
34
|
+
self,
|
35
|
+
file: Union[str, Path, BinaryIO, Image.Image]
|
36
|
+
) -> Tuple[str, BinaryIO]:
|
37
|
+
"""Convert various file types into a tuple of (filename, file-like object).
|
38
|
+
|
39
|
+
Args:
|
40
|
+
file: Input file, can be:
|
41
|
+
- String or Path to a file
|
42
|
+
- URL string starting with http:// or https://
|
43
|
+
- Base64 string
|
44
|
+
- Opened binary file (mode='rb')
|
45
|
+
- PIL/Pillow Image object
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
|
49
|
+
|
50
|
+
Raises:
|
51
|
+
FileNotFoundError: If the file path doesn't exist
|
52
|
+
TypeError: If the file type is not supported
|
53
|
+
ValueError: If the URL is invalid or unreachable
|
54
|
+
ValueError: If the MIME type is unsupported
|
55
|
+
"""
|
56
|
+
# Handle URLs
|
57
|
+
if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
|
58
|
+
response = requests.get(file)
|
59
|
+
response.raise_for_status()
|
60
|
+
file_obj = io.BytesIO(response.content)
|
61
|
+
filename = Path(file.split('/')[-1]).name or 'downloaded_file'
|
62
|
+
return filename, file_obj
|
63
|
+
|
64
|
+
# Handle base64 strings
|
65
|
+
if isinstance(file, str) and ',' in file and ';base64,' in file:
|
66
|
+
try:
|
67
|
+
# Split header and data
|
68
|
+
header, base64_data = file.split(',', 1)
|
69
|
+
import base64
|
70
|
+
file_bytes = base64.b64decode(base64_data)
|
71
|
+
file_obj = io.BytesIO(file_bytes)
|
72
|
+
|
73
|
+
# Try to determine format from header
|
74
|
+
format = 'bin'
|
75
|
+
mime_type = header.split(':')[-1].split(';')[0].lower()
|
76
|
+
|
77
|
+
# Map MIME types to file extensions
|
78
|
+
mime_to_ext = {
|
79
|
+
'application/pdf': 'pdf',
|
80
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
81
|
+
'application/msword': 'doc',
|
82
|
+
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
|
83
|
+
'application/vnd.ms-powerpoint': 'ppt',
|
84
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
|
85
|
+
'application/vnd.ms-excel': 'xls',
|
86
|
+
'image/jpeg': 'jpg',
|
87
|
+
'image/png': 'png',
|
88
|
+
'image/jpg': 'jpg'
|
89
|
+
}
|
90
|
+
|
91
|
+
if mime_type in mime_to_ext:
|
92
|
+
format = mime_to_ext[mime_type]
|
93
|
+
else:
|
94
|
+
raise ValueError(f"Unsupported MIME type: {mime_type}")
|
95
|
+
|
96
|
+
return f"file.{format}", file_obj
|
97
|
+
except Exception as e:
|
98
|
+
raise ValueError(f"Invalid base64 string: {str(e)}")
|
99
|
+
|
100
|
+
# Handle file paths
|
101
|
+
if isinstance(file, (str, Path)):
|
102
|
+
path = Path(file).resolve()
|
103
|
+
if not path.exists():
|
104
|
+
raise FileNotFoundError(f"File not found: {file}")
|
105
|
+
return path.name, open(path, 'rb')
|
106
|
+
|
107
|
+
# Handle PIL Images
|
108
|
+
if isinstance(file, Image.Image):
|
109
|
+
img_byte_arr = io.BytesIO()
|
110
|
+
format = file.format or 'PNG'
|
111
|
+
file.save(img_byte_arr, format=format)
|
112
|
+
img_byte_arr.seek(0)
|
113
|
+
return f"image.{format.lower()}", img_byte_arr
|
114
|
+
|
115
|
+
# Handle file-like objects
|
116
|
+
if hasattr(file, 'read') and hasattr(file, 'seek'):
|
117
|
+
# Try to get the filename from the file object if possible
|
118
|
+
name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
|
119
|
+
return Path(name).name, file
|
120
|
+
|
121
|
+
raise TypeError(f"Unsupported file type: {type(file)}")
|
122
|
+
|
123
|
+
def _prepare_upload_data(
|
124
|
+
self,
|
125
|
+
file: Union[str, Path, BinaryIO, Image.Image],
|
126
|
+
config: Configuration = None
|
127
|
+
) -> Tuple[dict, dict]:
|
128
|
+
"""Prepare files and data dictionaries for upload.
|
129
|
+
|
130
|
+
Args:
|
131
|
+
file: The file to upload
|
132
|
+
config: Optional configuration settings
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
Tuple[dict, dict]: (files dict, data dict) ready for upload
|
136
|
+
"""
|
137
|
+
filename, file_obj = self._prepare_file(file)
|
138
|
+
files = {"file": (filename, file_obj)}
|
139
|
+
data = {}
|
140
|
+
|
141
|
+
if config:
|
142
|
+
config_dict = config.model_dump(mode="json", exclude_none=True)
|
143
|
+
for key, value in config_dict.items():
|
144
|
+
if isinstance(value, dict):
|
145
|
+
files[key] = (None, json.dumps(value), 'application/json')
|
146
|
+
else:
|
147
|
+
data[key] = value
|
148
|
+
|
149
|
+
return files, data
|
150
|
+
|
151
|
+
@abstractmethod
|
152
|
+
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
153
|
+
"""Upload a file and wait for processing to complete.
|
154
|
+
|
155
|
+
Must be implemented by subclasses.
|
156
|
+
"""
|
157
|
+
pass
|
158
|
+
|
159
|
+
@abstractmethod
|
160
|
+
def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
161
|
+
"""Upload a file for processing and immediately return the task response.
|
162
|
+
|
163
|
+
Must be implemented by subclasses.
|
164
|
+
"""
|
165
|
+
pass
|
166
|
+
|
167
|
+
@abstractmethod
|
168
|
+
def get_task(self, task_id: str) -> TaskResponse:
|
169
|
+
"""Get a task response by its ID.
|
170
|
+
|
171
|
+
Must be implemented by subclasses.
|
172
|
+
"""
|
173
|
+
pass
|
chunkr_ai/api/chunkr.py
CHANGED
@@ -1,125 +1,108 @@
|
|
1
|
-
from .
|
2
|
-
from .
|
3
|
-
from
|
4
|
-
import io
|
5
|
-
import os
|
1
|
+
from .base import ChunkrBase
|
2
|
+
from .config import Configuration
|
3
|
+
from .task import TaskResponse
|
6
4
|
from pathlib import Path
|
7
5
|
from PIL import Image
|
8
6
|
import requests
|
9
|
-
from typing import Union, BinaryIO
|
7
|
+
from typing import Union, BinaryIO
|
10
8
|
|
11
|
-
class Chunkr(
|
12
|
-
"""
|
9
|
+
class Chunkr(ChunkrBase):
|
10
|
+
"""Chunkr API client"""
|
13
11
|
|
14
12
|
def __init__(self, url: str = None, api_key: str = None):
|
15
|
-
|
16
|
-
self.
|
17
|
-
url or
|
18
|
-
os.getenv('CHUNKR_URL') or
|
19
|
-
'https://api.chunkr.ai'
|
20
|
-
)
|
21
|
-
self._api_key = (
|
22
|
-
api_key or
|
23
|
-
os.getenv('CHUNKR_API_KEY')
|
24
|
-
)
|
25
|
-
if not self._api_key:
|
26
|
-
raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
|
27
|
-
|
28
|
-
self.url = self.url.rstrip("/")
|
13
|
+
super().__init__(url, api_key)
|
14
|
+
self._session = requests.Session()
|
29
15
|
|
30
|
-
def
|
31
|
-
|
32
|
-
file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO]
|
33
|
-
) -> Tuple[str, BinaryIO]:
|
34
|
-
"""Convert various file types into a tuple of (filename, file-like object).
|
16
|
+
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
17
|
+
"""Upload a file and wait for processing to complete.
|
35
18
|
|
36
19
|
Args:
|
37
|
-
file:
|
20
|
+
file: The file to upload.
|
21
|
+
config: Configuration options for processing. Optional.
|
38
22
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
path = Path(file).resolve()
|
44
|
-
if not path.exists():
|
45
|
-
raise FileNotFoundError(f"File not found: {file}")
|
46
|
-
return path.name, path.open("rb")
|
47
|
-
elif isinstance(file, Image.Image):
|
48
|
-
img_byte_arr = io.BytesIO()
|
49
|
-
file.save(img_byte_arr, format=file.format or 'PNG')
|
50
|
-
img_byte_arr.seek(0)
|
51
|
-
return "image.png", img_byte_arr
|
52
|
-
elif isinstance(file, bytes):
|
53
|
-
return "document", io.BytesIO(file)
|
54
|
-
elif isinstance(file, io.BytesIO):
|
55
|
-
return "document", file
|
56
|
-
else:
|
57
|
-
return "document", file
|
58
|
-
|
59
|
-
def upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
|
60
|
-
"""Upload a file and wait for processing to complete.
|
23
|
+
Examples:
|
24
|
+
```
|
25
|
+
# Upload from file path
|
26
|
+
chunkr.upload("document.pdf")
|
61
27
|
|
62
|
-
|
63
|
-
|
64
|
-
- BinaryIO: A file-like object (e.g., opened with 'rb' mode)
|
65
|
-
- Image.Image: A PIL/Pillow Image object
|
66
|
-
- bytes: Raw binary data
|
67
|
-
- io.BytesIO: A binary stream in memory
|
28
|
+
# Upload from URL
|
29
|
+
chunkr.upload("https://example.com/document.pdf")
|
68
30
|
|
69
|
-
|
70
|
-
|
71
|
-
config:
|
72
|
-
Configuration options for processing. Optional.
|
31
|
+
# Upload from base64 string (must include MIME type header)
|
32
|
+
chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
73
33
|
|
34
|
+
# Upload from opened file
|
35
|
+
with open("document.pdf", "rb") as f:
|
36
|
+
chunkr.upload(f)
|
37
|
+
|
38
|
+
# Upload an image
|
39
|
+
from PIL import Image
|
40
|
+
img = Image.open("photo.jpg")
|
41
|
+
chunkr.upload(img)
|
42
|
+
```
|
74
43
|
Returns:
|
75
44
|
TaskResponse: The completed task response
|
76
45
|
"""
|
77
|
-
|
46
|
+
task = self.start_upload(file, config)
|
47
|
+
return task.poll()
|
78
48
|
|
79
|
-
def start_upload(self, file: Union[str, BinaryIO, Image.Image
|
49
|
+
def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
80
50
|
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
|
81
51
|
|
82
|
-
The file can be one of:
|
83
|
-
- str: Path to a file on disk
|
84
|
-
- BinaryIO: A file-like object (e.g., opened with 'rb' mode)
|
85
|
-
- Image.Image: A PIL/Pillow Image object
|
86
|
-
- bytes: Raw binary data
|
87
|
-
- io.BytesIO: A binary stream in memory
|
88
|
-
|
89
52
|
Args:
|
90
53
|
file: The file to upload.
|
91
|
-
config
|
54
|
+
config: Configuration options for processing. Optional.
|
55
|
+
|
56
|
+
Examples:
|
57
|
+
```
|
58
|
+
# Upload from file path
|
59
|
+
task = chunkr.start_upload("document.pdf")
|
60
|
+
|
61
|
+
# Upload from opened file
|
62
|
+
with open("document.pdf", "rb") as f:
|
63
|
+
task = chunkr.start_upload(f)
|
64
|
+
|
65
|
+
# Upload from URL
|
66
|
+
task = chunkr.start_upload("https://example.com/document.pdf")
|
67
|
+
|
68
|
+
# Upload from base64 string (must include MIME type header)
|
69
|
+
task = chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
70
|
+
|
71
|
+
# Upload an image
|
72
|
+
from PIL import Image
|
73
|
+
img = Image.open("photo.jpg")
|
74
|
+
task = chunkr.start_upload(img)
|
75
|
+
|
76
|
+
# Wait for the task to complete - this can be done when needed
|
77
|
+
task.poll()
|
78
|
+
```
|
92
79
|
|
93
80
|
Returns:
|
94
81
|
TaskResponse: The initial task response
|
95
|
-
|
96
|
-
Raises:
|
97
|
-
requests.exceptions.HTTPError: If the API request fails
|
98
82
|
"""
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
url,
|
105
|
-
files=files,
|
106
|
-
json=config.dict() if config else {},
|
83
|
+
files, data = self._prepare_upload_data(file, config)
|
84
|
+
r = self._session.post(
|
85
|
+
f"{self.url}/api/v1/task",
|
86
|
+
files=files,
|
87
|
+
data=data,
|
107
88
|
headers=self._headers()
|
108
89
|
)
|
109
90
|
r.raise_for_status()
|
110
|
-
return TaskResponse(**r.json()).
|
91
|
+
return TaskResponse(**r.json()).with_client(self)
|
111
92
|
|
112
93
|
def get_task(self, task_id: str) -> TaskResponse:
|
113
94
|
"""Get a task response by its ID.
|
114
95
|
|
115
96
|
Args:
|
116
|
-
task_id
|
97
|
+
task_id: The ID of the task to get
|
117
98
|
|
118
99
|
Returns:
|
119
100
|
TaskResponse: The task response
|
120
101
|
"""
|
121
|
-
|
122
|
-
|
102
|
+
r = self._session.get(
|
103
|
+
f"{self.url}/api/v1/task/{task_id}",
|
104
|
+
headers=self._headers()
|
105
|
+
)
|
123
106
|
r.raise_for_status()
|
124
|
-
return TaskResponse(**r.json()).
|
107
|
+
return TaskResponse(**r.json()).with_client(self)
|
125
108
|
|
chunkr_ai/api/chunkr_async.py
CHANGED
@@ -1,39 +1,105 @@
|
|
1
|
-
from .
|
2
|
-
from .
|
1
|
+
from .base import ChunkrBase
|
2
|
+
from .task import TaskResponse
|
3
|
+
from .config import Configuration
|
3
4
|
import httpx
|
4
|
-
import
|
5
|
+
from pathlib import Path
|
5
6
|
from PIL import Image
|
6
7
|
from typing import Union, BinaryIO
|
7
8
|
|
8
|
-
class ChunkrAsync(
|
9
|
-
"""
|
9
|
+
class ChunkrAsync(ChunkrBase):
|
10
|
+
"""Asynchronous Chunkr API client"""
|
10
11
|
|
11
|
-
|
12
|
-
|
12
|
+
def __init__(self, url: str = None, api_key: str = None):
|
13
|
+
super().__init__(url, api_key)
|
14
|
+
self._client = httpx.AsyncClient()
|
13
15
|
|
14
|
-
async def upload(self, file: Union[str, BinaryIO, Image.Image
|
16
|
+
async def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
17
|
+
"""Upload a file and wait for processing to complete.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
file: The file to upload.
|
21
|
+
config: Configuration options for processing. Optional.
|
22
|
+
|
23
|
+
Examples:
|
24
|
+
```python
|
25
|
+
# Upload from file path
|
26
|
+
await chunkr.upload("document.pdf")
|
27
|
+
|
28
|
+
# Upload from opened file
|
29
|
+
with open("document.pdf", "rb") as f:
|
30
|
+
await chunkr.upload(f)
|
31
|
+
|
32
|
+
# Upload from URL
|
33
|
+
await chunkr.upload("https://example.com/document.pdf")
|
34
|
+
|
35
|
+
# Upload from base64 string (must include MIME type header)
|
36
|
+
await chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
37
|
+
|
38
|
+
# Upload an image
|
39
|
+
from PIL import Image
|
40
|
+
img = Image.open("photo.jpg")
|
41
|
+
await chunkr.upload(img)
|
42
|
+
```
|
43
|
+
Returns:
|
44
|
+
TaskResponse: The completed task response
|
45
|
+
"""
|
15
46
|
task = await self.start_upload(file, config)
|
16
47
|
return await task.poll_async()
|
17
48
|
|
18
|
-
async def start_upload(self, file: Union[str, BinaryIO, Image.Image
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
49
|
+
async def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
50
|
+
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
file: The file to upload.
|
54
|
+
config: Configuration options for processing. Optional.
|
55
|
+
|
56
|
+
Examples:
|
57
|
+
```
|
58
|
+
# Upload from file path
|
59
|
+
task = await chunkr.start_upload("document.pdf")
|
60
|
+
|
61
|
+
# Upload from opened file
|
62
|
+
with open("document.pdf", "rb") as f:
|
63
|
+
task = await chunkr.start_upload(f)
|
64
|
+
|
65
|
+
# Upload from URL
|
66
|
+
task = await chunkr.start_upload("https://example.com/document.pdf")
|
67
|
+
|
68
|
+
# Upload from base64 string (must include MIME type header)
|
69
|
+
task = await chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
70
|
+
|
71
|
+
# Upload an image
|
72
|
+
from PIL import Image
|
73
|
+
img = Image.open("photo.jpg")
|
74
|
+
task = await chunkr.start_upload(img)
|
75
|
+
|
76
|
+
# Wait for the task to complete - this can be done when needed
|
77
|
+
await task.poll_async()
|
78
|
+
```
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
TaskResponse: The initial task response
|
82
|
+
"""
|
83
|
+
files, data = self._prepare_upload_data(file, config)
|
84
|
+
r = await self._client.post(
|
85
|
+
f"{self.url}/api/v1/task",
|
86
|
+
files=files,
|
87
|
+
json=config.model_dump() if config else {},
|
88
|
+
headers=self._headers()
|
89
|
+
)
|
90
|
+
r.raise_for_status()
|
91
|
+
return TaskResponse(**r.json()).with_client(self)
|
31
92
|
|
32
93
|
async def get_task(self, task_id: str) -> TaskResponse:
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
94
|
+
r = await self._client.get(
|
95
|
+
f"{self.url}/api/v1/task/{task_id}",
|
96
|
+
headers=self._headers()
|
97
|
+
)
|
98
|
+
r.raise_for_status()
|
99
|
+
return TaskResponse(**r.json()).with_client(self)
|
100
|
+
|
101
|
+
async def __aenter__(self):
|
102
|
+
return self
|
38
103
|
|
39
|
-
|
104
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
105
|
+
await self._client.aclose()
|
chunkr_ai/api/config.py
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
from pydantic import BaseModel, Field, model_validator
|
2
|
+
from enum import Enum
|
3
|
+
from typing import Optional, List, Dict
|
4
|
+
|
5
|
+
class GenerationStrategy(str, Enum):
|
6
|
+
LLM = "LLM"
|
7
|
+
AUTO = "Auto"
|
8
|
+
|
9
|
+
class CroppingStrategy(str, Enum):
|
10
|
+
ALL = "All"
|
11
|
+
AUTO = "Auto"
|
12
|
+
|
13
|
+
class LlmConfig(BaseModel):
|
14
|
+
model: str
|
15
|
+
prompt: str
|
16
|
+
temperature: float = 0.0
|
17
|
+
|
18
|
+
class GenerationConfig(BaseModel):
|
19
|
+
html: Optional[GenerationStrategy] = None
|
20
|
+
llm: Optional[LlmConfig] = None
|
21
|
+
markdown: Optional[GenerationStrategy] = None
|
22
|
+
crop_image: Optional[CroppingStrategy] = None
|
23
|
+
|
24
|
+
class SegmentProcessing(BaseModel):
|
25
|
+
title: Optional[GenerationConfig] = None
|
26
|
+
section_header: Optional[GenerationConfig] = None
|
27
|
+
text: Optional[GenerationConfig] = None
|
28
|
+
list_item: Optional[GenerationConfig] = None
|
29
|
+
table: Optional[GenerationConfig] = None
|
30
|
+
picture: Optional[GenerationConfig] = None
|
31
|
+
caption: Optional[GenerationConfig] = None
|
32
|
+
formula: Optional[GenerationConfig] = None
|
33
|
+
footnote: Optional[GenerationConfig] = None
|
34
|
+
page_header: Optional[GenerationConfig] = None
|
35
|
+
page_footer: Optional[GenerationConfig] = None
|
36
|
+
page: Optional[GenerationConfig] = None
|
37
|
+
|
38
|
+
class ChunkProcessing(BaseModel):
|
39
|
+
target_length: Optional[int] = None
|
40
|
+
|
41
|
+
class Property(BaseModel):
|
42
|
+
name: str
|
43
|
+
title: Optional[str] = None
|
44
|
+
prop_type: str
|
45
|
+
description: Optional[str] = None
|
46
|
+
default: Optional[str] = None
|
47
|
+
|
48
|
+
class JsonSchema(BaseModel):
|
49
|
+
title: str
|
50
|
+
properties: List[Property]
|
51
|
+
|
52
|
+
class OcrStrategy(str, Enum):
|
53
|
+
ALL = "All"
|
54
|
+
AUTO = "Auto"
|
55
|
+
|
56
|
+
class SegmentationStrategy(str, Enum):
|
57
|
+
LAYOUT_ANALYSIS = "LayoutAnalysis"
|
58
|
+
PAGE = "Page"
|
59
|
+
|
60
|
+
class BoundingBox(BaseModel):
|
61
|
+
left: float
|
62
|
+
top: float
|
63
|
+
width: float
|
64
|
+
height: float
|
65
|
+
|
66
|
+
class OCRResult(BaseModel):
|
67
|
+
bbox: BoundingBox
|
68
|
+
text: str
|
69
|
+
confidence: Optional[float]
|
70
|
+
|
71
|
+
class SegmentType(str, Enum):
|
72
|
+
CAPTION = "Caption"
|
73
|
+
FOOTNOTE = "Footnote"
|
74
|
+
FORMULA = "Formula"
|
75
|
+
LIST_ITEM = "ListItem"
|
76
|
+
PAGE = "Page"
|
77
|
+
PAGE_FOOTER = "PageFooter"
|
78
|
+
PAGE_HEADER = "PageHeader"
|
79
|
+
PICTURE = "Picture"
|
80
|
+
SECTION_HEADER = "SectionHeader"
|
81
|
+
TABLE = "Table"
|
82
|
+
TEXT = "Text"
|
83
|
+
TITLE = "Title"
|
84
|
+
|
85
|
+
class Segment(BaseModel):
|
86
|
+
bbox: BoundingBox
|
87
|
+
content: str
|
88
|
+
page_height: float
|
89
|
+
html: Optional[str]
|
90
|
+
image: Optional[str]
|
91
|
+
markdown: Optional[str]
|
92
|
+
ocr: List[OCRResult]
|
93
|
+
page_number: int
|
94
|
+
page_width: float
|
95
|
+
segment_id: str
|
96
|
+
segment_type: SegmentType
|
97
|
+
|
98
|
+
class Chunk(BaseModel):
|
99
|
+
chunk_id: str
|
100
|
+
chunk_length: int
|
101
|
+
segments: List[Segment]
|
102
|
+
|
103
|
+
class ExtractedJson(BaseModel):
|
104
|
+
data: Dict
|
105
|
+
|
106
|
+
class OutputResponse(BaseModel):
|
107
|
+
chunks: List[Chunk] = []
|
108
|
+
extracted_json: Optional[ExtractedJson]
|
109
|
+
|
110
|
+
class Model(str, Enum):
|
111
|
+
FAST = "Fast"
|
112
|
+
HIGH_QUALITY = "HighQuality"
|
113
|
+
|
114
|
+
class Configuration(BaseModel):
|
115
|
+
chunk_processing: Optional[ChunkProcessing] = Field(default=None)
|
116
|
+
expires_in: Optional[int] = Field(default=None)
|
117
|
+
high_resolution: Optional[bool] = Field(default=None)
|
118
|
+
json_schema: Optional[JsonSchema] = Field(default=None)
|
119
|
+
model: Optional[Model] = Field(default=None)
|
120
|
+
ocr_strategy: Optional[OcrStrategy] = Field(default=None)
|
121
|
+
segment_processing: Optional[SegmentProcessing] = Field(default=None)
|
122
|
+
segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
|
123
|
+
|
124
|
+
@model_validator(mode='before')
|
125
|
+
def map_deprecated_fields(cls, values: Dict) -> Dict:
|
126
|
+
if isinstance(values, dict) and "target_chunk_length" in values:
|
127
|
+
target_length = values.pop("target_chunk_length")
|
128
|
+
if target_length is not None:
|
129
|
+
values["chunk_processing"] = values.get("chunk_processing", {}) or {}
|
130
|
+
values["chunk_processing"]["target_length"] = target_length
|
131
|
+
return values
|
@@ -0,0 +1,19 @@
|
|
1
|
+
from typing import runtime_checkable, Protocol
|
2
|
+
from requests import Session
|
3
|
+
from httpx import AsyncClient
|
4
|
+
|
5
|
+
@runtime_checkable
|
6
|
+
class ChunkrClientProtocol(Protocol):
|
7
|
+
"""Protocol defining the interface for Chunkr clients"""
|
8
|
+
url: str
|
9
|
+
_api_key: str
|
10
|
+
_session: Session
|
11
|
+
_client: AsyncClient
|
12
|
+
|
13
|
+
def get_api_key(self) -> str:
|
14
|
+
"""Get the API key"""
|
15
|
+
...
|
16
|
+
|
17
|
+
def _headers(self) -> dict:
|
18
|
+
"""Return headers required for API requests"""
|
19
|
+
...
|
chunkr_ai/api/task.py
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
from .protocol import ChunkrClientProtocol
|
2
|
+
from .config import Configuration, OutputResponse
|
3
|
+
import asyncio
|
4
|
+
from datetime import datetime
|
5
|
+
from enum import Enum
|
6
|
+
from pydantic import BaseModel, PrivateAttr
|
7
|
+
import time
|
8
|
+
from typing import Optional, Union
|
9
|
+
|
10
|
+
class Status(str, Enum):
|
11
|
+
STARTING = "Starting"
|
12
|
+
PROCESSING = "Processing"
|
13
|
+
SUCCEEDED = "Succeeded"
|
14
|
+
FAILED = "Failed"
|
15
|
+
|
16
|
+
class TaskResponse(BaseModel):
|
17
|
+
configuration: Configuration
|
18
|
+
created_at: datetime
|
19
|
+
expires_at: Optional[datetime]
|
20
|
+
file_name: Optional[str]
|
21
|
+
finished_at: Optional[datetime]
|
22
|
+
input_file_url: Optional[str]
|
23
|
+
message: str
|
24
|
+
output: Optional[OutputResponse]
|
25
|
+
page_count: Optional[int]
|
26
|
+
pdf_url: Optional[str]
|
27
|
+
status: Status
|
28
|
+
task_id: str
|
29
|
+
task_url: Optional[str]
|
30
|
+
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
31
|
+
|
32
|
+
def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
|
33
|
+
self._client = client
|
34
|
+
return self
|
35
|
+
|
36
|
+
def _poll_request_sync(self) -> dict:
|
37
|
+
"""Helper method to make polling request with retry logic (synchronous)"""
|
38
|
+
if not self.task_url:
|
39
|
+
raise ValueError("Task URL not found in response")
|
40
|
+
|
41
|
+
while True:
|
42
|
+
try:
|
43
|
+
r = self._client._session.get(self.task_url, headers=self._client._headers())
|
44
|
+
r.raise_for_status()
|
45
|
+
return r.json()
|
46
|
+
except (ConnectionError, TimeoutError) as _:
|
47
|
+
print("Connection error while polling the task, retrying...")
|
48
|
+
time.sleep(0.5)
|
49
|
+
except Exception as e:
|
50
|
+
raise
|
51
|
+
|
52
|
+
async def _poll_request_async(self) -> dict:
|
53
|
+
"""Helper method to make polling request with retry logic (asynchronous)"""
|
54
|
+
if not self.task_url:
|
55
|
+
raise ValueError("Task URL not found in response")
|
56
|
+
|
57
|
+
while True:
|
58
|
+
try:
|
59
|
+
r = await self._client._client.get(self.task_url, headers=self._client._headers())
|
60
|
+
await r.raise_for_status()
|
61
|
+
return await r.json()
|
62
|
+
except (ConnectionError, TimeoutError) as _:
|
63
|
+
print("Connection error while polling the task, retrying...")
|
64
|
+
await asyncio.sleep(0.5)
|
65
|
+
except Exception as e:
|
66
|
+
raise
|
67
|
+
|
68
|
+
def _check_status(self) -> Optional['TaskResponse']:
|
69
|
+
"""Helper method to check task status and handle completion/failure"""
|
70
|
+
if self.status == "Failed":
|
71
|
+
raise ValueError(self.message)
|
72
|
+
if self.status not in ("Starting", "Processing"):
|
73
|
+
return self
|
74
|
+
return None
|
75
|
+
|
76
|
+
def poll(self) -> 'TaskResponse':
|
77
|
+
"""Poll the task for completion."""
|
78
|
+
while True:
|
79
|
+
response = self._poll_request_sync()
|
80
|
+
self.__dict__.update(response)
|
81
|
+
|
82
|
+
if result := self._check_status():
|
83
|
+
return result
|
84
|
+
|
85
|
+
time.sleep(0.5)
|
86
|
+
|
87
|
+
async def poll_async(self) -> 'TaskResponse':
|
88
|
+
"""Poll the task for completion asynchronously."""
|
89
|
+
while True:
|
90
|
+
response = await self._poll_request_async()
|
91
|
+
self.__dict__.update(response)
|
92
|
+
|
93
|
+
if result := self._check_status():
|
94
|
+
return result
|
95
|
+
|
96
|
+
await asyncio.sleep(0.5)
|
97
|
+
|
98
|
+
def _get_content(self, content_type: str) -> str:
|
99
|
+
"""Helper method to get either HTML, Markdown, or raw content."""
|
100
|
+
if not self.output:
|
101
|
+
return ""
|
102
|
+
parts = []
|
103
|
+
for c in self.output.chunks:
|
104
|
+
for s in c.segments:
|
105
|
+
content = getattr(s, content_type)
|
106
|
+
if content:
|
107
|
+
parts.append(content)
|
108
|
+
return "\n".join(parts)
|
109
|
+
|
110
|
+
def html(self) -> str:
|
111
|
+
"""Get full HTML for the task"""
|
112
|
+
return self._get_content("html")
|
113
|
+
|
114
|
+
def markdown(self) -> str:
|
115
|
+
"""Get full markdown for the task"""
|
116
|
+
return self._get_content("markdown")
|
117
|
+
|
118
|
+
def content(self) -> str:
|
119
|
+
"""Get full text for the task"""
|
120
|
+
return self._get_content("content")
|
121
|
+
|
122
|
+
class TaskPayload(BaseModel):
|
123
|
+
current_configuration: Configuration
|
124
|
+
file_name: str
|
125
|
+
image_folder_location: str
|
126
|
+
input_location: str
|
127
|
+
output_location: str
|
128
|
+
pdf_location: str
|
129
|
+
previous_configuration: Optional[Configuration]
|
130
|
+
task_id: str
|
131
|
+
user_id: str
|
chunkr_ai/models.py
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
from .api.config import (
|
2
|
+
BoundingBox,
|
3
|
+
Chunk,
|
4
|
+
ChunkProcessing,
|
5
|
+
Configuration,
|
6
|
+
CroppingStrategy,
|
7
|
+
ExtractedJson,
|
8
|
+
GenerationStrategy,
|
9
|
+
GenerationConfig,
|
10
|
+
JsonSchema,
|
11
|
+
LlmConfig,
|
12
|
+
Model,
|
13
|
+
OCRResult,
|
14
|
+
OcrStrategy,
|
15
|
+
OutputResponse,
|
16
|
+
Property,
|
17
|
+
Segment,
|
18
|
+
SegmentProcessing,
|
19
|
+
SegmentType,
|
20
|
+
SegmentationStrategy,
|
21
|
+
)
|
22
|
+
|
23
|
+
from .api.task import TaskResponse, TaskPayload, Status
|
24
|
+
|
25
|
+
__all__ = [
|
26
|
+
'BoundingBox',
|
27
|
+
'Chunk',
|
28
|
+
'ChunkProcessing',
|
29
|
+
'Configuration',
|
30
|
+
'CroppingStrategy',
|
31
|
+
'ExtractedJson',
|
32
|
+
'GenerationConfig',
|
33
|
+
'GenerationStrategy',
|
34
|
+
'JsonSchema',
|
35
|
+
'LlmConfig',
|
36
|
+
'Model',
|
37
|
+
'OCRResult',
|
38
|
+
'OcrStrategy',
|
39
|
+
'OutputResponse',
|
40
|
+
'Property',
|
41
|
+
'Segment',
|
42
|
+
'SegmentProcessing',
|
43
|
+
'SegmentType',
|
44
|
+
'SegmentationStrategy',
|
45
|
+
'Status',
|
46
|
+
'TaskPayload',
|
47
|
+
'TaskResponse'
|
48
|
+
]
|
@@ -0,0 +1,204 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: chunkr-ai
|
3
|
+
Version: 0.0.4
|
4
|
+
Summary: Python client for Chunkr: open source document intelligence
|
5
|
+
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
|
+
Project-URL: Homepage, https://chunkr.ai
|
7
|
+
Description-Content-Type: text/markdown
|
8
|
+
License-File: LICENSE
|
9
|
+
Requires-Dist: httpx>=0.28.1
|
10
|
+
Requires-Dist: pillow>=11.1.0
|
11
|
+
Requires-Dist: pydantic>=2.10.4
|
12
|
+
Requires-Dist: python-dotenv>=1.0.1
|
13
|
+
Requires-Dist: requests>=2.32.3
|
14
|
+
Provides-Extra: test
|
15
|
+
Requires-Dist: pytest>=8.3.4; extra == "test"
|
16
|
+
Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
|
17
|
+
|
18
|
+
# Chunkr Python Client
|
19
|
+
|
20
|
+
This provides a simple interface to interact with the Chunkr API.
|
21
|
+
|
22
|
+
## Getting Started
|
23
|
+
|
24
|
+
You can get an API key from [Chunkr](https://chunkr.ai) or deploy your own Chunkr instance. For self-hosted deployment options, check out our [deployment guide](https://github.com/lumina-ai-inc/chunkr/tree/main?tab=readme-ov-file#self-hosted-deployment-options).
|
25
|
+
|
26
|
+
For more information about the API and its capabilities, visit the [Chunkr API docs](https://docs.chunkr.ai).
|
27
|
+
|
28
|
+
## Installation
|
29
|
+
|
30
|
+
```bash
|
31
|
+
pip install chunkr-ai
|
32
|
+
```
|
33
|
+
|
34
|
+
## Usage
|
35
|
+
|
36
|
+
We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
|
37
|
+
|
38
|
+
### Synchronous Usage
|
39
|
+
|
40
|
+
```python
|
41
|
+
from chunkr_ai import Chunkr
|
42
|
+
|
43
|
+
# Initialize client
|
44
|
+
chunkr = Chunkr()
|
45
|
+
|
46
|
+
# Upload a file and wait for processing
|
47
|
+
task = chunkr.upload("document.pdf")
|
48
|
+
|
49
|
+
# Print the response
|
50
|
+
print(task)
|
51
|
+
|
52
|
+
# Get output from task
|
53
|
+
output = task.output
|
54
|
+
|
55
|
+
# If you want to upload without waiting for processing
|
56
|
+
task = chunkr.start_upload("document.pdf")
|
57
|
+
# ... do other things ...
|
58
|
+
task.poll() # Check status when needed
|
59
|
+
```
|
60
|
+
|
61
|
+
### Asynchronous Usage
|
62
|
+
|
63
|
+
```python
|
64
|
+
from chunkr_ai import ChunkrAsync
|
65
|
+
|
66
|
+
async def process_document():
|
67
|
+
# Initialize client
|
68
|
+
chunkr = ChunkrAsync()
|
69
|
+
|
70
|
+
# Upload a file and wait for processing
|
71
|
+
task = await chunkr.upload("document.pdf")
|
72
|
+
|
73
|
+
# Print the response
|
74
|
+
print(task)
|
75
|
+
|
76
|
+
# Get output from task
|
77
|
+
output = task.output
|
78
|
+
|
79
|
+
# If you want to upload without waiting for processing
|
80
|
+
task = await chunkr.start_upload("document.pdf")
|
81
|
+
# ... do other things ...
|
82
|
+
await task.poll_async() # Check status when needed
|
83
|
+
```
|
84
|
+
|
85
|
+
### Additional Features
|
86
|
+
|
87
|
+
Both clients support various input types:
|
88
|
+
|
89
|
+
```python
|
90
|
+
# Upload from file path
|
91
|
+
chunkr.upload("document.pdf")
|
92
|
+
|
93
|
+
# Upload from opened file
|
94
|
+
with open("document.pdf", "rb") as f:
|
95
|
+
chunkr.upload(f)
|
96
|
+
|
97
|
+
# Upload from URL
|
98
|
+
chunkr.upload("https://example.com/document.pdf")
|
99
|
+
|
100
|
+
# Upload from base64 string
|
101
|
+
chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
102
|
+
|
103
|
+
# Upload an image
|
104
|
+
from PIL import Image
|
105
|
+
img = Image.open("photo.jpg")
|
106
|
+
chunkr.upload(img)
|
107
|
+
```
|
108
|
+
|
109
|
+
### Configuration
|
110
|
+
|
111
|
+
You can customize the processing behavior by passing a `Configuration` object:
|
112
|
+
|
113
|
+
```python
|
114
|
+
from chunkr_ai.models import Configuration, OcrStrategy, SegmentationStrategy, GenerationStrategy
|
115
|
+
|
116
|
+
# Basic configuration
|
117
|
+
config = Configuration(
|
118
|
+
ocr_strategy=OcrStrategy.AUTO,
|
119
|
+
segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
|
120
|
+
high_resolution=True,
|
121
|
+
expires_in=3600, # seconds
|
122
|
+
)
|
123
|
+
|
124
|
+
# Upload with configuration
|
125
|
+
task = chunkr.upload("document.pdf", config)
|
126
|
+
```
|
127
|
+
|
128
|
+
#### Available Configuration Examples
|
129
|
+
|
130
|
+
- **Chunk Processing**
|
131
|
+
```python
|
132
|
+
from chunkr_ai.models import ChunkProcessing
|
133
|
+
config = Configuration(
|
134
|
+
chunk_processing=ChunkProcessing(target_length=1024)
|
135
|
+
)
|
136
|
+
```
|
137
|
+
- **Expires In**
|
138
|
+
```python
|
139
|
+
config = Configuration(expires_in=3600)
|
140
|
+
```
|
141
|
+
|
142
|
+
- **High Resolution**
|
143
|
+
```python
|
144
|
+
config = Configuration(high_resolution=True)
|
145
|
+
```
|
146
|
+
|
147
|
+
- **JSON Schema**
|
148
|
+
```python
|
149
|
+
config = Configuration(json_schema=JsonSchema(
|
150
|
+
title="Sales Data",
|
151
|
+
properties=[
|
152
|
+
Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
|
153
|
+
Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
|
154
|
+
]
|
155
|
+
))
|
156
|
+
```
|
157
|
+
|
158
|
+
- **OCR Strategy**
|
159
|
+
```python
|
160
|
+
config = Configuration(ocr_strategy=OcrStrategy.AUTO)
|
161
|
+
```
|
162
|
+
|
163
|
+
- **Segment Processing**
|
164
|
+
```python
|
165
|
+
from chunkr_ai.models import SegmentProcessing, GenerationConfig, GenerationStrategy
|
166
|
+
config = Configuration(
|
167
|
+
segment_processing=SegmentProcessing(
|
168
|
+
page=GenerationConfig(
|
169
|
+
html=GenerationStrategy.LLM,
|
170
|
+
markdown=GenerationStrategy.LLM
|
171
|
+
)
|
172
|
+
)
|
173
|
+
)
|
174
|
+
```
|
175
|
+
|
176
|
+
- **Segmentation Strategy**
|
177
|
+
```python
|
178
|
+
config = Configuration(
|
179
|
+
segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS # or SegmentationStrategy.PAGE
|
180
|
+
)
|
181
|
+
```
|
182
|
+
|
183
|
+
## Environment setup
|
184
|
+
|
185
|
+
You can provide your API key and URL in several ways:
|
186
|
+
1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
|
187
|
+
2. `.env` file
|
188
|
+
3. Direct initialization:
|
189
|
+
```python
|
190
|
+
chunkr = Chunkr(
|
191
|
+
api_key="your-api-key",
|
192
|
+
url="https://api.chunkr.ai"
|
193
|
+
)
|
194
|
+
```
|
195
|
+
|
196
|
+
## Run tests
|
197
|
+
|
198
|
+
```python
|
199
|
+
# Install dependencies
|
200
|
+
uv pip install -e ".[test]"
|
201
|
+
|
202
|
+
# Run tests
|
203
|
+
uv run pytest
|
204
|
+
```
|
@@ -0,0 +1,17 @@
|
|
1
|
+
chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
|
2
|
+
chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
chunkr_ai/models.py,sha256=d-B4vfgZClJOoHdPaH3vagwUc4qxeQSmUxab77DKYtQ,874
|
4
|
+
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
|
7
|
+
chunkr_ai/api/base.py,sha256=WDHx8tU0fl9_-yvYTKL-U0uaxHv-8_bRfiw9Xkl-mWM,6499
|
8
|
+
chunkr_ai/api/chunkr.py,sha256=LkBFzGB_T0y3fnBeIn_nwQW6Mb7eZO-iTlzWrmWBoko,3450
|
9
|
+
chunkr_ai/api/chunkr_async.py,sha256=B9deRVoe4h3Csh_jEuQxuxQ-DKSuZPdwkanFTyfHmeM,3603
|
10
|
+
chunkr_ai/api/config.py,sha256=K0s1giImciPksu-bO9gzRwUaK2Vo1nxNKQkXlRQ2cb8,3785
|
11
|
+
chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
|
12
|
+
chunkr_ai/api/task.py,sha256=ALU-rYlObbitlM1MKEFeSz_IBUpzb9736Iqu9huWg7c,4392
|
13
|
+
chunkr_ai-0.0.4.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
chunkr_ai-0.0.4.dist-info/METADATA,sha256=7k2zij-F7_Kcs6nFCJMKQW382gFpOOLAnZoOOXFrKFs,4913
|
15
|
+
chunkr_ai-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
16
|
+
chunkr_ai-0.0.4.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
17
|
+
chunkr_ai-0.0.4.dist-info/RECORD,,
|
chunkr_ai/api/models.py
DELETED
@@ -1,231 +0,0 @@
|
|
1
|
-
from .auth import HeadersMixin
|
2
|
-
import asyncio
|
3
|
-
from datetime import datetime
|
4
|
-
from enum import Enum
|
5
|
-
import httpx
|
6
|
-
from pydantic import BaseModel, Field, PrivateAttr
|
7
|
-
import requests
|
8
|
-
import time
|
9
|
-
from typing import Optional, List, Dict, Union
|
10
|
-
|
11
|
-
class GenerationStrategy(str, Enum):
|
12
|
-
LLM = "LLM"
|
13
|
-
AUTO = "Auto"
|
14
|
-
|
15
|
-
class CroppingStrategy(str, Enum):
|
16
|
-
ALL = "All"
|
17
|
-
AUTO = "Auto"
|
18
|
-
|
19
|
-
class LlmConfig(BaseModel):
|
20
|
-
model: str
|
21
|
-
prompt: str
|
22
|
-
temperature: float = 0.0
|
23
|
-
|
24
|
-
class AutoGenerationConfig(BaseModel):
|
25
|
-
html: GenerationStrategy = GenerationStrategy.AUTO
|
26
|
-
llm: Optional[LlmConfig] = None
|
27
|
-
markdown: GenerationStrategy = GenerationStrategy.AUTO
|
28
|
-
crop_image: CroppingStrategy = CroppingStrategy.ALL
|
29
|
-
|
30
|
-
class LlmGenerationConfig(BaseModel):
|
31
|
-
html: GenerationStrategy = GenerationStrategy.LLM
|
32
|
-
llm: Optional[LlmConfig] = None
|
33
|
-
markdown: GenerationStrategy = GenerationStrategy.LLM
|
34
|
-
crop_image: CroppingStrategy = CroppingStrategy.ALL
|
35
|
-
|
36
|
-
class SegmentProcessing(BaseModel):
|
37
|
-
title: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
38
|
-
section_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
39
|
-
text: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
40
|
-
list_item: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
41
|
-
table: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
|
42
|
-
picture: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
43
|
-
caption: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
44
|
-
formula: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
|
45
|
-
footnote: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
46
|
-
page_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
47
|
-
page_footer: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
48
|
-
page: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
49
|
-
|
50
|
-
class ChunkProcessing(BaseModel):
|
51
|
-
target_length: int = 512
|
52
|
-
|
53
|
-
class Property(BaseModel):
|
54
|
-
name: str
|
55
|
-
title: Optional[str]
|
56
|
-
prop_type: str
|
57
|
-
description: Optional[str]
|
58
|
-
default: Optional[str]
|
59
|
-
|
60
|
-
class JsonSchema(BaseModel):
|
61
|
-
title: str
|
62
|
-
properties: List[Property]
|
63
|
-
schema_type: Optional[str]
|
64
|
-
|
65
|
-
class OcrStrategy(str, Enum):
|
66
|
-
ALL = "All"
|
67
|
-
AUTO = "Auto"
|
68
|
-
|
69
|
-
class SegmentationStrategy(str, Enum):
|
70
|
-
LAYOUT_ANALYSIS = "LayoutAnalysis"
|
71
|
-
PAGE = "Page"
|
72
|
-
|
73
|
-
class BoundingBox(BaseModel):
|
74
|
-
left: float
|
75
|
-
top: float
|
76
|
-
width: float
|
77
|
-
height: float
|
78
|
-
|
79
|
-
class OCRResult(BaseModel):
|
80
|
-
bbox: BoundingBox
|
81
|
-
text: str
|
82
|
-
confidence: Optional[float]
|
83
|
-
|
84
|
-
class SegmentType(str, Enum):
|
85
|
-
CAPTION = "Caption"
|
86
|
-
FOOTNOTE = "Footnote"
|
87
|
-
FORMULA = "Formula"
|
88
|
-
LIST_ITEM = "ListItem"
|
89
|
-
PAGE = "Page"
|
90
|
-
PAGE_FOOTER = "PageFooter"
|
91
|
-
PAGE_HEADER = "PageHeader"
|
92
|
-
PICTURE = "Picture"
|
93
|
-
SECTION_HEADER = "SectionHeader"
|
94
|
-
TABLE = "Table"
|
95
|
-
TEXT = "Text"
|
96
|
-
TITLE = "Title"
|
97
|
-
|
98
|
-
class Segment(BaseModel):
|
99
|
-
bbox: BoundingBox
|
100
|
-
content: str
|
101
|
-
page_height: float
|
102
|
-
html: Optional[str]
|
103
|
-
image: Optional[str]
|
104
|
-
markdown: Optional[str]
|
105
|
-
ocr: List[OCRResult]
|
106
|
-
page_number: int
|
107
|
-
page_width: float
|
108
|
-
segment_id: str
|
109
|
-
segment_type: SegmentType
|
110
|
-
|
111
|
-
class Chunk(BaseModel):
|
112
|
-
chunk_id: str
|
113
|
-
chunk_length: int
|
114
|
-
segments: List[Segment]
|
115
|
-
|
116
|
-
class ExtractedJson(BaseModel):
|
117
|
-
data: Dict
|
118
|
-
|
119
|
-
class OutputResponse(BaseModel):
|
120
|
-
chunks: List[Chunk] = []
|
121
|
-
extracted_json: Optional[ExtractedJson]
|
122
|
-
|
123
|
-
class Model(str, Enum):
|
124
|
-
FAST = "Fast"
|
125
|
-
HIGH_QUALITY = "HighQuality"
|
126
|
-
|
127
|
-
class Configuration(BaseModel):
|
128
|
-
chunk_processing: ChunkProcessing = Field(default_factory=ChunkProcessing)
|
129
|
-
expires_in: Optional[int] = None
|
130
|
-
high_resolution: bool = False
|
131
|
-
json_schema: Optional[JsonSchema] = None
|
132
|
-
model: Optional[Model] = Field(None, deprecated=True)
|
133
|
-
ocr_strategy: OcrStrategy = OcrStrategy.AUTO
|
134
|
-
segment_processing: SegmentProcessing = Field(default_factory=SegmentProcessing)
|
135
|
-
segmentation_strategy: SegmentationStrategy = SegmentationStrategy.LAYOUT_ANALYSIS
|
136
|
-
target_chunk_length: Optional[int] = Field(None, deprecated=True)
|
137
|
-
|
138
|
-
|
139
|
-
class Status(str, Enum):
|
140
|
-
STARTING = "Starting"
|
141
|
-
PROCESSING = "Processing"
|
142
|
-
SUCCEEDED = "Succeeded"
|
143
|
-
FAILED = "Failed"
|
144
|
-
|
145
|
-
class TaskResponse(BaseModel, HeadersMixin):
|
146
|
-
configuration: Configuration
|
147
|
-
created_at: datetime
|
148
|
-
expires_at: Optional[datetime]
|
149
|
-
file_name: Optional[str]
|
150
|
-
finished_at: Optional[datetime]
|
151
|
-
input_file_url: Optional[str]
|
152
|
-
message: str
|
153
|
-
output: Optional[OutputResponse]
|
154
|
-
page_count: Optional[int]
|
155
|
-
pdf_url: Optional[str]
|
156
|
-
status: Status
|
157
|
-
task_id: str
|
158
|
-
task_url: Optional[str]
|
159
|
-
_api_key: Optional[str] = PrivateAttr(default=None)
|
160
|
-
|
161
|
-
def with_api_key(self, api_key: str) -> 'TaskResponse':
|
162
|
-
"""Helper function to set api key on a TaskResponse after creation"""
|
163
|
-
self._api_key = api_key
|
164
|
-
return self
|
165
|
-
|
166
|
-
def poll(self) -> 'TaskResponse':
|
167
|
-
"""Poll the task for completion"""
|
168
|
-
if not self.task_url:
|
169
|
-
raise ValueError("Task URL not found in response")
|
170
|
-
|
171
|
-
while True:
|
172
|
-
r = requests.get(self.task_url, headers=self._headers())
|
173
|
-
r.raise_for_status()
|
174
|
-
self.__dict__.update(r.json())
|
175
|
-
if self.status == "Failed":
|
176
|
-
raise ValueError(self.message)
|
177
|
-
if self.status not in ("Starting", "Processing"):
|
178
|
-
return self
|
179
|
-
time.sleep(0.5)
|
180
|
-
|
181
|
-
async def poll_async(self) -> 'TaskResponse':
|
182
|
-
"""Async poll the task for completion"""
|
183
|
-
if not self.task_url:
|
184
|
-
raise ValueError("Task URL not found in response")
|
185
|
-
|
186
|
-
async with httpx.AsyncClient() as client:
|
187
|
-
while True:
|
188
|
-
r = await client.get(self.task_url, headers=self._headers())
|
189
|
-
r.raise_for_status()
|
190
|
-
self.__dict__.update(r.json())
|
191
|
-
if self.status == "Failed":
|
192
|
-
raise ValueError(self.message)
|
193
|
-
if self.status not in ("Starting", "Processing"):
|
194
|
-
return self
|
195
|
-
await asyncio.sleep(0.5)
|
196
|
-
|
197
|
-
|
198
|
-
def _get_content(self, content_type: str) -> str:
|
199
|
-
"""Helper method to get either HTML, Markdown, or raw content."""
|
200
|
-
if not self.output:
|
201
|
-
return ""
|
202
|
-
parts = []
|
203
|
-
for c in self.output.chunks:
|
204
|
-
for s in c.segments:
|
205
|
-
content = getattr(s, content_type)
|
206
|
-
if content:
|
207
|
-
parts.append(content)
|
208
|
-
return "\n".join(parts)
|
209
|
-
|
210
|
-
def html(self) -> str:
|
211
|
-
"""Get full HTML for the task"""
|
212
|
-
return self._get_content("html")
|
213
|
-
|
214
|
-
def markdown(self) -> str:
|
215
|
-
"""Get full markdown for the task"""
|
216
|
-
return self._get_content("markdown")
|
217
|
-
|
218
|
-
def content(self) -> str:
|
219
|
-
"""Get full text for the task"""
|
220
|
-
return self._get_content("content")
|
221
|
-
|
222
|
-
class TaskPayload(BaseModel):
|
223
|
-
current_configuration: Configuration
|
224
|
-
file_name: str
|
225
|
-
image_folder_location: str
|
226
|
-
input_location: str
|
227
|
-
output_location: str
|
228
|
-
pdf_location: str
|
229
|
-
previous_configuration: Optional[Configuration]
|
230
|
-
task_id: str
|
231
|
-
user_id: str
|
@@ -1,16 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: chunkr-ai
|
3
|
-
Version: 0.0.2
|
4
|
-
Summary: Python client for chunkr: open source document intelligence
|
5
|
-
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
|
-
Description-Content-Type: text/markdown
|
7
|
-
License-File: LICENSE
|
8
|
-
Requires-Dist: build>=1.2.2.post1
|
9
|
-
Requires-Dist: httpx>=0.28.1
|
10
|
-
Requires-Dist: pillow>=11.1.0
|
11
|
-
Requires-Dist: pydantic>=2.10.4
|
12
|
-
Requires-Dist: python-dotenv>=1.0.1
|
13
|
-
Requires-Dist: requests>=2.32.3
|
14
|
-
Requires-Dist: twine>=6.0.1
|
15
|
-
Provides-Extra: test
|
16
|
-
Requires-Dist: pytest>=8.3.4; extra == "test"
|
chunkr_ai-0.0.2.dist-info/RECORD
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
|
2
|
-
chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
chunkr_ai/api/auth.py,sha256=U25WiNQBsrAWYAntuds0zSMvB4gUpAwGoSa5wnQ2LRQ,454
|
5
|
-
chunkr_ai/api/chunkr.py,sha256=UqFoK8ytCsW1I5F0nM4OD6I4zigy-UHzGuMDtpvMSmE,4454
|
6
|
-
chunkr_ai/api/chunkr_async.py,sha256=Kfh7_DEon6QTPe-XJops8l9R6rp0zIfJKeh9ZEGFQao,1529
|
7
|
-
chunkr_ai/api/models.py,sha256=vAVeRHgdSO4SDl009R2Vz75WtuXAwkUZW8ZsVXk9yBA,7221
|
8
|
-
chunkr_ai-0.0.2.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
chunkr_ai-0.0.2.dist-info/METADATA,sha256=ZK6gdzkukxMEVr1WxodLZ9dZNHar32C00ST1LG9mFl8,519
|
10
|
-
chunkr_ai-0.0.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
11
|
-
chunkr_ai-0.0.2.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
12
|
-
chunkr_ai-0.0.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|