chunkr-ai 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- chunkr_ai/api/__init__.py +0 -0
- chunkr_ai/api/auth.py +0 -2
- chunkr_ai/api/base.py +173 -0
- chunkr_ai/api/chunkr.py +69 -86
- chunkr_ai/api/chunkr_async.py +93 -27
- chunkr_ai/api/config.py +131 -0
- chunkr_ai/api/protocol.py +19 -0
- chunkr_ai/api/task.py +131 -0
- chunkr_ai/models.py +48 -0
- chunkr_ai-0.0.4.dist-info/METADATA +204 -0
- chunkr_ai-0.0.4.dist-info/RECORD +17 -0
- chunkr_ai/api/models.py +0 -231
- chunkr_ai-0.0.2.dist-info/METADATA +0 -16
- chunkr_ai-0.0.2.dist-info/RECORD +0 -12
- {chunkr_ai-0.0.2.dist-info → chunkr_ai-0.0.4.dist-info}/LICENSE +0 -0
- {chunkr_ai-0.0.2.dist-info → chunkr_ai-0.0.4.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.0.2.dist-info → chunkr_ai-0.0.4.dist-info}/top_level.txt +0 -0
File without changes
|
chunkr_ai/api/auth.py
CHANGED
chunkr_ai/api/base.py
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
from .config import Configuration
|
2
|
+
from .task import TaskResponse
|
3
|
+
from .auth import HeadersMixin
|
4
|
+
from abc import abstractmethod
|
5
|
+
from dotenv import load_dotenv
|
6
|
+
import io
|
7
|
+
import json
|
8
|
+
import os
|
9
|
+
from pathlib import Path
|
10
|
+
from PIL import Image
|
11
|
+
import requests
|
12
|
+
from typing import BinaryIO, Tuple, Union
|
13
|
+
|
14
|
+
class ChunkrBase(HeadersMixin):
|
15
|
+
"""Base class with shared functionality for Chunkr API clients."""
|
16
|
+
|
17
|
+
def __init__(self, url: str = None, api_key: str = None):
|
18
|
+
load_dotenv()
|
19
|
+
self.url = (
|
20
|
+
url or
|
21
|
+
os.getenv('CHUNKR_URL') or
|
22
|
+
'https://api.chunkr.ai'
|
23
|
+
)
|
24
|
+
self._api_key = (
|
25
|
+
api_key or
|
26
|
+
os.getenv('CHUNKR_API_KEY')
|
27
|
+
)
|
28
|
+
if not self._api_key:
|
29
|
+
raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
|
30
|
+
|
31
|
+
self.url = self.url.rstrip("/")
|
32
|
+
|
33
|
+
def _prepare_file(
|
34
|
+
self,
|
35
|
+
file: Union[str, Path, BinaryIO, Image.Image]
|
36
|
+
) -> Tuple[str, BinaryIO]:
|
37
|
+
"""Convert various file types into a tuple of (filename, file-like object).
|
38
|
+
|
39
|
+
Args:
|
40
|
+
file: Input file, can be:
|
41
|
+
- String or Path to a file
|
42
|
+
- URL string starting with http:// or https://
|
43
|
+
- Base64 string
|
44
|
+
- Opened binary file (mode='rb')
|
45
|
+
- PIL/Pillow Image object
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
|
49
|
+
|
50
|
+
Raises:
|
51
|
+
FileNotFoundError: If the file path doesn't exist
|
52
|
+
TypeError: If the file type is not supported
|
53
|
+
ValueError: If the URL is invalid or unreachable
|
54
|
+
ValueError: If the MIME type is unsupported
|
55
|
+
"""
|
56
|
+
# Handle URLs
|
57
|
+
if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
|
58
|
+
response = requests.get(file)
|
59
|
+
response.raise_for_status()
|
60
|
+
file_obj = io.BytesIO(response.content)
|
61
|
+
filename = Path(file.split('/')[-1]).name or 'downloaded_file'
|
62
|
+
return filename, file_obj
|
63
|
+
|
64
|
+
# Handle base64 strings
|
65
|
+
if isinstance(file, str) and ',' in file and ';base64,' in file:
|
66
|
+
try:
|
67
|
+
# Split header and data
|
68
|
+
header, base64_data = file.split(',', 1)
|
69
|
+
import base64
|
70
|
+
file_bytes = base64.b64decode(base64_data)
|
71
|
+
file_obj = io.BytesIO(file_bytes)
|
72
|
+
|
73
|
+
# Try to determine format from header
|
74
|
+
format = 'bin'
|
75
|
+
mime_type = header.split(':')[-1].split(';')[0].lower()
|
76
|
+
|
77
|
+
# Map MIME types to file extensions
|
78
|
+
mime_to_ext = {
|
79
|
+
'application/pdf': 'pdf',
|
80
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
81
|
+
'application/msword': 'doc',
|
82
|
+
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
|
83
|
+
'application/vnd.ms-powerpoint': 'ppt',
|
84
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
|
85
|
+
'application/vnd.ms-excel': 'xls',
|
86
|
+
'image/jpeg': 'jpg',
|
87
|
+
'image/png': 'png',
|
88
|
+
'image/jpg': 'jpg'
|
89
|
+
}
|
90
|
+
|
91
|
+
if mime_type in mime_to_ext:
|
92
|
+
format = mime_to_ext[mime_type]
|
93
|
+
else:
|
94
|
+
raise ValueError(f"Unsupported MIME type: {mime_type}")
|
95
|
+
|
96
|
+
return f"file.{format}", file_obj
|
97
|
+
except Exception as e:
|
98
|
+
raise ValueError(f"Invalid base64 string: {str(e)}")
|
99
|
+
|
100
|
+
# Handle file paths
|
101
|
+
if isinstance(file, (str, Path)):
|
102
|
+
path = Path(file).resolve()
|
103
|
+
if not path.exists():
|
104
|
+
raise FileNotFoundError(f"File not found: {file}")
|
105
|
+
return path.name, open(path, 'rb')
|
106
|
+
|
107
|
+
# Handle PIL Images
|
108
|
+
if isinstance(file, Image.Image):
|
109
|
+
img_byte_arr = io.BytesIO()
|
110
|
+
format = file.format or 'PNG'
|
111
|
+
file.save(img_byte_arr, format=format)
|
112
|
+
img_byte_arr.seek(0)
|
113
|
+
return f"image.{format.lower()}", img_byte_arr
|
114
|
+
|
115
|
+
# Handle file-like objects
|
116
|
+
if hasattr(file, 'read') and hasattr(file, 'seek'):
|
117
|
+
# Try to get the filename from the file object if possible
|
118
|
+
name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
|
119
|
+
return Path(name).name, file
|
120
|
+
|
121
|
+
raise TypeError(f"Unsupported file type: {type(file)}")
|
122
|
+
|
123
|
+
def _prepare_upload_data(
|
124
|
+
self,
|
125
|
+
file: Union[str, Path, BinaryIO, Image.Image],
|
126
|
+
config: Configuration = None
|
127
|
+
) -> Tuple[dict, dict]:
|
128
|
+
"""Prepare files and data dictionaries for upload.
|
129
|
+
|
130
|
+
Args:
|
131
|
+
file: The file to upload
|
132
|
+
config: Optional configuration settings
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
Tuple[dict, dict]: (files dict, data dict) ready for upload
|
136
|
+
"""
|
137
|
+
filename, file_obj = self._prepare_file(file)
|
138
|
+
files = {"file": (filename, file_obj)}
|
139
|
+
data = {}
|
140
|
+
|
141
|
+
if config:
|
142
|
+
config_dict = config.model_dump(mode="json", exclude_none=True)
|
143
|
+
for key, value in config_dict.items():
|
144
|
+
if isinstance(value, dict):
|
145
|
+
files[key] = (None, json.dumps(value), 'application/json')
|
146
|
+
else:
|
147
|
+
data[key] = value
|
148
|
+
|
149
|
+
return files, data
|
150
|
+
|
151
|
+
@abstractmethod
|
152
|
+
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
153
|
+
"""Upload a file and wait for processing to complete.
|
154
|
+
|
155
|
+
Must be implemented by subclasses.
|
156
|
+
"""
|
157
|
+
pass
|
158
|
+
|
159
|
+
@abstractmethod
|
160
|
+
def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
161
|
+
"""Upload a file for processing and immediately return the task response.
|
162
|
+
|
163
|
+
Must be implemented by subclasses.
|
164
|
+
"""
|
165
|
+
pass
|
166
|
+
|
167
|
+
@abstractmethod
|
168
|
+
def get_task(self, task_id: str) -> TaskResponse:
|
169
|
+
"""Get a task response by its ID.
|
170
|
+
|
171
|
+
Must be implemented by subclasses.
|
172
|
+
"""
|
173
|
+
pass
|
chunkr_ai/api/chunkr.py
CHANGED
@@ -1,125 +1,108 @@
|
|
1
|
-
from .
|
2
|
-
from .
|
3
|
-
from
|
4
|
-
import io
|
5
|
-
import os
|
1
|
+
from .base import ChunkrBase
|
2
|
+
from .config import Configuration
|
3
|
+
from .task import TaskResponse
|
6
4
|
from pathlib import Path
|
7
5
|
from PIL import Image
|
8
6
|
import requests
|
9
|
-
from typing import Union, BinaryIO
|
7
|
+
from typing import Union, BinaryIO
|
10
8
|
|
11
|
-
class Chunkr(
|
12
|
-
"""
|
9
|
+
class Chunkr(ChunkrBase):
|
10
|
+
"""Chunkr API client"""
|
13
11
|
|
14
12
|
def __init__(self, url: str = None, api_key: str = None):
|
15
|
-
|
16
|
-
self.
|
17
|
-
url or
|
18
|
-
os.getenv('CHUNKR_URL') or
|
19
|
-
'https://api.chunkr.ai'
|
20
|
-
)
|
21
|
-
self._api_key = (
|
22
|
-
api_key or
|
23
|
-
os.getenv('CHUNKR_API_KEY')
|
24
|
-
)
|
25
|
-
if not self._api_key:
|
26
|
-
raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
|
27
|
-
|
28
|
-
self.url = self.url.rstrip("/")
|
13
|
+
super().__init__(url, api_key)
|
14
|
+
self._session = requests.Session()
|
29
15
|
|
30
|
-
def
|
31
|
-
|
32
|
-
file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO]
|
33
|
-
) -> Tuple[str, BinaryIO]:
|
34
|
-
"""Convert various file types into a tuple of (filename, file-like object).
|
16
|
+
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
17
|
+
"""Upload a file and wait for processing to complete.
|
35
18
|
|
36
19
|
Args:
|
37
|
-
file:
|
20
|
+
file: The file to upload.
|
21
|
+
config: Configuration options for processing. Optional.
|
38
22
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
path = Path(file).resolve()
|
44
|
-
if not path.exists():
|
45
|
-
raise FileNotFoundError(f"File not found: {file}")
|
46
|
-
return path.name, path.open("rb")
|
47
|
-
elif isinstance(file, Image.Image):
|
48
|
-
img_byte_arr = io.BytesIO()
|
49
|
-
file.save(img_byte_arr, format=file.format or 'PNG')
|
50
|
-
img_byte_arr.seek(0)
|
51
|
-
return "image.png", img_byte_arr
|
52
|
-
elif isinstance(file, bytes):
|
53
|
-
return "document", io.BytesIO(file)
|
54
|
-
elif isinstance(file, io.BytesIO):
|
55
|
-
return "document", file
|
56
|
-
else:
|
57
|
-
return "document", file
|
58
|
-
|
59
|
-
def upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
|
60
|
-
"""Upload a file and wait for processing to complete.
|
23
|
+
Examples:
|
24
|
+
```
|
25
|
+
# Upload from file path
|
26
|
+
chunkr.upload("document.pdf")
|
61
27
|
|
62
|
-
|
63
|
-
|
64
|
-
- BinaryIO: A file-like object (e.g., opened with 'rb' mode)
|
65
|
-
- Image.Image: A PIL/Pillow Image object
|
66
|
-
- bytes: Raw binary data
|
67
|
-
- io.BytesIO: A binary stream in memory
|
28
|
+
# Upload from URL
|
29
|
+
chunkr.upload("https://example.com/document.pdf")
|
68
30
|
|
69
|
-
|
70
|
-
|
71
|
-
config:
|
72
|
-
Configuration options for processing. Optional.
|
31
|
+
# Upload from base64 string (must include MIME type header)
|
32
|
+
chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
73
33
|
|
34
|
+
# Upload from opened file
|
35
|
+
with open("document.pdf", "rb") as f:
|
36
|
+
chunkr.upload(f)
|
37
|
+
|
38
|
+
# Upload an image
|
39
|
+
from PIL import Image
|
40
|
+
img = Image.open("photo.jpg")
|
41
|
+
chunkr.upload(img)
|
42
|
+
```
|
74
43
|
Returns:
|
75
44
|
TaskResponse: The completed task response
|
76
45
|
"""
|
77
|
-
|
46
|
+
task = self.start_upload(file, config)
|
47
|
+
return task.poll()
|
78
48
|
|
79
|
-
def start_upload(self, file: Union[str, BinaryIO, Image.Image
|
49
|
+
def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
80
50
|
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
|
81
51
|
|
82
|
-
The file can be one of:
|
83
|
-
- str: Path to a file on disk
|
84
|
-
- BinaryIO: A file-like object (e.g., opened with 'rb' mode)
|
85
|
-
- Image.Image: A PIL/Pillow Image object
|
86
|
-
- bytes: Raw binary data
|
87
|
-
- io.BytesIO: A binary stream in memory
|
88
|
-
|
89
52
|
Args:
|
90
53
|
file: The file to upload.
|
91
|
-
config
|
54
|
+
config: Configuration options for processing. Optional.
|
55
|
+
|
56
|
+
Examples:
|
57
|
+
```
|
58
|
+
# Upload from file path
|
59
|
+
task = chunkr.start_upload("document.pdf")
|
60
|
+
|
61
|
+
# Upload from opened file
|
62
|
+
with open("document.pdf", "rb") as f:
|
63
|
+
task = chunkr.start_upload(f)
|
64
|
+
|
65
|
+
# Upload from URL
|
66
|
+
task = chunkr.start_upload("https://example.com/document.pdf")
|
67
|
+
|
68
|
+
# Upload from base64 string (must include MIME type header)
|
69
|
+
task = chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
70
|
+
|
71
|
+
# Upload an image
|
72
|
+
from PIL import Image
|
73
|
+
img = Image.open("photo.jpg")
|
74
|
+
task = chunkr.start_upload(img)
|
75
|
+
|
76
|
+
# Wait for the task to complete - this can be done when needed
|
77
|
+
task.poll()
|
78
|
+
```
|
92
79
|
|
93
80
|
Returns:
|
94
81
|
TaskResponse: The initial task response
|
95
|
-
|
96
|
-
Raises:
|
97
|
-
requests.exceptions.HTTPError: If the API request fails
|
98
82
|
"""
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
url,
|
105
|
-
files=files,
|
106
|
-
json=config.dict() if config else {},
|
83
|
+
files, data = self._prepare_upload_data(file, config)
|
84
|
+
r = self._session.post(
|
85
|
+
f"{self.url}/api/v1/task",
|
86
|
+
files=files,
|
87
|
+
data=data,
|
107
88
|
headers=self._headers()
|
108
89
|
)
|
109
90
|
r.raise_for_status()
|
110
|
-
return TaskResponse(**r.json()).
|
91
|
+
return TaskResponse(**r.json()).with_client(self)
|
111
92
|
|
112
93
|
def get_task(self, task_id: str) -> TaskResponse:
|
113
94
|
"""Get a task response by its ID.
|
114
95
|
|
115
96
|
Args:
|
116
|
-
task_id
|
97
|
+
task_id: The ID of the task to get
|
117
98
|
|
118
99
|
Returns:
|
119
100
|
TaskResponse: The task response
|
120
101
|
"""
|
121
|
-
|
122
|
-
|
102
|
+
r = self._session.get(
|
103
|
+
f"{self.url}/api/v1/task/{task_id}",
|
104
|
+
headers=self._headers()
|
105
|
+
)
|
123
106
|
r.raise_for_status()
|
124
|
-
return TaskResponse(**r.json()).
|
107
|
+
return TaskResponse(**r.json()).with_client(self)
|
125
108
|
|
chunkr_ai/api/chunkr_async.py
CHANGED
@@ -1,39 +1,105 @@
|
|
1
|
-
from .
|
2
|
-
from .
|
1
|
+
from .base import ChunkrBase
|
2
|
+
from .task import TaskResponse
|
3
|
+
from .config import Configuration
|
3
4
|
import httpx
|
4
|
-
import
|
5
|
+
from pathlib import Path
|
5
6
|
from PIL import Image
|
6
7
|
from typing import Union, BinaryIO
|
7
8
|
|
8
|
-
class ChunkrAsync(
|
9
|
-
"""
|
9
|
+
class ChunkrAsync(ChunkrBase):
|
10
|
+
"""Asynchronous Chunkr API client"""
|
10
11
|
|
11
|
-
|
12
|
-
|
12
|
+
def __init__(self, url: str = None, api_key: str = None):
|
13
|
+
super().__init__(url, api_key)
|
14
|
+
self._client = httpx.AsyncClient()
|
13
15
|
|
14
|
-
async def upload(self, file: Union[str, BinaryIO, Image.Image
|
16
|
+
async def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
17
|
+
"""Upload a file and wait for processing to complete.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
file: The file to upload.
|
21
|
+
config: Configuration options for processing. Optional.
|
22
|
+
|
23
|
+
Examples:
|
24
|
+
```python
|
25
|
+
# Upload from file path
|
26
|
+
await chunkr.upload("document.pdf")
|
27
|
+
|
28
|
+
# Upload from opened file
|
29
|
+
with open("document.pdf", "rb") as f:
|
30
|
+
await chunkr.upload(f)
|
31
|
+
|
32
|
+
# Upload from URL
|
33
|
+
await chunkr.upload("https://example.com/document.pdf")
|
34
|
+
|
35
|
+
# Upload from base64 string (must include MIME type header)
|
36
|
+
await chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
37
|
+
|
38
|
+
# Upload an image
|
39
|
+
from PIL import Image
|
40
|
+
img = Image.open("photo.jpg")
|
41
|
+
await chunkr.upload(img)
|
42
|
+
```
|
43
|
+
Returns:
|
44
|
+
TaskResponse: The completed task response
|
45
|
+
"""
|
15
46
|
task = await self.start_upload(file, config)
|
16
47
|
return await task.poll_async()
|
17
48
|
|
18
|
-
async def start_upload(self, file: Union[str, BinaryIO, Image.Image
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
49
|
+
async def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
50
|
+
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
file: The file to upload.
|
54
|
+
config: Configuration options for processing. Optional.
|
55
|
+
|
56
|
+
Examples:
|
57
|
+
```
|
58
|
+
# Upload from file path
|
59
|
+
task = await chunkr.start_upload("document.pdf")
|
60
|
+
|
61
|
+
# Upload from opened file
|
62
|
+
with open("document.pdf", "rb") as f:
|
63
|
+
task = await chunkr.start_upload(f)
|
64
|
+
|
65
|
+
# Upload from URL
|
66
|
+
task = await chunkr.start_upload("https://example.com/document.pdf")
|
67
|
+
|
68
|
+
# Upload from base64 string (must include MIME type header)
|
69
|
+
task = await chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
70
|
+
|
71
|
+
# Upload an image
|
72
|
+
from PIL import Image
|
73
|
+
img = Image.open("photo.jpg")
|
74
|
+
task = await chunkr.start_upload(img)
|
75
|
+
|
76
|
+
# Wait for the task to complete - this can be done when needed
|
77
|
+
await task.poll_async()
|
78
|
+
```
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
TaskResponse: The initial task response
|
82
|
+
"""
|
83
|
+
files, data = self._prepare_upload_data(file, config)
|
84
|
+
r = await self._client.post(
|
85
|
+
f"{self.url}/api/v1/task",
|
86
|
+
files=files,
|
87
|
+
json=config.model_dump() if config else {},
|
88
|
+
headers=self._headers()
|
89
|
+
)
|
90
|
+
r.raise_for_status()
|
91
|
+
return TaskResponse(**r.json()).with_client(self)
|
31
92
|
|
32
93
|
async def get_task(self, task_id: str) -> TaskResponse:
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
94
|
+
r = await self._client.get(
|
95
|
+
f"{self.url}/api/v1/task/{task_id}",
|
96
|
+
headers=self._headers()
|
97
|
+
)
|
98
|
+
r.raise_for_status()
|
99
|
+
return TaskResponse(**r.json()).with_client(self)
|
100
|
+
|
101
|
+
async def __aenter__(self):
|
102
|
+
return self
|
38
103
|
|
39
|
-
|
104
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
105
|
+
await self._client.aclose()
|
chunkr_ai/api/config.py
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
from pydantic import BaseModel, Field, model_validator
|
2
|
+
from enum import Enum
|
3
|
+
from typing import Optional, List, Dict
|
4
|
+
|
5
|
+
class GenerationStrategy(str, Enum):
|
6
|
+
LLM = "LLM"
|
7
|
+
AUTO = "Auto"
|
8
|
+
|
9
|
+
class CroppingStrategy(str, Enum):
|
10
|
+
ALL = "All"
|
11
|
+
AUTO = "Auto"
|
12
|
+
|
13
|
+
class LlmConfig(BaseModel):
|
14
|
+
model: str
|
15
|
+
prompt: str
|
16
|
+
temperature: float = 0.0
|
17
|
+
|
18
|
+
class GenerationConfig(BaseModel):
|
19
|
+
html: Optional[GenerationStrategy] = None
|
20
|
+
llm: Optional[LlmConfig] = None
|
21
|
+
markdown: Optional[GenerationStrategy] = None
|
22
|
+
crop_image: Optional[CroppingStrategy] = None
|
23
|
+
|
24
|
+
class SegmentProcessing(BaseModel):
|
25
|
+
title: Optional[GenerationConfig] = None
|
26
|
+
section_header: Optional[GenerationConfig] = None
|
27
|
+
text: Optional[GenerationConfig] = None
|
28
|
+
list_item: Optional[GenerationConfig] = None
|
29
|
+
table: Optional[GenerationConfig] = None
|
30
|
+
picture: Optional[GenerationConfig] = None
|
31
|
+
caption: Optional[GenerationConfig] = None
|
32
|
+
formula: Optional[GenerationConfig] = None
|
33
|
+
footnote: Optional[GenerationConfig] = None
|
34
|
+
page_header: Optional[GenerationConfig] = None
|
35
|
+
page_footer: Optional[GenerationConfig] = None
|
36
|
+
page: Optional[GenerationConfig] = None
|
37
|
+
|
38
|
+
class ChunkProcessing(BaseModel):
|
39
|
+
target_length: Optional[int] = None
|
40
|
+
|
41
|
+
class Property(BaseModel):
|
42
|
+
name: str
|
43
|
+
title: Optional[str] = None
|
44
|
+
prop_type: str
|
45
|
+
description: Optional[str] = None
|
46
|
+
default: Optional[str] = None
|
47
|
+
|
48
|
+
class JsonSchema(BaseModel):
|
49
|
+
title: str
|
50
|
+
properties: List[Property]
|
51
|
+
|
52
|
+
class OcrStrategy(str, Enum):
|
53
|
+
ALL = "All"
|
54
|
+
AUTO = "Auto"
|
55
|
+
|
56
|
+
class SegmentationStrategy(str, Enum):
|
57
|
+
LAYOUT_ANALYSIS = "LayoutAnalysis"
|
58
|
+
PAGE = "Page"
|
59
|
+
|
60
|
+
class BoundingBox(BaseModel):
|
61
|
+
left: float
|
62
|
+
top: float
|
63
|
+
width: float
|
64
|
+
height: float
|
65
|
+
|
66
|
+
class OCRResult(BaseModel):
|
67
|
+
bbox: BoundingBox
|
68
|
+
text: str
|
69
|
+
confidence: Optional[float]
|
70
|
+
|
71
|
+
class SegmentType(str, Enum):
|
72
|
+
CAPTION = "Caption"
|
73
|
+
FOOTNOTE = "Footnote"
|
74
|
+
FORMULA = "Formula"
|
75
|
+
LIST_ITEM = "ListItem"
|
76
|
+
PAGE = "Page"
|
77
|
+
PAGE_FOOTER = "PageFooter"
|
78
|
+
PAGE_HEADER = "PageHeader"
|
79
|
+
PICTURE = "Picture"
|
80
|
+
SECTION_HEADER = "SectionHeader"
|
81
|
+
TABLE = "Table"
|
82
|
+
TEXT = "Text"
|
83
|
+
TITLE = "Title"
|
84
|
+
|
85
|
+
class Segment(BaseModel):
|
86
|
+
bbox: BoundingBox
|
87
|
+
content: str
|
88
|
+
page_height: float
|
89
|
+
html: Optional[str]
|
90
|
+
image: Optional[str]
|
91
|
+
markdown: Optional[str]
|
92
|
+
ocr: List[OCRResult]
|
93
|
+
page_number: int
|
94
|
+
page_width: float
|
95
|
+
segment_id: str
|
96
|
+
segment_type: SegmentType
|
97
|
+
|
98
|
+
class Chunk(BaseModel):
|
99
|
+
chunk_id: str
|
100
|
+
chunk_length: int
|
101
|
+
segments: List[Segment]
|
102
|
+
|
103
|
+
class ExtractedJson(BaseModel):
|
104
|
+
data: Dict
|
105
|
+
|
106
|
+
class OutputResponse(BaseModel):
|
107
|
+
chunks: List[Chunk] = []
|
108
|
+
extracted_json: Optional[ExtractedJson]
|
109
|
+
|
110
|
+
class Model(str, Enum):
|
111
|
+
FAST = "Fast"
|
112
|
+
HIGH_QUALITY = "HighQuality"
|
113
|
+
|
114
|
+
class Configuration(BaseModel):
|
115
|
+
chunk_processing: Optional[ChunkProcessing] = Field(default=None)
|
116
|
+
expires_in: Optional[int] = Field(default=None)
|
117
|
+
high_resolution: Optional[bool] = Field(default=None)
|
118
|
+
json_schema: Optional[JsonSchema] = Field(default=None)
|
119
|
+
model: Optional[Model] = Field(default=None)
|
120
|
+
ocr_strategy: Optional[OcrStrategy] = Field(default=None)
|
121
|
+
segment_processing: Optional[SegmentProcessing] = Field(default=None)
|
122
|
+
segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
|
123
|
+
|
124
|
+
@model_validator(mode='before')
|
125
|
+
def map_deprecated_fields(cls, values: Dict) -> Dict:
|
126
|
+
if isinstance(values, dict) and "target_chunk_length" in values:
|
127
|
+
target_length = values.pop("target_chunk_length")
|
128
|
+
if target_length is not None:
|
129
|
+
values["chunk_processing"] = values.get("chunk_processing", {}) or {}
|
130
|
+
values["chunk_processing"]["target_length"] = target_length
|
131
|
+
return values
|
@@ -0,0 +1,19 @@
|
|
1
|
+
from typing import runtime_checkable, Protocol
|
2
|
+
from requests import Session
|
3
|
+
from httpx import AsyncClient
|
4
|
+
|
5
|
+
@runtime_checkable
|
6
|
+
class ChunkrClientProtocol(Protocol):
|
7
|
+
"""Protocol defining the interface for Chunkr clients"""
|
8
|
+
url: str
|
9
|
+
_api_key: str
|
10
|
+
_session: Session
|
11
|
+
_client: AsyncClient
|
12
|
+
|
13
|
+
def get_api_key(self) -> str:
|
14
|
+
"""Get the API key"""
|
15
|
+
...
|
16
|
+
|
17
|
+
def _headers(self) -> dict:
|
18
|
+
"""Return headers required for API requests"""
|
19
|
+
...
|
chunkr_ai/api/task.py
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
from .protocol import ChunkrClientProtocol
|
2
|
+
from .config import Configuration, OutputResponse
|
3
|
+
import asyncio
|
4
|
+
from datetime import datetime
|
5
|
+
from enum import Enum
|
6
|
+
from pydantic import BaseModel, PrivateAttr
|
7
|
+
import time
|
8
|
+
from typing import Optional, Union
|
9
|
+
|
10
|
+
class Status(str, Enum):
|
11
|
+
STARTING = "Starting"
|
12
|
+
PROCESSING = "Processing"
|
13
|
+
SUCCEEDED = "Succeeded"
|
14
|
+
FAILED = "Failed"
|
15
|
+
|
16
|
+
class TaskResponse(BaseModel):
|
17
|
+
configuration: Configuration
|
18
|
+
created_at: datetime
|
19
|
+
expires_at: Optional[datetime]
|
20
|
+
file_name: Optional[str]
|
21
|
+
finished_at: Optional[datetime]
|
22
|
+
input_file_url: Optional[str]
|
23
|
+
message: str
|
24
|
+
output: Optional[OutputResponse]
|
25
|
+
page_count: Optional[int]
|
26
|
+
pdf_url: Optional[str]
|
27
|
+
status: Status
|
28
|
+
task_id: str
|
29
|
+
task_url: Optional[str]
|
30
|
+
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
31
|
+
|
32
|
+
def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
|
33
|
+
self._client = client
|
34
|
+
return self
|
35
|
+
|
36
|
+
def _poll_request_sync(self) -> dict:
|
37
|
+
"""Helper method to make polling request with retry logic (synchronous)"""
|
38
|
+
if not self.task_url:
|
39
|
+
raise ValueError("Task URL not found in response")
|
40
|
+
|
41
|
+
while True:
|
42
|
+
try:
|
43
|
+
r = self._client._session.get(self.task_url, headers=self._client._headers())
|
44
|
+
r.raise_for_status()
|
45
|
+
return r.json()
|
46
|
+
except (ConnectionError, TimeoutError) as _:
|
47
|
+
print("Connection error while polling the task, retrying...")
|
48
|
+
time.sleep(0.5)
|
49
|
+
except Exception as e:
|
50
|
+
raise
|
51
|
+
|
52
|
+
async def _poll_request_async(self) -> dict:
|
53
|
+
"""Helper method to make polling request with retry logic (asynchronous)"""
|
54
|
+
if not self.task_url:
|
55
|
+
raise ValueError("Task URL not found in response")
|
56
|
+
|
57
|
+
while True:
|
58
|
+
try:
|
59
|
+
r = await self._client._client.get(self.task_url, headers=self._client._headers())
|
60
|
+
await r.raise_for_status()
|
61
|
+
return await r.json()
|
62
|
+
except (ConnectionError, TimeoutError) as _:
|
63
|
+
print("Connection error while polling the task, retrying...")
|
64
|
+
await asyncio.sleep(0.5)
|
65
|
+
except Exception as e:
|
66
|
+
raise
|
67
|
+
|
68
|
+
def _check_status(self) -> Optional['TaskResponse']:
|
69
|
+
"""Helper method to check task status and handle completion/failure"""
|
70
|
+
if self.status == "Failed":
|
71
|
+
raise ValueError(self.message)
|
72
|
+
if self.status not in ("Starting", "Processing"):
|
73
|
+
return self
|
74
|
+
return None
|
75
|
+
|
76
|
+
def poll(self) -> 'TaskResponse':
|
77
|
+
"""Poll the task for completion."""
|
78
|
+
while True:
|
79
|
+
response = self._poll_request_sync()
|
80
|
+
self.__dict__.update(response)
|
81
|
+
|
82
|
+
if result := self._check_status():
|
83
|
+
return result
|
84
|
+
|
85
|
+
time.sleep(0.5)
|
86
|
+
|
87
|
+
async def poll_async(self) -> 'TaskResponse':
|
88
|
+
"""Poll the task for completion asynchronously."""
|
89
|
+
while True:
|
90
|
+
response = await self._poll_request_async()
|
91
|
+
self.__dict__.update(response)
|
92
|
+
|
93
|
+
if result := self._check_status():
|
94
|
+
return result
|
95
|
+
|
96
|
+
await asyncio.sleep(0.5)
|
97
|
+
|
98
|
+
def _get_content(self, content_type: str) -> str:
|
99
|
+
"""Helper method to get either HTML, Markdown, or raw content."""
|
100
|
+
if not self.output:
|
101
|
+
return ""
|
102
|
+
parts = []
|
103
|
+
for c in self.output.chunks:
|
104
|
+
for s in c.segments:
|
105
|
+
content = getattr(s, content_type)
|
106
|
+
if content:
|
107
|
+
parts.append(content)
|
108
|
+
return "\n".join(parts)
|
109
|
+
|
110
|
+
def html(self) -> str:
|
111
|
+
"""Get full HTML for the task"""
|
112
|
+
return self._get_content("html")
|
113
|
+
|
114
|
+
def markdown(self) -> str:
|
115
|
+
"""Get full markdown for the task"""
|
116
|
+
return self._get_content("markdown")
|
117
|
+
|
118
|
+
def content(self) -> str:
|
119
|
+
"""Get full text for the task"""
|
120
|
+
return self._get_content("content")
|
121
|
+
|
122
|
+
class TaskPayload(BaseModel):
|
123
|
+
current_configuration: Configuration
|
124
|
+
file_name: str
|
125
|
+
image_folder_location: str
|
126
|
+
input_location: str
|
127
|
+
output_location: str
|
128
|
+
pdf_location: str
|
129
|
+
previous_configuration: Optional[Configuration]
|
130
|
+
task_id: str
|
131
|
+
user_id: str
|
chunkr_ai/models.py
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
from .api.config import (
|
2
|
+
BoundingBox,
|
3
|
+
Chunk,
|
4
|
+
ChunkProcessing,
|
5
|
+
Configuration,
|
6
|
+
CroppingStrategy,
|
7
|
+
ExtractedJson,
|
8
|
+
GenerationStrategy,
|
9
|
+
GenerationConfig,
|
10
|
+
JsonSchema,
|
11
|
+
LlmConfig,
|
12
|
+
Model,
|
13
|
+
OCRResult,
|
14
|
+
OcrStrategy,
|
15
|
+
OutputResponse,
|
16
|
+
Property,
|
17
|
+
Segment,
|
18
|
+
SegmentProcessing,
|
19
|
+
SegmentType,
|
20
|
+
SegmentationStrategy,
|
21
|
+
)
|
22
|
+
|
23
|
+
from .api.task import TaskResponse, TaskPayload, Status
|
24
|
+
|
25
|
+
__all__ = [
|
26
|
+
'BoundingBox',
|
27
|
+
'Chunk',
|
28
|
+
'ChunkProcessing',
|
29
|
+
'Configuration',
|
30
|
+
'CroppingStrategy',
|
31
|
+
'ExtractedJson',
|
32
|
+
'GenerationConfig',
|
33
|
+
'GenerationStrategy',
|
34
|
+
'JsonSchema',
|
35
|
+
'LlmConfig',
|
36
|
+
'Model',
|
37
|
+
'OCRResult',
|
38
|
+
'OcrStrategy',
|
39
|
+
'OutputResponse',
|
40
|
+
'Property',
|
41
|
+
'Segment',
|
42
|
+
'SegmentProcessing',
|
43
|
+
'SegmentType',
|
44
|
+
'SegmentationStrategy',
|
45
|
+
'Status',
|
46
|
+
'TaskPayload',
|
47
|
+
'TaskResponse'
|
48
|
+
]
|
@@ -0,0 +1,204 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: chunkr-ai
|
3
|
+
Version: 0.0.4
|
4
|
+
Summary: Python client for Chunkr: open source document intelligence
|
5
|
+
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
|
+
Project-URL: Homepage, https://chunkr.ai
|
7
|
+
Description-Content-Type: text/markdown
|
8
|
+
License-File: LICENSE
|
9
|
+
Requires-Dist: httpx>=0.28.1
|
10
|
+
Requires-Dist: pillow>=11.1.0
|
11
|
+
Requires-Dist: pydantic>=2.10.4
|
12
|
+
Requires-Dist: python-dotenv>=1.0.1
|
13
|
+
Requires-Dist: requests>=2.32.3
|
14
|
+
Provides-Extra: test
|
15
|
+
Requires-Dist: pytest>=8.3.4; extra == "test"
|
16
|
+
Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
|
17
|
+
|
18
|
+
# Chunkr Python Client
|
19
|
+
|
20
|
+
This provides a simple interface to interact with the Chunkr API.
|
21
|
+
|
22
|
+
## Getting Started
|
23
|
+
|
24
|
+
You can get an API key from [Chunkr](https://chunkr.ai) or deploy your own Chunkr instance. For self-hosted deployment options, check out our [deployment guide](https://github.com/lumina-ai-inc/chunkr/tree/main?tab=readme-ov-file#self-hosted-deployment-options).
|
25
|
+
|
26
|
+
For more information about the API and its capabilities, visit the [Chunkr API docs](https://docs.chunkr.ai).
|
27
|
+
|
28
|
+
## Installation
|
29
|
+
|
30
|
+
```bash
|
31
|
+
pip install chunkr-ai
|
32
|
+
```
|
33
|
+
|
34
|
+
## Usage
|
35
|
+
|
36
|
+
We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
|
37
|
+
|
38
|
+
### Synchronous Usage
|
39
|
+
|
40
|
+
```python
|
41
|
+
from chunkr_ai import Chunkr
|
42
|
+
|
43
|
+
# Initialize client
|
44
|
+
chunkr = Chunkr()
|
45
|
+
|
46
|
+
# Upload a file and wait for processing
|
47
|
+
task = chunkr.upload("document.pdf")
|
48
|
+
|
49
|
+
# Print the response
|
50
|
+
print(task)
|
51
|
+
|
52
|
+
# Get output from task
|
53
|
+
output = task.output
|
54
|
+
|
55
|
+
# If you want to upload without waiting for processing
|
56
|
+
task = chunkr.start_upload("document.pdf")
|
57
|
+
# ... do other things ...
|
58
|
+
task.poll() # Check status when needed
|
59
|
+
```
|
60
|
+
|
61
|
+
### Asynchronous Usage
|
62
|
+
|
63
|
+
```python
|
64
|
+
from chunkr_ai import ChunkrAsync
|
65
|
+
|
66
|
+
async def process_document():
|
67
|
+
# Initialize client
|
68
|
+
chunkr = ChunkrAsync()
|
69
|
+
|
70
|
+
# Upload a file and wait for processing
|
71
|
+
task = await chunkr.upload("document.pdf")
|
72
|
+
|
73
|
+
# Print the response
|
74
|
+
print(task)
|
75
|
+
|
76
|
+
# Get output from task
|
77
|
+
output = task.output
|
78
|
+
|
79
|
+
# If you want to upload without waiting for processing
|
80
|
+
task = await chunkr.start_upload("document.pdf")
|
81
|
+
# ... do other things ...
|
82
|
+
await task.poll_async() # Check status when needed
|
83
|
+
```
|
84
|
+
|
85
|
+
### Additional Features
|
86
|
+
|
87
|
+
Both clients support various input types:
|
88
|
+
|
89
|
+
```python
|
90
|
+
# Upload from file path
|
91
|
+
chunkr.upload("document.pdf")
|
92
|
+
|
93
|
+
# Upload from opened file
|
94
|
+
with open("document.pdf", "rb") as f:
|
95
|
+
chunkr.upload(f)
|
96
|
+
|
97
|
+
# Upload from URL
|
98
|
+
chunkr.upload("https://example.com/document.pdf")
|
99
|
+
|
100
|
+
# Upload from base64 string
|
101
|
+
chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
102
|
+
|
103
|
+
# Upload an image
|
104
|
+
from PIL import Image
|
105
|
+
img = Image.open("photo.jpg")
|
106
|
+
chunkr.upload(img)
|
107
|
+
```
|
108
|
+
|
109
|
+
### Configuration
|
110
|
+
|
111
|
+
You can customize the processing behavior by passing a `Configuration` object:
|
112
|
+
|
113
|
+
```python
|
114
|
+
from chunkr_ai.models import Configuration, OcrStrategy, SegmentationStrategy, GenerationStrategy
|
115
|
+
|
116
|
+
# Basic configuration
|
117
|
+
config = Configuration(
|
118
|
+
ocr_strategy=OcrStrategy.AUTO,
|
119
|
+
segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
|
120
|
+
high_resolution=True,
|
121
|
+
expires_in=3600, # seconds
|
122
|
+
)
|
123
|
+
|
124
|
+
# Upload with configuration
|
125
|
+
task = chunkr.upload("document.pdf", config)
|
126
|
+
```
|
127
|
+
|
128
|
+
#### Available Configuration Examples
|
129
|
+
|
130
|
+
- **Chunk Processing**
|
131
|
+
```python
|
132
|
+
from chunkr_ai.models import ChunkProcessing
|
133
|
+
config = Configuration(
|
134
|
+
chunk_processing=ChunkProcessing(target_length=1024)
|
135
|
+
)
|
136
|
+
```
|
137
|
+
- **Expires In**
|
138
|
+
```python
|
139
|
+
config = Configuration(expires_in=3600)
|
140
|
+
```
|
141
|
+
|
142
|
+
- **High Resolution**
|
143
|
+
```python
|
144
|
+
config = Configuration(high_resolution=True)
|
145
|
+
```
|
146
|
+
|
147
|
+
- **JSON Schema**
|
148
|
+
```python
|
149
|
+
config = Configuration(json_schema=JsonSchema(
|
150
|
+
title="Sales Data",
|
151
|
+
properties=[
|
152
|
+
Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
|
153
|
+
Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
|
154
|
+
]
|
155
|
+
))
|
156
|
+
```
|
157
|
+
|
158
|
+
- **OCR Strategy**
|
159
|
+
```python
|
160
|
+
config = Configuration(ocr_strategy=OcrStrategy.AUTO)
|
161
|
+
```
|
162
|
+
|
163
|
+
- **Segment Processing**
|
164
|
+
```python
|
165
|
+
from chunkr_ai.models import SegmentProcessing, GenerationConfig, GenerationStrategy
|
166
|
+
config = Configuration(
|
167
|
+
segment_processing=SegmentProcessing(
|
168
|
+
page=GenerationConfig(
|
169
|
+
html=GenerationStrategy.LLM,
|
170
|
+
markdown=GenerationStrategy.LLM
|
171
|
+
)
|
172
|
+
)
|
173
|
+
)
|
174
|
+
```
|
175
|
+
|
176
|
+
- **Segmentation Strategy**
|
177
|
+
```python
|
178
|
+
config = Configuration(
|
179
|
+
segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS # or SegmentationStrategy.PAGE
|
180
|
+
)
|
181
|
+
```
|
182
|
+
|
183
|
+
## Environment setup
|
184
|
+
|
185
|
+
You can provide your API key and URL in several ways:
|
186
|
+
1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
|
187
|
+
2. `.env` file
|
188
|
+
3. Direct initialization:
|
189
|
+
```python
|
190
|
+
chunkr = Chunkr(
|
191
|
+
api_key="your-api-key",
|
192
|
+
url="https://api.chunkr.ai"
|
193
|
+
)
|
194
|
+
```
|
195
|
+
|
196
|
+
## Run tests
|
197
|
+
|
198
|
+
```python
|
199
|
+
# Install dependencies
|
200
|
+
uv pip install -e ".[test]"
|
201
|
+
|
202
|
+
# Run tests
|
203
|
+
uv run pytest
|
204
|
+
```
|
@@ -0,0 +1,17 @@
|
|
1
|
+
chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
|
2
|
+
chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
chunkr_ai/models.py,sha256=d-B4vfgZClJOoHdPaH3vagwUc4qxeQSmUxab77DKYtQ,874
|
4
|
+
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
|
7
|
+
chunkr_ai/api/base.py,sha256=WDHx8tU0fl9_-yvYTKL-U0uaxHv-8_bRfiw9Xkl-mWM,6499
|
8
|
+
chunkr_ai/api/chunkr.py,sha256=LkBFzGB_T0y3fnBeIn_nwQW6Mb7eZO-iTlzWrmWBoko,3450
|
9
|
+
chunkr_ai/api/chunkr_async.py,sha256=B9deRVoe4h3Csh_jEuQxuxQ-DKSuZPdwkanFTyfHmeM,3603
|
10
|
+
chunkr_ai/api/config.py,sha256=K0s1giImciPksu-bO9gzRwUaK2Vo1nxNKQkXlRQ2cb8,3785
|
11
|
+
chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
|
12
|
+
chunkr_ai/api/task.py,sha256=ALU-rYlObbitlM1MKEFeSz_IBUpzb9736Iqu9huWg7c,4392
|
13
|
+
chunkr_ai-0.0.4.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
chunkr_ai-0.0.4.dist-info/METADATA,sha256=7k2zij-F7_Kcs6nFCJMKQW382gFpOOLAnZoOOXFrKFs,4913
|
15
|
+
chunkr_ai-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
16
|
+
chunkr_ai-0.0.4.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
17
|
+
chunkr_ai-0.0.4.dist-info/RECORD,,
|
chunkr_ai/api/models.py
DELETED
@@ -1,231 +0,0 @@
|
|
1
|
-
from .auth import HeadersMixin
|
2
|
-
import asyncio
|
3
|
-
from datetime import datetime
|
4
|
-
from enum import Enum
|
5
|
-
import httpx
|
6
|
-
from pydantic import BaseModel, Field, PrivateAttr
|
7
|
-
import requests
|
8
|
-
import time
|
9
|
-
from typing import Optional, List, Dict, Union
|
10
|
-
|
11
|
-
class GenerationStrategy(str, Enum):
|
12
|
-
LLM = "LLM"
|
13
|
-
AUTO = "Auto"
|
14
|
-
|
15
|
-
class CroppingStrategy(str, Enum):
|
16
|
-
ALL = "All"
|
17
|
-
AUTO = "Auto"
|
18
|
-
|
19
|
-
class LlmConfig(BaseModel):
|
20
|
-
model: str
|
21
|
-
prompt: str
|
22
|
-
temperature: float = 0.0
|
23
|
-
|
24
|
-
class AutoGenerationConfig(BaseModel):
|
25
|
-
html: GenerationStrategy = GenerationStrategy.AUTO
|
26
|
-
llm: Optional[LlmConfig] = None
|
27
|
-
markdown: GenerationStrategy = GenerationStrategy.AUTO
|
28
|
-
crop_image: CroppingStrategy = CroppingStrategy.ALL
|
29
|
-
|
30
|
-
class LlmGenerationConfig(BaseModel):
|
31
|
-
html: GenerationStrategy = GenerationStrategy.LLM
|
32
|
-
llm: Optional[LlmConfig] = None
|
33
|
-
markdown: GenerationStrategy = GenerationStrategy.LLM
|
34
|
-
crop_image: CroppingStrategy = CroppingStrategy.ALL
|
35
|
-
|
36
|
-
class SegmentProcessing(BaseModel):
|
37
|
-
title: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
38
|
-
section_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
39
|
-
text: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
40
|
-
list_item: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
41
|
-
table: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
|
42
|
-
picture: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
43
|
-
caption: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
44
|
-
formula: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
|
45
|
-
footnote: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
46
|
-
page_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
47
|
-
page_footer: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
48
|
-
page: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
49
|
-
|
50
|
-
class ChunkProcessing(BaseModel):
|
51
|
-
target_length: int = 512
|
52
|
-
|
53
|
-
class Property(BaseModel):
|
54
|
-
name: str
|
55
|
-
title: Optional[str]
|
56
|
-
prop_type: str
|
57
|
-
description: Optional[str]
|
58
|
-
default: Optional[str]
|
59
|
-
|
60
|
-
class JsonSchema(BaseModel):
|
61
|
-
title: str
|
62
|
-
properties: List[Property]
|
63
|
-
schema_type: Optional[str]
|
64
|
-
|
65
|
-
class OcrStrategy(str, Enum):
|
66
|
-
ALL = "All"
|
67
|
-
AUTO = "Auto"
|
68
|
-
|
69
|
-
class SegmentationStrategy(str, Enum):
|
70
|
-
LAYOUT_ANALYSIS = "LayoutAnalysis"
|
71
|
-
PAGE = "Page"
|
72
|
-
|
73
|
-
class BoundingBox(BaseModel):
|
74
|
-
left: float
|
75
|
-
top: float
|
76
|
-
width: float
|
77
|
-
height: float
|
78
|
-
|
79
|
-
class OCRResult(BaseModel):
|
80
|
-
bbox: BoundingBox
|
81
|
-
text: str
|
82
|
-
confidence: Optional[float]
|
83
|
-
|
84
|
-
class SegmentType(str, Enum):
|
85
|
-
CAPTION = "Caption"
|
86
|
-
FOOTNOTE = "Footnote"
|
87
|
-
FORMULA = "Formula"
|
88
|
-
LIST_ITEM = "ListItem"
|
89
|
-
PAGE = "Page"
|
90
|
-
PAGE_FOOTER = "PageFooter"
|
91
|
-
PAGE_HEADER = "PageHeader"
|
92
|
-
PICTURE = "Picture"
|
93
|
-
SECTION_HEADER = "SectionHeader"
|
94
|
-
TABLE = "Table"
|
95
|
-
TEXT = "Text"
|
96
|
-
TITLE = "Title"
|
97
|
-
|
98
|
-
class Segment(BaseModel):
|
99
|
-
bbox: BoundingBox
|
100
|
-
content: str
|
101
|
-
page_height: float
|
102
|
-
html: Optional[str]
|
103
|
-
image: Optional[str]
|
104
|
-
markdown: Optional[str]
|
105
|
-
ocr: List[OCRResult]
|
106
|
-
page_number: int
|
107
|
-
page_width: float
|
108
|
-
segment_id: str
|
109
|
-
segment_type: SegmentType
|
110
|
-
|
111
|
-
class Chunk(BaseModel):
|
112
|
-
chunk_id: str
|
113
|
-
chunk_length: int
|
114
|
-
segments: List[Segment]
|
115
|
-
|
116
|
-
class ExtractedJson(BaseModel):
|
117
|
-
data: Dict
|
118
|
-
|
119
|
-
class OutputResponse(BaseModel):
|
120
|
-
chunks: List[Chunk] = []
|
121
|
-
extracted_json: Optional[ExtractedJson]
|
122
|
-
|
123
|
-
class Model(str, Enum):
|
124
|
-
FAST = "Fast"
|
125
|
-
HIGH_QUALITY = "HighQuality"
|
126
|
-
|
127
|
-
class Configuration(BaseModel):
|
128
|
-
chunk_processing: ChunkProcessing = Field(default_factory=ChunkProcessing)
|
129
|
-
expires_in: Optional[int] = None
|
130
|
-
high_resolution: bool = False
|
131
|
-
json_schema: Optional[JsonSchema] = None
|
132
|
-
model: Optional[Model] = Field(None, deprecated=True)
|
133
|
-
ocr_strategy: OcrStrategy = OcrStrategy.AUTO
|
134
|
-
segment_processing: SegmentProcessing = Field(default_factory=SegmentProcessing)
|
135
|
-
segmentation_strategy: SegmentationStrategy = SegmentationStrategy.LAYOUT_ANALYSIS
|
136
|
-
target_chunk_length: Optional[int] = Field(None, deprecated=True)
|
137
|
-
|
138
|
-
|
139
|
-
class Status(str, Enum):
|
140
|
-
STARTING = "Starting"
|
141
|
-
PROCESSING = "Processing"
|
142
|
-
SUCCEEDED = "Succeeded"
|
143
|
-
FAILED = "Failed"
|
144
|
-
|
145
|
-
class TaskResponse(BaseModel, HeadersMixin):
|
146
|
-
configuration: Configuration
|
147
|
-
created_at: datetime
|
148
|
-
expires_at: Optional[datetime]
|
149
|
-
file_name: Optional[str]
|
150
|
-
finished_at: Optional[datetime]
|
151
|
-
input_file_url: Optional[str]
|
152
|
-
message: str
|
153
|
-
output: Optional[OutputResponse]
|
154
|
-
page_count: Optional[int]
|
155
|
-
pdf_url: Optional[str]
|
156
|
-
status: Status
|
157
|
-
task_id: str
|
158
|
-
task_url: Optional[str]
|
159
|
-
_api_key: Optional[str] = PrivateAttr(default=None)
|
160
|
-
|
161
|
-
def with_api_key(self, api_key: str) -> 'TaskResponse':
|
162
|
-
"""Helper function to set api key on a TaskResponse after creation"""
|
163
|
-
self._api_key = api_key
|
164
|
-
return self
|
165
|
-
|
166
|
-
def poll(self) -> 'TaskResponse':
|
167
|
-
"""Poll the task for completion"""
|
168
|
-
if not self.task_url:
|
169
|
-
raise ValueError("Task URL not found in response")
|
170
|
-
|
171
|
-
while True:
|
172
|
-
r = requests.get(self.task_url, headers=self._headers())
|
173
|
-
r.raise_for_status()
|
174
|
-
self.__dict__.update(r.json())
|
175
|
-
if self.status == "Failed":
|
176
|
-
raise ValueError(self.message)
|
177
|
-
if self.status not in ("Starting", "Processing"):
|
178
|
-
return self
|
179
|
-
time.sleep(0.5)
|
180
|
-
|
181
|
-
async def poll_async(self) -> 'TaskResponse':
|
182
|
-
"""Async poll the task for completion"""
|
183
|
-
if not self.task_url:
|
184
|
-
raise ValueError("Task URL not found in response")
|
185
|
-
|
186
|
-
async with httpx.AsyncClient() as client:
|
187
|
-
while True:
|
188
|
-
r = await client.get(self.task_url, headers=self._headers())
|
189
|
-
r.raise_for_status()
|
190
|
-
self.__dict__.update(r.json())
|
191
|
-
if self.status == "Failed":
|
192
|
-
raise ValueError(self.message)
|
193
|
-
if self.status not in ("Starting", "Processing"):
|
194
|
-
return self
|
195
|
-
await asyncio.sleep(0.5)
|
196
|
-
|
197
|
-
|
198
|
-
def _get_content(self, content_type: str) -> str:
|
199
|
-
"""Helper method to get either HTML, Markdown, or raw content."""
|
200
|
-
if not self.output:
|
201
|
-
return ""
|
202
|
-
parts = []
|
203
|
-
for c in self.output.chunks:
|
204
|
-
for s in c.segments:
|
205
|
-
content = getattr(s, content_type)
|
206
|
-
if content:
|
207
|
-
parts.append(content)
|
208
|
-
return "\n".join(parts)
|
209
|
-
|
210
|
-
def html(self) -> str:
|
211
|
-
"""Get full HTML for the task"""
|
212
|
-
return self._get_content("html")
|
213
|
-
|
214
|
-
def markdown(self) -> str:
|
215
|
-
"""Get full markdown for the task"""
|
216
|
-
return self._get_content("markdown")
|
217
|
-
|
218
|
-
def content(self) -> str:
|
219
|
-
"""Get full text for the task"""
|
220
|
-
return self._get_content("content")
|
221
|
-
|
222
|
-
class TaskPayload(BaseModel):
|
223
|
-
current_configuration: Configuration
|
224
|
-
file_name: str
|
225
|
-
image_folder_location: str
|
226
|
-
input_location: str
|
227
|
-
output_location: str
|
228
|
-
pdf_location: str
|
229
|
-
previous_configuration: Optional[Configuration]
|
230
|
-
task_id: str
|
231
|
-
user_id: str
|
@@ -1,16 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: chunkr-ai
|
3
|
-
Version: 0.0.2
|
4
|
-
Summary: Python client for chunkr: open source document intelligence
|
5
|
-
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
|
-
Description-Content-Type: text/markdown
|
7
|
-
License-File: LICENSE
|
8
|
-
Requires-Dist: build>=1.2.2.post1
|
9
|
-
Requires-Dist: httpx>=0.28.1
|
10
|
-
Requires-Dist: pillow>=11.1.0
|
11
|
-
Requires-Dist: pydantic>=2.10.4
|
12
|
-
Requires-Dist: python-dotenv>=1.0.1
|
13
|
-
Requires-Dist: requests>=2.32.3
|
14
|
-
Requires-Dist: twine>=6.0.1
|
15
|
-
Provides-Extra: test
|
16
|
-
Requires-Dist: pytest>=8.3.4; extra == "test"
|
chunkr_ai-0.0.2.dist-info/RECORD
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
|
2
|
-
chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
chunkr_ai/api/auth.py,sha256=U25WiNQBsrAWYAntuds0zSMvB4gUpAwGoSa5wnQ2LRQ,454
|
5
|
-
chunkr_ai/api/chunkr.py,sha256=UqFoK8ytCsW1I5F0nM4OD6I4zigy-UHzGuMDtpvMSmE,4454
|
6
|
-
chunkr_ai/api/chunkr_async.py,sha256=Kfh7_DEon6QTPe-XJops8l9R6rp0zIfJKeh9ZEGFQao,1529
|
7
|
-
chunkr_ai/api/models.py,sha256=vAVeRHgdSO4SDl009R2Vz75WtuXAwkUZW8ZsVXk9yBA,7221
|
8
|
-
chunkr_ai-0.0.2.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
chunkr_ai-0.0.2.dist-info/METADATA,sha256=ZK6gdzkukxMEVr1WxodLZ9dZNHar32C00ST1LG9mFl8,519
|
10
|
-
chunkr_ai-0.0.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
11
|
-
chunkr_ai-0.0.2.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
12
|
-
chunkr_ai-0.0.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|