chunkr-ai 0.0.1__tar.gz → 0.0.3__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- chunkr_ai-0.0.3/PKG-INFO +124 -0
- chunkr_ai-0.0.3/README.md +107 -0
- chunkr_ai-0.0.3/pyproject.toml +25 -0
- chunkr_ai-0.0.3/src/chunkr_ai/__init__.py +4 -0
- chunkr_ai-0.0.3/src/chunkr_ai/api/auth.py +12 -0
- chunkr_ai-0.0.3/src/chunkr_ai/api/base.py +173 -0
- chunkr_ai-0.0.3/src/chunkr_ai/api/chunkr.py +108 -0
- chunkr_ai-0.0.3/src/chunkr_ai/api/chunkr_async.py +105 -0
- chunkr_ai-0.0.3/src/chunkr_ai/api/config.py +130 -0
- chunkr_ai-0.0.3/src/chunkr_ai/api/protocol.py +19 -0
- chunkr_ai-0.0.3/src/chunkr_ai/api/task.py +124 -0
- chunkr_ai-0.0.3/src/chunkr_ai/models.py +49 -0
- chunkr_ai-0.0.3/src/chunkr_ai.egg-info/PKG-INFO +124 -0
- chunkr_ai-0.0.3/src/chunkr_ai.egg-info/SOURCES.txt +21 -0
- chunkr_ai-0.0.3/src/chunkr_ai.egg-info/requires.txt +9 -0
- chunkr_ai-0.0.3/tests/test_chunkr.py +141 -0
- chunkr_ai-0.0.1/PKG-INFO +0 -7
- chunkr_ai-0.0.1/pyproject.toml +0 -11
- chunkr_ai-0.0.1/src/chunkr_ai.egg-info/PKG-INFO +0 -7
- chunkr_ai-0.0.1/src/chunkr_ai.egg-info/SOURCES.txt +0 -9
- {chunkr_ai-0.0.1 → chunkr_ai-0.0.3}/LICENSE +0 -0
- {chunkr_ai-0.0.1 → chunkr_ai-0.0.3}/setup.cfg +0 -0
- {chunkr_ai-0.0.1/src/chunkr_ai → chunkr_ai-0.0.3/src/chunkr_ai/api}/__init__.py +0 -0
- /chunkr_ai-0.0.1/README.md → /chunkr_ai-0.0.3/src/chunkr_ai/api/api.py +0 -0
- {chunkr_ai-0.0.1 → chunkr_ai-0.0.3}/src/chunkr_ai/main.py +0 -0
- {chunkr_ai-0.0.1 → chunkr_ai-0.0.3}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.1 → chunkr_ai-0.0.3}/src/chunkr_ai.egg-info/top_level.txt +0 -0
chunkr_ai-0.0.3/PKG-INFO
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: chunkr-ai
|
3
|
+
Version: 0.0.3
|
4
|
+
Summary: Python client for Chunkr: open source document intelligence
|
5
|
+
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
|
+
Project-URL: Homepage, https://chunkr.ai
|
7
|
+
Description-Content-Type: text/markdown
|
8
|
+
License-File: LICENSE
|
9
|
+
Requires-Dist: httpx>=0.28.1
|
10
|
+
Requires-Dist: pillow>=11.1.0
|
11
|
+
Requires-Dist: pydantic>=2.10.4
|
12
|
+
Requires-Dist: python-dotenv>=1.0.1
|
13
|
+
Requires-Dist: requests>=2.32.3
|
14
|
+
Provides-Extra: test
|
15
|
+
Requires-Dist: pytest>=8.3.4; extra == "test"
|
16
|
+
Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
|
17
|
+
|
18
|
+
# Chunkr Python Client
|
19
|
+
|
20
|
+
This is the Python client for the Chunkr API. It provides a simple interface to interact with Chunkr's services.
|
21
|
+
|
22
|
+
## Installation
|
23
|
+
|
24
|
+
```bash
|
25
|
+
pip install chunkr-ai
|
26
|
+
```
|
27
|
+
|
28
|
+
## Usage
|
29
|
+
|
30
|
+
We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
|
31
|
+
|
32
|
+
### Synchronous Usage
|
33
|
+
|
34
|
+
```python
|
35
|
+
from chunkr_ai import Chunkr
|
36
|
+
|
37
|
+
# Initialize client
|
38
|
+
chunkr = Chunkr()
|
39
|
+
|
40
|
+
# Upload a file and wait for processing
|
41
|
+
task = chunkr.upload("document.pdf")
|
42
|
+
|
43
|
+
# Print the response
|
44
|
+
print(task)
|
45
|
+
|
46
|
+
# Get output from task
|
47
|
+
output = task.output
|
48
|
+
|
49
|
+
# If you want to upload without waiting for processing
|
50
|
+
task = chunkr.start_upload("document.pdf")
|
51
|
+
# ... do other things ...
|
52
|
+
task.poll() # Check status when needed
|
53
|
+
```
|
54
|
+
|
55
|
+
### Asynchronous Usage
|
56
|
+
|
57
|
+
```python
|
58
|
+
from chunkr_ai import ChunkrAsync
|
59
|
+
|
60
|
+
async def process_document():
|
61
|
+
# Initialize client
|
62
|
+
chunkr = ChunkrAsync()
|
63
|
+
|
64
|
+
# Upload a file and wait for processing
|
65
|
+
task = await chunkr.upload("document.pdf")
|
66
|
+
|
67
|
+
# Print the response
|
68
|
+
print(task)
|
69
|
+
|
70
|
+
# Get output from task
|
71
|
+
output = task.output
|
72
|
+
|
73
|
+
# If you want to upload without waiting for processing
|
74
|
+
task = await chunkr.start_upload("document.pdf")
|
75
|
+
# ... do other things ...
|
76
|
+
await task.poll_async() # Check status when needed
|
77
|
+
```
|
78
|
+
|
79
|
+
### Additional Features
|
80
|
+
|
81
|
+
Both clients support various input types:
|
82
|
+
|
83
|
+
```python
|
84
|
+
# Upload from file path
|
85
|
+
chunkr.upload("document.pdf")
|
86
|
+
|
87
|
+
# Upload from opened file
|
88
|
+
with open("document.pdf", "rb") as f:
|
89
|
+
chunkr.upload(f)
|
90
|
+
|
91
|
+
# Upload from URL
|
92
|
+
chunkr.upload("https://example.com/document.pdf")
|
93
|
+
|
94
|
+
# Upload from base64 string
|
95
|
+
chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
96
|
+
|
97
|
+
# Upload an image
|
98
|
+
from PIL import Image
|
99
|
+
img = Image.open("photo.jpg")
|
100
|
+
chunkr.upload(img)
|
101
|
+
```
|
102
|
+
|
103
|
+
### Configuration
|
104
|
+
|
105
|
+
You can provide your API key and URL in several ways:
|
106
|
+
1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
|
107
|
+
2. `.env` file
|
108
|
+
3. Direct initialization:
|
109
|
+
```python
|
110
|
+
chunkr = Chunkr(
|
111
|
+
api_key="your-api-key",
|
112
|
+
url="https://api.chunkr.ai"
|
113
|
+
)
|
114
|
+
```
|
115
|
+
|
116
|
+
## Run tests
|
117
|
+
|
118
|
+
```python
|
119
|
+
# Install dependencies
|
120
|
+
uv pip install -e ".[test]"
|
121
|
+
|
122
|
+
# Run tests
|
123
|
+
uv run pytest
|
124
|
+
```
|
@@ -0,0 +1,107 @@
|
|
1
|
+
# Chunkr Python Client
|
2
|
+
|
3
|
+
This is the Python client for the Chunkr API. It provides a simple interface to interact with Chunkr's services.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
```bash
|
8
|
+
pip install chunkr-ai
|
9
|
+
```
|
10
|
+
|
11
|
+
## Usage
|
12
|
+
|
13
|
+
We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
|
14
|
+
|
15
|
+
### Synchronous Usage
|
16
|
+
|
17
|
+
```python
|
18
|
+
from chunkr_ai import Chunkr
|
19
|
+
|
20
|
+
# Initialize client
|
21
|
+
chunkr = Chunkr()
|
22
|
+
|
23
|
+
# Upload a file and wait for processing
|
24
|
+
task = chunkr.upload("document.pdf")
|
25
|
+
|
26
|
+
# Print the response
|
27
|
+
print(task)
|
28
|
+
|
29
|
+
# Get output from task
|
30
|
+
output = task.output
|
31
|
+
|
32
|
+
# If you want to upload without waiting for processing
|
33
|
+
task = chunkr.start_upload("document.pdf")
|
34
|
+
# ... do other things ...
|
35
|
+
task.poll() # Check status when needed
|
36
|
+
```
|
37
|
+
|
38
|
+
### Asynchronous Usage
|
39
|
+
|
40
|
+
```python
|
41
|
+
from chunkr_ai import ChunkrAsync
|
42
|
+
|
43
|
+
async def process_document():
|
44
|
+
# Initialize client
|
45
|
+
chunkr = ChunkrAsync()
|
46
|
+
|
47
|
+
# Upload a file and wait for processing
|
48
|
+
task = await chunkr.upload("document.pdf")
|
49
|
+
|
50
|
+
# Print the response
|
51
|
+
print(task)
|
52
|
+
|
53
|
+
# Get output from task
|
54
|
+
output = task.output
|
55
|
+
|
56
|
+
# If you want to upload without waiting for processing
|
57
|
+
task = await chunkr.start_upload("document.pdf")
|
58
|
+
# ... do other things ...
|
59
|
+
await task.poll_async() # Check status when needed
|
60
|
+
```
|
61
|
+
|
62
|
+
### Additional Features
|
63
|
+
|
64
|
+
Both clients support various input types:
|
65
|
+
|
66
|
+
```python
|
67
|
+
# Upload from file path
|
68
|
+
chunkr.upload("document.pdf")
|
69
|
+
|
70
|
+
# Upload from opened file
|
71
|
+
with open("document.pdf", "rb") as f:
|
72
|
+
chunkr.upload(f)
|
73
|
+
|
74
|
+
# Upload from URL
|
75
|
+
chunkr.upload("https://example.com/document.pdf")
|
76
|
+
|
77
|
+
# Upload from base64 string
|
78
|
+
chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
79
|
+
|
80
|
+
# Upload an image
|
81
|
+
from PIL import Image
|
82
|
+
img = Image.open("photo.jpg")
|
83
|
+
chunkr.upload(img)
|
84
|
+
```
|
85
|
+
|
86
|
+
### Configuration
|
87
|
+
|
88
|
+
You can provide your API key and URL in several ways:
|
89
|
+
1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
|
90
|
+
2. `.env` file
|
91
|
+
3. Direct initialization:
|
92
|
+
```python
|
93
|
+
chunkr = Chunkr(
|
94
|
+
api_key="your-api-key",
|
95
|
+
url="https://api.chunkr.ai"
|
96
|
+
)
|
97
|
+
```
|
98
|
+
|
99
|
+
## Run tests
|
100
|
+
|
101
|
+
```python
|
102
|
+
# Install dependencies
|
103
|
+
uv pip install -e ".[test]"
|
104
|
+
|
105
|
+
# Run tests
|
106
|
+
uv run pytest
|
107
|
+
```
|
@@ -0,0 +1,25 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["setuptools>=42", "wheel"]
|
3
|
+
build-backend = "setuptools.build_meta"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "chunkr-ai"
|
7
|
+
version = "0.0.3"
|
8
|
+
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
|
+
description = "Python client for Chunkr: open source document intelligence"
|
10
|
+
readme = "README.md"
|
11
|
+
license = {"file" = "LICENSE"}
|
12
|
+
urls = {Homepage = "https://chunkr.ai"}
|
13
|
+
dependencies = [
|
14
|
+
"httpx>=0.28.1",
|
15
|
+
"pillow>=11.1.0",
|
16
|
+
"pydantic>=2.10.4",
|
17
|
+
"python-dotenv>=1.0.1",
|
18
|
+
"requests>=2.32.3",
|
19
|
+
]
|
20
|
+
|
21
|
+
[project.optional-dependencies]
|
22
|
+
test = [
|
23
|
+
"pytest>=8.3.4",
|
24
|
+
"pytest-xdist>=3.6.1",
|
25
|
+
]
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class HeadersMixin:
|
2
|
+
"""Mixin class for handling authorization headers"""
|
3
|
+
|
4
|
+
def get_api_key(self) -> str:
|
5
|
+
"""Get the API key"""
|
6
|
+
if not hasattr(self, '_api_key') or not self._api_key:
|
7
|
+
raise ValueError("API key not set")
|
8
|
+
return self._api_key
|
9
|
+
|
10
|
+
def _headers(self) -> dict:
|
11
|
+
"""Generate authorization headers"""
|
12
|
+
return {"Authorization": self.get_api_key()}
|
@@ -0,0 +1,173 @@
|
|
1
|
+
from .config import Configuration
|
2
|
+
from .task import TaskResponse
|
3
|
+
from .auth import HeadersMixin
|
4
|
+
from abc import abstractmethod
|
5
|
+
from dotenv import load_dotenv
|
6
|
+
import io
|
7
|
+
import json
|
8
|
+
import os
|
9
|
+
from pathlib import Path
|
10
|
+
from PIL import Image
|
11
|
+
import requests
|
12
|
+
from typing import BinaryIO, Tuple, Union
|
13
|
+
|
14
|
+
class ChunkrBase(HeadersMixin):
|
15
|
+
"""Base class with shared functionality for Chunkr API clients."""
|
16
|
+
|
17
|
+
def __init__(self, url: str = None, api_key: str = None):
|
18
|
+
load_dotenv()
|
19
|
+
self.url = (
|
20
|
+
url or
|
21
|
+
os.getenv('CHUNKR_URL') or
|
22
|
+
'https://api.chunkr.ai'
|
23
|
+
)
|
24
|
+
self._api_key = (
|
25
|
+
api_key or
|
26
|
+
os.getenv('CHUNKR_API_KEY')
|
27
|
+
)
|
28
|
+
if not self._api_key:
|
29
|
+
raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
|
30
|
+
|
31
|
+
self.url = self.url.rstrip("/")
|
32
|
+
|
33
|
+
def _prepare_file(
|
34
|
+
self,
|
35
|
+
file: Union[str, Path, BinaryIO, Image.Image]
|
36
|
+
) -> Tuple[str, BinaryIO]:
|
37
|
+
"""Convert various file types into a tuple of (filename, file-like object).
|
38
|
+
|
39
|
+
Args:
|
40
|
+
file: Input file, can be:
|
41
|
+
- String or Path to a file
|
42
|
+
- URL string starting with http:// or https://
|
43
|
+
- Base64 string
|
44
|
+
- Opened binary file (mode='rb')
|
45
|
+
- PIL/Pillow Image object
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
|
49
|
+
|
50
|
+
Raises:
|
51
|
+
FileNotFoundError: If the file path doesn't exist
|
52
|
+
TypeError: If the file type is not supported
|
53
|
+
ValueError: If the URL is invalid or unreachable
|
54
|
+
ValueError: If the MIME type is unsupported
|
55
|
+
"""
|
56
|
+
# Handle URLs
|
57
|
+
if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
|
58
|
+
response = requests.get(file)
|
59
|
+
response.raise_for_status()
|
60
|
+
file_obj = io.BytesIO(response.content)
|
61
|
+
filename = Path(file.split('/')[-1]).name or 'downloaded_file'
|
62
|
+
return filename, file_obj
|
63
|
+
|
64
|
+
# Handle base64 strings
|
65
|
+
if isinstance(file, str) and ',' in file and ';base64,' in file:
|
66
|
+
try:
|
67
|
+
# Split header and data
|
68
|
+
header, base64_data = file.split(',', 1)
|
69
|
+
import base64
|
70
|
+
file_bytes = base64.b64decode(base64_data)
|
71
|
+
file_obj = io.BytesIO(file_bytes)
|
72
|
+
|
73
|
+
# Try to determine format from header
|
74
|
+
format = 'bin'
|
75
|
+
mime_type = header.split(':')[-1].split(';')[0].lower()
|
76
|
+
|
77
|
+
# Map MIME types to file extensions
|
78
|
+
mime_to_ext = {
|
79
|
+
'application/pdf': 'pdf',
|
80
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
81
|
+
'application/msword': 'doc',
|
82
|
+
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
|
83
|
+
'application/vnd.ms-powerpoint': 'ppt',
|
84
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
|
85
|
+
'application/vnd.ms-excel': 'xls',
|
86
|
+
'image/jpeg': 'jpg',
|
87
|
+
'image/png': 'png',
|
88
|
+
'image/jpg': 'jpg'
|
89
|
+
}
|
90
|
+
|
91
|
+
if mime_type in mime_to_ext:
|
92
|
+
format = mime_to_ext[mime_type]
|
93
|
+
else:
|
94
|
+
raise ValueError(f"Unsupported MIME type: {mime_type}")
|
95
|
+
|
96
|
+
return f"file.{format}", file_obj
|
97
|
+
except Exception as e:
|
98
|
+
raise ValueError(f"Invalid base64 string: {str(e)}")
|
99
|
+
|
100
|
+
# Handle file paths
|
101
|
+
if isinstance(file, (str, Path)):
|
102
|
+
path = Path(file).resolve()
|
103
|
+
if not path.exists():
|
104
|
+
raise FileNotFoundError(f"File not found: {file}")
|
105
|
+
return path.name, open(path, 'rb')
|
106
|
+
|
107
|
+
# Handle PIL Images
|
108
|
+
if isinstance(file, Image.Image):
|
109
|
+
img_byte_arr = io.BytesIO()
|
110
|
+
format = file.format or 'PNG'
|
111
|
+
file.save(img_byte_arr, format=format)
|
112
|
+
img_byte_arr.seek(0)
|
113
|
+
return f"image.{format.lower()}", img_byte_arr
|
114
|
+
|
115
|
+
# Handle file-like objects
|
116
|
+
if hasattr(file, 'read') and hasattr(file, 'seek'):
|
117
|
+
# Try to get the filename from the file object if possible
|
118
|
+
name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
|
119
|
+
return Path(name).name, file
|
120
|
+
|
121
|
+
raise TypeError(f"Unsupported file type: {type(file)}")
|
122
|
+
|
123
|
+
def _prepare_upload_data(
|
124
|
+
self,
|
125
|
+
file: Union[str, Path, BinaryIO, Image.Image],
|
126
|
+
config: Configuration = None
|
127
|
+
) -> Tuple[dict, dict]:
|
128
|
+
"""Prepare files and data dictionaries for upload.
|
129
|
+
|
130
|
+
Args:
|
131
|
+
file: The file to upload
|
132
|
+
config: Optional configuration settings
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
Tuple[dict, dict]: (files dict, data dict) ready for upload
|
136
|
+
"""
|
137
|
+
filename, file_obj = self._prepare_file(file)
|
138
|
+
files = {"file": (filename, file_obj)}
|
139
|
+
data = {}
|
140
|
+
|
141
|
+
if config:
|
142
|
+
config_dict = config.model_dump(mode="json", exclude_none=True)
|
143
|
+
for key, value in config_dict.items():
|
144
|
+
if isinstance(value, dict):
|
145
|
+
files[key] = (None, json.dumps(value), 'application/json')
|
146
|
+
else:
|
147
|
+
data[key] = value
|
148
|
+
|
149
|
+
return files, data
|
150
|
+
|
151
|
+
@abstractmethod
|
152
|
+
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
153
|
+
"""Upload a file and wait for processing to complete.
|
154
|
+
|
155
|
+
Must be implemented by subclasses.
|
156
|
+
"""
|
157
|
+
pass
|
158
|
+
|
159
|
+
@abstractmethod
|
160
|
+
def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
161
|
+
"""Upload a file for processing and immediately return the task response.
|
162
|
+
|
163
|
+
Must be implemented by subclasses.
|
164
|
+
"""
|
165
|
+
pass
|
166
|
+
|
167
|
+
@abstractmethod
|
168
|
+
def get_task(self, task_id: str) -> TaskResponse:
|
169
|
+
"""Get a task response by its ID.
|
170
|
+
|
171
|
+
Must be implemented by subclasses.
|
172
|
+
"""
|
173
|
+
pass
|
@@ -0,0 +1,108 @@
|
|
1
|
+
from .base import ChunkrBase
|
2
|
+
from .config import Configuration
|
3
|
+
from .task import TaskResponse
|
4
|
+
from pathlib import Path
|
5
|
+
from PIL import Image
|
6
|
+
import requests
|
7
|
+
from typing import Union, BinaryIO
|
8
|
+
|
9
|
+
class Chunkr(ChunkrBase):
|
10
|
+
"""Chunkr API client"""
|
11
|
+
|
12
|
+
def __init__(self, url: str = None, api_key: str = None):
|
13
|
+
super().__init__(url, api_key)
|
14
|
+
self._session = requests.Session()
|
15
|
+
|
16
|
+
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
17
|
+
"""Upload a file and wait for processing to complete.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
file: The file to upload.
|
21
|
+
config: Configuration options for processing. Optional.
|
22
|
+
|
23
|
+
Examples:
|
24
|
+
```
|
25
|
+
# Upload from file path
|
26
|
+
chunkr.upload("document.pdf")
|
27
|
+
|
28
|
+
# Upload from URL
|
29
|
+
chunkr.upload("https://example.com/document.pdf")
|
30
|
+
|
31
|
+
# Upload from base64 string (must include MIME type header)
|
32
|
+
chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
33
|
+
|
34
|
+
# Upload from opened file
|
35
|
+
with open("document.pdf", "rb") as f:
|
36
|
+
chunkr.upload(f)
|
37
|
+
|
38
|
+
# Upload an image
|
39
|
+
from PIL import Image
|
40
|
+
img = Image.open("photo.jpg")
|
41
|
+
chunkr.upload(img)
|
42
|
+
```
|
43
|
+
Returns:
|
44
|
+
TaskResponse: The completed task response
|
45
|
+
"""
|
46
|
+
task = self.start_upload(file, config)
|
47
|
+
return task.poll()
|
48
|
+
|
49
|
+
def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
50
|
+
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
|
51
|
+
|
52
|
+
Args:
|
53
|
+
file: The file to upload.
|
54
|
+
config: Configuration options for processing. Optional.
|
55
|
+
|
56
|
+
Examples:
|
57
|
+
```
|
58
|
+
# Upload from file path
|
59
|
+
task = chunkr.start_upload("document.pdf")
|
60
|
+
|
61
|
+
# Upload from opened file
|
62
|
+
with open("document.pdf", "rb") as f:
|
63
|
+
task = chunkr.start_upload(f)
|
64
|
+
|
65
|
+
# Upload from URL
|
66
|
+
task = chunkr.start_upload("https://example.com/document.pdf")
|
67
|
+
|
68
|
+
# Upload from base64 string (must include MIME type header)
|
69
|
+
task = chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
70
|
+
|
71
|
+
# Upload an image
|
72
|
+
from PIL import Image
|
73
|
+
img = Image.open("photo.jpg")
|
74
|
+
task = chunkr.start_upload(img)
|
75
|
+
|
76
|
+
# Wait for the task to complete - this can be done when needed
|
77
|
+
task.poll()
|
78
|
+
```
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
TaskResponse: The initial task response
|
82
|
+
"""
|
83
|
+
files, data = self._prepare_upload_data(file, config)
|
84
|
+
r = self._session.post(
|
85
|
+
f"{self.url}/api/v1/task",
|
86
|
+
files=files,
|
87
|
+
data=data,
|
88
|
+
headers=self._headers()
|
89
|
+
)
|
90
|
+
r.raise_for_status()
|
91
|
+
return TaskResponse(**r.json()).with_client(self)
|
92
|
+
|
93
|
+
def get_task(self, task_id: str) -> TaskResponse:
|
94
|
+
"""Get a task response by its ID.
|
95
|
+
|
96
|
+
Args:
|
97
|
+
task_id: The ID of the task to get
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
TaskResponse: The task response
|
101
|
+
"""
|
102
|
+
r = self._session.get(
|
103
|
+
f"{self.url}/api/v1/task/{task_id}",
|
104
|
+
headers=self._headers()
|
105
|
+
)
|
106
|
+
r.raise_for_status()
|
107
|
+
return TaskResponse(**r.json()).with_client(self)
|
108
|
+
|
@@ -0,0 +1,105 @@
|
|
1
|
+
from .base import ChunkrBase
|
2
|
+
from .task import TaskResponse
|
3
|
+
from .config import Configuration
|
4
|
+
import httpx
|
5
|
+
from pathlib import Path
|
6
|
+
from PIL import Image
|
7
|
+
from typing import Union, BinaryIO
|
8
|
+
|
9
|
+
class ChunkrAsync(ChunkrBase):
|
10
|
+
"""Asynchronous Chunkr API client"""
|
11
|
+
|
12
|
+
def __init__(self, url: str = None, api_key: str = None):
|
13
|
+
super().__init__(url, api_key)
|
14
|
+
self._client = httpx.AsyncClient()
|
15
|
+
|
16
|
+
async def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
17
|
+
"""Upload a file and wait for processing to complete.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
file: The file to upload.
|
21
|
+
config: Configuration options for processing. Optional.
|
22
|
+
|
23
|
+
Examples:
|
24
|
+
```python
|
25
|
+
# Upload from file path
|
26
|
+
await chunkr.upload("document.pdf")
|
27
|
+
|
28
|
+
# Upload from opened file
|
29
|
+
with open("document.pdf", "rb") as f:
|
30
|
+
await chunkr.upload(f)
|
31
|
+
|
32
|
+
# Upload from URL
|
33
|
+
await chunkr.upload("https://example.com/document.pdf")
|
34
|
+
|
35
|
+
# Upload from base64 string (must include MIME type header)
|
36
|
+
await chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
37
|
+
|
38
|
+
# Upload an image
|
39
|
+
from PIL import Image
|
40
|
+
img = Image.open("photo.jpg")
|
41
|
+
await chunkr.upload(img)
|
42
|
+
```
|
43
|
+
Returns:
|
44
|
+
TaskResponse: The completed task response
|
45
|
+
"""
|
46
|
+
task = await self.start_upload(file, config)
|
47
|
+
return await task.poll_async()
|
48
|
+
|
49
|
+
async def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
50
|
+
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
file: The file to upload.
|
54
|
+
config: Configuration options for processing. Optional.
|
55
|
+
|
56
|
+
Examples:
|
57
|
+
```
|
58
|
+
# Upload from file path
|
59
|
+
task = await chunkr.start_upload("document.pdf")
|
60
|
+
|
61
|
+
# Upload from opened file
|
62
|
+
with open("document.pdf", "rb") as f:
|
63
|
+
task = await chunkr.start_upload(f)
|
64
|
+
|
65
|
+
# Upload from URL
|
66
|
+
task = await chunkr.start_upload("https://example.com/document.pdf")
|
67
|
+
|
68
|
+
# Upload from base64 string (must include MIME type header)
|
69
|
+
task = await chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
70
|
+
|
71
|
+
# Upload an image
|
72
|
+
from PIL import Image
|
73
|
+
img = Image.open("photo.jpg")
|
74
|
+
task = await chunkr.start_upload(img)
|
75
|
+
|
76
|
+
# Wait for the task to complete - this can be done when needed
|
77
|
+
await task.poll_async()
|
78
|
+
```
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
TaskResponse: The initial task response
|
82
|
+
"""
|
83
|
+
files, data = self._prepare_upload_data(file, config)
|
84
|
+
r = await self._client.post(
|
85
|
+
f"{self.url}/api/v1/task",
|
86
|
+
files=files,
|
87
|
+
json=config.model_dump() if config else {},
|
88
|
+
headers=self._headers()
|
89
|
+
)
|
90
|
+
r.raise_for_status()
|
91
|
+
return TaskResponse(**r.json()).with_client(self)
|
92
|
+
|
93
|
+
async def get_task(self, task_id: str) -> TaskResponse:
|
94
|
+
r = await self._client.get(
|
95
|
+
f"{self.url}/api/v1/task/{task_id}",
|
96
|
+
headers=self._headers()
|
97
|
+
)
|
98
|
+
r.raise_for_status()
|
99
|
+
return TaskResponse(**r.json()).with_client(self)
|
100
|
+
|
101
|
+
async def __aenter__(self):
|
102
|
+
return self
|
103
|
+
|
104
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
105
|
+
await self._client.aclose()
|
@@ -0,0 +1,130 @@
|
|
1
|
+
from pydantic import BaseModel, Field
|
2
|
+
from enum import Enum
|
3
|
+
from typing import Optional, List, Dict
|
4
|
+
|
5
|
+
class GenerationStrategy(str, Enum):
|
6
|
+
LLM = "LLM"
|
7
|
+
AUTO = "Auto"
|
8
|
+
|
9
|
+
class CroppingStrategy(str, Enum):
|
10
|
+
ALL = "All"
|
11
|
+
AUTO = "Auto"
|
12
|
+
|
13
|
+
class LlmConfig(BaseModel):
|
14
|
+
model: str
|
15
|
+
prompt: str
|
16
|
+
temperature: float = 0.0
|
17
|
+
|
18
|
+
class GenerationConfig(BaseModel):
|
19
|
+
html: Optional[GenerationStrategy] = None
|
20
|
+
llm: Optional[LlmConfig] = None
|
21
|
+
markdown: Optional[GenerationStrategy] = None
|
22
|
+
crop_image: Optional[CroppingStrategy] = None
|
23
|
+
|
24
|
+
class SegmentProcessing(BaseModel):
|
25
|
+
title: Optional[GenerationConfig] = None
|
26
|
+
section_header: Optional[GenerationConfig] = None
|
27
|
+
text: Optional[GenerationConfig] = None
|
28
|
+
list_item: Optional[GenerationConfig] = None
|
29
|
+
table: Optional[GenerationConfig] = None
|
30
|
+
picture: Optional[GenerationConfig] = None
|
31
|
+
caption: Optional[GenerationConfig] = None
|
32
|
+
formula: Optional[GenerationConfig] = None
|
33
|
+
footnote: Optional[GenerationConfig] = None
|
34
|
+
page_header: Optional[GenerationConfig] = None
|
35
|
+
page_footer: Optional[GenerationConfig] = None
|
36
|
+
page: Optional[GenerationConfig] = None
|
37
|
+
|
38
|
+
class ChunkProcessing(BaseModel):
|
39
|
+
target_length: Optional[int] = None
|
40
|
+
|
41
|
+
class Property(BaseModel):
|
42
|
+
name: str
|
43
|
+
title: Optional[str]
|
44
|
+
prop_type: str
|
45
|
+
description: Optional[str]
|
46
|
+
default: Optional[str]
|
47
|
+
|
48
|
+
class JsonSchema(BaseModel):
|
49
|
+
title: str
|
50
|
+
properties: List[Property]
|
51
|
+
schema_type: Optional[str]
|
52
|
+
|
53
|
+
class OcrStrategy(str, Enum):
|
54
|
+
ALL = "All"
|
55
|
+
AUTO = "Auto"
|
56
|
+
|
57
|
+
class SegmentationStrategy(str, Enum):
|
58
|
+
LAYOUT_ANALYSIS = "LayoutAnalysis"
|
59
|
+
PAGE = "Page"
|
60
|
+
|
61
|
+
class BoundingBox(BaseModel):
|
62
|
+
left: float
|
63
|
+
top: float
|
64
|
+
width: float
|
65
|
+
height: float
|
66
|
+
|
67
|
+
class OCRResult(BaseModel):
|
68
|
+
bbox: BoundingBox
|
69
|
+
text: str
|
70
|
+
confidence: Optional[float]
|
71
|
+
|
72
|
+
class SegmentType(str, Enum):
|
73
|
+
CAPTION = "Caption"
|
74
|
+
FOOTNOTE = "Footnote"
|
75
|
+
FORMULA = "Formula"
|
76
|
+
LIST_ITEM = "ListItem"
|
77
|
+
PAGE = "Page"
|
78
|
+
PAGE_FOOTER = "PageFooter"
|
79
|
+
PAGE_HEADER = "PageHeader"
|
80
|
+
PICTURE = "Picture"
|
81
|
+
SECTION_HEADER = "SectionHeader"
|
82
|
+
TABLE = "Table"
|
83
|
+
TEXT = "Text"
|
84
|
+
TITLE = "Title"
|
85
|
+
|
86
|
+
class Segment(BaseModel):
|
87
|
+
bbox: BoundingBox
|
88
|
+
content: str
|
89
|
+
page_height: float
|
90
|
+
html: Optional[str]
|
91
|
+
image: Optional[str]
|
92
|
+
markdown: Optional[str]
|
93
|
+
ocr: List[OCRResult]
|
94
|
+
page_number: int
|
95
|
+
page_width: float
|
96
|
+
segment_id: str
|
97
|
+
segment_type: SegmentType
|
98
|
+
|
99
|
+
class Chunk(BaseModel):
|
100
|
+
chunk_id: str
|
101
|
+
chunk_length: int
|
102
|
+
segments: List[Segment]
|
103
|
+
|
104
|
+
class ExtractedJson(BaseModel):
|
105
|
+
data: Dict
|
106
|
+
|
107
|
+
class OutputResponse(BaseModel):
|
108
|
+
chunks: List[Chunk] = []
|
109
|
+
extracted_json: Optional[ExtractedJson]
|
110
|
+
|
111
|
+
class Model(str, Enum):
|
112
|
+
FAST = "Fast"
|
113
|
+
HIGH_QUALITY = "HighQuality"
|
114
|
+
|
115
|
+
class Configuration(BaseModel):
|
116
|
+
chunk_processing: Optional[ChunkProcessing] = Field(default=None)
|
117
|
+
expires_in: Optional[int] = Field(default=None)
|
118
|
+
high_resolution: Optional[bool] = Field(default=None)
|
119
|
+
json_schema: Optional[JsonSchema] = Field(default=None)
|
120
|
+
model: Optional[Model] = Field(default=None)
|
121
|
+
ocr_strategy: Optional[OcrStrategy] = Field(default=None)
|
122
|
+
segment_processing: Optional[SegmentProcessing] = Field(default=None)
|
123
|
+
segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
|
124
|
+
target_chunk_length: Optional[int] = Field(default=None)
|
125
|
+
|
126
|
+
class Status(str, Enum):
|
127
|
+
STARTING = "Starting"
|
128
|
+
PROCESSING = "Processing"
|
129
|
+
SUCCEEDED = "Succeeded"
|
130
|
+
FAILED = "Failed"
|
@@ -0,0 +1,19 @@
|
|
1
|
+
from typing import runtime_checkable, Protocol
|
2
|
+
from requests import Session
|
3
|
+
from httpx import AsyncClient
|
4
|
+
|
5
|
+
@runtime_checkable
|
6
|
+
class ChunkrClientProtocol(Protocol):
|
7
|
+
"""Protocol defining the interface for Chunkr clients"""
|
8
|
+
url: str
|
9
|
+
_api_key: str
|
10
|
+
_session: Session
|
11
|
+
_client: AsyncClient
|
12
|
+
|
13
|
+
def get_api_key(self) -> str:
|
14
|
+
"""Get the API key"""
|
15
|
+
...
|
16
|
+
|
17
|
+
def _headers(self) -> dict:
|
18
|
+
"""Return headers required for API requests"""
|
19
|
+
...
|
@@ -0,0 +1,124 @@
|
|
1
|
+
from .protocol import ChunkrClientProtocol
|
2
|
+
from .config import Configuration, Status, OutputResponse
|
3
|
+
import asyncio
|
4
|
+
from datetime import datetime
|
5
|
+
from pydantic import BaseModel, PrivateAttr
|
6
|
+
import time
|
7
|
+
from typing import Optional, Union
|
8
|
+
|
9
|
+
class TaskResponse(BaseModel):
|
10
|
+
configuration: Configuration
|
11
|
+
created_at: datetime
|
12
|
+
expires_at: Optional[datetime]
|
13
|
+
file_name: Optional[str]
|
14
|
+
finished_at: Optional[datetime]
|
15
|
+
input_file_url: Optional[str]
|
16
|
+
message: str
|
17
|
+
output: Optional[OutputResponse]
|
18
|
+
page_count: Optional[int]
|
19
|
+
pdf_url: Optional[str]
|
20
|
+
status: Status
|
21
|
+
task_id: str
|
22
|
+
task_url: Optional[str]
|
23
|
+
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
24
|
+
|
25
|
+
def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
|
26
|
+
self._client = client
|
27
|
+
return self
|
28
|
+
|
29
|
+
def _poll_request_sync(self) -> dict:
|
30
|
+
"""Helper method to make polling request with retry logic (synchronous)"""
|
31
|
+
if not self.task_url:
|
32
|
+
raise ValueError("Task URL not found in response")
|
33
|
+
|
34
|
+
while True:
|
35
|
+
try:
|
36
|
+
r = self._client._session.get(self.task_url, headers=self._client._headers())
|
37
|
+
r.raise_for_status()
|
38
|
+
return r.json()
|
39
|
+
except (ConnectionError, TimeoutError) as _:
|
40
|
+
print("Connection error while polling the task, retrying...")
|
41
|
+
time.sleep(0.5)
|
42
|
+
except Exception as e:
|
43
|
+
raise
|
44
|
+
|
45
|
+
async def _poll_request_async(self) -> dict:
|
46
|
+
"""Helper method to make polling request with retry logic (asynchronous)"""
|
47
|
+
if not self.task_url:
|
48
|
+
raise ValueError("Task URL not found in response")
|
49
|
+
|
50
|
+
while True:
|
51
|
+
try:
|
52
|
+
r = await self._client._client.get(self.task_url, headers=self._client._headers())
|
53
|
+
await r.raise_for_status()
|
54
|
+
return await r.json()
|
55
|
+
except (ConnectionError, TimeoutError) as _:
|
56
|
+
print("Connection error while polling the task, retrying...")
|
57
|
+
await asyncio.sleep(0.5)
|
58
|
+
except Exception as e:
|
59
|
+
raise
|
60
|
+
|
61
|
+
def _check_status(self) -> Optional['TaskResponse']:
|
62
|
+
"""Helper method to check task status and handle completion/failure"""
|
63
|
+
if self.status == "Failed":
|
64
|
+
raise ValueError(self.message)
|
65
|
+
if self.status not in ("Starting", "Processing"):
|
66
|
+
return self
|
67
|
+
return None
|
68
|
+
|
69
|
+
def poll(self) -> 'TaskResponse':
|
70
|
+
"""Poll the task for completion."""
|
71
|
+
while True:
|
72
|
+
response = self._poll_request_sync()
|
73
|
+
self.__dict__.update(response)
|
74
|
+
|
75
|
+
if result := self._check_status():
|
76
|
+
return result
|
77
|
+
|
78
|
+
time.sleep(0.5)
|
79
|
+
|
80
|
+
async def poll_async(self) -> 'TaskResponse':
|
81
|
+
"""Poll the task for completion asynchronously."""
|
82
|
+
while True:
|
83
|
+
response = await self._poll_request_async()
|
84
|
+
self.__dict__.update(response)
|
85
|
+
|
86
|
+
if result := self._check_status():
|
87
|
+
return result
|
88
|
+
|
89
|
+
await asyncio.sleep(0.5)
|
90
|
+
|
91
|
+
def _get_content(self, content_type: str) -> str:
|
92
|
+
"""Helper method to get either HTML, Markdown, or raw content."""
|
93
|
+
if not self.output:
|
94
|
+
return ""
|
95
|
+
parts = []
|
96
|
+
for c in self.output.chunks:
|
97
|
+
for s in c.segments:
|
98
|
+
content = getattr(s, content_type)
|
99
|
+
if content:
|
100
|
+
parts.append(content)
|
101
|
+
return "\n".join(parts)
|
102
|
+
|
103
|
+
def html(self) -> str:
|
104
|
+
"""Get full HTML for the task"""
|
105
|
+
return self._get_content("html")
|
106
|
+
|
107
|
+
def markdown(self) -> str:
|
108
|
+
"""Get full markdown for the task"""
|
109
|
+
return self._get_content("markdown")
|
110
|
+
|
111
|
+
def content(self) -> str:
|
112
|
+
"""Get full text for the task"""
|
113
|
+
return self._get_content("content")
|
114
|
+
|
115
|
+
class TaskPayload(BaseModel):
|
116
|
+
current_configuration: Configuration
|
117
|
+
file_name: str
|
118
|
+
image_folder_location: str
|
119
|
+
input_location: str
|
120
|
+
output_location: str
|
121
|
+
pdf_location: str
|
122
|
+
previous_configuration: Optional[Configuration]
|
123
|
+
task_id: str
|
124
|
+
user_id: str
|
@@ -0,0 +1,49 @@
|
|
1
|
+
from .api.config import (
|
2
|
+
BoundingBox,
|
3
|
+
Chunk,
|
4
|
+
ChunkProcessing,
|
5
|
+
Configuration,
|
6
|
+
CroppingStrategy,
|
7
|
+
ExtractedJson,
|
8
|
+
GenerationStrategy,
|
9
|
+
GenerationConfig,
|
10
|
+
JsonSchema,
|
11
|
+
LlmConfig,
|
12
|
+
Model,
|
13
|
+
OCRResult,
|
14
|
+
OcrStrategy,
|
15
|
+
OutputResponse,
|
16
|
+
Property,
|
17
|
+
Segment,
|
18
|
+
SegmentProcessing,
|
19
|
+
SegmentType,
|
20
|
+
SegmentationStrategy,
|
21
|
+
Status
|
22
|
+
)
|
23
|
+
|
24
|
+
from .api.task import TaskResponse, TaskPayload
|
25
|
+
|
26
|
+
__all__ = [
|
27
|
+
'BoundingBox',
|
28
|
+
'Chunk',
|
29
|
+
'ChunkProcessing',
|
30
|
+
'Configuration',
|
31
|
+
'CroppingStrategy',
|
32
|
+
'ExtractedJson',
|
33
|
+
'GenerationConfig',
|
34
|
+
'GenerationStrategy',
|
35
|
+
'JsonSchema',
|
36
|
+
'LlmConfig',
|
37
|
+
'Model',
|
38
|
+
'OCRResult',
|
39
|
+
'OcrStrategy',
|
40
|
+
'OutputResponse',
|
41
|
+
'Property',
|
42
|
+
'Segment',
|
43
|
+
'SegmentProcessing',
|
44
|
+
'SegmentType',
|
45
|
+
'SegmentationStrategy',
|
46
|
+
'Status',
|
47
|
+
'TaskPayload',
|
48
|
+
'TaskResponse'
|
49
|
+
]
|
@@ -0,0 +1,124 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: chunkr-ai
|
3
|
+
Version: 0.0.3
|
4
|
+
Summary: Python client for Chunkr: open source document intelligence
|
5
|
+
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
|
+
Project-URL: Homepage, https://chunkr.ai
|
7
|
+
Description-Content-Type: text/markdown
|
8
|
+
License-File: LICENSE
|
9
|
+
Requires-Dist: httpx>=0.28.1
|
10
|
+
Requires-Dist: pillow>=11.1.0
|
11
|
+
Requires-Dist: pydantic>=2.10.4
|
12
|
+
Requires-Dist: python-dotenv>=1.0.1
|
13
|
+
Requires-Dist: requests>=2.32.3
|
14
|
+
Provides-Extra: test
|
15
|
+
Requires-Dist: pytest>=8.3.4; extra == "test"
|
16
|
+
Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
|
17
|
+
|
18
|
+
# Chunkr Python Client
|
19
|
+
|
20
|
+
This is the Python client for the Chunkr API. It provides a simple interface to interact with Chunkr's services.
|
21
|
+
|
22
|
+
## Installation
|
23
|
+
|
24
|
+
```bash
|
25
|
+
pip install chunkr-ai
|
26
|
+
```
|
27
|
+
|
28
|
+
## Usage
|
29
|
+
|
30
|
+
We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
|
31
|
+
|
32
|
+
### Synchronous Usage
|
33
|
+
|
34
|
+
```python
|
35
|
+
from chunkr_ai import Chunkr
|
36
|
+
|
37
|
+
# Initialize client
|
38
|
+
chunkr = Chunkr()
|
39
|
+
|
40
|
+
# Upload a file and wait for processing
|
41
|
+
task = chunkr.upload("document.pdf")
|
42
|
+
|
43
|
+
# Print the response
|
44
|
+
print(task)
|
45
|
+
|
46
|
+
# Get output from task
|
47
|
+
output = task.output
|
48
|
+
|
49
|
+
# If you want to upload without waiting for processing
|
50
|
+
task = chunkr.start_upload("document.pdf")
|
51
|
+
# ... do other things ...
|
52
|
+
task.poll() # Check status when needed
|
53
|
+
```
|
54
|
+
|
55
|
+
### Asynchronous Usage
|
56
|
+
|
57
|
+
```python
|
58
|
+
from chunkr_ai import ChunkrAsync
|
59
|
+
|
60
|
+
async def process_document():
|
61
|
+
# Initialize client
|
62
|
+
chunkr = ChunkrAsync()
|
63
|
+
|
64
|
+
# Upload a file and wait for processing
|
65
|
+
task = await chunkr.upload("document.pdf")
|
66
|
+
|
67
|
+
# Print the response
|
68
|
+
print(task)
|
69
|
+
|
70
|
+
# Get output from task
|
71
|
+
output = task.output
|
72
|
+
|
73
|
+
# If you want to upload without waiting for processing
|
74
|
+
task = await chunkr.start_upload("document.pdf")
|
75
|
+
# ... do other things ...
|
76
|
+
await task.poll_async() # Check status when needed
|
77
|
+
```
|
78
|
+
|
79
|
+
### Additional Features
|
80
|
+
|
81
|
+
Both clients support various input types:
|
82
|
+
|
83
|
+
```python
|
84
|
+
# Upload from file path
|
85
|
+
chunkr.upload("document.pdf")
|
86
|
+
|
87
|
+
# Upload from opened file
|
88
|
+
with open("document.pdf", "rb") as f:
|
89
|
+
chunkr.upload(f)
|
90
|
+
|
91
|
+
# Upload from URL
|
92
|
+
chunkr.upload("https://example.com/document.pdf")
|
93
|
+
|
94
|
+
# Upload from base64 string
|
95
|
+
chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
96
|
+
|
97
|
+
# Upload an image
|
98
|
+
from PIL import Image
|
99
|
+
img = Image.open("photo.jpg")
|
100
|
+
chunkr.upload(img)
|
101
|
+
```
|
102
|
+
|
103
|
+
### Configuration
|
104
|
+
|
105
|
+
You can provide your API key and URL in several ways:
|
106
|
+
1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
|
107
|
+
2. `.env` file
|
108
|
+
3. Direct initialization:
|
109
|
+
```python
|
110
|
+
chunkr = Chunkr(
|
111
|
+
api_key="your-api-key",
|
112
|
+
url="https://api.chunkr.ai"
|
113
|
+
)
|
114
|
+
```
|
115
|
+
|
116
|
+
## Run tests
|
117
|
+
|
118
|
+
```python
|
119
|
+
# Install dependencies
|
120
|
+
uv pip install -e ".[test]"
|
121
|
+
|
122
|
+
# Run tests
|
123
|
+
uv run pytest
|
124
|
+
```
|
@@ -0,0 +1,21 @@
|
|
1
|
+
LICENSE
|
2
|
+
README.md
|
3
|
+
pyproject.toml
|
4
|
+
src/chunkr_ai/__init__.py
|
5
|
+
src/chunkr_ai/main.py
|
6
|
+
src/chunkr_ai/models.py
|
7
|
+
src/chunkr_ai.egg-info/PKG-INFO
|
8
|
+
src/chunkr_ai.egg-info/SOURCES.txt
|
9
|
+
src/chunkr_ai.egg-info/dependency_links.txt
|
10
|
+
src/chunkr_ai.egg-info/requires.txt
|
11
|
+
src/chunkr_ai.egg-info/top_level.txt
|
12
|
+
src/chunkr_ai/api/__init__.py
|
13
|
+
src/chunkr_ai/api/api.py
|
14
|
+
src/chunkr_ai/api/auth.py
|
15
|
+
src/chunkr_ai/api/base.py
|
16
|
+
src/chunkr_ai/api/chunkr.py
|
17
|
+
src/chunkr_ai/api/chunkr_async.py
|
18
|
+
src/chunkr_ai/api/config.py
|
19
|
+
src/chunkr_ai/api/protocol.py
|
20
|
+
src/chunkr_ai/api/task.py
|
21
|
+
tests/test_chunkr.py
|
@@ -0,0 +1,141 @@
|
|
1
|
+
import pytest
|
2
|
+
from pathlib import Path
|
3
|
+
from PIL import Image
|
4
|
+
|
5
|
+
from chunkr_ai import Chunkr, ChunkrAsync
|
6
|
+
from chunkr_ai.models import (
|
7
|
+
ChunkProcessing,
|
8
|
+
Configuration,
|
9
|
+
GenerationStrategy,
|
10
|
+
GenerationConfig,
|
11
|
+
OcrStrategy,
|
12
|
+
SegmentationStrategy,
|
13
|
+
SegmentProcessing,
|
14
|
+
TaskResponse,
|
15
|
+
)
|
16
|
+
|
17
|
+
@pytest.fixture
|
18
|
+
def chunkr():
|
19
|
+
return Chunkr()
|
20
|
+
|
21
|
+
@pytest.fixture
|
22
|
+
def async_chunkr():
|
23
|
+
return ChunkrAsync()
|
24
|
+
|
25
|
+
@pytest.fixture
|
26
|
+
def sample_path():
|
27
|
+
return Path("tests/files/test.pdf")
|
28
|
+
|
29
|
+
@pytest.fixture
|
30
|
+
def sample_image():
|
31
|
+
img = Image.open("tests/files/test.jpg")
|
32
|
+
return img
|
33
|
+
|
34
|
+
def test_send_file_path(chunkr, sample_path):
|
35
|
+
response = chunkr.upload(sample_path)
|
36
|
+
|
37
|
+
assert isinstance(response, TaskResponse)
|
38
|
+
assert response.task_id is not None
|
39
|
+
assert response.status == "Succeeded"
|
40
|
+
assert response.output is not None
|
41
|
+
|
42
|
+
def test_send_file_path_str(chunkr, sample_path):
|
43
|
+
response = chunkr.upload(str(sample_path))
|
44
|
+
|
45
|
+
assert isinstance(response, TaskResponse)
|
46
|
+
assert response.task_id is not None
|
47
|
+
assert response.status == "Succeeded"
|
48
|
+
assert response.output is not None
|
49
|
+
|
50
|
+
def test_send_opened_file(chunkr, sample_path):
|
51
|
+
with open(sample_path, 'rb') as f:
|
52
|
+
response = chunkr.upload(f)
|
53
|
+
|
54
|
+
assert isinstance(response, TaskResponse)
|
55
|
+
assert response.task_id is not None
|
56
|
+
assert response.status == "Succeeded"
|
57
|
+
assert response.output is not None
|
58
|
+
|
59
|
+
def test_send_pil_image(chunkr, sample_image):
|
60
|
+
response = chunkr.upload(sample_image)
|
61
|
+
|
62
|
+
assert isinstance(response, TaskResponse)
|
63
|
+
assert response.task_id is not None
|
64
|
+
assert response.status == "Succeeded"
|
65
|
+
|
66
|
+
def test_ocr_auto(chunkr, sample_path):
|
67
|
+
response = chunkr.upload(sample_path, Configuration(
|
68
|
+
ocr_strategy=OcrStrategy.AUTO
|
69
|
+
))
|
70
|
+
assert isinstance(response, TaskResponse)
|
71
|
+
assert response.task_id is not None
|
72
|
+
assert response.status == "Succeeded"
|
73
|
+
assert response.output is not None
|
74
|
+
|
75
|
+
def test_expires_in(chunkr, sample_path):
|
76
|
+
response = chunkr.upload(sample_path, Configuration(
|
77
|
+
expires_in=10
|
78
|
+
))
|
79
|
+
assert isinstance(response, TaskResponse)
|
80
|
+
assert response.task_id is not None
|
81
|
+
assert response.status == "Succeeded"
|
82
|
+
assert response.output is not None
|
83
|
+
|
84
|
+
def test_chunk_processing(chunkr, sample_path):
|
85
|
+
response = chunkr.upload(sample_path, Configuration(
|
86
|
+
chunk_processing=ChunkProcessing(
|
87
|
+
target_length=1024
|
88
|
+
)
|
89
|
+
))
|
90
|
+
assert isinstance(response, TaskResponse)
|
91
|
+
assert response.task_id is not None
|
92
|
+
assert response.status == "Succeeded"
|
93
|
+
assert response.output is not None
|
94
|
+
|
95
|
+
def test_segmentation_strategy_page(chunkr, sample_path):
|
96
|
+
response = chunkr.upload(sample_path, Configuration(
|
97
|
+
segmentation_strategy=SegmentationStrategy.PAGE
|
98
|
+
))
|
99
|
+
assert isinstance(response, TaskResponse)
|
100
|
+
assert response.task_id is not None
|
101
|
+
assert response.status == "Succeeded"
|
102
|
+
assert response.output is not None
|
103
|
+
|
104
|
+
def test_page_llm_html(chunkr, sample_path):
|
105
|
+
response = chunkr.upload(sample_path, Configuration(
|
106
|
+
segmentation_strategy=SegmentationStrategy.PAGE,
|
107
|
+
segment_processing=SegmentProcessing(
|
108
|
+
page=GenerationConfig(
|
109
|
+
html=GenerationStrategy.LLM
|
110
|
+
)
|
111
|
+
)
|
112
|
+
))
|
113
|
+
assert isinstance(response, TaskResponse)
|
114
|
+
assert response.task_id is not None
|
115
|
+
assert response.status == "Succeeded"
|
116
|
+
assert response.output is not None
|
117
|
+
|
118
|
+
def test_page_llm(chunkr, sample_path):
|
119
|
+
response = chunkr.upload(sample_path, Configuration(
|
120
|
+
segmentation_strategy=SegmentationStrategy.PAGE,
|
121
|
+
segment_processing=SegmentProcessing(
|
122
|
+
page=GenerationConfig(
|
123
|
+
html=GenerationStrategy.LLM,
|
124
|
+
markdown=GenerationStrategy.LLM
|
125
|
+
)
|
126
|
+
)
|
127
|
+
))
|
128
|
+
assert isinstance(response, TaskResponse)
|
129
|
+
assert response.task_id is not None
|
130
|
+
assert response.status == "Succeeded"
|
131
|
+
assert response.output is not None
|
132
|
+
|
133
|
+
|
134
|
+
async def test_async_send_file_path(async_chunkr, sample_path):
|
135
|
+
response = await async_chunkr.upload(sample_path)
|
136
|
+
|
137
|
+
assert isinstance(response, TaskResponse)
|
138
|
+
assert response.task_id is not None
|
139
|
+
assert response.status == "Succeeded"
|
140
|
+
assert response.output is not None
|
141
|
+
|
chunkr_ai-0.0.1/PKG-INFO
DELETED
chunkr_ai-0.0.1/pyproject.toml
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
[build-system]
|
2
|
-
requires = ["setuptools>=42", "wheel"]
|
3
|
-
build-backend = "setuptools.build_meta"
|
4
|
-
|
5
|
-
[project]
|
6
|
-
name = "chunkr-ai"
|
7
|
-
version = "0.0.1"
|
8
|
-
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
|
-
description = "PDF chunking"
|
10
|
-
readme = "README.md"
|
11
|
-
license = {"file" = "LICENSE"}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|