chunkr-ai 0.0.2__tar.gz → 0.0.4__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- chunkr_ai-0.0.4/PKG-INFO +204 -0
- chunkr_ai-0.0.4/README.md +187 -0
- {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/pyproject.toml +4 -4
- {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai/api/auth.py +0 -2
- chunkr_ai-0.0.4/src/chunkr_ai/api/base.py +173 -0
- chunkr_ai-0.0.4/src/chunkr_ai/api/chunkr.py +108 -0
- chunkr_ai-0.0.4/src/chunkr_ai/api/chunkr_async.py +105 -0
- chunkr_ai-0.0.4/src/chunkr_ai/api/config.py +131 -0
- chunkr_ai-0.0.4/src/chunkr_ai/api/protocol.py +19 -0
- chunkr_ai-0.0.4/src/chunkr_ai/api/task.py +131 -0
- chunkr_ai-0.0.4/src/chunkr_ai/models.py +48 -0
- chunkr_ai-0.0.4/src/chunkr_ai.egg-info/PKG-INFO +204 -0
- {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai.egg-info/SOURCES.txt +6 -1
- {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai.egg-info/requires.txt +1 -2
- chunkr_ai-0.0.4/tests/test_chunkr.py +158 -0
- chunkr_ai-0.0.2/PKG-INFO +0 -16
- chunkr_ai-0.0.2/src/chunkr_ai/api/chunkr.py +0 -125
- chunkr_ai-0.0.2/src/chunkr_ai/api/chunkr_async.py +0 -39
- chunkr_ai-0.0.2/src/chunkr_ai/api/models.py +0 -231
- chunkr_ai-0.0.2/src/chunkr_ai.egg-info/PKG-INFO +0 -16
- chunkr_ai-0.0.2/tests/test_chunkr.py +0 -69
- {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/LICENSE +0 -0
- {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/setup.cfg +0 -0
- {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai/__init__.py +0 -0
- /chunkr_ai-0.0.2/README.md → /chunkr_ai-0.0.4/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai/api/api.py +0 -0
- {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai/main.py +0 -0
- {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai.egg-info/top_level.txt +0 -0
chunkr_ai-0.0.4/PKG-INFO
ADDED
@@ -0,0 +1,204 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: chunkr-ai
|
3
|
+
Version: 0.0.4
|
4
|
+
Summary: Python client for Chunkr: open source document intelligence
|
5
|
+
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
|
+
Project-URL: Homepage, https://chunkr.ai
|
7
|
+
Description-Content-Type: text/markdown
|
8
|
+
License-File: LICENSE
|
9
|
+
Requires-Dist: httpx>=0.28.1
|
10
|
+
Requires-Dist: pillow>=11.1.0
|
11
|
+
Requires-Dist: pydantic>=2.10.4
|
12
|
+
Requires-Dist: python-dotenv>=1.0.1
|
13
|
+
Requires-Dist: requests>=2.32.3
|
14
|
+
Provides-Extra: test
|
15
|
+
Requires-Dist: pytest>=8.3.4; extra == "test"
|
16
|
+
Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
|
17
|
+
|
18
|
+
# Chunkr Python Client
|
19
|
+
|
20
|
+
This provides a simple interface to interact with the Chunkr API.
|
21
|
+
|
22
|
+
## Getting Started
|
23
|
+
|
24
|
+
You can get an API key from [Chunkr](https://chunkr.ai) or deploy your own Chunkr instance. For self-hosted deployment options, check out our [deployment guide](https://github.com/lumina-ai-inc/chunkr/tree/main?tab=readme-ov-file#self-hosted-deployment-options).
|
25
|
+
|
26
|
+
For more information about the API and its capabilities, visit the [Chunkr API docs](https://docs.chunkr.ai).
|
27
|
+
|
28
|
+
## Installation
|
29
|
+
|
30
|
+
```bash
|
31
|
+
pip install chunkr-ai
|
32
|
+
```
|
33
|
+
|
34
|
+
## Usage
|
35
|
+
|
36
|
+
We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
|
37
|
+
|
38
|
+
### Synchronous Usage
|
39
|
+
|
40
|
+
```python
|
41
|
+
from chunkr_ai import Chunkr
|
42
|
+
|
43
|
+
# Initialize client
|
44
|
+
chunkr = Chunkr()
|
45
|
+
|
46
|
+
# Upload a file and wait for processing
|
47
|
+
task = chunkr.upload("document.pdf")
|
48
|
+
|
49
|
+
# Print the response
|
50
|
+
print(task)
|
51
|
+
|
52
|
+
# Get output from task
|
53
|
+
output = task.output
|
54
|
+
|
55
|
+
# If you want to upload without waiting for processing
|
56
|
+
task = chunkr.start_upload("document.pdf")
|
57
|
+
# ... do other things ...
|
58
|
+
task.poll() # Check status when needed
|
59
|
+
```
|
60
|
+
|
61
|
+
### Asynchronous Usage
|
62
|
+
|
63
|
+
```python
|
64
|
+
from chunkr_ai import ChunkrAsync
|
65
|
+
|
66
|
+
async def process_document():
|
67
|
+
# Initialize client
|
68
|
+
chunkr = ChunkrAsync()
|
69
|
+
|
70
|
+
# Upload a file and wait for processing
|
71
|
+
task = await chunkr.upload("document.pdf")
|
72
|
+
|
73
|
+
# Print the response
|
74
|
+
print(task)
|
75
|
+
|
76
|
+
# Get output from task
|
77
|
+
output = task.output
|
78
|
+
|
79
|
+
# If you want to upload without waiting for processing
|
80
|
+
task = await chunkr.start_upload("document.pdf")
|
81
|
+
# ... do other things ...
|
82
|
+
await task.poll_async() # Check status when needed
|
83
|
+
```
|
84
|
+
|
85
|
+
### Additional Features
|
86
|
+
|
87
|
+
Both clients support various input types:
|
88
|
+
|
89
|
+
```python
|
90
|
+
# Upload from file path
|
91
|
+
chunkr.upload("document.pdf")
|
92
|
+
|
93
|
+
# Upload from opened file
|
94
|
+
with open("document.pdf", "rb") as f:
|
95
|
+
chunkr.upload(f)
|
96
|
+
|
97
|
+
# Upload from URL
|
98
|
+
chunkr.upload("https://example.com/document.pdf")
|
99
|
+
|
100
|
+
# Upload from base64 string
|
101
|
+
chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
102
|
+
|
103
|
+
# Upload an image
|
104
|
+
from PIL import Image
|
105
|
+
img = Image.open("photo.jpg")
|
106
|
+
chunkr.upload(img)
|
107
|
+
```
|
108
|
+
|
109
|
+
### Configuration
|
110
|
+
|
111
|
+
You can customize the processing behavior by passing a `Configuration` object:
|
112
|
+
|
113
|
+
```python
|
114
|
+
from chunkr_ai.models import Configuration, OcrStrategy, SegmentationStrategy, GenerationStrategy
|
115
|
+
|
116
|
+
# Basic configuration
|
117
|
+
config = Configuration(
|
118
|
+
ocr_strategy=OcrStrategy.AUTO,
|
119
|
+
segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
|
120
|
+
high_resolution=True,
|
121
|
+
expires_in=3600, # seconds
|
122
|
+
)
|
123
|
+
|
124
|
+
# Upload with configuration
|
125
|
+
task = chunkr.upload("document.pdf", config)
|
126
|
+
```
|
127
|
+
|
128
|
+
#### Available Configuration Examples
|
129
|
+
|
130
|
+
- **Chunk Processing**
|
131
|
+
```python
|
132
|
+
from chunkr_ai.models import ChunkProcessing
|
133
|
+
config = Configuration(
|
134
|
+
chunk_processing=ChunkProcessing(target_length=1024)
|
135
|
+
)
|
136
|
+
```
|
137
|
+
- **Expires In**
|
138
|
+
```python
|
139
|
+
config = Configuration(expires_in=3600)
|
140
|
+
```
|
141
|
+
|
142
|
+
- **High Resolution**
|
143
|
+
```python
|
144
|
+
config = Configuration(high_resolution=True)
|
145
|
+
```
|
146
|
+
|
147
|
+
- **JSON Schema**
|
148
|
+
```python
|
149
|
+
config = Configuration(json_schema=JsonSchema(
|
150
|
+
title="Sales Data",
|
151
|
+
properties=[
|
152
|
+
Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
|
153
|
+
Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
|
154
|
+
]
|
155
|
+
))
|
156
|
+
```
|
157
|
+
|
158
|
+
- **OCR Strategy**
|
159
|
+
```python
|
160
|
+
config = Configuration(ocr_strategy=OcrStrategy.AUTO)
|
161
|
+
```
|
162
|
+
|
163
|
+
- **Segment Processing**
|
164
|
+
```python
|
165
|
+
from chunkr_ai.models import SegmentProcessing, GenerationConfig, GenerationStrategy
|
166
|
+
config = Configuration(
|
167
|
+
segment_processing=SegmentProcessing(
|
168
|
+
page=GenerationConfig(
|
169
|
+
html=GenerationStrategy.LLM,
|
170
|
+
markdown=GenerationStrategy.LLM
|
171
|
+
)
|
172
|
+
)
|
173
|
+
)
|
174
|
+
```
|
175
|
+
|
176
|
+
- **Segmentation Strategy**
|
177
|
+
```python
|
178
|
+
config = Configuration(
|
179
|
+
segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS # or SegmentationStrategy.PAGE
|
180
|
+
)
|
181
|
+
```
|
182
|
+
|
183
|
+
## Environment setup
|
184
|
+
|
185
|
+
You can provide your API key and URL in several ways:
|
186
|
+
1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
|
187
|
+
2. `.env` file
|
188
|
+
3. Direct initialization:
|
189
|
+
```python
|
190
|
+
chunkr = Chunkr(
|
191
|
+
api_key="your-api-key",
|
192
|
+
url="https://api.chunkr.ai"
|
193
|
+
)
|
194
|
+
```
|
195
|
+
|
196
|
+
## Run tests
|
197
|
+
|
198
|
+
```python
|
199
|
+
# Install dependencies
|
200
|
+
uv pip install -e ".[test]"
|
201
|
+
|
202
|
+
# Run tests
|
203
|
+
uv run pytest
|
204
|
+
```
|
@@ -0,0 +1,187 @@
|
|
1
|
+
# Chunkr Python Client
|
2
|
+
|
3
|
+
This provides a simple interface to interact with the Chunkr API.
|
4
|
+
|
5
|
+
## Getting Started
|
6
|
+
|
7
|
+
You can get an API key from [Chunkr](https://chunkr.ai) or deploy your own Chunkr instance. For self-hosted deployment options, check out our [deployment guide](https://github.com/lumina-ai-inc/chunkr/tree/main?tab=readme-ov-file#self-hosted-deployment-options).
|
8
|
+
|
9
|
+
For more information about the API and its capabilities, visit the [Chunkr API docs](https://docs.chunkr.ai).
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
```bash
|
14
|
+
pip install chunkr-ai
|
15
|
+
```
|
16
|
+
|
17
|
+
## Usage
|
18
|
+
|
19
|
+
We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
|
20
|
+
|
21
|
+
### Synchronous Usage
|
22
|
+
|
23
|
+
```python
|
24
|
+
from chunkr_ai import Chunkr
|
25
|
+
|
26
|
+
# Initialize client
|
27
|
+
chunkr = Chunkr()
|
28
|
+
|
29
|
+
# Upload a file and wait for processing
|
30
|
+
task = chunkr.upload("document.pdf")
|
31
|
+
|
32
|
+
# Print the response
|
33
|
+
print(task)
|
34
|
+
|
35
|
+
# Get output from task
|
36
|
+
output = task.output
|
37
|
+
|
38
|
+
# If you want to upload without waiting for processing
|
39
|
+
task = chunkr.start_upload("document.pdf")
|
40
|
+
# ... do other things ...
|
41
|
+
task.poll() # Check status when needed
|
42
|
+
```
|
43
|
+
|
44
|
+
### Asynchronous Usage
|
45
|
+
|
46
|
+
```python
|
47
|
+
from chunkr_ai import ChunkrAsync
|
48
|
+
|
49
|
+
async def process_document():
|
50
|
+
# Initialize client
|
51
|
+
chunkr = ChunkrAsync()
|
52
|
+
|
53
|
+
# Upload a file and wait for processing
|
54
|
+
task = await chunkr.upload("document.pdf")
|
55
|
+
|
56
|
+
# Print the response
|
57
|
+
print(task)
|
58
|
+
|
59
|
+
# Get output from task
|
60
|
+
output = task.output
|
61
|
+
|
62
|
+
# If you want to upload without waiting for processing
|
63
|
+
task = await chunkr.start_upload("document.pdf")
|
64
|
+
# ... do other things ...
|
65
|
+
await task.poll_async() # Check status when needed
|
66
|
+
```
|
67
|
+
|
68
|
+
### Additional Features
|
69
|
+
|
70
|
+
Both clients support various input types:
|
71
|
+
|
72
|
+
```python
|
73
|
+
# Upload from file path
|
74
|
+
chunkr.upload("document.pdf")
|
75
|
+
|
76
|
+
# Upload from opened file
|
77
|
+
with open("document.pdf", "rb") as f:
|
78
|
+
chunkr.upload(f)
|
79
|
+
|
80
|
+
# Upload from URL
|
81
|
+
chunkr.upload("https://example.com/document.pdf")
|
82
|
+
|
83
|
+
# Upload from base64 string
|
84
|
+
chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
85
|
+
|
86
|
+
# Upload an image
|
87
|
+
from PIL import Image
|
88
|
+
img = Image.open("photo.jpg")
|
89
|
+
chunkr.upload(img)
|
90
|
+
```
|
91
|
+
|
92
|
+
### Configuration
|
93
|
+
|
94
|
+
You can customize the processing behavior by passing a `Configuration` object:
|
95
|
+
|
96
|
+
```python
|
97
|
+
from chunkr_ai.models import Configuration, OcrStrategy, SegmentationStrategy, GenerationStrategy
|
98
|
+
|
99
|
+
# Basic configuration
|
100
|
+
config = Configuration(
|
101
|
+
ocr_strategy=OcrStrategy.AUTO,
|
102
|
+
segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
|
103
|
+
high_resolution=True,
|
104
|
+
expires_in=3600, # seconds
|
105
|
+
)
|
106
|
+
|
107
|
+
# Upload with configuration
|
108
|
+
task = chunkr.upload("document.pdf", config)
|
109
|
+
```
|
110
|
+
|
111
|
+
#### Available Configuration Examples
|
112
|
+
|
113
|
+
- **Chunk Processing**
|
114
|
+
```python
|
115
|
+
from chunkr_ai.models import ChunkProcessing
|
116
|
+
config = Configuration(
|
117
|
+
chunk_processing=ChunkProcessing(target_length=1024)
|
118
|
+
)
|
119
|
+
```
|
120
|
+
- **Expires In**
|
121
|
+
```python
|
122
|
+
config = Configuration(expires_in=3600)
|
123
|
+
```
|
124
|
+
|
125
|
+
- **High Resolution**
|
126
|
+
```python
|
127
|
+
config = Configuration(high_resolution=True)
|
128
|
+
```
|
129
|
+
|
130
|
+
- **JSON Schema**
|
131
|
+
```python
|
132
|
+
config = Configuration(json_schema=JsonSchema(
|
133
|
+
title="Sales Data",
|
134
|
+
properties=[
|
135
|
+
Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
|
136
|
+
Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
|
137
|
+
]
|
138
|
+
))
|
139
|
+
```
|
140
|
+
|
141
|
+
- **OCR Strategy**
|
142
|
+
```python
|
143
|
+
config = Configuration(ocr_strategy=OcrStrategy.AUTO)
|
144
|
+
```
|
145
|
+
|
146
|
+
- **Segment Processing**
|
147
|
+
```python
|
148
|
+
from chunkr_ai.models import SegmentProcessing, GenerationConfig, GenerationStrategy
|
149
|
+
config = Configuration(
|
150
|
+
segment_processing=SegmentProcessing(
|
151
|
+
page=GenerationConfig(
|
152
|
+
html=GenerationStrategy.LLM,
|
153
|
+
markdown=GenerationStrategy.LLM
|
154
|
+
)
|
155
|
+
)
|
156
|
+
)
|
157
|
+
```
|
158
|
+
|
159
|
+
- **Segmentation Strategy**
|
160
|
+
```python
|
161
|
+
config = Configuration(
|
162
|
+
segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS # or SegmentationStrategy.PAGE
|
163
|
+
)
|
164
|
+
```
|
165
|
+
|
166
|
+
## Environment setup
|
167
|
+
|
168
|
+
You can provide your API key and URL in several ways:
|
169
|
+
1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
|
170
|
+
2. `.env` file
|
171
|
+
3. Direct initialization:
|
172
|
+
```python
|
173
|
+
chunkr = Chunkr(
|
174
|
+
api_key="your-api-key",
|
175
|
+
url="https://api.chunkr.ai"
|
176
|
+
)
|
177
|
+
```
|
178
|
+
|
179
|
+
## Run tests
|
180
|
+
|
181
|
+
```python
|
182
|
+
# Install dependencies
|
183
|
+
uv pip install -e ".[test]"
|
184
|
+
|
185
|
+
# Run tests
|
186
|
+
uv run pytest
|
187
|
+
```
|
@@ -4,22 +4,22 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.0.
|
7
|
+
version = "0.0.4"
|
8
8
|
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
|
-
description = "Python client for
|
9
|
+
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
11
11
|
license = {"file" = "LICENSE"}
|
12
|
+
urls = {Homepage = "https://chunkr.ai"}
|
12
13
|
dependencies = [
|
13
|
-
"build>=1.2.2.post1",
|
14
14
|
"httpx>=0.28.1",
|
15
15
|
"pillow>=11.1.0",
|
16
16
|
"pydantic>=2.10.4",
|
17
17
|
"python-dotenv>=1.0.1",
|
18
18
|
"requests>=2.32.3",
|
19
|
-
"twine>=6.0.1",
|
20
19
|
]
|
21
20
|
|
22
21
|
[project.optional-dependencies]
|
23
22
|
test = [
|
24
23
|
"pytest>=8.3.4",
|
24
|
+
"pytest-xdist>=3.6.1",
|
25
25
|
]
|
@@ -0,0 +1,173 @@
|
|
1
|
+
from .config import Configuration
|
2
|
+
from .task import TaskResponse
|
3
|
+
from .auth import HeadersMixin
|
4
|
+
from abc import abstractmethod
|
5
|
+
from dotenv import load_dotenv
|
6
|
+
import io
|
7
|
+
import json
|
8
|
+
import os
|
9
|
+
from pathlib import Path
|
10
|
+
from PIL import Image
|
11
|
+
import requests
|
12
|
+
from typing import BinaryIO, Tuple, Union
|
13
|
+
|
14
|
+
class ChunkrBase(HeadersMixin):
|
15
|
+
"""Base class with shared functionality for Chunkr API clients."""
|
16
|
+
|
17
|
+
def __init__(self, url: str = None, api_key: str = None):
|
18
|
+
load_dotenv()
|
19
|
+
self.url = (
|
20
|
+
url or
|
21
|
+
os.getenv('CHUNKR_URL') or
|
22
|
+
'https://api.chunkr.ai'
|
23
|
+
)
|
24
|
+
self._api_key = (
|
25
|
+
api_key or
|
26
|
+
os.getenv('CHUNKR_API_KEY')
|
27
|
+
)
|
28
|
+
if not self._api_key:
|
29
|
+
raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
|
30
|
+
|
31
|
+
self.url = self.url.rstrip("/")
|
32
|
+
|
33
|
+
def _prepare_file(
|
34
|
+
self,
|
35
|
+
file: Union[str, Path, BinaryIO, Image.Image]
|
36
|
+
) -> Tuple[str, BinaryIO]:
|
37
|
+
"""Convert various file types into a tuple of (filename, file-like object).
|
38
|
+
|
39
|
+
Args:
|
40
|
+
file: Input file, can be:
|
41
|
+
- String or Path to a file
|
42
|
+
- URL string starting with http:// or https://
|
43
|
+
- Base64 string
|
44
|
+
- Opened binary file (mode='rb')
|
45
|
+
- PIL/Pillow Image object
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
|
49
|
+
|
50
|
+
Raises:
|
51
|
+
FileNotFoundError: If the file path doesn't exist
|
52
|
+
TypeError: If the file type is not supported
|
53
|
+
ValueError: If the URL is invalid or unreachable
|
54
|
+
ValueError: If the MIME type is unsupported
|
55
|
+
"""
|
56
|
+
# Handle URLs
|
57
|
+
if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
|
58
|
+
response = requests.get(file)
|
59
|
+
response.raise_for_status()
|
60
|
+
file_obj = io.BytesIO(response.content)
|
61
|
+
filename = Path(file.split('/')[-1]).name or 'downloaded_file'
|
62
|
+
return filename, file_obj
|
63
|
+
|
64
|
+
# Handle base64 strings
|
65
|
+
if isinstance(file, str) and ',' in file and ';base64,' in file:
|
66
|
+
try:
|
67
|
+
# Split header and data
|
68
|
+
header, base64_data = file.split(',', 1)
|
69
|
+
import base64
|
70
|
+
file_bytes = base64.b64decode(base64_data)
|
71
|
+
file_obj = io.BytesIO(file_bytes)
|
72
|
+
|
73
|
+
# Try to determine format from header
|
74
|
+
format = 'bin'
|
75
|
+
mime_type = header.split(':')[-1].split(';')[0].lower()
|
76
|
+
|
77
|
+
# Map MIME types to file extensions
|
78
|
+
mime_to_ext = {
|
79
|
+
'application/pdf': 'pdf',
|
80
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
81
|
+
'application/msword': 'doc',
|
82
|
+
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
|
83
|
+
'application/vnd.ms-powerpoint': 'ppt',
|
84
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
|
85
|
+
'application/vnd.ms-excel': 'xls',
|
86
|
+
'image/jpeg': 'jpg',
|
87
|
+
'image/png': 'png',
|
88
|
+
'image/jpg': 'jpg'
|
89
|
+
}
|
90
|
+
|
91
|
+
if mime_type in mime_to_ext:
|
92
|
+
format = mime_to_ext[mime_type]
|
93
|
+
else:
|
94
|
+
raise ValueError(f"Unsupported MIME type: {mime_type}")
|
95
|
+
|
96
|
+
return f"file.{format}", file_obj
|
97
|
+
except Exception as e:
|
98
|
+
raise ValueError(f"Invalid base64 string: {str(e)}")
|
99
|
+
|
100
|
+
# Handle file paths
|
101
|
+
if isinstance(file, (str, Path)):
|
102
|
+
path = Path(file).resolve()
|
103
|
+
if not path.exists():
|
104
|
+
raise FileNotFoundError(f"File not found: {file}")
|
105
|
+
return path.name, open(path, 'rb')
|
106
|
+
|
107
|
+
# Handle PIL Images
|
108
|
+
if isinstance(file, Image.Image):
|
109
|
+
img_byte_arr = io.BytesIO()
|
110
|
+
format = file.format or 'PNG'
|
111
|
+
file.save(img_byte_arr, format=format)
|
112
|
+
img_byte_arr.seek(0)
|
113
|
+
return f"image.{format.lower()}", img_byte_arr
|
114
|
+
|
115
|
+
# Handle file-like objects
|
116
|
+
if hasattr(file, 'read') and hasattr(file, 'seek'):
|
117
|
+
# Try to get the filename from the file object if possible
|
118
|
+
name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
|
119
|
+
return Path(name).name, file
|
120
|
+
|
121
|
+
raise TypeError(f"Unsupported file type: {type(file)}")
|
122
|
+
|
123
|
+
def _prepare_upload_data(
|
124
|
+
self,
|
125
|
+
file: Union[str, Path, BinaryIO, Image.Image],
|
126
|
+
config: Configuration = None
|
127
|
+
) -> Tuple[dict, dict]:
|
128
|
+
"""Prepare files and data dictionaries for upload.
|
129
|
+
|
130
|
+
Args:
|
131
|
+
file: The file to upload
|
132
|
+
config: Optional configuration settings
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
Tuple[dict, dict]: (files dict, data dict) ready for upload
|
136
|
+
"""
|
137
|
+
filename, file_obj = self._prepare_file(file)
|
138
|
+
files = {"file": (filename, file_obj)}
|
139
|
+
data = {}
|
140
|
+
|
141
|
+
if config:
|
142
|
+
config_dict = config.model_dump(mode="json", exclude_none=True)
|
143
|
+
for key, value in config_dict.items():
|
144
|
+
if isinstance(value, dict):
|
145
|
+
files[key] = (None, json.dumps(value), 'application/json')
|
146
|
+
else:
|
147
|
+
data[key] = value
|
148
|
+
|
149
|
+
return files, data
|
150
|
+
|
151
|
+
@abstractmethod
|
152
|
+
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
153
|
+
"""Upload a file and wait for processing to complete.
|
154
|
+
|
155
|
+
Must be implemented by subclasses.
|
156
|
+
"""
|
157
|
+
pass
|
158
|
+
|
159
|
+
@abstractmethod
|
160
|
+
def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
161
|
+
"""Upload a file for processing and immediately return the task response.
|
162
|
+
|
163
|
+
Must be implemented by subclasses.
|
164
|
+
"""
|
165
|
+
pass
|
166
|
+
|
167
|
+
@abstractmethod
|
168
|
+
def get_task(self, task_id: str) -> TaskResponse:
|
169
|
+
"""Get a task response by its ID.
|
170
|
+
|
171
|
+
Must be implemented by subclasses.
|
172
|
+
"""
|
173
|
+
pass
|
@@ -0,0 +1,108 @@
|
|
1
|
+
from .base import ChunkrBase
|
2
|
+
from .config import Configuration
|
3
|
+
from .task import TaskResponse
|
4
|
+
from pathlib import Path
|
5
|
+
from PIL import Image
|
6
|
+
import requests
|
7
|
+
from typing import Union, BinaryIO
|
8
|
+
|
9
|
+
class Chunkr(ChunkrBase):
|
10
|
+
"""Chunkr API client"""
|
11
|
+
|
12
|
+
def __init__(self, url: str = None, api_key: str = None):
|
13
|
+
super().__init__(url, api_key)
|
14
|
+
self._session = requests.Session()
|
15
|
+
|
16
|
+
def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
17
|
+
"""Upload a file and wait for processing to complete.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
file: The file to upload.
|
21
|
+
config: Configuration options for processing. Optional.
|
22
|
+
|
23
|
+
Examples:
|
24
|
+
```
|
25
|
+
# Upload from file path
|
26
|
+
chunkr.upload("document.pdf")
|
27
|
+
|
28
|
+
# Upload from URL
|
29
|
+
chunkr.upload("https://example.com/document.pdf")
|
30
|
+
|
31
|
+
# Upload from base64 string (must include MIME type header)
|
32
|
+
chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
33
|
+
|
34
|
+
# Upload from opened file
|
35
|
+
with open("document.pdf", "rb") as f:
|
36
|
+
chunkr.upload(f)
|
37
|
+
|
38
|
+
# Upload an image
|
39
|
+
from PIL import Image
|
40
|
+
img = Image.open("photo.jpg")
|
41
|
+
chunkr.upload(img)
|
42
|
+
```
|
43
|
+
Returns:
|
44
|
+
TaskResponse: The completed task response
|
45
|
+
"""
|
46
|
+
task = self.start_upload(file, config)
|
47
|
+
return task.poll()
|
48
|
+
|
49
|
+
def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
|
50
|
+
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
|
51
|
+
|
52
|
+
Args:
|
53
|
+
file: The file to upload.
|
54
|
+
config: Configuration options for processing. Optional.
|
55
|
+
|
56
|
+
Examples:
|
57
|
+
```
|
58
|
+
# Upload from file path
|
59
|
+
task = chunkr.start_upload("document.pdf")
|
60
|
+
|
61
|
+
# Upload from opened file
|
62
|
+
with open("document.pdf", "rb") as f:
|
63
|
+
task = chunkr.start_upload(f)
|
64
|
+
|
65
|
+
# Upload from URL
|
66
|
+
task = chunkr.start_upload("https://example.com/document.pdf")
|
67
|
+
|
68
|
+
# Upload from base64 string (must include MIME type header)
|
69
|
+
task = chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
|
70
|
+
|
71
|
+
# Upload an image
|
72
|
+
from PIL import Image
|
73
|
+
img = Image.open("photo.jpg")
|
74
|
+
task = chunkr.start_upload(img)
|
75
|
+
|
76
|
+
# Wait for the task to complete - this can be done when needed
|
77
|
+
task.poll()
|
78
|
+
```
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
TaskResponse: The initial task response
|
82
|
+
"""
|
83
|
+
files, data = self._prepare_upload_data(file, config)
|
84
|
+
r = self._session.post(
|
85
|
+
f"{self.url}/api/v1/task",
|
86
|
+
files=files,
|
87
|
+
data=data,
|
88
|
+
headers=self._headers()
|
89
|
+
)
|
90
|
+
r.raise_for_status()
|
91
|
+
return TaskResponse(**r.json()).with_client(self)
|
92
|
+
|
93
|
+
def get_task(self, task_id: str) -> TaskResponse:
|
94
|
+
"""Get a task response by its ID.
|
95
|
+
|
96
|
+
Args:
|
97
|
+
task_id: The ID of the task to get
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
TaskResponse: The task response
|
101
|
+
"""
|
102
|
+
r = self._session.get(
|
103
|
+
f"{self.url}/api/v1/task/{task_id}",
|
104
|
+
headers=self._headers()
|
105
|
+
)
|
106
|
+
r.raise_for_status()
|
107
|
+
return TaskResponse(**r.json()).with_client(self)
|
108
|
+
|