chunkr-ai 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/config.py +12 -11
- chunkr_ai/api/task.py +8 -1
- chunkr_ai/models.py +1 -2
- {chunkr_ai-0.0.3.dist-info → chunkr_ai-0.0.5.dist-info}/METADATA +82 -12
- {chunkr_ai-0.0.3.dist-info → chunkr_ai-0.0.5.dist-info}/RECORD +8 -8
- {chunkr_ai-0.0.3.dist-info → chunkr_ai-0.0.5.dist-info}/LICENSE +0 -0
- {chunkr_ai-0.0.3.dist-info → chunkr_ai-0.0.5.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.0.3.dist-info → chunkr_ai-0.0.5.dist-info}/top_level.txt +0 -0
chunkr_ai/api/config.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from pydantic import BaseModel, Field
|
1
|
+
from pydantic import BaseModel, Field, model_validator
|
2
2
|
from enum import Enum
|
3
3
|
from typing import Optional, List, Dict
|
4
4
|
|
@@ -40,15 +40,14 @@ class ChunkProcessing(BaseModel):
|
|
40
40
|
|
41
41
|
class Property(BaseModel):
|
42
42
|
name: str
|
43
|
-
title: Optional[str]
|
43
|
+
title: Optional[str] = None
|
44
44
|
prop_type: str
|
45
|
-
description: Optional[str]
|
46
|
-
default: Optional[str]
|
45
|
+
description: Optional[str] = None
|
46
|
+
default: Optional[str] = None
|
47
47
|
|
48
48
|
class JsonSchema(BaseModel):
|
49
49
|
title: str
|
50
50
|
properties: List[Property]
|
51
|
-
schema_type: Optional[str]
|
52
51
|
|
53
52
|
class OcrStrategy(str, Enum):
|
54
53
|
ALL = "All"
|
@@ -121,10 +120,12 @@ class Configuration(BaseModel):
|
|
121
120
|
ocr_strategy: Optional[OcrStrategy] = Field(default=None)
|
122
121
|
segment_processing: Optional[SegmentProcessing] = Field(default=None)
|
123
122
|
segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
|
124
|
-
target_chunk_length: Optional[int] = Field(default=None)
|
125
123
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
124
|
+
@model_validator(mode='before')
|
125
|
+
def map_deprecated_fields(cls, values: Dict) -> Dict:
|
126
|
+
if isinstance(values, dict) and "target_chunk_length" in values:
|
127
|
+
target_length = values.pop("target_chunk_length")
|
128
|
+
if target_length is not None:
|
129
|
+
values["chunk_processing"] = values.get("chunk_processing", {}) or {}
|
130
|
+
values["chunk_processing"]["target_length"] = target_length
|
131
|
+
return values
|
chunkr_ai/api/task.py
CHANGED
@@ -1,11 +1,18 @@
|
|
1
1
|
from .protocol import ChunkrClientProtocol
|
2
|
-
from .config import Configuration,
|
2
|
+
from .config import Configuration, OutputResponse
|
3
3
|
import asyncio
|
4
4
|
from datetime import datetime
|
5
|
+
from enum import Enum
|
5
6
|
from pydantic import BaseModel, PrivateAttr
|
6
7
|
import time
|
7
8
|
from typing import Optional, Union
|
8
9
|
|
10
|
+
class Status(str, Enum):
|
11
|
+
STARTING = "Starting"
|
12
|
+
PROCESSING = "Processing"
|
13
|
+
SUCCEEDED = "Succeeded"
|
14
|
+
FAILED = "Failed"
|
15
|
+
|
9
16
|
class TaskResponse(BaseModel):
|
10
17
|
configuration: Configuration
|
11
18
|
created_at: datetime
|
chunkr_ai/models.py
CHANGED
@@ -18,10 +18,9 @@ from .api.config import (
|
|
18
18
|
SegmentProcessing,
|
19
19
|
SegmentType,
|
20
20
|
SegmentationStrategy,
|
21
|
-
Status
|
22
21
|
)
|
23
22
|
|
24
|
-
from .api.task import TaskResponse, TaskPayload
|
23
|
+
from .api.task import TaskResponse, TaskPayload, Status
|
25
24
|
|
26
25
|
__all__ = [
|
27
26
|
'BoundingBox',
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.5
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
Project-URL: Homepage, https://chunkr.ai
|
@@ -17,7 +17,13 @@ Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
|
|
17
17
|
|
18
18
|
# Chunkr Python Client
|
19
19
|
|
20
|
-
This
|
20
|
+
This provides a simple interface to interact with the Chunkr API.
|
21
|
+
|
22
|
+
## Getting Started
|
23
|
+
|
24
|
+
You can get an API key from [Chunkr](https://chunkr.ai) or deploy your own Chunkr instance. For self-hosted deployment options, check out our [deployment guide](https://github.com/lumina-ai-inc/chunkr/tree/main?tab=readme-ov-file#self-hosted-deployment-options).
|
25
|
+
|
26
|
+
For more information about the API and its capabilities, visit the [Chunkr API docs](https://docs.chunkr.ai).
|
21
27
|
|
22
28
|
## Installation
|
23
29
|
|
@@ -102,6 +108,80 @@ chunkr.upload(img)
|
|
102
108
|
|
103
109
|
### Configuration
|
104
110
|
|
111
|
+
You can customize the processing behavior by passing a `Configuration` object:
|
112
|
+
|
113
|
+
```python
|
114
|
+
from chunkr_ai.models import Configuration, OcrStrategy, SegmentationStrategy, GenerationStrategy
|
115
|
+
|
116
|
+
# Basic configuration
|
117
|
+
config = Configuration(
|
118
|
+
ocr_strategy=OcrStrategy.AUTO,
|
119
|
+
segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
|
120
|
+
high_resolution=True,
|
121
|
+
expires_in=3600, # seconds
|
122
|
+
)
|
123
|
+
|
124
|
+
# Upload with configuration
|
125
|
+
task = chunkr.upload("document.pdf", config)
|
126
|
+
```
|
127
|
+
|
128
|
+
#### Available Configuration Examples
|
129
|
+
|
130
|
+
- **Chunk Processing**
|
131
|
+
```python
|
132
|
+
from chunkr_ai.models import ChunkProcessing
|
133
|
+
config = Configuration(
|
134
|
+
chunk_processing=ChunkProcessing(target_length=1024)
|
135
|
+
)
|
136
|
+
```
|
137
|
+
- **Expires In**
|
138
|
+
```python
|
139
|
+
config = Configuration(expires_in=3600)
|
140
|
+
```
|
141
|
+
|
142
|
+
- **High Resolution**
|
143
|
+
```python
|
144
|
+
config = Configuration(high_resolution=True)
|
145
|
+
```
|
146
|
+
|
147
|
+
- **JSON Schema**
|
148
|
+
```python
|
149
|
+
config = Configuration(json_schema=JsonSchema(
|
150
|
+
title="Sales Data",
|
151
|
+
properties=[
|
152
|
+
Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
|
153
|
+
Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
|
154
|
+
]
|
155
|
+
))
|
156
|
+
```
|
157
|
+
|
158
|
+
- **OCR Strategy**
|
159
|
+
```python
|
160
|
+
config = Configuration(ocr_strategy=OcrStrategy.AUTO)
|
161
|
+
```
|
162
|
+
|
163
|
+
- **Segment Processing**
|
164
|
+
```python
|
165
|
+
from chunkr_ai.models import SegmentProcessing, GenerationConfig, GenerationStrategy
|
166
|
+
config = Configuration(
|
167
|
+
segment_processing=SegmentProcessing(
|
168
|
+
page=GenerationConfig(
|
169
|
+
html=GenerationStrategy.LLM,
|
170
|
+
markdown=GenerationStrategy.LLM
|
171
|
+
)
|
172
|
+
)
|
173
|
+
)
|
174
|
+
```
|
175
|
+
|
176
|
+
- **Segmentation Strategy**
|
177
|
+
```python
|
178
|
+
config = Configuration(
|
179
|
+
segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS # or SegmentationStrategy.PAGE
|
180
|
+
)
|
181
|
+
```
|
182
|
+
|
183
|
+
## Environment setup
|
184
|
+
|
105
185
|
You can provide your API key and URL in several ways:
|
106
186
|
1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
|
107
187
|
2. `.env` file
|
@@ -112,13 +192,3 @@ chunkr = Chunkr(
|
|
112
192
|
url="https://api.chunkr.ai"
|
113
193
|
)
|
114
194
|
```
|
115
|
-
|
116
|
-
## Run tests
|
117
|
-
|
118
|
-
```python
|
119
|
-
# Install dependencies
|
120
|
-
uv pip install -e ".[test]"
|
121
|
-
|
122
|
-
# Run tests
|
123
|
-
uv run pytest
|
124
|
-
```
|
@@ -1,17 +1,17 @@
|
|
1
1
|
chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
|
2
2
|
chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
chunkr_ai/models.py,sha256=
|
3
|
+
chunkr_ai/models.py,sha256=d-B4vfgZClJOoHdPaH3vagwUc4qxeQSmUxab77DKYtQ,874
|
4
4
|
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
|
7
7
|
chunkr_ai/api/base.py,sha256=WDHx8tU0fl9_-yvYTKL-U0uaxHv-8_bRfiw9Xkl-mWM,6499
|
8
8
|
chunkr_ai/api/chunkr.py,sha256=LkBFzGB_T0y3fnBeIn_nwQW6Mb7eZO-iTlzWrmWBoko,3450
|
9
9
|
chunkr_ai/api/chunkr_async.py,sha256=B9deRVoe4h3Csh_jEuQxuxQ-DKSuZPdwkanFTyfHmeM,3603
|
10
|
-
chunkr_ai/api/config.py,sha256=
|
10
|
+
chunkr_ai/api/config.py,sha256=K0s1giImciPksu-bO9gzRwUaK2Vo1nxNKQkXlRQ2cb8,3785
|
11
11
|
chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
|
12
|
-
chunkr_ai/api/task.py,sha256=
|
13
|
-
chunkr_ai-0.0.
|
14
|
-
chunkr_ai-0.0.
|
15
|
-
chunkr_ai-0.0.
|
16
|
-
chunkr_ai-0.0.
|
17
|
-
chunkr_ai-0.0.
|
12
|
+
chunkr_ai/api/task.py,sha256=ALU-rYlObbitlM1MKEFeSz_IBUpzb9736Iqu9huWg7c,4392
|
13
|
+
chunkr_ai-0.0.5.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
chunkr_ai-0.0.5.dist-info/METADATA,sha256=Roj63O2Ms3D1vNfgEmnCYAJESFrOQ9nnsSlyXkvORU4,4806
|
15
|
+
chunkr_ai-0.0.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
16
|
+
chunkr_ai-0.0.5.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
17
|
+
chunkr_ai-0.0.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|