chunkr-ai 0.0.1__tar.gz → 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai-0.0.2/PKG-INFO +16 -0
- chunkr_ai-0.0.2/pyproject.toml +25 -0
- chunkr_ai-0.0.2/src/chunkr_ai/__init__.py +4 -0
- chunkr_ai-0.0.2/src/chunkr_ai/api/auth.py +14 -0
- chunkr_ai-0.0.2/src/chunkr_ai/api/chunkr.py +125 -0
- chunkr_ai-0.0.2/src/chunkr_ai/api/chunkr_async.py +39 -0
- chunkr_ai-0.0.2/src/chunkr_ai/api/models.py +231 -0
- chunkr_ai-0.0.2/src/chunkr_ai.egg-info/PKG-INFO +16 -0
- chunkr_ai-0.0.2/src/chunkr_ai.egg-info/SOURCES.txt +16 -0
- chunkr_ai-0.0.2/src/chunkr_ai.egg-info/requires.txt +10 -0
- chunkr_ai-0.0.2/tests/test_chunkr.py +69 -0
- chunkr_ai-0.0.1/PKG-INFO +0 -7
- chunkr_ai-0.0.1/pyproject.toml +0 -11
- chunkr_ai-0.0.1/src/chunkr_ai.egg-info/PKG-INFO +0 -7
- chunkr_ai-0.0.1/src/chunkr_ai.egg-info/SOURCES.txt +0 -9
- {chunkr_ai-0.0.1 → chunkr_ai-0.0.2}/LICENSE +0 -0
- {chunkr_ai-0.0.1 → chunkr_ai-0.0.2}/README.md +0 -0
- {chunkr_ai-0.0.1 → chunkr_ai-0.0.2}/setup.cfg +0 -0
- /chunkr_ai-0.0.1/src/chunkr_ai/__init__.py → /chunkr_ai-0.0.2/src/chunkr_ai/api/api.py +0 -0
- {chunkr_ai-0.0.1 → chunkr_ai-0.0.2}/src/chunkr_ai/main.py +0 -0
- {chunkr_ai-0.0.1 → chunkr_ai-0.0.2}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.1 → chunkr_ai-0.0.2}/src/chunkr_ai.egg-info/top_level.txt +0 -0
chunkr_ai-0.0.2/PKG-INFO
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: chunkr-ai
|
3
|
+
Version: 0.0.2
|
4
|
+
Summary: Python client for chunkr: open source document intelligence
|
5
|
+
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
|
+
Description-Content-Type: text/markdown
|
7
|
+
License-File: LICENSE
|
8
|
+
Requires-Dist: build>=1.2.2.post1
|
9
|
+
Requires-Dist: httpx>=0.28.1
|
10
|
+
Requires-Dist: pillow>=11.1.0
|
11
|
+
Requires-Dist: pydantic>=2.10.4
|
12
|
+
Requires-Dist: python-dotenv>=1.0.1
|
13
|
+
Requires-Dist: requests>=2.32.3
|
14
|
+
Requires-Dist: twine>=6.0.1
|
15
|
+
Provides-Extra: test
|
16
|
+
Requires-Dist: pytest>=8.3.4; extra == "test"
|
@@ -0,0 +1,25 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["setuptools>=42", "wheel"]
|
3
|
+
build-backend = "setuptools.build_meta"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "chunkr-ai"
|
7
|
+
version = "0.0.2"
|
8
|
+
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
|
+
description = "Python client for chunkr: open source document intelligence"
|
10
|
+
readme = "README.md"
|
11
|
+
license = {"file" = "LICENSE"}
|
12
|
+
dependencies = [
|
13
|
+
"build>=1.2.2.post1",
|
14
|
+
"httpx>=0.28.1",
|
15
|
+
"pillow>=11.1.0",
|
16
|
+
"pydantic>=2.10.4",
|
17
|
+
"python-dotenv>=1.0.1",
|
18
|
+
"requests>=2.32.3",
|
19
|
+
"twine>=6.0.1",
|
20
|
+
]
|
21
|
+
|
22
|
+
[project.optional-dependencies]
|
23
|
+
test = [
|
24
|
+
"pytest>=8.3.4",
|
25
|
+
]
|
@@ -0,0 +1,14 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
class HeadersMixin:
|
4
|
+
"""Mixin class for handling authorization headers"""
|
5
|
+
|
6
|
+
def get_api_key(self) -> str:
|
7
|
+
"""Get the API key"""
|
8
|
+
if not hasattr(self, '_api_key') or not self._api_key:
|
9
|
+
raise ValueError("API key not set")
|
10
|
+
return self._api_key
|
11
|
+
|
12
|
+
def _headers(self) -> dict:
|
13
|
+
"""Generate authorization headers"""
|
14
|
+
return {"Authorization": self.get_api_key()}
|
@@ -0,0 +1,125 @@
|
|
1
|
+
from .models import TaskResponse, Configuration
|
2
|
+
from .auth import HeadersMixin
|
3
|
+
from dotenv import load_dotenv
|
4
|
+
import io
|
5
|
+
import os
|
6
|
+
from pathlib import Path
|
7
|
+
from PIL import Image
|
8
|
+
import requests
|
9
|
+
from typing import Union, BinaryIO, Tuple
|
10
|
+
|
11
|
+
class Chunkr(HeadersMixin):
|
12
|
+
"""Client for interacting with the Chunkr API."""
|
13
|
+
|
14
|
+
def __init__(self, url: str = None, api_key: str = None):
|
15
|
+
load_dotenv()
|
16
|
+
self.url = (
|
17
|
+
url or
|
18
|
+
os.getenv('CHUNKR_URL') or
|
19
|
+
'https://api.chunkr.ai'
|
20
|
+
)
|
21
|
+
self._api_key = (
|
22
|
+
api_key or
|
23
|
+
os.getenv('CHUNKR_API_KEY')
|
24
|
+
)
|
25
|
+
if not self._api_key:
|
26
|
+
raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
|
27
|
+
|
28
|
+
self.url = self.url.rstrip("/")
|
29
|
+
|
30
|
+
def _prepare_file(
|
31
|
+
self,
|
32
|
+
file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO]
|
33
|
+
) -> Tuple[str, BinaryIO]:
|
34
|
+
"""Convert various file types into a tuple of (filename, file-like object).
|
35
|
+
|
36
|
+
Args:
|
37
|
+
file: Input file in various formats
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
Tuple[str, BinaryIO]: Filename and file-like object ready for upload
|
41
|
+
"""
|
42
|
+
if isinstance(file, str):
|
43
|
+
path = Path(file).resolve()
|
44
|
+
if not path.exists():
|
45
|
+
raise FileNotFoundError(f"File not found: {file}")
|
46
|
+
return path.name, path.open("rb")
|
47
|
+
elif isinstance(file, Image.Image):
|
48
|
+
img_byte_arr = io.BytesIO()
|
49
|
+
file.save(img_byte_arr, format=file.format or 'PNG')
|
50
|
+
img_byte_arr.seek(0)
|
51
|
+
return "image.png", img_byte_arr
|
52
|
+
elif isinstance(file, bytes):
|
53
|
+
return "document", io.BytesIO(file)
|
54
|
+
elif isinstance(file, io.BytesIO):
|
55
|
+
return "document", file
|
56
|
+
else:
|
57
|
+
return "document", file
|
58
|
+
|
59
|
+
def upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
|
60
|
+
"""Upload a file and wait for processing to complete.
|
61
|
+
|
62
|
+
The file can be one of:
|
63
|
+
- str: Path to a file on disk
|
64
|
+
- BinaryIO: A file-like object (e.g., opened with 'rb' mode)
|
65
|
+
- Image.Image: A PIL/Pillow Image object
|
66
|
+
- bytes: Raw binary data
|
67
|
+
- io.BytesIO: A binary stream in memory
|
68
|
+
|
69
|
+
Args:
|
70
|
+
file: The file to upload.
|
71
|
+
config:
|
72
|
+
Configuration options for processing. Optional.
|
73
|
+
|
74
|
+
Returns:
|
75
|
+
TaskResponse: The completed task response
|
76
|
+
"""
|
77
|
+
return self.start_upload(file, config).poll()
|
78
|
+
|
79
|
+
def start_upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
|
80
|
+
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
|
81
|
+
|
82
|
+
The file can be one of:
|
83
|
+
- str: Path to a file on disk
|
84
|
+
- BinaryIO: A file-like object (e.g., opened with 'rb' mode)
|
85
|
+
- Image.Image: A PIL/Pillow Image object
|
86
|
+
- bytes: Raw binary data
|
87
|
+
- io.BytesIO: A binary stream in memory
|
88
|
+
|
89
|
+
Args:
|
90
|
+
file: The file to upload.
|
91
|
+
config (Configuration, optional): Configuration options for processing
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
TaskResponse: The initial task response
|
95
|
+
|
96
|
+
Raises:
|
97
|
+
requests.exceptions.HTTPError: If the API request fails
|
98
|
+
"""
|
99
|
+
url = f"{self.url}/api/v1/task"
|
100
|
+
filename, file_obj = self._prepare_file(file)
|
101
|
+
|
102
|
+
files = {"file": (filename, file_obj)}
|
103
|
+
r = requests.post(
|
104
|
+
url,
|
105
|
+
files=files,
|
106
|
+
json=config.dict() if config else {},
|
107
|
+
headers=self._headers()
|
108
|
+
)
|
109
|
+
r.raise_for_status()
|
110
|
+
return TaskResponse(**r.json()).with_api_key(self._api_key)
|
111
|
+
|
112
|
+
def get_task(self, task_id: str) -> TaskResponse:
|
113
|
+
"""Get a task response by its ID.
|
114
|
+
|
115
|
+
Args:
|
116
|
+
task_id (str): The ID of the task to get
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
TaskResponse: The task response
|
120
|
+
"""
|
121
|
+
url = f"{self.url}/api/v1/task/{task_id}"
|
122
|
+
r = requests.get(url, headers=self._headers())
|
123
|
+
r.raise_for_status()
|
124
|
+
return TaskResponse(**r.json()).with_api_key(self._api_key)
|
125
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
from .chunkr import Chunkr
|
2
|
+
from .models import TaskResponse, Configuration
|
3
|
+
import httpx
|
4
|
+
import io
|
5
|
+
from PIL import Image
|
6
|
+
from typing import Union, BinaryIO
|
7
|
+
|
8
|
+
class ChunkrAsync(Chunkr):
|
9
|
+
"""Async client for interacting with the Chunkr API.
|
10
|
+
|
11
|
+
This class inherits from the Chunkr class but works with async HTTP requests.
|
12
|
+
"""
|
13
|
+
|
14
|
+
async def upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
|
15
|
+
task = await self.start_upload(file, config)
|
16
|
+
return await task.poll_async()
|
17
|
+
|
18
|
+
async def start_upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
|
19
|
+
url = f"{self.url}/api/v1/task"
|
20
|
+
filename, file_obj = self._prepare_file(file)
|
21
|
+
async with httpx.AsyncClient() as client:
|
22
|
+
files = {"file": (filename, file_obj)}
|
23
|
+
r = await client.post(
|
24
|
+
url,
|
25
|
+
files=files,
|
26
|
+
json=config.dict() if config else {},
|
27
|
+
headers=self._headers()
|
28
|
+
)
|
29
|
+
r.raise_for_status()
|
30
|
+
return TaskResponse(**r.json()).with_api_key(self._api_key)
|
31
|
+
|
32
|
+
async def get_task(self, task_id: str) -> TaskResponse:
|
33
|
+
url = f"{self.url}/api/v1/task/{task_id}"
|
34
|
+
async with httpx.AsyncClient() as client:
|
35
|
+
r = await client.get(url, headers=self._headers())
|
36
|
+
r.raise_for_status()
|
37
|
+
return TaskResponse(**r.json()).with_api_key(self._api_key)
|
38
|
+
|
39
|
+
|
@@ -0,0 +1,231 @@
|
|
1
|
+
from .auth import HeadersMixin
|
2
|
+
import asyncio
|
3
|
+
from datetime import datetime
|
4
|
+
from enum import Enum
|
5
|
+
import httpx
|
6
|
+
from pydantic import BaseModel, Field, PrivateAttr
|
7
|
+
import requests
|
8
|
+
import time
|
9
|
+
from typing import Optional, List, Dict, Union
|
10
|
+
|
11
|
+
class GenerationStrategy(str, Enum):
|
12
|
+
LLM = "LLM"
|
13
|
+
AUTO = "Auto"
|
14
|
+
|
15
|
+
class CroppingStrategy(str, Enum):
|
16
|
+
ALL = "All"
|
17
|
+
AUTO = "Auto"
|
18
|
+
|
19
|
+
class LlmConfig(BaseModel):
|
20
|
+
model: str
|
21
|
+
prompt: str
|
22
|
+
temperature: float = 0.0
|
23
|
+
|
24
|
+
class AutoGenerationConfig(BaseModel):
|
25
|
+
html: GenerationStrategy = GenerationStrategy.AUTO
|
26
|
+
llm: Optional[LlmConfig] = None
|
27
|
+
markdown: GenerationStrategy = GenerationStrategy.AUTO
|
28
|
+
crop_image: CroppingStrategy = CroppingStrategy.ALL
|
29
|
+
|
30
|
+
class LlmGenerationConfig(BaseModel):
|
31
|
+
html: GenerationStrategy = GenerationStrategy.LLM
|
32
|
+
llm: Optional[LlmConfig] = None
|
33
|
+
markdown: GenerationStrategy = GenerationStrategy.LLM
|
34
|
+
crop_image: CroppingStrategy = CroppingStrategy.ALL
|
35
|
+
|
36
|
+
class SegmentProcessing(BaseModel):
|
37
|
+
title: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
38
|
+
section_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
39
|
+
text: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
40
|
+
list_item: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
41
|
+
table: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
|
42
|
+
picture: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
43
|
+
caption: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
44
|
+
formula: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
|
45
|
+
footnote: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
46
|
+
page_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
47
|
+
page_footer: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
48
|
+
page: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
|
49
|
+
|
50
|
+
class ChunkProcessing(BaseModel):
|
51
|
+
target_length: int = 512
|
52
|
+
|
53
|
+
class Property(BaseModel):
|
54
|
+
name: str
|
55
|
+
title: Optional[str]
|
56
|
+
prop_type: str
|
57
|
+
description: Optional[str]
|
58
|
+
default: Optional[str]
|
59
|
+
|
60
|
+
class JsonSchema(BaseModel):
|
61
|
+
title: str
|
62
|
+
properties: List[Property]
|
63
|
+
schema_type: Optional[str]
|
64
|
+
|
65
|
+
class OcrStrategy(str, Enum):
|
66
|
+
ALL = "All"
|
67
|
+
AUTO = "Auto"
|
68
|
+
|
69
|
+
class SegmentationStrategy(str, Enum):
|
70
|
+
LAYOUT_ANALYSIS = "LayoutAnalysis"
|
71
|
+
PAGE = "Page"
|
72
|
+
|
73
|
+
class BoundingBox(BaseModel):
|
74
|
+
left: float
|
75
|
+
top: float
|
76
|
+
width: float
|
77
|
+
height: float
|
78
|
+
|
79
|
+
class OCRResult(BaseModel):
|
80
|
+
bbox: BoundingBox
|
81
|
+
text: str
|
82
|
+
confidence: Optional[float]
|
83
|
+
|
84
|
+
class SegmentType(str, Enum):
|
85
|
+
CAPTION = "Caption"
|
86
|
+
FOOTNOTE = "Footnote"
|
87
|
+
FORMULA = "Formula"
|
88
|
+
LIST_ITEM = "ListItem"
|
89
|
+
PAGE = "Page"
|
90
|
+
PAGE_FOOTER = "PageFooter"
|
91
|
+
PAGE_HEADER = "PageHeader"
|
92
|
+
PICTURE = "Picture"
|
93
|
+
SECTION_HEADER = "SectionHeader"
|
94
|
+
TABLE = "Table"
|
95
|
+
TEXT = "Text"
|
96
|
+
TITLE = "Title"
|
97
|
+
|
98
|
+
class Segment(BaseModel):
|
99
|
+
bbox: BoundingBox
|
100
|
+
content: str
|
101
|
+
page_height: float
|
102
|
+
html: Optional[str]
|
103
|
+
image: Optional[str]
|
104
|
+
markdown: Optional[str]
|
105
|
+
ocr: List[OCRResult]
|
106
|
+
page_number: int
|
107
|
+
page_width: float
|
108
|
+
segment_id: str
|
109
|
+
segment_type: SegmentType
|
110
|
+
|
111
|
+
class Chunk(BaseModel):
|
112
|
+
chunk_id: str
|
113
|
+
chunk_length: int
|
114
|
+
segments: List[Segment]
|
115
|
+
|
116
|
+
class ExtractedJson(BaseModel):
|
117
|
+
data: Dict
|
118
|
+
|
119
|
+
class OutputResponse(BaseModel):
|
120
|
+
chunks: List[Chunk] = []
|
121
|
+
extracted_json: Optional[ExtractedJson]
|
122
|
+
|
123
|
+
class Model(str, Enum):
|
124
|
+
FAST = "Fast"
|
125
|
+
HIGH_QUALITY = "HighQuality"
|
126
|
+
|
127
|
+
class Configuration(BaseModel):
|
128
|
+
chunk_processing: ChunkProcessing = Field(default_factory=ChunkProcessing)
|
129
|
+
expires_in: Optional[int] = None
|
130
|
+
high_resolution: bool = False
|
131
|
+
json_schema: Optional[JsonSchema] = None
|
132
|
+
model: Optional[Model] = Field(None, deprecated=True)
|
133
|
+
ocr_strategy: OcrStrategy = OcrStrategy.AUTO
|
134
|
+
segment_processing: SegmentProcessing = Field(default_factory=SegmentProcessing)
|
135
|
+
segmentation_strategy: SegmentationStrategy = SegmentationStrategy.LAYOUT_ANALYSIS
|
136
|
+
target_chunk_length: Optional[int] = Field(None, deprecated=True)
|
137
|
+
|
138
|
+
|
139
|
+
class Status(str, Enum):
|
140
|
+
STARTING = "Starting"
|
141
|
+
PROCESSING = "Processing"
|
142
|
+
SUCCEEDED = "Succeeded"
|
143
|
+
FAILED = "Failed"
|
144
|
+
|
145
|
+
class TaskResponse(BaseModel, HeadersMixin):
|
146
|
+
configuration: Configuration
|
147
|
+
created_at: datetime
|
148
|
+
expires_at: Optional[datetime]
|
149
|
+
file_name: Optional[str]
|
150
|
+
finished_at: Optional[datetime]
|
151
|
+
input_file_url: Optional[str]
|
152
|
+
message: str
|
153
|
+
output: Optional[OutputResponse]
|
154
|
+
page_count: Optional[int]
|
155
|
+
pdf_url: Optional[str]
|
156
|
+
status: Status
|
157
|
+
task_id: str
|
158
|
+
task_url: Optional[str]
|
159
|
+
_api_key: Optional[str] = PrivateAttr(default=None)
|
160
|
+
|
161
|
+
def with_api_key(self, api_key: str) -> 'TaskResponse':
|
162
|
+
"""Helper function to set api key on a TaskResponse after creation"""
|
163
|
+
self._api_key = api_key
|
164
|
+
return self
|
165
|
+
|
166
|
+
def poll(self) -> 'TaskResponse':
|
167
|
+
"""Poll the task for completion"""
|
168
|
+
if not self.task_url:
|
169
|
+
raise ValueError("Task URL not found in response")
|
170
|
+
|
171
|
+
while True:
|
172
|
+
r = requests.get(self.task_url, headers=self._headers())
|
173
|
+
r.raise_for_status()
|
174
|
+
self.__dict__.update(r.json())
|
175
|
+
if self.status == "Failed":
|
176
|
+
raise ValueError(self.message)
|
177
|
+
if self.status not in ("Starting", "Processing"):
|
178
|
+
return self
|
179
|
+
time.sleep(0.5)
|
180
|
+
|
181
|
+
async def poll_async(self) -> 'TaskResponse':
|
182
|
+
"""Async poll the task for completion"""
|
183
|
+
if not self.task_url:
|
184
|
+
raise ValueError("Task URL not found in response")
|
185
|
+
|
186
|
+
async with httpx.AsyncClient() as client:
|
187
|
+
while True:
|
188
|
+
r = await client.get(self.task_url, headers=self._headers())
|
189
|
+
r.raise_for_status()
|
190
|
+
self.__dict__.update(r.json())
|
191
|
+
if self.status == "Failed":
|
192
|
+
raise ValueError(self.message)
|
193
|
+
if self.status not in ("Starting", "Processing"):
|
194
|
+
return self
|
195
|
+
await asyncio.sleep(0.5)
|
196
|
+
|
197
|
+
|
198
|
+
def _get_content(self, content_type: str) -> str:
|
199
|
+
"""Helper method to get either HTML, Markdown, or raw content."""
|
200
|
+
if not self.output:
|
201
|
+
return ""
|
202
|
+
parts = []
|
203
|
+
for c in self.output.chunks:
|
204
|
+
for s in c.segments:
|
205
|
+
content = getattr(s, content_type)
|
206
|
+
if content:
|
207
|
+
parts.append(content)
|
208
|
+
return "\n".join(parts)
|
209
|
+
|
210
|
+
def html(self) -> str:
|
211
|
+
"""Get full HTML for the task"""
|
212
|
+
return self._get_content("html")
|
213
|
+
|
214
|
+
def markdown(self) -> str:
|
215
|
+
"""Get full markdown for the task"""
|
216
|
+
return self._get_content("markdown")
|
217
|
+
|
218
|
+
def content(self) -> str:
|
219
|
+
"""Get full text for the task"""
|
220
|
+
return self._get_content("content")
|
221
|
+
|
222
|
+
class TaskPayload(BaseModel):
|
223
|
+
current_configuration: Configuration
|
224
|
+
file_name: str
|
225
|
+
image_folder_location: str
|
226
|
+
input_location: str
|
227
|
+
output_location: str
|
228
|
+
pdf_location: str
|
229
|
+
previous_configuration: Optional[Configuration]
|
230
|
+
task_id: str
|
231
|
+
user_id: str
|
@@ -0,0 +1,16 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: chunkr-ai
|
3
|
+
Version: 0.0.2
|
4
|
+
Summary: Python client for chunkr: open source document intelligence
|
5
|
+
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
|
+
Description-Content-Type: text/markdown
|
7
|
+
License-File: LICENSE
|
8
|
+
Requires-Dist: build>=1.2.2.post1
|
9
|
+
Requires-Dist: httpx>=0.28.1
|
10
|
+
Requires-Dist: pillow>=11.1.0
|
11
|
+
Requires-Dist: pydantic>=2.10.4
|
12
|
+
Requires-Dist: python-dotenv>=1.0.1
|
13
|
+
Requires-Dist: requests>=2.32.3
|
14
|
+
Requires-Dist: twine>=6.0.1
|
15
|
+
Provides-Extra: test
|
16
|
+
Requires-Dist: pytest>=8.3.4; extra == "test"
|
@@ -0,0 +1,16 @@
|
|
1
|
+
LICENSE
|
2
|
+
README.md
|
3
|
+
pyproject.toml
|
4
|
+
src/chunkr_ai/__init__.py
|
5
|
+
src/chunkr_ai/main.py
|
6
|
+
src/chunkr_ai.egg-info/PKG-INFO
|
7
|
+
src/chunkr_ai.egg-info/SOURCES.txt
|
8
|
+
src/chunkr_ai.egg-info/dependency_links.txt
|
9
|
+
src/chunkr_ai.egg-info/requires.txt
|
10
|
+
src/chunkr_ai.egg-info/top_level.txt
|
11
|
+
src/chunkr_ai/api/api.py
|
12
|
+
src/chunkr_ai/api/auth.py
|
13
|
+
src/chunkr_ai/api/chunkr.py
|
14
|
+
src/chunkr_ai/api/chunkr_async.py
|
15
|
+
src/chunkr_ai/api/models.py
|
16
|
+
tests/test_chunkr.py
|
@@ -0,0 +1,69 @@
|
|
1
|
+
import pytest
|
2
|
+
import os
|
3
|
+
from pathlib import Path
|
4
|
+
from PIL import Image
|
5
|
+
import io
|
6
|
+
from chunkr_ai import Chunkr, ChunkrAsync
|
7
|
+
from chunkr_ai.api.models import TaskResponse
|
8
|
+
|
9
|
+
# Test fixtures
|
10
|
+
@pytest.fixture
|
11
|
+
def chunkr():
|
12
|
+
return Chunkr()
|
13
|
+
|
14
|
+
@pytest.fixture
|
15
|
+
def async_chunkr():
|
16
|
+
return ChunkrAsync()
|
17
|
+
|
18
|
+
@pytest.fixture
|
19
|
+
def sample_pdf():
|
20
|
+
# Create a temporary PDF file for testing
|
21
|
+
content = b"%PDF-1.4 test content"
|
22
|
+
pdf_path = Path("test_document.pdf")
|
23
|
+
pdf_path.write_bytes(content)
|
24
|
+
yield str(pdf_path)
|
25
|
+
pdf_path.unlink() # Cleanup after tests
|
26
|
+
|
27
|
+
@pytest.fixture
|
28
|
+
def sample_image():
|
29
|
+
# Create a test image
|
30
|
+
img = Image.new('RGB', (100, 100), color='red')
|
31
|
+
return img
|
32
|
+
|
33
|
+
def test_prepare_file_string(chunkr, sample_pdf):
|
34
|
+
filename, file_obj = chunkr._prepare_file(sample_pdf)
|
35
|
+
assert filename == "test_document.pdf"
|
36
|
+
assert hasattr(file_obj, 'read')
|
37
|
+
|
38
|
+
def test_prepare_file_image(chunkr, sample_image):
|
39
|
+
filename, file_obj = chunkr._prepare_file(sample_image)
|
40
|
+
assert filename == "image.png"
|
41
|
+
assert isinstance(file_obj, io.BytesIO)
|
42
|
+
|
43
|
+
def test_prepare_file_bytes(chunkr):
|
44
|
+
test_bytes = b"test content"
|
45
|
+
filename, file_obj = chunkr._prepare_file(test_bytes)
|
46
|
+
assert filename == "document"
|
47
|
+
assert isinstance(file_obj, io.BytesIO)
|
48
|
+
|
49
|
+
def test_send_file_string(chunkr, sample_pdf):
|
50
|
+
response = chunkr.upload(sample_pdf)
|
51
|
+
|
52
|
+
assert isinstance(response, TaskResponse)
|
53
|
+
assert response.task_id is not None
|
54
|
+
assert response.status in ["pending", "processing", "completed"]
|
55
|
+
|
56
|
+
def test_send_file_image(chunkr, sample_image):
|
57
|
+
response = chunkr.upload(sample_image)
|
58
|
+
|
59
|
+
assert isinstance(response, TaskResponse)
|
60
|
+
assert response.task_id is not None
|
61
|
+
assert response.status in ["pending", "processing", "completed"]
|
62
|
+
|
63
|
+
def test_send_file_bytes(chunkr):
|
64
|
+
test_bytes = b"This is a test document content"
|
65
|
+
response = chunkr.upload(test_bytes)
|
66
|
+
|
67
|
+
assert isinstance(response, TaskResponse)
|
68
|
+
assert response.task_id is not None
|
69
|
+
assert response.status in ["pending", "processing", "completed"]
|
chunkr_ai-0.0.1/PKG-INFO
DELETED
chunkr_ai-0.0.1/pyproject.toml
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
[build-system]
|
2
|
-
requires = ["setuptools>=42", "wheel"]
|
3
|
-
build-backend = "setuptools.build_meta"
|
4
|
-
|
5
|
-
[project]
|
6
|
-
name = "chunkr-ai"
|
7
|
-
version = "0.0.1"
|
8
|
-
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
|
-
description = "PDF chunking"
|
10
|
-
readme = "README.md"
|
11
|
-
license = {"file" = "LICENSE"}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|