chunkr-ai 0.0.1__tar.gz → 0.0.2__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.2
2
+ Name: chunkr-ai
3
+ Version: 0.0.2
4
+ Summary: Python client for chunkr: open source document intelligence
5
+ Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: build>=1.2.2.post1
9
+ Requires-Dist: httpx>=0.28.1
10
+ Requires-Dist: pillow>=11.1.0
11
+ Requires-Dist: pydantic>=2.10.4
12
+ Requires-Dist: python-dotenv>=1.0.1
13
+ Requires-Dist: requests>=2.32.3
14
+ Requires-Dist: twine>=6.0.1
15
+ Provides-Extra: test
16
+ Requires-Dist: pytest>=8.3.4; extra == "test"
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=42", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "chunkr-ai"
7
+ version = "0.0.2"
8
+ authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
+ description = "Python client for chunkr: open source document intelligence"
10
+ readme = "README.md"
11
+ license = {"file" = "LICENSE"}
12
+ dependencies = [
13
+ "build>=1.2.2.post1",
14
+ "httpx>=0.28.1",
15
+ "pillow>=11.1.0",
16
+ "pydantic>=2.10.4",
17
+ "python-dotenv>=1.0.1",
18
+ "requests>=2.32.3",
19
+ "twine>=6.0.1",
20
+ ]
21
+
22
+ [project.optional-dependencies]
23
+ test = [
24
+ "pytest>=8.3.4",
25
+ ]
@@ -0,0 +1,4 @@
1
+ from .api.chunkr import Chunkr
2
+ from .api.chunkr_async import ChunkrAsync
3
+
4
+ __all__ = ['Chunkr', 'ChunkrAsync']
@@ -0,0 +1,14 @@
1
+ from typing import Optional
2
+
3
+ class HeadersMixin:
4
+ """Mixin class for handling authorization headers"""
5
+
6
+ def get_api_key(self) -> str:
7
+ """Get the API key"""
8
+ if not hasattr(self, '_api_key') or not self._api_key:
9
+ raise ValueError("API key not set")
10
+ return self._api_key
11
+
12
+ def _headers(self) -> dict:
13
+ """Generate authorization headers"""
14
+ return {"Authorization": self.get_api_key()}
@@ -0,0 +1,125 @@
1
+ from .models import TaskResponse, Configuration
2
+ from .auth import HeadersMixin
3
+ from dotenv import load_dotenv
4
+ import io
5
+ import os
6
+ from pathlib import Path
7
+ from PIL import Image
8
+ import requests
9
+ from typing import Union, BinaryIO, Tuple
10
+
11
+ class Chunkr(HeadersMixin):
12
+ """Client for interacting with the Chunkr API."""
13
+
14
+ def __init__(self, url: str = None, api_key: str = None):
15
+ load_dotenv()
16
+ self.url = (
17
+ url or
18
+ os.getenv('CHUNKR_URL') or
19
+ 'https://api.chunkr.ai'
20
+ )
21
+ self._api_key = (
22
+ api_key or
23
+ os.getenv('CHUNKR_API_KEY')
24
+ )
25
+ if not self._api_key:
26
+ raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
27
+
28
+ self.url = self.url.rstrip("/")
29
+
30
+ def _prepare_file(
31
+ self,
32
+ file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO]
33
+ ) -> Tuple[str, BinaryIO]:
34
+ """Convert various file types into a tuple of (filename, file-like object).
35
+
36
+ Args:
37
+ file: Input file in various formats
38
+
39
+ Returns:
40
+ Tuple[str, BinaryIO]: Filename and file-like object ready for upload
41
+ """
42
+ if isinstance(file, str):
43
+ path = Path(file).resolve()
44
+ if not path.exists():
45
+ raise FileNotFoundError(f"File not found: {file}")
46
+ return path.name, path.open("rb")
47
+ elif isinstance(file, Image.Image):
48
+ img_byte_arr = io.BytesIO()
49
+ file.save(img_byte_arr, format=file.format or 'PNG')
50
+ img_byte_arr.seek(0)
51
+ return "image.png", img_byte_arr
52
+ elif isinstance(file, bytes):
53
+ return "document", io.BytesIO(file)
54
+ elif isinstance(file, io.BytesIO):
55
+ return "document", file
56
+ else:
57
+ return "document", file
58
+
59
+ def upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
60
+ """Upload a file and wait for processing to complete.
61
+
62
+ The file can be one of:
63
+ - str: Path to a file on disk
64
+ - BinaryIO: A file-like object (e.g., opened with 'rb' mode)
65
+ - Image.Image: A PIL/Pillow Image object
66
+ - bytes: Raw binary data
67
+ - io.BytesIO: A binary stream in memory
68
+
69
+ Args:
70
+ file: The file to upload.
71
+ config:
72
+ Configuration options for processing. Optional.
73
+
74
+ Returns:
75
+ TaskResponse: The completed task response
76
+ """
77
+ return self.start_upload(file, config).poll()
78
+
79
+ def start_upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
80
+ """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
81
+
82
+ The file can be one of:
83
+ - str: Path to a file on disk
84
+ - BinaryIO: A file-like object (e.g., opened with 'rb' mode)
85
+ - Image.Image: A PIL/Pillow Image object
86
+ - bytes: Raw binary data
87
+ - io.BytesIO: A binary stream in memory
88
+
89
+ Args:
90
+ file: The file to upload.
91
+ config (Configuration, optional): Configuration options for processing
92
+
93
+ Returns:
94
+ TaskResponse: The initial task response
95
+
96
+ Raises:
97
+ requests.exceptions.HTTPError: If the API request fails
98
+ """
99
+ url = f"{self.url}/api/v1/task"
100
+ filename, file_obj = self._prepare_file(file)
101
+
102
+ files = {"file": (filename, file_obj)}
103
+ r = requests.post(
104
+ url,
105
+ files=files,
106
+ json=config.dict() if config else {},
107
+ headers=self._headers()
108
+ )
109
+ r.raise_for_status()
110
+ return TaskResponse(**r.json()).with_api_key(self._api_key)
111
+
112
+ def get_task(self, task_id: str) -> TaskResponse:
113
+ """Get a task response by its ID.
114
+
115
+ Args:
116
+ task_id (str): The ID of the task to get
117
+
118
+ Returns:
119
+ TaskResponse: The task response
120
+ """
121
+ url = f"{self.url}/api/v1/task/{task_id}"
122
+ r = requests.get(url, headers=self._headers())
123
+ r.raise_for_status()
124
+ return TaskResponse(**r.json()).with_api_key(self._api_key)
125
+
@@ -0,0 +1,39 @@
1
+ from .chunkr import Chunkr
2
+ from .models import TaskResponse, Configuration
3
+ import httpx
4
+ import io
5
+ from PIL import Image
6
+ from typing import Union, BinaryIO
7
+
8
+ class ChunkrAsync(Chunkr):
9
+ """Async client for interacting with the Chunkr API.
10
+
11
+ This class inherits from the Chunkr class but works with async HTTP requests.
12
+ """
13
+
14
+ async def upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
15
+ task = await self.start_upload(file, config)
16
+ return await task.poll_async()
17
+
18
+ async def start_upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
19
+ url = f"{self.url}/api/v1/task"
20
+ filename, file_obj = self._prepare_file(file)
21
+ async with httpx.AsyncClient() as client:
22
+ files = {"file": (filename, file_obj)}
23
+ r = await client.post(
24
+ url,
25
+ files=files,
26
+ json=config.dict() if config else {},
27
+ headers=self._headers()
28
+ )
29
+ r.raise_for_status()
30
+ return TaskResponse(**r.json()).with_api_key(self._api_key)
31
+
32
+ async def get_task(self, task_id: str) -> TaskResponse:
33
+ url = f"{self.url}/api/v1/task/{task_id}"
34
+ async with httpx.AsyncClient() as client:
35
+ r = await client.get(url, headers=self._headers())
36
+ r.raise_for_status()
37
+ return TaskResponse(**r.json()).with_api_key(self._api_key)
38
+
39
+
@@ -0,0 +1,231 @@
1
+ from .auth import HeadersMixin
2
+ import asyncio
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ import httpx
6
+ from pydantic import BaseModel, Field, PrivateAttr
7
+ import requests
8
+ import time
9
+ from typing import Optional, List, Dict, Union
10
+
11
+ class GenerationStrategy(str, Enum):
12
+ LLM = "LLM"
13
+ AUTO = "Auto"
14
+
15
+ class CroppingStrategy(str, Enum):
16
+ ALL = "All"
17
+ AUTO = "Auto"
18
+
19
+ class LlmConfig(BaseModel):
20
+ model: str
21
+ prompt: str
22
+ temperature: float = 0.0
23
+
24
+ class AutoGenerationConfig(BaseModel):
25
+ html: GenerationStrategy = GenerationStrategy.AUTO
26
+ llm: Optional[LlmConfig] = None
27
+ markdown: GenerationStrategy = GenerationStrategy.AUTO
28
+ crop_image: CroppingStrategy = CroppingStrategy.ALL
29
+
30
+ class LlmGenerationConfig(BaseModel):
31
+ html: GenerationStrategy = GenerationStrategy.LLM
32
+ llm: Optional[LlmConfig] = None
33
+ markdown: GenerationStrategy = GenerationStrategy.LLM
34
+ crop_image: CroppingStrategy = CroppingStrategy.ALL
35
+
36
+ class SegmentProcessing(BaseModel):
37
+ title: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
38
+ section_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
39
+ text: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
40
+ list_item: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
41
+ table: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
42
+ picture: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
43
+ caption: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
44
+ formula: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
45
+ footnote: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
46
+ page_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
47
+ page_footer: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
48
+ page: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
49
+
50
+ class ChunkProcessing(BaseModel):
51
+ target_length: int = 512
52
+
53
+ class Property(BaseModel):
54
+ name: str
55
+ title: Optional[str]
56
+ prop_type: str
57
+ description: Optional[str]
58
+ default: Optional[str]
59
+
60
+ class JsonSchema(BaseModel):
61
+ title: str
62
+ properties: List[Property]
63
+ schema_type: Optional[str]
64
+
65
+ class OcrStrategy(str, Enum):
66
+ ALL = "All"
67
+ AUTO = "Auto"
68
+
69
+ class SegmentationStrategy(str, Enum):
70
+ LAYOUT_ANALYSIS = "LayoutAnalysis"
71
+ PAGE = "Page"
72
+
73
+ class BoundingBox(BaseModel):
74
+ left: float
75
+ top: float
76
+ width: float
77
+ height: float
78
+
79
+ class OCRResult(BaseModel):
80
+ bbox: BoundingBox
81
+ text: str
82
+ confidence: Optional[float]
83
+
84
+ class SegmentType(str, Enum):
85
+ CAPTION = "Caption"
86
+ FOOTNOTE = "Footnote"
87
+ FORMULA = "Formula"
88
+ LIST_ITEM = "ListItem"
89
+ PAGE = "Page"
90
+ PAGE_FOOTER = "PageFooter"
91
+ PAGE_HEADER = "PageHeader"
92
+ PICTURE = "Picture"
93
+ SECTION_HEADER = "SectionHeader"
94
+ TABLE = "Table"
95
+ TEXT = "Text"
96
+ TITLE = "Title"
97
+
98
+ class Segment(BaseModel):
99
+ bbox: BoundingBox
100
+ content: str
101
+ page_height: float
102
+ html: Optional[str]
103
+ image: Optional[str]
104
+ markdown: Optional[str]
105
+ ocr: List[OCRResult]
106
+ page_number: int
107
+ page_width: float
108
+ segment_id: str
109
+ segment_type: SegmentType
110
+
111
+ class Chunk(BaseModel):
112
+ chunk_id: str
113
+ chunk_length: int
114
+ segments: List[Segment]
115
+
116
+ class ExtractedJson(BaseModel):
117
+ data: Dict
118
+
119
+ class OutputResponse(BaseModel):
120
+ chunks: List[Chunk] = []
121
+ extracted_json: Optional[ExtractedJson]
122
+
123
+ class Model(str, Enum):
124
+ FAST = "Fast"
125
+ HIGH_QUALITY = "HighQuality"
126
+
127
+ class Configuration(BaseModel):
128
+ chunk_processing: ChunkProcessing = Field(default_factory=ChunkProcessing)
129
+ expires_in: Optional[int] = None
130
+ high_resolution: bool = False
131
+ json_schema: Optional[JsonSchema] = None
132
+ model: Optional[Model] = Field(None, deprecated=True)
133
+ ocr_strategy: OcrStrategy = OcrStrategy.AUTO
134
+ segment_processing: SegmentProcessing = Field(default_factory=SegmentProcessing)
135
+ segmentation_strategy: SegmentationStrategy = SegmentationStrategy.LAYOUT_ANALYSIS
136
+ target_chunk_length: Optional[int] = Field(None, deprecated=True)
137
+
138
+
139
+ class Status(str, Enum):
140
+ STARTING = "Starting"
141
+ PROCESSING = "Processing"
142
+ SUCCEEDED = "Succeeded"
143
+ FAILED = "Failed"
144
+
145
+ class TaskResponse(BaseModel, HeadersMixin):
146
+ configuration: Configuration
147
+ created_at: datetime
148
+ expires_at: Optional[datetime]
149
+ file_name: Optional[str]
150
+ finished_at: Optional[datetime]
151
+ input_file_url: Optional[str]
152
+ message: str
153
+ output: Optional[OutputResponse]
154
+ page_count: Optional[int]
155
+ pdf_url: Optional[str]
156
+ status: Status
157
+ task_id: str
158
+ task_url: Optional[str]
159
+ _api_key: Optional[str] = PrivateAttr(default=None)
160
+
161
+ def with_api_key(self, api_key: str) -> 'TaskResponse':
162
+ """Helper function to set api key on a TaskResponse after creation"""
163
+ self._api_key = api_key
164
+ return self
165
+
166
+ def poll(self) -> 'TaskResponse':
167
+ """Poll the task for completion"""
168
+ if not self.task_url:
169
+ raise ValueError("Task URL not found in response")
170
+
171
+ while True:
172
+ r = requests.get(self.task_url, headers=self._headers())
173
+ r.raise_for_status()
174
+ self.__dict__.update(r.json())
175
+ if self.status == "Failed":
176
+ raise ValueError(self.message)
177
+ if self.status not in ("Starting", "Processing"):
178
+ return self
179
+ time.sleep(0.5)
180
+
181
+ async def poll_async(self) -> 'TaskResponse':
182
+ """Async poll the task for completion"""
183
+ if not self.task_url:
184
+ raise ValueError("Task URL not found in response")
185
+
186
+ async with httpx.AsyncClient() as client:
187
+ while True:
188
+ r = await client.get(self.task_url, headers=self._headers())
189
+ r.raise_for_status()
190
+ self.__dict__.update(r.json())
191
+ if self.status == "Failed":
192
+ raise ValueError(self.message)
193
+ if self.status not in ("Starting", "Processing"):
194
+ return self
195
+ await asyncio.sleep(0.5)
196
+
197
+
198
+ def _get_content(self, content_type: str) -> str:
199
+ """Helper method to get either HTML, Markdown, or raw content."""
200
+ if not self.output:
201
+ return ""
202
+ parts = []
203
+ for c in self.output.chunks:
204
+ for s in c.segments:
205
+ content = getattr(s, content_type)
206
+ if content:
207
+ parts.append(content)
208
+ return "\n".join(parts)
209
+
210
+ def html(self) -> str:
211
+ """Get full HTML for the task"""
212
+ return self._get_content("html")
213
+
214
+ def markdown(self) -> str:
215
+ """Get full markdown for the task"""
216
+ return self._get_content("markdown")
217
+
218
+ def content(self) -> str:
219
+ """Get full text for the task"""
220
+ return self._get_content("content")
221
+
222
+ class TaskPayload(BaseModel):
223
+ current_configuration: Configuration
224
+ file_name: str
225
+ image_folder_location: str
226
+ input_location: str
227
+ output_location: str
228
+ pdf_location: str
229
+ previous_configuration: Optional[Configuration]
230
+ task_id: str
231
+ user_id: str
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.2
2
+ Name: chunkr-ai
3
+ Version: 0.0.2
4
+ Summary: Python client for chunkr: open source document intelligence
5
+ Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: build>=1.2.2.post1
9
+ Requires-Dist: httpx>=0.28.1
10
+ Requires-Dist: pillow>=11.1.0
11
+ Requires-Dist: pydantic>=2.10.4
12
+ Requires-Dist: python-dotenv>=1.0.1
13
+ Requires-Dist: requests>=2.32.3
14
+ Requires-Dist: twine>=6.0.1
15
+ Provides-Extra: test
16
+ Requires-Dist: pytest>=8.3.4; extra == "test"
@@ -0,0 +1,16 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/chunkr_ai/__init__.py
5
+ src/chunkr_ai/main.py
6
+ src/chunkr_ai.egg-info/PKG-INFO
7
+ src/chunkr_ai.egg-info/SOURCES.txt
8
+ src/chunkr_ai.egg-info/dependency_links.txt
9
+ src/chunkr_ai.egg-info/requires.txt
10
+ src/chunkr_ai.egg-info/top_level.txt
11
+ src/chunkr_ai/api/api.py
12
+ src/chunkr_ai/api/auth.py
13
+ src/chunkr_ai/api/chunkr.py
14
+ src/chunkr_ai/api/chunkr_async.py
15
+ src/chunkr_ai/api/models.py
16
+ tests/test_chunkr.py
@@ -0,0 +1,10 @@
1
+ build>=1.2.2.post1
2
+ httpx>=0.28.1
3
+ pillow>=11.1.0
4
+ pydantic>=2.10.4
5
+ python-dotenv>=1.0.1
6
+ requests>=2.32.3
7
+ twine>=6.0.1
8
+
9
+ [test]
10
+ pytest>=8.3.4
@@ -0,0 +1,69 @@
1
+ import pytest
2
+ import os
3
+ from pathlib import Path
4
+ from PIL import Image
5
+ import io
6
+ from chunkr_ai import Chunkr, ChunkrAsync
7
+ from chunkr_ai.api.models import TaskResponse
8
+
9
+ # Test fixtures
10
+ @pytest.fixture
11
+ def chunkr():
12
+ return Chunkr()
13
+
14
+ @pytest.fixture
15
+ def async_chunkr():
16
+ return ChunkrAsync()
17
+
18
+ @pytest.fixture
19
+ def sample_pdf():
20
+ # Create a temporary PDF file for testing
21
+ content = b"%PDF-1.4 test content"
22
+ pdf_path = Path("test_document.pdf")
23
+ pdf_path.write_bytes(content)
24
+ yield str(pdf_path)
25
+ pdf_path.unlink() # Cleanup after tests
26
+
27
+ @pytest.fixture
28
+ def sample_image():
29
+ # Create a test image
30
+ img = Image.new('RGB', (100, 100), color='red')
31
+ return img
32
+
33
+ def test_prepare_file_string(chunkr, sample_pdf):
34
+ filename, file_obj = chunkr._prepare_file(sample_pdf)
35
+ assert filename == "test_document.pdf"
36
+ assert hasattr(file_obj, 'read')
37
+
38
+ def test_prepare_file_image(chunkr, sample_image):
39
+ filename, file_obj = chunkr._prepare_file(sample_image)
40
+ assert filename == "image.png"
41
+ assert isinstance(file_obj, io.BytesIO)
42
+
43
+ def test_prepare_file_bytes(chunkr):
44
+ test_bytes = b"test content"
45
+ filename, file_obj = chunkr._prepare_file(test_bytes)
46
+ assert filename == "document"
47
+ assert isinstance(file_obj, io.BytesIO)
48
+
49
+ def test_send_file_string(chunkr, sample_pdf):
50
+ response = chunkr.upload(sample_pdf)
51
+
52
+ assert isinstance(response, TaskResponse)
53
+ assert response.task_id is not None
54
+ assert response.status in ["pending", "processing", "completed"]
55
+
56
+ def test_send_file_image(chunkr, sample_image):
57
+ response = chunkr.upload(sample_image)
58
+
59
+ assert isinstance(response, TaskResponse)
60
+ assert response.task_id is not None
61
+ assert response.status in ["pending", "processing", "completed"]
62
+
63
+ def test_send_file_bytes(chunkr):
64
+ test_bytes = b"This is a test document content"
65
+ response = chunkr.upload(test_bytes)
66
+
67
+ assert isinstance(response, TaskResponse)
68
+ assert response.task_id is not None
69
+ assert response.status in ["pending", "processing", "completed"]
chunkr_ai-0.0.1/PKG-INFO DELETED
@@ -1,7 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: chunkr-ai
3
- Version: 0.0.1
4
- Summary: PDF chunking
5
- Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
- Description-Content-Type: text/markdown
7
- License-File: LICENSE
@@ -1,11 +0,0 @@
1
- [build-system]
2
- requires = ["setuptools>=42", "wheel"]
3
- build-backend = "setuptools.build_meta"
4
-
5
- [project]
6
- name = "chunkr-ai"
7
- version = "0.0.1"
8
- authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
- description = "PDF chunking"
10
- readme = "README.md"
11
- license = {"file" = "LICENSE"}
@@ -1,7 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: chunkr-ai
3
- Version: 0.0.1
4
- Summary: PDF chunking
5
- Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
- Description-Content-Type: text/markdown
7
- License-File: LICENSE
@@ -1,9 +0,0 @@
1
- LICENSE
2
- README.md
3
- pyproject.toml
4
- src/chunkr_ai/__init__.py
5
- src/chunkr_ai/main.py
6
- src/chunkr_ai.egg-info/PKG-INFO
7
- src/chunkr_ai.egg-info/SOURCES.txt
8
- src/chunkr_ai.egg-info/dependency_links.txt
9
- src/chunkr_ai.egg-info/top_level.txt
File without changes
File without changes
File without changes