chunkr-ai 0.0.1__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ Metadata-Version: 2.2
2
+ Name: chunkr-ai
3
+ Version: 0.0.3
4
+ Summary: Python client for Chunkr: open source document intelligence
5
+ Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
+ Project-URL: Homepage, https://chunkr.ai
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: httpx>=0.28.1
10
+ Requires-Dist: pillow>=11.1.0
11
+ Requires-Dist: pydantic>=2.10.4
12
+ Requires-Dist: python-dotenv>=1.0.1
13
+ Requires-Dist: requests>=2.32.3
14
+ Provides-Extra: test
15
+ Requires-Dist: pytest>=8.3.4; extra == "test"
16
+ Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
17
+
18
+ # Chunkr Python Client
19
+
20
+ This is the Python client for the Chunkr API. It provides a simple interface to interact with Chunkr's services.
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ pip install chunkr-ai
26
+ ```
27
+
28
+ ## Usage
29
+
30
+ We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
31
+
32
+ ### Synchronous Usage
33
+
34
+ ```python
35
+ from chunkr_ai import Chunkr
36
+
37
+ # Initialize client
38
+ chunkr = Chunkr()
39
+
40
+ # Upload a file and wait for processing
41
+ task = chunkr.upload("document.pdf")
42
+
43
+ # Print the response
44
+ print(task)
45
+
46
+ # Get output from task
47
+ output = task.output
48
+
49
+ # If you want to upload without waiting for processing
50
+ task = chunkr.start_upload("document.pdf")
51
+ # ... do other things ...
52
+ task.poll() # Check status when needed
53
+ ```
54
+
55
+ ### Asynchronous Usage
56
+
57
+ ```python
58
+ from chunkr_ai import ChunkrAsync
59
+
60
+ async def process_document():
61
+ # Initialize client
62
+ chunkr = ChunkrAsync()
63
+
64
+ # Upload a file and wait for processing
65
+ task = await chunkr.upload("document.pdf")
66
+
67
+ # Print the response
68
+ print(task)
69
+
70
+ # Get output from task
71
+ output = task.output
72
+
73
+ # If you want to upload without waiting for processing
74
+ task = await chunkr.start_upload("document.pdf")
75
+ # ... do other things ...
76
+ await task.poll_async() # Check status when needed
77
+ ```
78
+
79
+ ### Additional Features
80
+
81
+ Both clients support various input types:
82
+
83
+ ```python
84
+ # Upload from file path
85
+ chunkr.upload("document.pdf")
86
+
87
+ # Upload from opened file
88
+ with open("document.pdf", "rb") as f:
89
+ chunkr.upload(f)
90
+
91
+ # Upload from URL
92
+ chunkr.upload("https://example.com/document.pdf")
93
+
94
+ # Upload from base64 string
95
+ chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
96
+
97
+ # Upload an image
98
+ from PIL import Image
99
+ img = Image.open("photo.jpg")
100
+ chunkr.upload(img)
101
+ ```
102
+
103
+ ### Configuration
104
+
105
+ You can provide your API key and URL in several ways:
106
+ 1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
107
+ 2. `.env` file
108
+ 3. Direct initialization:
109
+ ```python
110
+ chunkr = Chunkr(
111
+ api_key="your-api-key",
112
+ url="https://api.chunkr.ai"
113
+ )
114
+ ```
115
+
116
+ ## Run tests
117
+
118
+ ```python
119
+ # Install dependencies
120
+ uv pip install -e ".[test]"
121
+
122
+ # Run tests
123
+ uv run pytest
124
+ ```
@@ -0,0 +1,107 @@
1
+ # Chunkr Python Client
2
+
3
+ This is the Python client for the Chunkr API. It provides a simple interface to interact with Chunkr's services.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install chunkr-ai
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
14
+
15
+ ### Synchronous Usage
16
+
17
+ ```python
18
+ from chunkr_ai import Chunkr
19
+
20
+ # Initialize client
21
+ chunkr = Chunkr()
22
+
23
+ # Upload a file and wait for processing
24
+ task = chunkr.upload("document.pdf")
25
+
26
+ # Print the response
27
+ print(task)
28
+
29
+ # Get output from task
30
+ output = task.output
31
+
32
+ # If you want to upload without waiting for processing
33
+ task = chunkr.start_upload("document.pdf")
34
+ # ... do other things ...
35
+ task.poll() # Check status when needed
36
+ ```
37
+
38
+ ### Asynchronous Usage
39
+
40
+ ```python
41
+ from chunkr_ai import ChunkrAsync
42
+
43
+ async def process_document():
44
+ # Initialize client
45
+ chunkr = ChunkrAsync()
46
+
47
+ # Upload a file and wait for processing
48
+ task = await chunkr.upload("document.pdf")
49
+
50
+ # Print the response
51
+ print(task)
52
+
53
+ # Get output from task
54
+ output = task.output
55
+
56
+ # If you want to upload without waiting for processing
57
+ task = await chunkr.start_upload("document.pdf")
58
+ # ... do other things ...
59
+ await task.poll_async() # Check status when needed
60
+ ```
61
+
62
+ ### Additional Features
63
+
64
+ Both clients support various input types:
65
+
66
+ ```python
67
+ # Upload from file path
68
+ chunkr.upload("document.pdf")
69
+
70
+ # Upload from opened file
71
+ with open("document.pdf", "rb") as f:
72
+ chunkr.upload(f)
73
+
74
+ # Upload from URL
75
+ chunkr.upload("https://example.com/document.pdf")
76
+
77
+ # Upload from base64 string
78
+ chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
79
+
80
+ # Upload an image
81
+ from PIL import Image
82
+ img = Image.open("photo.jpg")
83
+ chunkr.upload(img)
84
+ ```
85
+
86
+ ### Configuration
87
+
88
+ You can provide your API key and URL in several ways:
89
+ 1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
90
+ 2. `.env` file
91
+ 3. Direct initialization:
92
+ ```python
93
+ chunkr = Chunkr(
94
+ api_key="your-api-key",
95
+ url="https://api.chunkr.ai"
96
+ )
97
+ ```
98
+
99
+ ## Run tests
100
+
101
+ ```python
102
+ # Install dependencies
103
+ uv pip install -e ".[test]"
104
+
105
+ # Run tests
106
+ uv run pytest
107
+ ```
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=42", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "chunkr-ai"
7
+ version = "0.0.3"
8
+ authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
+ description = "Python client for Chunkr: open source document intelligence"
10
+ readme = "README.md"
11
+ license = {"file" = "LICENSE"}
12
+ urls = {Homepage = "https://chunkr.ai"}
13
+ dependencies = [
14
+ "httpx>=0.28.1",
15
+ "pillow>=11.1.0",
16
+ "pydantic>=2.10.4",
17
+ "python-dotenv>=1.0.1",
18
+ "requests>=2.32.3",
19
+ ]
20
+
21
+ [project.optional-dependencies]
22
+ test = [
23
+ "pytest>=8.3.4",
24
+ "pytest-xdist>=3.6.1",
25
+ ]
@@ -0,0 +1,4 @@
1
+ from .api.chunkr import Chunkr
2
+ from .api.chunkr_async import ChunkrAsync
3
+
4
+ __all__ = ['Chunkr', 'ChunkrAsync']
@@ -0,0 +1,12 @@
1
+ class HeadersMixin:
2
+ """Mixin class for handling authorization headers"""
3
+
4
+ def get_api_key(self) -> str:
5
+ """Get the API key"""
6
+ if not hasattr(self, '_api_key') or not self._api_key:
7
+ raise ValueError("API key not set")
8
+ return self._api_key
9
+
10
+ def _headers(self) -> dict:
11
+ """Generate authorization headers"""
12
+ return {"Authorization": self.get_api_key()}
@@ -0,0 +1,173 @@
1
+ from .config import Configuration
2
+ from .task import TaskResponse
3
+ from .auth import HeadersMixin
4
+ from abc import abstractmethod
5
+ from dotenv import load_dotenv
6
+ import io
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ from PIL import Image
11
+ import requests
12
+ from typing import BinaryIO, Tuple, Union
13
+
14
+ class ChunkrBase(HeadersMixin):
15
+ """Base class with shared functionality for Chunkr API clients."""
16
+
17
+ def __init__(self, url: str = None, api_key: str = None):
18
+ load_dotenv()
19
+ self.url = (
20
+ url or
21
+ os.getenv('CHUNKR_URL') or
22
+ 'https://api.chunkr.ai'
23
+ )
24
+ self._api_key = (
25
+ api_key or
26
+ os.getenv('CHUNKR_API_KEY')
27
+ )
28
+ if not self._api_key:
29
+ raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
30
+
31
+ self.url = self.url.rstrip("/")
32
+
33
+ def _prepare_file(
34
+ self,
35
+ file: Union[str, Path, BinaryIO, Image.Image]
36
+ ) -> Tuple[str, BinaryIO]:
37
+ """Convert various file types into a tuple of (filename, file-like object).
38
+
39
+ Args:
40
+ file: Input file, can be:
41
+ - String or Path to a file
42
+ - URL string starting with http:// or https://
43
+ - Base64 string
44
+ - Opened binary file (mode='rb')
45
+ - PIL/Pillow Image object
46
+
47
+ Returns:
48
+ Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
49
+
50
+ Raises:
51
+ FileNotFoundError: If the file path doesn't exist
52
+ TypeError: If the file type is not supported
53
+ ValueError: If the URL is invalid or unreachable
54
+ ValueError: If the MIME type is unsupported
55
+ """
56
+ # Handle URLs
57
+ if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
58
+ response = requests.get(file)
59
+ response.raise_for_status()
60
+ file_obj = io.BytesIO(response.content)
61
+ filename = Path(file.split('/')[-1]).name or 'downloaded_file'
62
+ return filename, file_obj
63
+
64
+ # Handle base64 strings
65
+ if isinstance(file, str) and ',' in file and ';base64,' in file:
66
+ try:
67
+ # Split header and data
68
+ header, base64_data = file.split(',', 1)
69
+ import base64
70
+ file_bytes = base64.b64decode(base64_data)
71
+ file_obj = io.BytesIO(file_bytes)
72
+
73
+ # Try to determine format from header
74
+ format = 'bin'
75
+ mime_type = header.split(':')[-1].split(';')[0].lower()
76
+
77
+ # Map MIME types to file extensions
78
+ mime_to_ext = {
79
+ 'application/pdf': 'pdf',
80
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
81
+ 'application/msword': 'doc',
82
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
83
+ 'application/vnd.ms-powerpoint': 'ppt',
84
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
85
+ 'application/vnd.ms-excel': 'xls',
86
+ 'image/jpeg': 'jpg',
87
+ 'image/png': 'png',
88
+ 'image/jpg': 'jpg'
89
+ }
90
+
91
+ if mime_type in mime_to_ext:
92
+ format = mime_to_ext[mime_type]
93
+ else:
94
+ raise ValueError(f"Unsupported MIME type: {mime_type}")
95
+
96
+ return f"file.{format}", file_obj
97
+ except Exception as e:
98
+ raise ValueError(f"Invalid base64 string: {str(e)}")
99
+
100
+ # Handle file paths
101
+ if isinstance(file, (str, Path)):
102
+ path = Path(file).resolve()
103
+ if not path.exists():
104
+ raise FileNotFoundError(f"File not found: {file}")
105
+ return path.name, open(path, 'rb')
106
+
107
+ # Handle PIL Images
108
+ if isinstance(file, Image.Image):
109
+ img_byte_arr = io.BytesIO()
110
+ format = file.format or 'PNG'
111
+ file.save(img_byte_arr, format=format)
112
+ img_byte_arr.seek(0)
113
+ return f"image.{format.lower()}", img_byte_arr
114
+
115
+ # Handle file-like objects
116
+ if hasattr(file, 'read') and hasattr(file, 'seek'):
117
+ # Try to get the filename from the file object if possible
118
+ name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
119
+ return Path(name).name, file
120
+
121
+ raise TypeError(f"Unsupported file type: {type(file)}")
122
+
123
+ def _prepare_upload_data(
124
+ self,
125
+ file: Union[str, Path, BinaryIO, Image.Image],
126
+ config: Configuration = None
127
+ ) -> Tuple[dict, dict]:
128
+ """Prepare files and data dictionaries for upload.
129
+
130
+ Args:
131
+ file: The file to upload
132
+ config: Optional configuration settings
133
+
134
+ Returns:
135
+ Tuple[dict, dict]: (files dict, data dict) ready for upload
136
+ """
137
+ filename, file_obj = self._prepare_file(file)
138
+ files = {"file": (filename, file_obj)}
139
+ data = {}
140
+
141
+ if config:
142
+ config_dict = config.model_dump(mode="json", exclude_none=True)
143
+ for key, value in config_dict.items():
144
+ if isinstance(value, dict):
145
+ files[key] = (None, json.dumps(value), 'application/json')
146
+ else:
147
+ data[key] = value
148
+
149
+ return files, data
150
+
151
+ @abstractmethod
152
+ def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
153
+ """Upload a file and wait for processing to complete.
154
+
155
+ Must be implemented by subclasses.
156
+ """
157
+ pass
158
+
159
+ @abstractmethod
160
+ def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
161
+ """Upload a file for processing and immediately return the task response.
162
+
163
+ Must be implemented by subclasses.
164
+ """
165
+ pass
166
+
167
+ @abstractmethod
168
+ def get_task(self, task_id: str) -> TaskResponse:
169
+ """Get a task response by its ID.
170
+
171
+ Must be implemented by subclasses.
172
+ """
173
+ pass
@@ -0,0 +1,108 @@
1
+ from .base import ChunkrBase
2
+ from .config import Configuration
3
+ from .task import TaskResponse
4
+ from pathlib import Path
5
+ from PIL import Image
6
+ import requests
7
+ from typing import Union, BinaryIO
8
+
9
+ class Chunkr(ChunkrBase):
10
+ """Chunkr API client"""
11
+
12
+ def __init__(self, url: str = None, api_key: str = None):
13
+ super().__init__(url, api_key)
14
+ self._session = requests.Session()
15
+
16
+ def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
17
+ """Upload a file and wait for processing to complete.
18
+
19
+ Args:
20
+ file: The file to upload.
21
+ config: Configuration options for processing. Optional.
22
+
23
+ Examples:
24
+ ```
25
+ # Upload from file path
26
+ chunkr.upload("document.pdf")
27
+
28
+ # Upload from URL
29
+ chunkr.upload("https://example.com/document.pdf")
30
+
31
+ # Upload from base64 string (must include MIME type header)
32
+ chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
33
+
34
+ # Upload from opened file
35
+ with open("document.pdf", "rb") as f:
36
+ chunkr.upload(f)
37
+
38
+ # Upload an image
39
+ from PIL import Image
40
+ img = Image.open("photo.jpg")
41
+ chunkr.upload(img)
42
+ ```
43
+ Returns:
44
+ TaskResponse: The completed task response
45
+ """
46
+ task = self.start_upload(file, config)
47
+ return task.poll()
48
+
49
+ def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
50
+ """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
51
+
52
+ Args:
53
+ file: The file to upload.
54
+ config: Configuration options for processing. Optional.
55
+
56
+ Examples:
57
+ ```
58
+ # Upload from file path
59
+ task = chunkr.start_upload("document.pdf")
60
+
61
+ # Upload from opened file
62
+ with open("document.pdf", "rb") as f:
63
+ task = chunkr.start_upload(f)
64
+
65
+ # Upload from URL
66
+ task = chunkr.start_upload("https://example.com/document.pdf")
67
+
68
+ # Upload from base64 string (must include MIME type header)
69
+ task = chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
70
+
71
+ # Upload an image
72
+ from PIL import Image
73
+ img = Image.open("photo.jpg")
74
+ task = chunkr.start_upload(img)
75
+
76
+ # Wait for the task to complete - this can be done when needed
77
+ task.poll()
78
+ ```
79
+
80
+ Returns:
81
+ TaskResponse: The initial task response
82
+ """
83
+ files, data = self._prepare_upload_data(file, config)
84
+ r = self._session.post(
85
+ f"{self.url}/api/v1/task",
86
+ files=files,
87
+ data=data,
88
+ headers=self._headers()
89
+ )
90
+ r.raise_for_status()
91
+ return TaskResponse(**r.json()).with_client(self)
92
+
93
+ def get_task(self, task_id: str) -> TaskResponse:
94
+ """Get a task response by its ID.
95
+
96
+ Args:
97
+ task_id: The ID of the task to get
98
+
99
+ Returns:
100
+ TaskResponse: The task response
101
+ """
102
+ r = self._session.get(
103
+ f"{self.url}/api/v1/task/{task_id}",
104
+ headers=self._headers()
105
+ )
106
+ r.raise_for_status()
107
+ return TaskResponse(**r.json()).with_client(self)
108
+
@@ -0,0 +1,105 @@
1
+ from .base import ChunkrBase
2
+ from .task import TaskResponse
3
+ from .config import Configuration
4
+ import httpx
5
+ from pathlib import Path
6
+ from PIL import Image
7
+ from typing import Union, BinaryIO
8
+
9
+ class ChunkrAsync(ChunkrBase):
10
+ """Asynchronous Chunkr API client"""
11
+
12
+ def __init__(self, url: str = None, api_key: str = None):
13
+ super().__init__(url, api_key)
14
+ self._client = httpx.AsyncClient()
15
+
16
+ async def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
17
+ """Upload a file and wait for processing to complete.
18
+
19
+ Args:
20
+ file: The file to upload.
21
+ config: Configuration options for processing. Optional.
22
+
23
+ Examples:
24
+ ```python
25
+ # Upload from file path
26
+ await chunkr.upload("document.pdf")
27
+
28
+ # Upload from opened file
29
+ with open("document.pdf", "rb") as f:
30
+ await chunkr.upload(f)
31
+
32
+ # Upload from URL
33
+ await chunkr.upload("https://example.com/document.pdf")
34
+
35
+ # Upload from base64 string (must include MIME type header)
36
+ await chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
37
+
38
+ # Upload an image
39
+ from PIL import Image
40
+ img = Image.open("photo.jpg")
41
+ await chunkr.upload(img)
42
+ ```
43
+ Returns:
44
+ TaskResponse: The completed task response
45
+ """
46
+ task = await self.start_upload(file, config)
47
+ return await task.poll_async()
48
+
49
+ async def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
50
+ """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
51
+
52
+ Args:
53
+ file: The file to upload.
54
+ config: Configuration options for processing. Optional.
55
+
56
+ Examples:
57
+ ```
58
+ # Upload from file path
59
+ task = await chunkr.start_upload("document.pdf")
60
+
61
+ # Upload from opened file
62
+ with open("document.pdf", "rb") as f:
63
+ task = await chunkr.start_upload(f)
64
+
65
+ # Upload from URL
66
+ task = await chunkr.start_upload("https://example.com/document.pdf")
67
+
68
+ # Upload from base64 string (must include MIME type header)
69
+ task = await chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
70
+
71
+ # Upload an image
72
+ from PIL import Image
73
+ img = Image.open("photo.jpg")
74
+ task = await chunkr.start_upload(img)
75
+
76
+ # Wait for the task to complete - this can be done when needed
77
+ await task.poll_async()
78
+ ```
79
+
80
+ Returns:
81
+ TaskResponse: The initial task response
82
+ """
83
+ files, data = self._prepare_upload_data(file, config)
84
+ r = await self._client.post(
85
+ f"{self.url}/api/v1/task",
86
+ files=files,
87
+ json=config.model_dump() if config else {},
88
+ headers=self._headers()
89
+ )
90
+ r.raise_for_status()
91
+ return TaskResponse(**r.json()).with_client(self)
92
+
93
+ async def get_task(self, task_id: str) -> TaskResponse:
94
+ r = await self._client.get(
95
+ f"{self.url}/api/v1/task/{task_id}",
96
+ headers=self._headers()
97
+ )
98
+ r.raise_for_status()
99
+ return TaskResponse(**r.json()).with_client(self)
100
+
101
+ async def __aenter__(self):
102
+ return self
103
+
104
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
105
+ await self._client.aclose()
@@ -0,0 +1,130 @@
1
+ from pydantic import BaseModel, Field
2
+ from enum import Enum
3
+ from typing import Optional, List, Dict
4
+
5
+ class GenerationStrategy(str, Enum):
6
+ LLM = "LLM"
7
+ AUTO = "Auto"
8
+
9
+ class CroppingStrategy(str, Enum):
10
+ ALL = "All"
11
+ AUTO = "Auto"
12
+
13
+ class LlmConfig(BaseModel):
14
+ model: str
15
+ prompt: str
16
+ temperature: float = 0.0
17
+
18
+ class GenerationConfig(BaseModel):
19
+ html: Optional[GenerationStrategy] = None
20
+ llm: Optional[LlmConfig] = None
21
+ markdown: Optional[GenerationStrategy] = None
22
+ crop_image: Optional[CroppingStrategy] = None
23
+
24
+ class SegmentProcessing(BaseModel):
25
+ title: Optional[GenerationConfig] = None
26
+ section_header: Optional[GenerationConfig] = None
27
+ text: Optional[GenerationConfig] = None
28
+ list_item: Optional[GenerationConfig] = None
29
+ table: Optional[GenerationConfig] = None
30
+ picture: Optional[GenerationConfig] = None
31
+ caption: Optional[GenerationConfig] = None
32
+ formula: Optional[GenerationConfig] = None
33
+ footnote: Optional[GenerationConfig] = None
34
+ page_header: Optional[GenerationConfig] = None
35
+ page_footer: Optional[GenerationConfig] = None
36
+ page: Optional[GenerationConfig] = None
37
+
38
+ class ChunkProcessing(BaseModel):
39
+ target_length: Optional[int] = None
40
+
41
+ class Property(BaseModel):
42
+ name: str
43
+ title: Optional[str]
44
+ prop_type: str
45
+ description: Optional[str]
46
+ default: Optional[str]
47
+
48
+ class JsonSchema(BaseModel):
49
+ title: str
50
+ properties: List[Property]
51
+ schema_type: Optional[str]
52
+
53
+ class OcrStrategy(str, Enum):
54
+ ALL = "All"
55
+ AUTO = "Auto"
56
+
57
+ class SegmentationStrategy(str, Enum):
58
+ LAYOUT_ANALYSIS = "LayoutAnalysis"
59
+ PAGE = "Page"
60
+
61
+ class BoundingBox(BaseModel):
62
+ left: float
63
+ top: float
64
+ width: float
65
+ height: float
66
+
67
+ class OCRResult(BaseModel):
68
+ bbox: BoundingBox
69
+ text: str
70
+ confidence: Optional[float]
71
+
72
+ class SegmentType(str, Enum):
73
+ CAPTION = "Caption"
74
+ FOOTNOTE = "Footnote"
75
+ FORMULA = "Formula"
76
+ LIST_ITEM = "ListItem"
77
+ PAGE = "Page"
78
+ PAGE_FOOTER = "PageFooter"
79
+ PAGE_HEADER = "PageHeader"
80
+ PICTURE = "Picture"
81
+ SECTION_HEADER = "SectionHeader"
82
+ TABLE = "Table"
83
+ TEXT = "Text"
84
+ TITLE = "Title"
85
+
86
+ class Segment(BaseModel):
87
+ bbox: BoundingBox
88
+ content: str
89
+ page_height: float
90
+ html: Optional[str]
91
+ image: Optional[str]
92
+ markdown: Optional[str]
93
+ ocr: List[OCRResult]
94
+ page_number: int
95
+ page_width: float
96
+ segment_id: str
97
+ segment_type: SegmentType
98
+
99
+ class Chunk(BaseModel):
100
+ chunk_id: str
101
+ chunk_length: int
102
+ segments: List[Segment]
103
+
104
+ class ExtractedJson(BaseModel):
105
+ data: Dict
106
+
107
+ class OutputResponse(BaseModel):
108
+ chunks: List[Chunk] = []
109
+ extracted_json: Optional[ExtractedJson]
110
+
111
+ class Model(str, Enum):
112
+ FAST = "Fast"
113
+ HIGH_QUALITY = "HighQuality"
114
+
115
+ class Configuration(BaseModel):
116
+ chunk_processing: Optional[ChunkProcessing] = Field(default=None)
117
+ expires_in: Optional[int] = Field(default=None)
118
+ high_resolution: Optional[bool] = Field(default=None)
119
+ json_schema: Optional[JsonSchema] = Field(default=None)
120
+ model: Optional[Model] = Field(default=None)
121
+ ocr_strategy: Optional[OcrStrategy] = Field(default=None)
122
+ segment_processing: Optional[SegmentProcessing] = Field(default=None)
123
+ segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
124
+ target_chunk_length: Optional[int] = Field(default=None)
125
+
126
+ class Status(str, Enum):
127
+ STARTING = "Starting"
128
+ PROCESSING = "Processing"
129
+ SUCCEEDED = "Succeeded"
130
+ FAILED = "Failed"
@@ -0,0 +1,19 @@
1
+ from typing import runtime_checkable, Protocol
2
+ from requests import Session
3
+ from httpx import AsyncClient
4
+
5
+ @runtime_checkable
6
+ class ChunkrClientProtocol(Protocol):
7
+ """Protocol defining the interface for Chunkr clients"""
8
+ url: str
9
+ _api_key: str
10
+ _session: Session
11
+ _client: AsyncClient
12
+
13
+ def get_api_key(self) -> str:
14
+ """Get the API key"""
15
+ ...
16
+
17
+ def _headers(self) -> dict:
18
+ """Return headers required for API requests"""
19
+ ...
@@ -0,0 +1,124 @@
1
+ from .protocol import ChunkrClientProtocol
2
+ from .config import Configuration, Status, OutputResponse
3
+ import asyncio
4
+ from datetime import datetime
5
+ from pydantic import BaseModel, PrivateAttr
6
+ import time
7
+ from typing import Optional, Union
8
+
9
+ class TaskResponse(BaseModel):
10
+ configuration: Configuration
11
+ created_at: datetime
12
+ expires_at: Optional[datetime]
13
+ file_name: Optional[str]
14
+ finished_at: Optional[datetime]
15
+ input_file_url: Optional[str]
16
+ message: str
17
+ output: Optional[OutputResponse]
18
+ page_count: Optional[int]
19
+ pdf_url: Optional[str]
20
+ status: Status
21
+ task_id: str
22
+ task_url: Optional[str]
23
+ _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
24
+
25
+ def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
26
+ self._client = client
27
+ return self
28
+
29
+ def _poll_request_sync(self) -> dict:
30
+ """Helper method to make polling request with retry logic (synchronous)"""
31
+ if not self.task_url:
32
+ raise ValueError("Task URL not found in response")
33
+
34
+ while True:
35
+ try:
36
+ r = self._client._session.get(self.task_url, headers=self._client._headers())
37
+ r.raise_for_status()
38
+ return r.json()
39
+ except (ConnectionError, TimeoutError) as _:
40
+ print("Connection error while polling the task, retrying...")
41
+ time.sleep(0.5)
42
+ except Exception as e:
43
+ raise
44
+
45
+ async def _poll_request_async(self) -> dict:
46
+ """Helper method to make polling request with retry logic (asynchronous)"""
47
+ if not self.task_url:
48
+ raise ValueError("Task URL not found in response")
49
+
50
+ while True:
51
+ try:
52
+ r = await self._client._client.get(self.task_url, headers=self._client._headers())
53
+ await r.raise_for_status()
54
+ return await r.json()
55
+ except (ConnectionError, TimeoutError) as _:
56
+ print("Connection error while polling the task, retrying...")
57
+ await asyncio.sleep(0.5)
58
+ except Exception as e:
59
+ raise
60
+
61
+ def _check_status(self) -> Optional['TaskResponse']:
62
+ """Helper method to check task status and handle completion/failure"""
63
+ if self.status == "Failed":
64
+ raise ValueError(self.message)
65
+ if self.status not in ("Starting", "Processing"):
66
+ return self
67
+ return None
68
+
69
+ def poll(self) -> 'TaskResponse':
70
+ """Poll the task for completion."""
71
+ while True:
72
+ response = self._poll_request_sync()
73
+ self.__dict__.update(response)
74
+
75
+ if result := self._check_status():
76
+ return result
77
+
78
+ time.sleep(0.5)
79
+
80
+ async def poll_async(self) -> 'TaskResponse':
81
+ """Poll the task for completion asynchronously."""
82
+ while True:
83
+ response = await self._poll_request_async()
84
+ self.__dict__.update(response)
85
+
86
+ if result := self._check_status():
87
+ return result
88
+
89
+ await asyncio.sleep(0.5)
90
+
91
+ def _get_content(self, content_type: str) -> str:
92
+ """Helper method to get either HTML, Markdown, or raw content."""
93
+ if not self.output:
94
+ return ""
95
+ parts = []
96
+ for c in self.output.chunks:
97
+ for s in c.segments:
98
+ content = getattr(s, content_type)
99
+ if content:
100
+ parts.append(content)
101
+ return "\n".join(parts)
102
+
103
+ def html(self) -> str:
104
+ """Get full HTML for the task"""
105
+ return self._get_content("html")
106
+
107
+ def markdown(self) -> str:
108
+ """Get full markdown for the task"""
109
+ return self._get_content("markdown")
110
+
111
+ def content(self) -> str:
112
+ """Get full text for the task"""
113
+ return self._get_content("content")
114
+
115
+ class TaskPayload(BaseModel):
116
+ current_configuration: Configuration
117
+ file_name: str
118
+ image_folder_location: str
119
+ input_location: str
120
+ output_location: str
121
+ pdf_location: str
122
+ previous_configuration: Optional[Configuration]
123
+ task_id: str
124
+ user_id: str
@@ -0,0 +1,49 @@
1
+ from .api.config import (
2
+ BoundingBox,
3
+ Chunk,
4
+ ChunkProcessing,
5
+ Configuration,
6
+ CroppingStrategy,
7
+ ExtractedJson,
8
+ GenerationStrategy,
9
+ GenerationConfig,
10
+ JsonSchema,
11
+ LlmConfig,
12
+ Model,
13
+ OCRResult,
14
+ OcrStrategy,
15
+ OutputResponse,
16
+ Property,
17
+ Segment,
18
+ SegmentProcessing,
19
+ SegmentType,
20
+ SegmentationStrategy,
21
+ Status
22
+ )
23
+
24
+ from .api.task import TaskResponse, TaskPayload
25
+
26
+ __all__ = [
27
+ 'BoundingBox',
28
+ 'Chunk',
29
+ 'ChunkProcessing',
30
+ 'Configuration',
31
+ 'CroppingStrategy',
32
+ 'ExtractedJson',
33
+ 'GenerationConfig',
34
+ 'GenerationStrategy',
35
+ 'JsonSchema',
36
+ 'LlmConfig',
37
+ 'Model',
38
+ 'OCRResult',
39
+ 'OcrStrategy',
40
+ 'OutputResponse',
41
+ 'Property',
42
+ 'Segment',
43
+ 'SegmentProcessing',
44
+ 'SegmentType',
45
+ 'SegmentationStrategy',
46
+ 'Status',
47
+ 'TaskPayload',
48
+ 'TaskResponse'
49
+ ]
@@ -0,0 +1,124 @@
1
+ Metadata-Version: 2.2
2
+ Name: chunkr-ai
3
+ Version: 0.0.3
4
+ Summary: Python client for Chunkr: open source document intelligence
5
+ Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
+ Project-URL: Homepage, https://chunkr.ai
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: httpx>=0.28.1
10
+ Requires-Dist: pillow>=11.1.0
11
+ Requires-Dist: pydantic>=2.10.4
12
+ Requires-Dist: python-dotenv>=1.0.1
13
+ Requires-Dist: requests>=2.32.3
14
+ Provides-Extra: test
15
+ Requires-Dist: pytest>=8.3.4; extra == "test"
16
+ Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
17
+
18
+ # Chunkr Python Client
19
+
20
+ This is the Python client for the Chunkr API. It provides a simple interface to interact with Chunkr's services.
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ pip install chunkr-ai
26
+ ```
27
+
28
+ ## Usage
29
+
30
+ We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
31
+
32
+ ### Synchronous Usage
33
+
34
+ ```python
35
+ from chunkr_ai import Chunkr
36
+
37
+ # Initialize client
38
+ chunkr = Chunkr()
39
+
40
+ # Upload a file and wait for processing
41
+ task = chunkr.upload("document.pdf")
42
+
43
+ # Print the response
44
+ print(task)
45
+
46
+ # Get output from task
47
+ output = task.output
48
+
49
+ # If you want to upload without waiting for processing
50
+ task = chunkr.start_upload("document.pdf")
51
+ # ... do other things ...
52
+ task.poll() # Check status when needed
53
+ ```
54
+
55
+ ### Asynchronous Usage
56
+
57
+ ```python
58
+ from chunkr_ai import ChunkrAsync
59
+
60
+ async def process_document():
61
+ # Initialize client
62
+ chunkr = ChunkrAsync()
63
+
64
+ # Upload a file and wait for processing
65
+ task = await chunkr.upload("document.pdf")
66
+
67
+ # Print the response
68
+ print(task)
69
+
70
+ # Get output from task
71
+ output = task.output
72
+
73
+ # If you want to upload without waiting for processing
74
+ task = await chunkr.start_upload("document.pdf")
75
+ # ... do other things ...
76
+ await task.poll_async() # Check status when needed
77
+ ```
78
+
79
+ ### Additional Features
80
+
81
+ Both clients support various input types:
82
+
83
+ ```python
84
+ # Upload from file path
85
+ chunkr.upload("document.pdf")
86
+
87
+ # Upload from opened file
88
+ with open("document.pdf", "rb") as f:
89
+ chunkr.upload(f)
90
+
91
+ # Upload from URL
92
+ chunkr.upload("https://example.com/document.pdf")
93
+
94
+ # Upload from base64 string
95
+ chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
96
+
97
+ # Upload an image
98
+ from PIL import Image
99
+ img = Image.open("photo.jpg")
100
+ chunkr.upload(img)
101
+ ```
102
+
103
+ ### Configuration
104
+
105
+ You can provide your API key and URL in several ways:
106
+ 1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
107
+ 2. `.env` file
108
+ 3. Direct initialization:
109
+ ```python
110
+ chunkr = Chunkr(
111
+ api_key="your-api-key",
112
+ url="https://api.chunkr.ai"
113
+ )
114
+ ```
115
+
116
+ ## Run tests
117
+
118
+ ```python
119
+ # Install dependencies
120
+ uv pip install -e ".[test]"
121
+
122
+ # Run tests
123
+ uv run pytest
124
+ ```
@@ -0,0 +1,21 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/chunkr_ai/__init__.py
5
+ src/chunkr_ai/main.py
6
+ src/chunkr_ai/models.py
7
+ src/chunkr_ai.egg-info/PKG-INFO
8
+ src/chunkr_ai.egg-info/SOURCES.txt
9
+ src/chunkr_ai.egg-info/dependency_links.txt
10
+ src/chunkr_ai.egg-info/requires.txt
11
+ src/chunkr_ai.egg-info/top_level.txt
12
+ src/chunkr_ai/api/__init__.py
13
+ src/chunkr_ai/api/api.py
14
+ src/chunkr_ai/api/auth.py
15
+ src/chunkr_ai/api/base.py
16
+ src/chunkr_ai/api/chunkr.py
17
+ src/chunkr_ai/api/chunkr_async.py
18
+ src/chunkr_ai/api/config.py
19
+ src/chunkr_ai/api/protocol.py
20
+ src/chunkr_ai/api/task.py
21
+ tests/test_chunkr.py
@@ -0,0 +1,9 @@
1
+ httpx>=0.28.1
2
+ pillow>=11.1.0
3
+ pydantic>=2.10.4
4
+ python-dotenv>=1.0.1
5
+ requests>=2.32.3
6
+
7
+ [test]
8
+ pytest>=8.3.4
9
+ pytest-xdist>=3.6.1
@@ -0,0 +1,141 @@
1
+ import pytest
2
+ from pathlib import Path
3
+ from PIL import Image
4
+
5
+ from chunkr_ai import Chunkr, ChunkrAsync
6
+ from chunkr_ai.models import (
7
+ ChunkProcessing,
8
+ Configuration,
9
+ GenerationStrategy,
10
+ GenerationConfig,
11
+ OcrStrategy,
12
+ SegmentationStrategy,
13
+ SegmentProcessing,
14
+ TaskResponse,
15
+ )
16
+
17
+ @pytest.fixture
18
+ def chunkr():
19
+ return Chunkr()
20
+
21
+ @pytest.fixture
22
+ def async_chunkr():
23
+ return ChunkrAsync()
24
+
25
+ @pytest.fixture
26
+ def sample_path():
27
+ return Path("tests/files/test.pdf")
28
+
29
+ @pytest.fixture
30
+ def sample_image():
31
+ img = Image.open("tests/files/test.jpg")
32
+ return img
33
+
34
+ def test_send_file_path(chunkr, sample_path):
35
+ response = chunkr.upload(sample_path)
36
+
37
+ assert isinstance(response, TaskResponse)
38
+ assert response.task_id is not None
39
+ assert response.status == "Succeeded"
40
+ assert response.output is not None
41
+
42
+ def test_send_file_path_str(chunkr, sample_path):
43
+ response = chunkr.upload(str(sample_path))
44
+
45
+ assert isinstance(response, TaskResponse)
46
+ assert response.task_id is not None
47
+ assert response.status == "Succeeded"
48
+ assert response.output is not None
49
+
50
+ def test_send_opened_file(chunkr, sample_path):
51
+ with open(sample_path, 'rb') as f:
52
+ response = chunkr.upload(f)
53
+
54
+ assert isinstance(response, TaskResponse)
55
+ assert response.task_id is not None
56
+ assert response.status == "Succeeded"
57
+ assert response.output is not None
58
+
59
+ def test_send_pil_image(chunkr, sample_image):
60
+ response = chunkr.upload(sample_image)
61
+
62
+ assert isinstance(response, TaskResponse)
63
+ assert response.task_id is not None
64
+ assert response.status == "Succeeded"
65
+
66
+ def test_ocr_auto(chunkr, sample_path):
67
+ response = chunkr.upload(sample_path, Configuration(
68
+ ocr_strategy=OcrStrategy.AUTO
69
+ ))
70
+ assert isinstance(response, TaskResponse)
71
+ assert response.task_id is not None
72
+ assert response.status == "Succeeded"
73
+ assert response.output is not None
74
+
75
+ def test_expires_in(chunkr, sample_path):
76
+ response = chunkr.upload(sample_path, Configuration(
77
+ expires_in=10
78
+ ))
79
+ assert isinstance(response, TaskResponse)
80
+ assert response.task_id is not None
81
+ assert response.status == "Succeeded"
82
+ assert response.output is not None
83
+
84
+ def test_chunk_processing(chunkr, sample_path):
85
+ response = chunkr.upload(sample_path, Configuration(
86
+ chunk_processing=ChunkProcessing(
87
+ target_length=1024
88
+ )
89
+ ))
90
+ assert isinstance(response, TaskResponse)
91
+ assert response.task_id is not None
92
+ assert response.status == "Succeeded"
93
+ assert response.output is not None
94
+
95
+ def test_segmentation_strategy_page(chunkr, sample_path):
96
+ response = chunkr.upload(sample_path, Configuration(
97
+ segmentation_strategy=SegmentationStrategy.PAGE
98
+ ))
99
+ assert isinstance(response, TaskResponse)
100
+ assert response.task_id is not None
101
+ assert response.status == "Succeeded"
102
+ assert response.output is not None
103
+
104
+ def test_page_llm_html(chunkr, sample_path):
105
+ response = chunkr.upload(sample_path, Configuration(
106
+ segmentation_strategy=SegmentationStrategy.PAGE,
107
+ segment_processing=SegmentProcessing(
108
+ page=GenerationConfig(
109
+ html=GenerationStrategy.LLM
110
+ )
111
+ )
112
+ ))
113
+ assert isinstance(response, TaskResponse)
114
+ assert response.task_id is not None
115
+ assert response.status == "Succeeded"
116
+ assert response.output is not None
117
+
118
+ def test_page_llm(chunkr, sample_path):
119
+ response = chunkr.upload(sample_path, Configuration(
120
+ segmentation_strategy=SegmentationStrategy.PAGE,
121
+ segment_processing=SegmentProcessing(
122
+ page=GenerationConfig(
123
+ html=GenerationStrategy.LLM,
124
+ markdown=GenerationStrategy.LLM
125
+ )
126
+ )
127
+ ))
128
+ assert isinstance(response, TaskResponse)
129
+ assert response.task_id is not None
130
+ assert response.status == "Succeeded"
131
+ assert response.output is not None
132
+
133
+
134
+ async def test_async_send_file_path(async_chunkr, sample_path):
135
+ response = await async_chunkr.upload(sample_path)
136
+
137
+ assert isinstance(response, TaskResponse)
138
+ assert response.task_id is not None
139
+ assert response.status == "Succeeded"
140
+ assert response.output is not None
141
+
chunkr_ai-0.0.1/PKG-INFO DELETED
@@ -1,7 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: chunkr-ai
3
- Version: 0.0.1
4
- Summary: PDF chunking
5
- Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
- Description-Content-Type: text/markdown
7
- License-File: LICENSE
@@ -1,11 +0,0 @@
1
- [build-system]
2
- requires = ["setuptools>=42", "wheel"]
3
- build-backend = "setuptools.build_meta"
4
-
5
- [project]
6
- name = "chunkr-ai"
7
- version = "0.0.1"
8
- authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
- description = "PDF chunking"
10
- readme = "README.md"
11
- license = {"file" = "LICENSE"}
@@ -1,7 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: chunkr-ai
3
- Version: 0.0.1
4
- Summary: PDF chunking
5
- Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
- Description-Content-Type: text/markdown
7
- License-File: LICENSE
@@ -1,9 +0,0 @@
1
- LICENSE
2
- README.md
3
- pyproject.toml
4
- src/chunkr_ai/__init__.py
5
- src/chunkr_ai/main.py
6
- src/chunkr_ai.egg-info/PKG-INFO
7
- src/chunkr_ai.egg-info/SOURCES.txt
8
- src/chunkr_ai.egg-info/dependency_links.txt
9
- src/chunkr_ai.egg-info/top_level.txt
File without changes
File without changes