chunkr-ai 0.0.2__tar.gz → 0.0.4__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. chunkr_ai-0.0.4/PKG-INFO +204 -0
  2. chunkr_ai-0.0.4/README.md +187 -0
  3. {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/pyproject.toml +4 -4
  4. {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai/api/auth.py +0 -2
  5. chunkr_ai-0.0.4/src/chunkr_ai/api/base.py +173 -0
  6. chunkr_ai-0.0.4/src/chunkr_ai/api/chunkr.py +108 -0
  7. chunkr_ai-0.0.4/src/chunkr_ai/api/chunkr_async.py +105 -0
  8. chunkr_ai-0.0.4/src/chunkr_ai/api/config.py +131 -0
  9. chunkr_ai-0.0.4/src/chunkr_ai/api/protocol.py +19 -0
  10. chunkr_ai-0.0.4/src/chunkr_ai/api/task.py +131 -0
  11. chunkr_ai-0.0.4/src/chunkr_ai/models.py +48 -0
  12. chunkr_ai-0.0.4/src/chunkr_ai.egg-info/PKG-INFO +204 -0
  13. {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai.egg-info/SOURCES.txt +6 -1
  14. {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai.egg-info/requires.txt +1 -2
  15. chunkr_ai-0.0.4/tests/test_chunkr.py +158 -0
  16. chunkr_ai-0.0.2/PKG-INFO +0 -16
  17. chunkr_ai-0.0.2/src/chunkr_ai/api/chunkr.py +0 -125
  18. chunkr_ai-0.0.2/src/chunkr_ai/api/chunkr_async.py +0 -39
  19. chunkr_ai-0.0.2/src/chunkr_ai/api/models.py +0 -231
  20. chunkr_ai-0.0.2/src/chunkr_ai.egg-info/PKG-INFO +0 -16
  21. chunkr_ai-0.0.2/tests/test_chunkr.py +0 -69
  22. {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/LICENSE +0 -0
  23. {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/setup.cfg +0 -0
  24. {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai/__init__.py +0 -0
  25. /chunkr_ai-0.0.2/README.md → /chunkr_ai-0.0.4/src/chunkr_ai/api/__init__.py +0 -0
  26. {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai/api/api.py +0 -0
  27. {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai/main.py +0 -0
  28. {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  29. {chunkr_ai-0.0.2 → chunkr_ai-0.0.4}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -0,0 +1,204 @@
1
+ Metadata-Version: 2.2
2
+ Name: chunkr-ai
3
+ Version: 0.0.4
4
+ Summary: Python client for Chunkr: open source document intelligence
5
+ Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
+ Project-URL: Homepage, https://chunkr.ai
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: httpx>=0.28.1
10
+ Requires-Dist: pillow>=11.1.0
11
+ Requires-Dist: pydantic>=2.10.4
12
+ Requires-Dist: python-dotenv>=1.0.1
13
+ Requires-Dist: requests>=2.32.3
14
+ Provides-Extra: test
15
+ Requires-Dist: pytest>=8.3.4; extra == "test"
16
+ Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
17
+
18
+ # Chunkr Python Client
19
+
20
+ This provides a simple interface to interact with the Chunkr API.
21
+
22
+ ## Getting Started
23
+
24
+ You can get an API key from [Chunkr](https://chunkr.ai) or deploy your own Chunkr instance. For self-hosted deployment options, check out our [deployment guide](https://github.com/lumina-ai-inc/chunkr/tree/main?tab=readme-ov-file#self-hosted-deployment-options).
25
+
26
+ For more information about the API and its capabilities, visit the [Chunkr API docs](https://docs.chunkr.ai).
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ pip install chunkr-ai
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
37
+
38
+ ### Synchronous Usage
39
+
40
+ ```python
41
+ from chunkr_ai import Chunkr
42
+
43
+ # Initialize client
44
+ chunkr = Chunkr()
45
+
46
+ # Upload a file and wait for processing
47
+ task = chunkr.upload("document.pdf")
48
+
49
+ # Print the response
50
+ print(task)
51
+
52
+ # Get output from task
53
+ output = task.output
54
+
55
+ # If you want to upload without waiting for processing
56
+ task = chunkr.start_upload("document.pdf")
57
+ # ... do other things ...
58
+ task.poll() # Check status when needed
59
+ ```
60
+
61
+ ### Asynchronous Usage
62
+
63
+ ```python
64
+ from chunkr_ai import ChunkrAsync
65
+
66
+ async def process_document():
67
+ # Initialize client
68
+ chunkr = ChunkrAsync()
69
+
70
+ # Upload a file and wait for processing
71
+ task = await chunkr.upload("document.pdf")
72
+
73
+ # Print the response
74
+ print(task)
75
+
76
+ # Get output from task
77
+ output = task.output
78
+
79
+ # If you want to upload without waiting for processing
80
+ task = await chunkr.start_upload("document.pdf")
81
+ # ... do other things ...
82
+ await task.poll_async() # Check status when needed
83
+ ```
84
+
85
+ ### Additional Features
86
+
87
+ Both clients support various input types:
88
+
89
+ ```python
90
+ # Upload from file path
91
+ chunkr.upload("document.pdf")
92
+
93
+ # Upload from opened file
94
+ with open("document.pdf", "rb") as f:
95
+ chunkr.upload(f)
96
+
97
+ # Upload from URL
98
+ chunkr.upload("https://example.com/document.pdf")
99
+
100
+ # Upload from base64 string
101
+ chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
102
+
103
+ # Upload an image
104
+ from PIL import Image
105
+ img = Image.open("photo.jpg")
106
+ chunkr.upload(img)
107
+ ```
108
+
109
+ ### Configuration
110
+
111
+ You can customize the processing behavior by passing a `Configuration` object:
112
+
113
+ ```python
114
+ from chunkr_ai.models import Configuration, OcrStrategy, SegmentationStrategy, GenerationStrategy
115
+
116
+ # Basic configuration
117
+ config = Configuration(
118
+ ocr_strategy=OcrStrategy.AUTO,
119
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
120
+ high_resolution=True,
121
+ expires_in=3600, # seconds
122
+ )
123
+
124
+ # Upload with configuration
125
+ task = chunkr.upload("document.pdf", config)
126
+ ```
127
+
128
+ #### Available Configuration Examples
129
+
130
+ - **Chunk Processing**
131
+ ```python
132
+ from chunkr_ai.models import ChunkProcessing
133
+ config = Configuration(
134
+ chunk_processing=ChunkProcessing(target_length=1024)
135
+ )
136
+ ```
137
+ - **Expires In**
138
+ ```python
139
+ config = Configuration(expires_in=3600)
140
+ ```
141
+
142
+ - **High Resolution**
143
+ ```python
144
+ config = Configuration(high_resolution=True)
145
+ ```
146
+
147
+ - **JSON Schema**
148
+ ```python
149
+ config = Configuration(json_schema=JsonSchema(
150
+ title="Sales Data",
151
+ properties=[
152
+ Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
153
+ Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
154
+ ]
155
+ ))
156
+ ```
157
+
158
+ - **OCR Strategy**
159
+ ```python
160
+ config = Configuration(ocr_strategy=OcrStrategy.AUTO)
161
+ ```
162
+
163
+ - **Segment Processing**
164
+ ```python
165
+ from chunkr_ai.models import SegmentProcessing, GenerationConfig, GenerationStrategy
166
+ config = Configuration(
167
+ segment_processing=SegmentProcessing(
168
+ page=GenerationConfig(
169
+ html=GenerationStrategy.LLM,
170
+ markdown=GenerationStrategy.LLM
171
+ )
172
+ )
173
+ )
174
+ ```
175
+
176
+ - **Segmentation Strategy**
177
+ ```python
178
+ config = Configuration(
179
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS # or SegmentationStrategy.PAGE
180
+ )
181
+ ```
182
+
183
+ ## Environment setup
184
+
185
+ You can provide your API key and URL in several ways:
186
+ 1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
187
+ 2. `.env` file
188
+ 3. Direct initialization:
189
+ ```python
190
+ chunkr = Chunkr(
191
+ api_key="your-api-key",
192
+ url="https://api.chunkr.ai"
193
+ )
194
+ ```
195
+
196
+ ## Run tests
197
+
198
+ ```python
199
+ # Install dependencies
200
+ uv pip install -e ".[test]"
201
+
202
+ # Run tests
203
+ uv run pytest
204
+ ```
@@ -0,0 +1,187 @@
1
+ # Chunkr Python Client
2
+
3
+ This provides a simple interface to interact with the Chunkr API.
4
+
5
+ ## Getting Started
6
+
7
+ You can get an API key from [Chunkr](https://chunkr.ai) or deploy your own Chunkr instance. For self-hosted deployment options, check out our [deployment guide](https://github.com/lumina-ai-inc/chunkr/tree/main?tab=readme-ov-file#self-hosted-deployment-options).
8
+
9
+ For more information about the API and its capabilities, visit the [Chunkr API docs](https://docs.chunkr.ai).
10
+
11
+ ## Installation
12
+
13
+ ```bash
14
+ pip install chunkr-ai
15
+ ```
16
+
17
+ ## Usage
18
+
19
+ We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
20
+
21
+ ### Synchronous Usage
22
+
23
+ ```python
24
+ from chunkr_ai import Chunkr
25
+
26
+ # Initialize client
27
+ chunkr = Chunkr()
28
+
29
+ # Upload a file and wait for processing
30
+ task = chunkr.upload("document.pdf")
31
+
32
+ # Print the response
33
+ print(task)
34
+
35
+ # Get output from task
36
+ output = task.output
37
+
38
+ # If you want to upload without waiting for processing
39
+ task = chunkr.start_upload("document.pdf")
40
+ # ... do other things ...
41
+ task.poll() # Check status when needed
42
+ ```
43
+
44
+ ### Asynchronous Usage
45
+
46
+ ```python
47
+ from chunkr_ai import ChunkrAsync
48
+
49
+ async def process_document():
50
+ # Initialize client
51
+ chunkr = ChunkrAsync()
52
+
53
+ # Upload a file and wait for processing
54
+ task = await chunkr.upload("document.pdf")
55
+
56
+ # Print the response
57
+ print(task)
58
+
59
+ # Get output from task
60
+ output = task.output
61
+
62
+ # If you want to upload without waiting for processing
63
+ task = await chunkr.start_upload("document.pdf")
64
+ # ... do other things ...
65
+ await task.poll_async() # Check status when needed
66
+ ```
67
+
68
+ ### Additional Features
69
+
70
+ Both clients support various input types:
71
+
72
+ ```python
73
+ # Upload from file path
74
+ chunkr.upload("document.pdf")
75
+
76
+ # Upload from opened file
77
+ with open("document.pdf", "rb") as f:
78
+ chunkr.upload(f)
79
+
80
+ # Upload from URL
81
+ chunkr.upload("https://example.com/document.pdf")
82
+
83
+ # Upload from base64 string
84
+ chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
85
+
86
+ # Upload an image
87
+ from PIL import Image
88
+ img = Image.open("photo.jpg")
89
+ chunkr.upload(img)
90
+ ```
91
+
92
+ ### Configuration
93
+
94
+ You can customize the processing behavior by passing a `Configuration` object:
95
+
96
+ ```python
97
+ from chunkr_ai.models import Configuration, OcrStrategy, SegmentationStrategy, GenerationStrategy
98
+
99
+ # Basic configuration
100
+ config = Configuration(
101
+ ocr_strategy=OcrStrategy.AUTO,
102
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
103
+ high_resolution=True,
104
+ expires_in=3600, # seconds
105
+ )
106
+
107
+ # Upload with configuration
108
+ task = chunkr.upload("document.pdf", config)
109
+ ```
110
+
111
+ #### Available Configuration Examples
112
+
113
+ - **Chunk Processing**
114
+ ```python
115
+ from chunkr_ai.models import ChunkProcessing
116
+ config = Configuration(
117
+ chunk_processing=ChunkProcessing(target_length=1024)
118
+ )
119
+ ```
120
+ - **Expires In**
121
+ ```python
122
+ config = Configuration(expires_in=3600)
123
+ ```
124
+
125
+ - **High Resolution**
126
+ ```python
127
+ config = Configuration(high_resolution=True)
128
+ ```
129
+
130
+ - **JSON Schema**
131
+ ```python
132
+ config = Configuration(json_schema=JsonSchema(
133
+ title="Sales Data",
134
+ properties=[
135
+ Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
136
+ Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
137
+ ]
138
+ ))
139
+ ```
140
+
141
+ - **OCR Strategy**
142
+ ```python
143
+ config = Configuration(ocr_strategy=OcrStrategy.AUTO)
144
+ ```
145
+
146
+ - **Segment Processing**
147
+ ```python
148
+ from chunkr_ai.models import SegmentProcessing, GenerationConfig, GenerationStrategy
149
+ config = Configuration(
150
+ segment_processing=SegmentProcessing(
151
+ page=GenerationConfig(
152
+ html=GenerationStrategy.LLM,
153
+ markdown=GenerationStrategy.LLM
154
+ )
155
+ )
156
+ )
157
+ ```
158
+
159
+ - **Segmentation Strategy**
160
+ ```python
161
+ config = Configuration(
162
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS # or SegmentationStrategy.PAGE
163
+ )
164
+ ```
165
+
166
+ ## Environment setup
167
+
168
+ You can provide your API key and URL in several ways:
169
+ 1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
170
+ 2. `.env` file
171
+ 3. Direct initialization:
172
+ ```python
173
+ chunkr = Chunkr(
174
+ api_key="your-api-key",
175
+ url="https://api.chunkr.ai"
176
+ )
177
+ ```
178
+
179
+ ## Run tests
180
+
181
+ ```python
182
+ # Install dependencies
183
+ uv pip install -e ".[test]"
184
+
185
+ # Run tests
186
+ uv run pytest
187
+ ```
@@ -4,22 +4,22 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.2"
7
+ version = "0.0.4"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
- description = "Python client for chunkr: open source document intelligence"
9
+ description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
11
11
  license = {"file" = "LICENSE"}
12
+ urls = {Homepage = "https://chunkr.ai"}
12
13
  dependencies = [
13
- "build>=1.2.2.post1",
14
14
  "httpx>=0.28.1",
15
15
  "pillow>=11.1.0",
16
16
  "pydantic>=2.10.4",
17
17
  "python-dotenv>=1.0.1",
18
18
  "requests>=2.32.3",
19
- "twine>=6.0.1",
20
19
  ]
21
20
 
22
21
  [project.optional-dependencies]
23
22
  test = [
24
23
  "pytest>=8.3.4",
24
+ "pytest-xdist>=3.6.1",
25
25
  ]
@@ -1,5 +1,3 @@
1
- from typing import Optional
2
-
3
1
  class HeadersMixin:
4
2
  """Mixin class for handling authorization headers"""
5
3
 
@@ -0,0 +1,173 @@
1
+ from .config import Configuration
2
+ from .task import TaskResponse
3
+ from .auth import HeadersMixin
4
+ from abc import abstractmethod
5
+ from dotenv import load_dotenv
6
+ import io
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ from PIL import Image
11
+ import requests
12
+ from typing import BinaryIO, Tuple, Union
13
+
14
+ class ChunkrBase(HeadersMixin):
15
+ """Base class with shared functionality for Chunkr API clients."""
16
+
17
+ def __init__(self, url: str = None, api_key: str = None):
18
+ load_dotenv()
19
+ self.url = (
20
+ url or
21
+ os.getenv('CHUNKR_URL') or
22
+ 'https://api.chunkr.ai'
23
+ )
24
+ self._api_key = (
25
+ api_key or
26
+ os.getenv('CHUNKR_API_KEY')
27
+ )
28
+ if not self._api_key:
29
+ raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
30
+
31
+ self.url = self.url.rstrip("/")
32
+
33
+ def _prepare_file(
34
+ self,
35
+ file: Union[str, Path, BinaryIO, Image.Image]
36
+ ) -> Tuple[str, BinaryIO]:
37
+ """Convert various file types into a tuple of (filename, file-like object).
38
+
39
+ Args:
40
+ file: Input file, can be:
41
+ - String or Path to a file
42
+ - URL string starting with http:// or https://
43
+ - Base64 string
44
+ - Opened binary file (mode='rb')
45
+ - PIL/Pillow Image object
46
+
47
+ Returns:
48
+ Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
49
+
50
+ Raises:
51
+ FileNotFoundError: If the file path doesn't exist
52
+ TypeError: If the file type is not supported
53
+ ValueError: If the URL is invalid or unreachable
54
+ ValueError: If the MIME type is unsupported
55
+ """
56
+ # Handle URLs
57
+ if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
58
+ response = requests.get(file)
59
+ response.raise_for_status()
60
+ file_obj = io.BytesIO(response.content)
61
+ filename = Path(file.split('/')[-1]).name or 'downloaded_file'
62
+ return filename, file_obj
63
+
64
+ # Handle base64 strings
65
+ if isinstance(file, str) and ',' in file and ';base64,' in file:
66
+ try:
67
+ # Split header and data
68
+ header, base64_data = file.split(',', 1)
69
+ import base64
70
+ file_bytes = base64.b64decode(base64_data)
71
+ file_obj = io.BytesIO(file_bytes)
72
+
73
+ # Try to determine format from header
74
+ format = 'bin'
75
+ mime_type = header.split(':')[-1].split(';')[0].lower()
76
+
77
+ # Map MIME types to file extensions
78
+ mime_to_ext = {
79
+ 'application/pdf': 'pdf',
80
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
81
+ 'application/msword': 'doc',
82
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
83
+ 'application/vnd.ms-powerpoint': 'ppt',
84
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
85
+ 'application/vnd.ms-excel': 'xls',
86
+ 'image/jpeg': 'jpg',
87
+ 'image/png': 'png',
88
+ 'image/jpg': 'jpg'
89
+ }
90
+
91
+ if mime_type in mime_to_ext:
92
+ format = mime_to_ext[mime_type]
93
+ else:
94
+ raise ValueError(f"Unsupported MIME type: {mime_type}")
95
+
96
+ return f"file.{format}", file_obj
97
+ except Exception as e:
98
+ raise ValueError(f"Invalid base64 string: {str(e)}")
99
+
100
+ # Handle file paths
101
+ if isinstance(file, (str, Path)):
102
+ path = Path(file).resolve()
103
+ if not path.exists():
104
+ raise FileNotFoundError(f"File not found: {file}")
105
+ return path.name, open(path, 'rb')
106
+
107
+ # Handle PIL Images
108
+ if isinstance(file, Image.Image):
109
+ img_byte_arr = io.BytesIO()
110
+ format = file.format or 'PNG'
111
+ file.save(img_byte_arr, format=format)
112
+ img_byte_arr.seek(0)
113
+ return f"image.{format.lower()}", img_byte_arr
114
+
115
+ # Handle file-like objects
116
+ if hasattr(file, 'read') and hasattr(file, 'seek'):
117
+ # Try to get the filename from the file object if possible
118
+ name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
119
+ return Path(name).name, file
120
+
121
+ raise TypeError(f"Unsupported file type: {type(file)}")
122
+
123
+ def _prepare_upload_data(
124
+ self,
125
+ file: Union[str, Path, BinaryIO, Image.Image],
126
+ config: Configuration = None
127
+ ) -> Tuple[dict, dict]:
128
+ """Prepare files and data dictionaries for upload.
129
+
130
+ Args:
131
+ file: The file to upload
132
+ config: Optional configuration settings
133
+
134
+ Returns:
135
+ Tuple[dict, dict]: (files dict, data dict) ready for upload
136
+ """
137
+ filename, file_obj = self._prepare_file(file)
138
+ files = {"file": (filename, file_obj)}
139
+ data = {}
140
+
141
+ if config:
142
+ config_dict = config.model_dump(mode="json", exclude_none=True)
143
+ for key, value in config_dict.items():
144
+ if isinstance(value, dict):
145
+ files[key] = (None, json.dumps(value), 'application/json')
146
+ else:
147
+ data[key] = value
148
+
149
+ return files, data
150
+
151
+ @abstractmethod
152
+ def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
153
+ """Upload a file and wait for processing to complete.
154
+
155
+ Must be implemented by subclasses.
156
+ """
157
+ pass
158
+
159
+ @abstractmethod
160
+ def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
161
+ """Upload a file for processing and immediately return the task response.
162
+
163
+ Must be implemented by subclasses.
164
+ """
165
+ pass
166
+
167
+ @abstractmethod
168
+ def get_task(self, task_id: str) -> TaskResponse:
169
+ """Get a task response by its ID.
170
+
171
+ Must be implemented by subclasses.
172
+ """
173
+ pass
@@ -0,0 +1,108 @@
1
+ from .base import ChunkrBase
2
+ from .config import Configuration
3
+ from .task import TaskResponse
4
+ from pathlib import Path
5
+ from PIL import Image
6
+ import requests
7
+ from typing import Union, BinaryIO
8
+
9
+ class Chunkr(ChunkrBase):
10
+ """Chunkr API client"""
11
+
12
+ def __init__(self, url: str = None, api_key: str = None):
13
+ super().__init__(url, api_key)
14
+ self._session = requests.Session()
15
+
16
+ def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
17
+ """Upload a file and wait for processing to complete.
18
+
19
+ Args:
20
+ file: The file to upload.
21
+ config: Configuration options for processing. Optional.
22
+
23
+ Examples:
24
+ ```
25
+ # Upload from file path
26
+ chunkr.upload("document.pdf")
27
+
28
+ # Upload from URL
29
+ chunkr.upload("https://example.com/document.pdf")
30
+
31
+ # Upload from base64 string (must include MIME type header)
32
+ chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
33
+
34
+ # Upload from opened file
35
+ with open("document.pdf", "rb") as f:
36
+ chunkr.upload(f)
37
+
38
+ # Upload an image
39
+ from PIL import Image
40
+ img = Image.open("photo.jpg")
41
+ chunkr.upload(img)
42
+ ```
43
+ Returns:
44
+ TaskResponse: The completed task response
45
+ """
46
+ task = self.start_upload(file, config)
47
+ return task.poll()
48
+
49
+ def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
50
+ """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
51
+
52
+ Args:
53
+ file: The file to upload.
54
+ config: Configuration options for processing. Optional.
55
+
56
+ Examples:
57
+ ```
58
+ # Upload from file path
59
+ task = chunkr.start_upload("document.pdf")
60
+
61
+ # Upload from opened file
62
+ with open("document.pdf", "rb") as f:
63
+ task = chunkr.start_upload(f)
64
+
65
+ # Upload from URL
66
+ task = chunkr.start_upload("https://example.com/document.pdf")
67
+
68
+ # Upload from base64 string (must include MIME type header)
69
+ task = chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
70
+
71
+ # Upload an image
72
+ from PIL import Image
73
+ img = Image.open("photo.jpg")
74
+ task = chunkr.start_upload(img)
75
+
76
+ # Wait for the task to complete - this can be done when needed
77
+ task.poll()
78
+ ```
79
+
80
+ Returns:
81
+ TaskResponse: The initial task response
82
+ """
83
+ files, data = self._prepare_upload_data(file, config)
84
+ r = self._session.post(
85
+ f"{self.url}/api/v1/task",
86
+ files=files,
87
+ data=data,
88
+ headers=self._headers()
89
+ )
90
+ r.raise_for_status()
91
+ return TaskResponse(**r.json()).with_client(self)
92
+
93
+ def get_task(self, task_id: str) -> TaskResponse:
94
+ """Get a task response by its ID.
95
+
96
+ Args:
97
+ task_id: The ID of the task to get
98
+
99
+ Returns:
100
+ TaskResponse: The task response
101
+ """
102
+ r = self._session.get(
103
+ f"{self.url}/api/v1/task/{task_id}",
104
+ headers=self._headers()
105
+ )
106
+ r.raise_for_status()
107
+ return TaskResponse(**r.json()).with_client(self)
108
+