byteit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- byteit/ByteITClient.py +382 -0
- byteit/__init__.py +51 -0
- byteit/connectors/LocalFileInputConnector.py +63 -0
- byteit/connectors/LocalFileOutputConnector.py +25 -0
- byteit/connectors/S3InputConnector.py +82 -0
- byteit/connectors/S3OutputConnector.py +53 -0
- byteit/connectors/__init__.py +26 -0
- byteit/connectors/base.py +57 -0
- byteit/exceptions.py +107 -0
- byteit/models/DocumentMetadata.py +25 -0
- byteit/models/Job.py +153 -0
- byteit/models/JobList.py +21 -0
- byteit/models/OutputFormat.py +16 -0
- byteit/models/ProcessingOptions.py +96 -0
- byteit/validations.py +42 -0
- byteit-0.1.0.dist-info/LICENSE +201 -0
- byteit-0.1.0.dist-info/METADATA +424 -0
- byteit-0.1.0.dist-info/RECORD +20 -0
- byteit-0.1.0.dist-info/WHEEL +5 -0
- byteit-0.1.0.dist-info/top_level.txt +1 -0
byteit/ByteITClient.py
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
"""Simplified ByteIT API client - clean and minimal."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from types import TracebackType
|
|
7
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
from .connectors import (
|
|
13
|
+
LocalFileOutputConnector,
|
|
14
|
+
InputConnector,
|
|
15
|
+
OutputConnector,
|
|
16
|
+
LocalFileInputConnector,
|
|
17
|
+
)
|
|
18
|
+
from .exceptions import (
|
|
19
|
+
APIKeyError,
|
|
20
|
+
AuthenticationError,
|
|
21
|
+
ByteITError,
|
|
22
|
+
JobProcessingError,
|
|
23
|
+
RateLimitError,
|
|
24
|
+
ResourceNotFoundError,
|
|
25
|
+
ServerError,
|
|
26
|
+
ValidationError,
|
|
27
|
+
)
|
|
28
|
+
from .models.Job import Job
|
|
29
|
+
from .models.JobList import JobList
|
|
30
|
+
|
|
31
|
+
# API configuration
|
|
32
|
+
API_VERSION = "v1"
|
|
33
|
+
API_BASE = f"/{API_VERSION}"
|
|
34
|
+
JOBS_PATH = "jobs"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ByteITClient:
|
|
38
|
+
"""
|
|
39
|
+
Simple client for ByteIT document parsing.
|
|
40
|
+
|
|
41
|
+
Methods:
|
|
42
|
+
- parse(): Parse a document and get the result
|
|
43
|
+
- get_all_jobs(): Get all your jobs
|
|
44
|
+
- get_job_by_id(): Get a specific job
|
|
45
|
+
- get_result(): Download a job result
|
|
46
|
+
|
|
47
|
+
Example:
|
|
48
|
+
client = ByteITClient(api_key="your_key")
|
|
49
|
+
result = client.parse("document.pdf") # Returns bytes
|
|
50
|
+
client.parse("doc.pdf", output="result.txt") # Saves to file
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
# BASE_URL = "https://api.byteit.ai"
|
|
54
|
+
# BASE_URL = "http://127.0.0.1:8000"
|
|
55
|
+
BASE_URL = "https://heinzelai.com"
|
|
56
|
+
DEFAULT_TIMEOUT = 30
|
|
57
|
+
|
|
58
|
+
def __init__(self, api_key: str):
|
|
59
|
+
"""
|
|
60
|
+
Initialize the ByteIT client.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
api_key: Your ByteIT API key
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
APIKeyError: If API key is invalid
|
|
67
|
+
"""
|
|
68
|
+
if not api_key:
|
|
69
|
+
raise APIKeyError("API key must be a non-empty string")
|
|
70
|
+
|
|
71
|
+
self.api_key = api_key
|
|
72
|
+
self._session = requests.Session()
|
|
73
|
+
self._session.headers.update({"X-API-Key": self.api_key})
|
|
74
|
+
|
|
75
|
+
# ==================== PUBLIC API ====================
|
|
76
|
+
|
|
77
|
+
def parse(
|
|
78
|
+
self,
|
|
79
|
+
input: Union[str, Path, InputConnector],
|
|
80
|
+
output: Union[None, str, Path] = None,
|
|
81
|
+
result_format: str = "md",
|
|
82
|
+
) -> bytes:
|
|
83
|
+
"""
|
|
84
|
+
Parse a document and wait for the result.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
input: File to parse. Can be:
|
|
88
|
+
- str or Path: Local file path
|
|
89
|
+
- InputConnector: For S3 or custom sources
|
|
90
|
+
result_format: "txt", "json", "md", or "html" (default: "txt")
|
|
91
|
+
output: Where to save result (optional). Can be:
|
|
92
|
+
- None: Return result as bytes (default)
|
|
93
|
+
- str or Path: Save to local file
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Parsed content as bytes
|
|
97
|
+
|
|
98
|
+
Example:
|
|
99
|
+
# Simple - returns bytes
|
|
100
|
+
result = client.parse("document.pdf")
|
|
101
|
+
|
|
102
|
+
# Save to file
|
|
103
|
+
client.parse("doc.pdf", output="result.txt")
|
|
104
|
+
|
|
105
|
+
# S3 input (use connector)
|
|
106
|
+
from byteit.connectors import S3InputConnector
|
|
107
|
+
result = client.parse(S3InputConnector("my-bucket", "file.pdf"))
|
|
108
|
+
|
|
109
|
+
# Different format
|
|
110
|
+
json_result = client.parse("doc.pdf", result_format="json")
|
|
111
|
+
"""
|
|
112
|
+
# Convert input to connector as early as possible
|
|
113
|
+
input_connector = self._to_input_connector(input)
|
|
114
|
+
|
|
115
|
+
# Convert output to connector if provided
|
|
116
|
+
output_connector = self._to_output_connector(output)
|
|
117
|
+
|
|
118
|
+
# Create job and wait
|
|
119
|
+
job = self._create_job(
|
|
120
|
+
input_connector=input_connector,
|
|
121
|
+
output_connector=output_connector,
|
|
122
|
+
result_format=result_format,
|
|
123
|
+
)
|
|
124
|
+
self._wait_for_completion(job.id)
|
|
125
|
+
|
|
126
|
+
# Download result
|
|
127
|
+
result_bytes = self._download_result(job.id)
|
|
128
|
+
|
|
129
|
+
# If output is a file path, save it
|
|
130
|
+
if isinstance(output, (str, Path)):
|
|
131
|
+
output_path = Path(output)
|
|
132
|
+
output_path.write_bytes(result_bytes)
|
|
133
|
+
|
|
134
|
+
return result_bytes
|
|
135
|
+
|
|
136
|
+
def get_all_jobs(self) -> List[Job]:
|
|
137
|
+
"""
|
|
138
|
+
Get all jobs for your account.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
List of Job objects
|
|
142
|
+
|
|
143
|
+
Example:
|
|
144
|
+
jobs = client.get_all_jobs()
|
|
145
|
+
for job in jobs:
|
|
146
|
+
print(f"{job.id}: {job.processing_status}")
|
|
147
|
+
"""
|
|
148
|
+
job_list = self._list_jobs()
|
|
149
|
+
return job_list.jobs
|
|
150
|
+
|
|
151
|
+
def get_job_by_id(self, job_id: str) -> Job:
|
|
152
|
+
"""
|
|
153
|
+
Get a specific job by ID.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
job_id: The job ID
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Job object
|
|
160
|
+
|
|
161
|
+
Example:
|
|
162
|
+
job = client.get_job_by_id("job_123")
|
|
163
|
+
"""
|
|
164
|
+
return self._get_job_status(job_id)
|
|
165
|
+
|
|
166
|
+
def get_result(self, job_id: str) -> bytes:
|
|
167
|
+
"""
|
|
168
|
+
Download result for a completed job.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
job_id: The job ID
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Result as bytes
|
|
175
|
+
|
|
176
|
+
Raises:
|
|
177
|
+
JobProcessingError: If job is not completed
|
|
178
|
+
"""
|
|
179
|
+
return self._download_result(job_id)
|
|
180
|
+
|
|
181
|
+
# ==================== CONNECTOR CONVERTERS ====================
|
|
182
|
+
|
|
183
|
+
def _to_input_connector(
|
|
184
|
+
self, input: Union[str, Path, InputConnector]
|
|
185
|
+
) -> InputConnector:
|
|
186
|
+
"""Convert various input types to InputConnector."""
|
|
187
|
+
# Already a connector (checks for InputConnector or its subclasses)
|
|
188
|
+
if isinstance(input, InputConnector):
|
|
189
|
+
return input
|
|
190
|
+
|
|
191
|
+
# String or Path - local file
|
|
192
|
+
if isinstance(input, (str, Path)):
|
|
193
|
+
return LocalFileInputConnector(file_path=str(input))
|
|
194
|
+
|
|
195
|
+
raise ValidationError(
|
|
196
|
+
f"Unsupported input type: {type(input)}. "
|
|
197
|
+
"Use str, Path, or InputConnector (e.g., S3InputConnector)"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def _to_output_connector(self, output: Union[None, str, Path]):
|
|
201
|
+
"""Convert output specification to OutputConnector."""
|
|
202
|
+
# Always use ByteIT storage (simplest approach)
|
|
203
|
+
# If output is a file path, we download and save after completion
|
|
204
|
+
return LocalFileOutputConnector()
|
|
205
|
+
|
|
206
|
+
# ==================== INTERNAL METHODS ====================
|
|
207
|
+
|
|
208
|
+
def _create_job(
|
|
209
|
+
self,
|
|
210
|
+
input_connector: InputConnector,
|
|
211
|
+
output_connector: OutputConnector,
|
|
212
|
+
result_format: str,
|
|
213
|
+
) -> Job:
|
|
214
|
+
"""Create a processing job."""
|
|
215
|
+
connector_type = (
|
|
216
|
+
input_connector.to_dict().get("type", "localfile").strip().lower()
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Build base request data
|
|
220
|
+
data: Dict[str, Any] = {
|
|
221
|
+
"output_format": result_format,
|
|
222
|
+
"processing_options": json.dumps({}),
|
|
223
|
+
"input_connector": connector_type,
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
# Add output connector config
|
|
227
|
+
output_config = output_connector.to_dict()
|
|
228
|
+
data["output_connector"] = output_config.get("type", "")
|
|
229
|
+
data["output_connection_data"] = (
|
|
230
|
+
json.dumps(output_config) if output_config.get("type") else "{}"
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Prepare input based on type
|
|
234
|
+
files: Optional[Dict[str, Any]] = None
|
|
235
|
+
file_obj = None
|
|
236
|
+
|
|
237
|
+
if connector_type == "localfile":
|
|
238
|
+
filename, file_obj = input_connector.get_file_data()
|
|
239
|
+
files = {"file": (filename, file_obj)}
|
|
240
|
+
elif connector_type == "s3":
|
|
241
|
+
_, connection_data = input_connector.get_file_data()
|
|
242
|
+
data["input_connection_data"] = json.dumps(connection_data)
|
|
243
|
+
else:
|
|
244
|
+
raise ValidationError(f"Unsupported connector type: {connector_type}")
|
|
245
|
+
|
|
246
|
+
# Make request with cleanup
|
|
247
|
+
try:
|
|
248
|
+
response = self._request(
|
|
249
|
+
"POST", f"{API_BASE}/{JOBS_PATH}/", files=files, data=data
|
|
250
|
+
)
|
|
251
|
+
finally:
|
|
252
|
+
if file_obj and hasattr(file_obj, "close") and not file_obj.closed:
|
|
253
|
+
file_obj.close()
|
|
254
|
+
|
|
255
|
+
# Return job from response
|
|
256
|
+
if "job_id" in response:
|
|
257
|
+
return self._get_job_status(response["job_id"])
|
|
258
|
+
|
|
259
|
+
return Job.from_dict(response["job"])
|
|
260
|
+
|
|
261
|
+
def _get_job_status(self, job_id: str) -> Job:
|
|
262
|
+
"""Get current job status."""
|
|
263
|
+
response = self._request("GET", f"{API_BASE}/{JOBS_PATH}/{job_id}/")
|
|
264
|
+
job_data = response.get("job", response.get("document", response))
|
|
265
|
+
return Job.from_dict(job_data)
|
|
266
|
+
|
|
267
|
+
def _list_jobs(self) -> JobList:
|
|
268
|
+
"""List all jobs."""
|
|
269
|
+
response = self._request("GET", f"{API_BASE}/{JOBS_PATH}/")
|
|
270
|
+
jobs_data = response.get("jobs", response.get("documents", []))
|
|
271
|
+
jobs = [Job.from_dict(doc) for doc in jobs_data]
|
|
272
|
+
return JobList(
|
|
273
|
+
jobs=jobs,
|
|
274
|
+
count=response.get("count", len(jobs)),
|
|
275
|
+
detail=response.get("detail", ""),
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
def _wait_for_completion(self, job_id: str) -> Job:
|
|
279
|
+
"""Wait for job to complete (polls every 2 seconds)."""
|
|
280
|
+
while True:
|
|
281
|
+
job = self._get_job_status(job_id)
|
|
282
|
+
|
|
283
|
+
if job.is_completed:
|
|
284
|
+
return job
|
|
285
|
+
|
|
286
|
+
if job.is_failed:
|
|
287
|
+
raise JobProcessingError(
|
|
288
|
+
f"Job failed: {job.processing_error or 'Unknown error'}"
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
time.sleep(2)
|
|
292
|
+
|
|
293
|
+
def _download_result(self, job_id: str) -> bytes:
|
|
294
|
+
"""Download job result."""
|
|
295
|
+
url = self._build_url(f"{API_BASE}/{JOBS_PATH}/{job_id}/result/")
|
|
296
|
+
response = self._session.get(url, timeout=self.DEFAULT_TIMEOUT)
|
|
297
|
+
response.raise_for_status()
|
|
298
|
+
|
|
299
|
+
content_disposition = response.headers.get("Content-Disposition", "")
|
|
300
|
+
content_type = response.headers.get("Content-Type", "")
|
|
301
|
+
|
|
302
|
+
# Check if file download
|
|
303
|
+
if "attachment" in content_disposition:
|
|
304
|
+
return response.content
|
|
305
|
+
|
|
306
|
+
# Handle JSON response (not ready or error)
|
|
307
|
+
if "application/json" in content_type:
|
|
308
|
+
data = self._handle_response(response)
|
|
309
|
+
if not data.get("ready", False):
|
|
310
|
+
status = data.get("processing_status", "unknown")
|
|
311
|
+
raise JobProcessingError(f"Result not available. Job status: {status}")
|
|
312
|
+
raise JobProcessingError("Job ready but no result file returned")
|
|
313
|
+
|
|
314
|
+
# File response
|
|
315
|
+
return response.content
|
|
316
|
+
|
|
317
|
+
# ==================== HTTP HELPERS ====================
|
|
318
|
+
|
|
319
|
+
def _build_url(self, path: str) -> str:
|
|
320
|
+
"""Build full URL."""
|
|
321
|
+
return f"{self.BASE_URL}/{path.lstrip('/')}"
|
|
322
|
+
|
|
323
|
+
def _request(self, method: str, path: str, **kwargs: Any) -> Dict[str, Any]:
|
|
324
|
+
"""Make HTTP request."""
|
|
325
|
+
url = self._build_url(path)
|
|
326
|
+
kwargs.setdefault("timeout", self.DEFAULT_TIMEOUT)
|
|
327
|
+
response = self._session.request(method, url, **kwargs)
|
|
328
|
+
return self._handle_response(response)
|
|
329
|
+
|
|
330
|
+
def _handle_response(self, response: requests.Response) -> Dict[str, Any]:
|
|
331
|
+
"""Handle API response and raise appropriate exceptions."""
|
|
332
|
+
# Success path
|
|
333
|
+
if response.status_code in (200, 201):
|
|
334
|
+
return response.json() if response.content else {}
|
|
335
|
+
|
|
336
|
+
# Error path - extract details
|
|
337
|
+
try:
|
|
338
|
+
data: Dict[str, Any] = response.json() if response.content else {}
|
|
339
|
+
message: str = data.get("detail", "") or response.text or "Request failed"
|
|
340
|
+
except (ValueError, requests.exceptions.JSONDecodeError):
|
|
341
|
+
# Response is not JSON (e.g., HTML error page)
|
|
342
|
+
data = {}
|
|
343
|
+
message = (
|
|
344
|
+
response.text or f"Request failed with status {response.status_code}"
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Map status to exception
|
|
348
|
+
ERROR_MAP: Dict[int, Type[Exception]] = {
|
|
349
|
+
400: ValidationError,
|
|
350
|
+
401: AuthenticationError,
|
|
351
|
+
403: APIKeyError,
|
|
352
|
+
404: ResourceNotFoundError,
|
|
353
|
+
429: RateLimitError,
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
ExceptionClass = ERROR_MAP.get(response.status_code)
|
|
357
|
+
if ExceptionClass:
|
|
358
|
+
raise ExceptionClass(message, response.status_code, data)
|
|
359
|
+
|
|
360
|
+
if response.status_code >= 500:
|
|
361
|
+
raise ServerError(message, response.status_code, data)
|
|
362
|
+
|
|
363
|
+
raise ByteITError(message, response.status_code, data)
|
|
364
|
+
|
|
365
|
+
# ==================== CONTEXT MANAGER ====================
|
|
366
|
+
|
|
367
|
+
def close(self):
|
|
368
|
+
"""Close HTTP session."""
|
|
369
|
+
self._session.close()
|
|
370
|
+
|
|
371
|
+
def __enter__(self):
|
|
372
|
+
"""Context manager entry."""
|
|
373
|
+
return self
|
|
374
|
+
|
|
375
|
+
def __exit__(
|
|
376
|
+
self,
|
|
377
|
+
exc_type: Optional[Type[BaseException]],
|
|
378
|
+
exc_val: Optional[BaseException],
|
|
379
|
+
exc_tb: Optional[TracebackType],
|
|
380
|
+
) -> None:
|
|
381
|
+
"""Context manager exit."""
|
|
382
|
+
self.close()
|
byteit/__init__.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""ByteIT Python Client Library for text extraction."""
|
|
2
|
+
|
|
3
|
+
from .ByteITClient import ByteITClient
|
|
4
|
+
from .exceptions import (
|
|
5
|
+
APIKeyError,
|
|
6
|
+
AuthenticationError,
|
|
7
|
+
ByteITError,
|
|
8
|
+
JobProcessingError,
|
|
9
|
+
NetworkError,
|
|
10
|
+
RateLimitError,
|
|
11
|
+
ResourceNotFoundError,
|
|
12
|
+
ServerError,
|
|
13
|
+
ValidationError,
|
|
14
|
+
)
|
|
15
|
+
from .models.Job import Job
|
|
16
|
+
from .models.JobList import JobList
|
|
17
|
+
from .models.DocumentMetadata import DocumentMetadata
|
|
18
|
+
from .models.ProcessingOptions import ProcessingOptions
|
|
19
|
+
from .models.OutputFormat import OutputFormat
|
|
20
|
+
from .connectors import (
|
|
21
|
+
InputConnector,
|
|
22
|
+
OutputConnector,
|
|
23
|
+
LocalFileInputConnector,
|
|
24
|
+
LocalFileOutputConnector,
|
|
25
|
+
)
|
|
26
|
+
from .validations import validate_processing_options
|
|
27
|
+
|
|
28
|
+
__version__ = "0.1.0"
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"ByteITClient",
|
|
32
|
+
"Job",
|
|
33
|
+
"JobList",
|
|
34
|
+
"DocumentMetadata",
|
|
35
|
+
"ProcessingOptions",
|
|
36
|
+
"OutputFormat",
|
|
37
|
+
"InputConnector",
|
|
38
|
+
"OutputConnector",
|
|
39
|
+
"LocalFileInputConnector",
|
|
40
|
+
"LocalFileOutputConnector",
|
|
41
|
+
"validate_processing_options",
|
|
42
|
+
"ByteITError",
|
|
43
|
+
"AuthenticationError",
|
|
44
|
+
"APIKeyError",
|
|
45
|
+
"ValidationError",
|
|
46
|
+
"ResourceNotFoundError",
|
|
47
|
+
"RateLimitError",
|
|
48
|
+
"ServerError",
|
|
49
|
+
"NetworkError",
|
|
50
|
+
"JobProcessingError",
|
|
51
|
+
]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Local file input connector for ByteIT."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict, Tuple
|
|
5
|
+
|
|
6
|
+
from .base import InputConnector
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LocalFileInputConnector(InputConnector):
|
|
10
|
+
"""Local file input connector.
|
|
11
|
+
|
|
12
|
+
Reads files from your local filesystem and uploads them to ByteIT.
|
|
13
|
+
The file is read and transmitted from your machine to ByteIT servers.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
file_path: Path to the local file
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
FileNotFoundError: File doesn't exist at specified path
|
|
20
|
+
ValueError: Path is a directory, not a file
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
connector = LocalFileInputConnector("/path/to/document.pdf")
|
|
24
|
+
result = client.parse(connector)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, file_path: str):
|
|
28
|
+
"""
|
|
29
|
+
Initialize local file input connector.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
file_path: Path to the local file to upload
|
|
33
|
+
|
|
34
|
+
Raises:
|
|
35
|
+
FileNotFoundError: If the file does not exist
|
|
36
|
+
ValueError: If the path is not a file
|
|
37
|
+
"""
|
|
38
|
+
self.file_path = Path(file_path)
|
|
39
|
+
if not self.file_path.exists():
|
|
40
|
+
raise FileNotFoundError(f"File not found: {self.file_path}")
|
|
41
|
+
if not self.file_path.is_file():
|
|
42
|
+
raise ValueError(f"Path is not a file: {self.file_path}")
|
|
43
|
+
|
|
44
|
+
def get_file_data(self) -> Tuple[str, Any]:
|
|
45
|
+
"""
|
|
46
|
+
Get file data for upload.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Tuple of (filename, file_object)
|
|
50
|
+
"""
|
|
51
|
+
return (self.file_path.name, open(self.file_path, "rb"))
|
|
52
|
+
|
|
53
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
54
|
+
"""
|
|
55
|
+
Convert to dictionary representation.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Dictionary with connector type and configuration
|
|
59
|
+
"""
|
|
60
|
+
return {
|
|
61
|
+
"type": "localfile",
|
|
62
|
+
"path": str(self.file_path),
|
|
63
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""ByteIT cloud storage output connector."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
from .base import OutputConnector
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LocalFileOutputConnector(OutputConnector):
|
|
9
|
+
"""
|
|
10
|
+
Output connector that stores results in ByteIT cloud storage.
|
|
11
|
+
|
|
12
|
+
Results are stored on ByteIT servers and can be retrieved later
|
|
13
|
+
using the job ID and get_job_result() method.
|
|
14
|
+
|
|
15
|
+
This is the default output connector if none is specified.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
19
|
+
"""
|
|
20
|
+
Convert to dictionary representation.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Dictionary with connector type
|
|
24
|
+
"""
|
|
25
|
+
return {"type": "localfile"}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""AWS S3 input connector for ByteIT."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict, Tuple
|
|
5
|
+
|
|
6
|
+
from .base import InputConnector
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class S3InputConnector(InputConnector):
|
|
10
|
+
"""AWS S3 input connector with IAM role authentication.
|
|
11
|
+
|
|
12
|
+
Instructs ByteIT servers to fetch files directly from your S3 bucket
|
|
13
|
+
using IAM role assumption. Files never pass through your local machine,
|
|
14
|
+
providing faster processing and reduced bandwidth usage.
|
|
15
|
+
|
|
16
|
+
Prerequisites:
|
|
17
|
+
- Create an AWS connection in ByteIT dashboard
|
|
18
|
+
- Provide an IAM role ARN that ByteIT can assume
|
|
19
|
+
- Grant the role read access to your S3 bucket
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
source_bucket: S3 bucket name
|
|
23
|
+
source_path_inside_bucket: Object key/path within bucket
|
|
24
|
+
|
|
25
|
+
Note:
|
|
26
|
+
No AWS credentials needed in client code - ByteIT uses the
|
|
27
|
+
IAM role configured in your account settings.
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
connector = S3InputConnector(
|
|
31
|
+
source_bucket="my-documents",
|
|
32
|
+
source_path_inside_bucket="invoices/2024/jan.pdf"
|
|
33
|
+
)
|
|
34
|
+
result = client.parse(connector)
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
source_bucket: str,
|
|
40
|
+
source_path_inside_bucket: str,
|
|
41
|
+
):
|
|
42
|
+
"""
|
|
43
|
+
Initialize S3 input connector.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
source_bucket: S3 bucket name where the file is located
|
|
47
|
+
source_path_inside_bucket: Path to the file within the bucket (e.g., "folder/file.pdf")
|
|
48
|
+
"""
|
|
49
|
+
self.source_bucket = source_bucket
|
|
50
|
+
self.source_path_inside_bucket = source_path_inside_bucket
|
|
51
|
+
|
|
52
|
+
# Extract filename for display
|
|
53
|
+
self.filename = Path(source_path_inside_bucket).name
|
|
54
|
+
|
|
55
|
+
def get_file_data(self) -> Tuple[str, Dict[str, Any]]:
|
|
56
|
+
"""
|
|
57
|
+
Return connection configuration for the ByteIT server.
|
|
58
|
+
|
|
59
|
+
This method does NOT download the file. Instead, it returns metadata
|
|
60
|
+
that tells the ByteIT server how to fetch the file from S3.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Tuple of (filename, connection_data_dict)
|
|
64
|
+
"""
|
|
65
|
+
connection_data = {
|
|
66
|
+
"source_bucket": self.source_bucket,
|
|
67
|
+
"source_path_inside_bucket": self.source_path_inside_bucket,
|
|
68
|
+
}
|
|
69
|
+
return (self.filename, connection_data)
|
|
70
|
+
|
|
71
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
72
|
+
"""
|
|
73
|
+
Serialize connector configuration.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Dictionary with connector type and configuration
|
|
77
|
+
"""
|
|
78
|
+
return {
|
|
79
|
+
"type": "s3",
|
|
80
|
+
"source_bucket": self.source_bucket,
|
|
81
|
+
"source_path_inside_bucket": self.source_path_inside_bucket,
|
|
82
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""AWS S3 output connector for ByteIT."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
from .base import OutputConnector
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class S3OutputConnector(OutputConnector):
|
|
9
|
+
"""
|
|
10
|
+
Output connector for Amazon S3 using IAM role authentication.
|
|
11
|
+
|
|
12
|
+
This connector instructs the ByteIT server to save processed results
|
|
13
|
+
directly to your S3 bucket. The result does NOT pass through your local machine.
|
|
14
|
+
|
|
15
|
+
Prerequisites:
|
|
16
|
+
You must first create an AWS connection in ByteIT by providing an IAM
|
|
17
|
+
role ARN that ByteIT can assume to access your S3 bucket.
|
|
18
|
+
|
|
19
|
+
Note:
|
|
20
|
+
The ByteIT server will use the IAM role configured in your AWS connection
|
|
21
|
+
to write to the S3 bucket. No AWS credentials are needed in your client code.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
bucket: str,
|
|
27
|
+
path: str = "",
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Initialize S3 output connector.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
bucket: S3 bucket name where results will be saved
|
|
34
|
+
path: path prefix within the bucket (e.g., "results/" or "processed/2024/").
|
|
35
|
+
"""
|
|
36
|
+
self.bucket = bucket
|
|
37
|
+
self.path = (
|
|
38
|
+
path.rstrip("/") + "/" if path and not path.endswith("/") else path
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
42
|
+
"""
|
|
43
|
+
Serialize connector configuration for the API.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Dictionary with connector type and configuration matching API format:
|
|
47
|
+
{"type": "s3", "bucket": "bucket-name", "path": "output/path/"}
|
|
48
|
+
"""
|
|
49
|
+
return {
|
|
50
|
+
"type": "s3",
|
|
51
|
+
"bucket": self.bucket,
|
|
52
|
+
"path": self.path,
|
|
53
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Connector classes for ByteIT file input and output operations."""
|
|
2
|
+
|
|
3
|
+
from .base import InputConnector, OutputConnector
|
|
4
|
+
from .LocalFileInputConnector import LocalFileInputConnector
|
|
5
|
+
from .LocalFileOutputConnector import LocalFileOutputConnector
|
|
6
|
+
|
|
7
|
+
# S3 connectors are optional and require boto3
|
|
8
|
+
try:
|
|
9
|
+
from .S3InputConnector import S3InputConnector
|
|
10
|
+
from .S3OutputConnector import S3OutputConnector
|
|
11
|
+
|
|
12
|
+
_s3_available = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
_s3_available = False
|
|
15
|
+
S3InputConnector = None # type: ignore
|
|
16
|
+
S3OutputConnector = None # type: ignore
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"InputConnector",
|
|
20
|
+
"OutputConnector",
|
|
21
|
+
"LocalFileInputConnector",
|
|
22
|
+
"LocalFileOutputConnector",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
if _s3_available:
|
|
26
|
+
__all__.extend(["S3InputConnector", "S3OutputConnector"])
|