byteit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
byteit/ByteITClient.py ADDED
@@ -0,0 +1,382 @@
1
+ """Simplified ByteIT API client - clean and minimal."""
2
+
3
+ import json
4
+ import time
5
+ from pathlib import Path
6
+ from types import TracebackType
7
+ from typing import Any, Dict, List, Optional, Type, Union
8
+
9
+ import requests
10
+
11
+
12
+ from .connectors import (
13
+ LocalFileOutputConnector,
14
+ InputConnector,
15
+ OutputConnector,
16
+ LocalFileInputConnector,
17
+ )
18
+ from .exceptions import (
19
+ APIKeyError,
20
+ AuthenticationError,
21
+ ByteITError,
22
+ JobProcessingError,
23
+ RateLimitError,
24
+ ResourceNotFoundError,
25
+ ServerError,
26
+ ValidationError,
27
+ )
28
+ from .models.Job import Job
29
+ from .models.JobList import JobList
30
+
31
+ # API configuration
32
+ API_VERSION = "v1"
33
+ API_BASE = f"/{API_VERSION}"
34
+ JOBS_PATH = "jobs"
35
+
36
+
37
+ class ByteITClient:
38
+ """
39
+ Simple client for ByteIT document parsing.
40
+
41
+ Methods:
42
+ - parse(): Parse a document and get the result
43
+ - get_all_jobs(): Get all your jobs
44
+ - get_job_by_id(): Get a specific job
45
+ - get_result(): Download a job result
46
+
47
+ Example:
48
+ client = ByteITClient(api_key="your_key")
49
+ result = client.parse("document.pdf") # Returns bytes
50
+ client.parse("doc.pdf", output="result.txt") # Saves to file
51
+ """
52
+
53
+ # BASE_URL = "https://api.byteit.ai"
54
+ # BASE_URL = "http://127.0.0.1:8000"
55
+ BASE_URL = "https://heinzelai.com"
56
+ DEFAULT_TIMEOUT = 30
57
+
58
+ def __init__(self, api_key: str):
59
+ """
60
+ Initialize the ByteIT client.
61
+
62
+ Args:
63
+ api_key: Your ByteIT API key
64
+
65
+ Raises:
66
+ APIKeyError: If API key is invalid
67
+ """
68
+ if not api_key:
69
+ raise APIKeyError("API key must be a non-empty string")
70
+
71
+ self.api_key = api_key
72
+ self._session = requests.Session()
73
+ self._session.headers.update({"X-API-Key": self.api_key})
74
+
75
+ # ==================== PUBLIC API ====================
76
+
77
+ def parse(
78
+ self,
79
+ input: Union[str, Path, InputConnector],
80
+ output: Union[None, str, Path] = None,
81
+ result_format: str = "md",
82
+ ) -> bytes:
83
+ """
84
+ Parse a document and wait for the result.
85
+
86
+ Args:
87
+ input: File to parse. Can be:
88
+ - str or Path: Local file path
89
+ - InputConnector: For S3 or custom sources
90
+ result_format: "txt", "json", "md", or "html" (default: "txt")
91
+ output: Where to save result (optional). Can be:
92
+ - None: Return result as bytes (default)
93
+ - str or Path: Save to local file
94
+
95
+ Returns:
96
+ Parsed content as bytes
97
+
98
+ Example:
99
+ # Simple - returns bytes
100
+ result = client.parse("document.pdf")
101
+
102
+ # Save to file
103
+ client.parse("doc.pdf", output="result.txt")
104
+
105
+ # S3 input (use connector)
106
+ from byteit.connectors import S3InputConnector
107
+ result = client.parse(S3InputConnector("my-bucket", "file.pdf"))
108
+
109
+ # Different format
110
+ json_result = client.parse("doc.pdf", result_format="json")
111
+ """
112
+ # Convert input to connector as early as possible
113
+ input_connector = self._to_input_connector(input)
114
+
115
+ # Convert output to connector if provided
116
+ output_connector = self._to_output_connector(output)
117
+
118
+ # Create job and wait
119
+ job = self._create_job(
120
+ input_connector=input_connector,
121
+ output_connector=output_connector,
122
+ result_format=result_format,
123
+ )
124
+ self._wait_for_completion(job.id)
125
+
126
+ # Download result
127
+ result_bytes = self._download_result(job.id)
128
+
129
+ # If output is a file path, save it
130
+ if isinstance(output, (str, Path)):
131
+ output_path = Path(output)
132
+ output_path.write_bytes(result_bytes)
133
+
134
+ return result_bytes
135
+
136
+ def get_all_jobs(self) -> List[Job]:
137
+ """
138
+ Get all jobs for your account.
139
+
140
+ Returns:
141
+ List of Job objects
142
+
143
+ Example:
144
+ jobs = client.get_all_jobs()
145
+ for job in jobs:
146
+ print(f"{job.id}: {job.processing_status}")
147
+ """
148
+ job_list = self._list_jobs()
149
+ return job_list.jobs
150
+
151
+ def get_job_by_id(self, job_id: str) -> Job:
152
+ """
153
+ Get a specific job by ID.
154
+
155
+ Args:
156
+ job_id: The job ID
157
+
158
+ Returns:
159
+ Job object
160
+
161
+ Example:
162
+ job = client.get_job_by_id("job_123")
163
+ """
164
+ return self._get_job_status(job_id)
165
+
166
+ def get_result(self, job_id: str) -> bytes:
167
+ """
168
+ Download result for a completed job.
169
+
170
+ Args:
171
+ job_id: The job ID
172
+
173
+ Returns:
174
+ Result as bytes
175
+
176
+ Raises:
177
+ JobProcessingError: If job is not completed
178
+ """
179
+ return self._download_result(job_id)
180
+
181
+ # ==================== CONNECTOR CONVERTERS ====================
182
+
183
+ def _to_input_connector(
184
+ self, input: Union[str, Path, InputConnector]
185
+ ) -> InputConnector:
186
+ """Convert various input types to InputConnector."""
187
+ # Already a connector (checks for InputConnector or its subclasses)
188
+ if isinstance(input, InputConnector):
189
+ return input
190
+
191
+ # String or Path - local file
192
+ if isinstance(input, (str, Path)):
193
+ return LocalFileInputConnector(file_path=str(input))
194
+
195
+ raise ValidationError(
196
+ f"Unsupported input type: {type(input)}. "
197
+ "Use str, Path, or InputConnector (e.g., S3InputConnector)"
198
+ )
199
+
200
+ def _to_output_connector(self, output: Union[None, str, Path]):
201
+ """Convert output specification to OutputConnector."""
202
+ # Always use ByteIT storage (simplest approach)
203
+ # If output is a file path, we download and save after completion
204
+ return LocalFileOutputConnector()
205
+
206
+ # ==================== INTERNAL METHODS ====================
207
+
208
+ def _create_job(
209
+ self,
210
+ input_connector: InputConnector,
211
+ output_connector: OutputConnector,
212
+ result_format: str,
213
+ ) -> Job:
214
+ """Create a processing job."""
215
+ connector_type = (
216
+ input_connector.to_dict().get("type", "localfile").strip().lower()
217
+ )
218
+
219
+ # Build base request data
220
+ data: Dict[str, Any] = {
221
+ "output_format": result_format,
222
+ "processing_options": json.dumps({}),
223
+ "input_connector": connector_type,
224
+ }
225
+
226
+ # Add output connector config
227
+ output_config = output_connector.to_dict()
228
+ data["output_connector"] = output_config.get("type", "")
229
+ data["output_connection_data"] = (
230
+ json.dumps(output_config) if output_config.get("type") else "{}"
231
+ )
232
+
233
+ # Prepare input based on type
234
+ files: Optional[Dict[str, Any]] = None
235
+ file_obj = None
236
+
237
+ if connector_type == "localfile":
238
+ filename, file_obj = input_connector.get_file_data()
239
+ files = {"file": (filename, file_obj)}
240
+ elif connector_type == "s3":
241
+ _, connection_data = input_connector.get_file_data()
242
+ data["input_connection_data"] = json.dumps(connection_data)
243
+ else:
244
+ raise ValidationError(f"Unsupported connector type: {connector_type}")
245
+
246
+ # Make request with cleanup
247
+ try:
248
+ response = self._request(
249
+ "POST", f"{API_BASE}/{JOBS_PATH}/", files=files, data=data
250
+ )
251
+ finally:
252
+ if file_obj and hasattr(file_obj, "close") and not file_obj.closed:
253
+ file_obj.close()
254
+
255
+ # Return job from response
256
+ if "job_id" in response:
257
+ return self._get_job_status(response["job_id"])
258
+
259
+ return Job.from_dict(response["job"])
260
+
261
+ def _get_job_status(self, job_id: str) -> Job:
262
+ """Get current job status."""
263
+ response = self._request("GET", f"{API_BASE}/{JOBS_PATH}/{job_id}/")
264
+ job_data = response.get("job", response.get("document", response))
265
+ return Job.from_dict(job_data)
266
+
267
+ def _list_jobs(self) -> JobList:
268
+ """List all jobs."""
269
+ response = self._request("GET", f"{API_BASE}/{JOBS_PATH}/")
270
+ jobs_data = response.get("jobs", response.get("documents", []))
271
+ jobs = [Job.from_dict(doc) for doc in jobs_data]
272
+ return JobList(
273
+ jobs=jobs,
274
+ count=response.get("count", len(jobs)),
275
+ detail=response.get("detail", ""),
276
+ )
277
+
278
+ def _wait_for_completion(self, job_id: str) -> Job:
279
+ """Wait for job to complete (polls every 2 seconds)."""
280
+ while True:
281
+ job = self._get_job_status(job_id)
282
+
283
+ if job.is_completed:
284
+ return job
285
+
286
+ if job.is_failed:
287
+ raise JobProcessingError(
288
+ f"Job failed: {job.processing_error or 'Unknown error'}"
289
+ )
290
+
291
+ time.sleep(2)
292
+
293
+ def _download_result(self, job_id: str) -> bytes:
294
+ """Download job result."""
295
+ url = self._build_url(f"{API_BASE}/{JOBS_PATH}/{job_id}/result/")
296
+ response = self._session.get(url, timeout=self.DEFAULT_TIMEOUT)
297
+ response.raise_for_status()
298
+
299
+ content_disposition = response.headers.get("Content-Disposition", "")
300
+ content_type = response.headers.get("Content-Type", "")
301
+
302
+ # Check if file download
303
+ if "attachment" in content_disposition:
304
+ return response.content
305
+
306
+ # Handle JSON response (not ready or error)
307
+ if "application/json" in content_type:
308
+ data = self._handle_response(response)
309
+ if not data.get("ready", False):
310
+ status = data.get("processing_status", "unknown")
311
+ raise JobProcessingError(f"Result not available. Job status: {status}")
312
+ raise JobProcessingError("Job ready but no result file returned")
313
+
314
+ # File response
315
+ return response.content
316
+
317
+ # ==================== HTTP HELPERS ====================
318
+
319
+ def _build_url(self, path: str) -> str:
320
+ """Build full URL."""
321
+ return f"{self.BASE_URL}/{path.lstrip('/')}"
322
+
323
+ def _request(self, method: str, path: str, **kwargs: Any) -> Dict[str, Any]:
324
+ """Make HTTP request."""
325
+ url = self._build_url(path)
326
+ kwargs.setdefault("timeout", self.DEFAULT_TIMEOUT)
327
+ response = self._session.request(method, url, **kwargs)
328
+ return self._handle_response(response)
329
+
330
+ def _handle_response(self, response: requests.Response) -> Dict[str, Any]:
331
+ """Handle API response and raise appropriate exceptions."""
332
+ # Success path
333
+ if response.status_code in (200, 201):
334
+ return response.json() if response.content else {}
335
+
336
+ # Error path - extract details
337
+ try:
338
+ data: Dict[str, Any] = response.json() if response.content else {}
339
+ message: str = data.get("detail", "") or response.text or "Request failed"
340
+ except (ValueError, requests.exceptions.JSONDecodeError):
341
+ # Response is not JSON (e.g., HTML error page)
342
+ data = {}
343
+ message = (
344
+ response.text or f"Request failed with status {response.status_code}"
345
+ )
346
+
347
+ # Map status to exception
348
+ ERROR_MAP: Dict[int, Type[Exception]] = {
349
+ 400: ValidationError,
350
+ 401: AuthenticationError,
351
+ 403: APIKeyError,
352
+ 404: ResourceNotFoundError,
353
+ 429: RateLimitError,
354
+ }
355
+
356
+ ExceptionClass = ERROR_MAP.get(response.status_code)
357
+ if ExceptionClass:
358
+ raise ExceptionClass(message, response.status_code, data)
359
+
360
+ if response.status_code >= 500:
361
+ raise ServerError(message, response.status_code, data)
362
+
363
+ raise ByteITError(message, response.status_code, data)
364
+
365
+ # ==================== CONTEXT MANAGER ====================
366
+
367
+ def close(self):
368
+ """Close HTTP session."""
369
+ self._session.close()
370
+
371
+ def __enter__(self):
372
+ """Context manager entry."""
373
+ return self
374
+
375
+ def __exit__(
376
+ self,
377
+ exc_type: Optional[Type[BaseException]],
378
+ exc_val: Optional[BaseException],
379
+ exc_tb: Optional[TracebackType],
380
+ ) -> None:
381
+ """Context manager exit."""
382
+ self.close()
byteit/__init__.py ADDED
@@ -0,0 +1,51 @@
1
+ """ByteIT Python Client Library for text extraction."""
2
+
3
+ from .ByteITClient import ByteITClient
4
+ from .exceptions import (
5
+ APIKeyError,
6
+ AuthenticationError,
7
+ ByteITError,
8
+ JobProcessingError,
9
+ NetworkError,
10
+ RateLimitError,
11
+ ResourceNotFoundError,
12
+ ServerError,
13
+ ValidationError,
14
+ )
15
+ from .models.Job import Job
16
+ from .models.JobList import JobList
17
+ from .models.DocumentMetadata import DocumentMetadata
18
+ from .models.ProcessingOptions import ProcessingOptions
19
+ from .models.OutputFormat import OutputFormat
20
+ from .connectors import (
21
+ InputConnector,
22
+ OutputConnector,
23
+ LocalFileInputConnector,
24
+ LocalFileOutputConnector,
25
+ )
26
+ from .validations import validate_processing_options
27
+
28
+ __version__ = "0.1.0"
29
+
30
+ __all__ = [
31
+ "ByteITClient",
32
+ "Job",
33
+ "JobList",
34
+ "DocumentMetadata",
35
+ "ProcessingOptions",
36
+ "OutputFormat",
37
+ "InputConnector",
38
+ "OutputConnector",
39
+ "LocalFileInputConnector",
40
+ "LocalFileOutputConnector",
41
+ "validate_processing_options",
42
+ "ByteITError",
43
+ "AuthenticationError",
44
+ "APIKeyError",
45
+ "ValidationError",
46
+ "ResourceNotFoundError",
47
+ "RateLimitError",
48
+ "ServerError",
49
+ "NetworkError",
50
+ "JobProcessingError",
51
+ ]
@@ -0,0 +1,63 @@
1
+ """Local file input connector for ByteIT."""
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Dict, Tuple
5
+
6
+ from .base import InputConnector
7
+
8
+
9
+ class LocalFileInputConnector(InputConnector):
10
+ """Local file input connector.
11
+
12
+ Reads files from your local filesystem and uploads them to ByteIT.
13
+ The file is read and transmitted from your machine to ByteIT servers.
14
+
15
+ Args:
16
+ file_path: Path to the local file
17
+
18
+ Raises:
19
+ FileNotFoundError: File doesn't exist at specified path
20
+ ValueError: Path is a directory, not a file
21
+
22
+ Example:
23
+ connector = LocalFileInputConnector("/path/to/document.pdf")
24
+ result = client.parse(connector)
25
+ """
26
+
27
+ def __init__(self, file_path: str):
28
+ """
29
+ Initialize local file input connector.
30
+
31
+ Args:
32
+ file_path: Path to the local file to upload
33
+
34
+ Raises:
35
+ FileNotFoundError: If the file does not exist
36
+ ValueError: If the path is not a file
37
+ """
38
+ self.file_path = Path(file_path)
39
+ if not self.file_path.exists():
40
+ raise FileNotFoundError(f"File not found: {self.file_path}")
41
+ if not self.file_path.is_file():
42
+ raise ValueError(f"Path is not a file: {self.file_path}")
43
+
44
+ def get_file_data(self) -> Tuple[str, Any]:
45
+ """
46
+ Get file data for upload.
47
+
48
+ Returns:
49
+ Tuple of (filename, file_object)
50
+ """
51
+ return (self.file_path.name, open(self.file_path, "rb"))
52
+
53
+ def to_dict(self) -> Dict[str, Any]:
54
+ """
55
+ Convert to dictionary representation.
56
+
57
+ Returns:
58
+ Dictionary with connector type and configuration
59
+ """
60
+ return {
61
+ "type": "localfile",
62
+ "path": str(self.file_path),
63
+ }
@@ -0,0 +1,25 @@
1
+ """ByteIT cloud storage output connector."""
2
+
3
+ from typing import Any, Dict
4
+
5
+ from .base import OutputConnector
6
+
7
+
8
+ class LocalFileOutputConnector(OutputConnector):
9
+ """
10
+ Output connector that stores results in ByteIT cloud storage.
11
+
12
+ Results are stored on ByteIT servers and can be retrieved later
13
+ using the job ID and get_job_result() method.
14
+
15
+ This is the default output connector if none is specified.
16
+ """
17
+
18
+ def to_dict(self) -> Dict[str, Any]:
19
+ """
20
+ Convert to dictionary representation.
21
+
22
+ Returns:
23
+ Dictionary with connector type
24
+ """
25
+ return {"type": "localfile"}
@@ -0,0 +1,82 @@
1
+ """AWS S3 input connector for ByteIT."""
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Dict, Tuple
5
+
6
+ from .base import InputConnector
7
+
8
+
9
+ class S3InputConnector(InputConnector):
10
+ """AWS S3 input connector with IAM role authentication.
11
+
12
+ Instructs ByteIT servers to fetch files directly from your S3 bucket
13
+ using IAM role assumption. Files never pass through your local machine,
14
+ providing faster processing and reduced bandwidth usage.
15
+
16
+ Prerequisites:
17
+ - Create an AWS connection in ByteIT dashboard
18
+ - Provide an IAM role ARN that ByteIT can assume
19
+ - Grant the role read access to your S3 bucket
20
+
21
+ Args:
22
+ source_bucket: S3 bucket name
23
+ source_path_inside_bucket: Object key/path within bucket
24
+
25
+ Note:
26
+ No AWS credentials needed in client code - ByteIT uses the
27
+ IAM role configured in your account settings.
28
+
29
+ Example:
30
+ connector = S3InputConnector(
31
+ source_bucket="my-documents",
32
+ source_path_inside_bucket="invoices/2024/jan.pdf"
33
+ )
34
+ result = client.parse(connector)
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ source_bucket: str,
40
+ source_path_inside_bucket: str,
41
+ ):
42
+ """
43
+ Initialize S3 input connector.
44
+
45
+ Args:
46
+ source_bucket: S3 bucket name where the file is located
47
+ source_path_inside_bucket: Path to the file within the bucket (e.g., "folder/file.pdf")
48
+ """
49
+ self.source_bucket = source_bucket
50
+ self.source_path_inside_bucket = source_path_inside_bucket
51
+
52
+ # Extract filename for display
53
+ self.filename = Path(source_path_inside_bucket).name
54
+
55
+ def get_file_data(self) -> Tuple[str, Dict[str, Any]]:
56
+ """
57
+ Return connection configuration for the ByteIT server.
58
+
59
+ This method does NOT download the file. Instead, it returns metadata
60
+ that tells the ByteIT server how to fetch the file from S3.
61
+
62
+ Returns:
63
+ Tuple of (filename, connection_data_dict)
64
+ """
65
+ connection_data = {
66
+ "source_bucket": self.source_bucket,
67
+ "source_path_inside_bucket": self.source_path_inside_bucket,
68
+ }
69
+ return (self.filename, connection_data)
70
+
71
+ def to_dict(self) -> Dict[str, Any]:
72
+ """
73
+ Serialize connector configuration.
74
+
75
+ Returns:
76
+ Dictionary with connector type and configuration
77
+ """
78
+ return {
79
+ "type": "s3",
80
+ "source_bucket": self.source_bucket,
81
+ "source_path_inside_bucket": self.source_path_inside_bucket,
82
+ }
@@ -0,0 +1,53 @@
1
+ """AWS S3 output connector for ByteIT."""
2
+
3
+ from typing import Any, Dict
4
+
5
+ from .base import OutputConnector
6
+
7
+
8
+ class S3OutputConnector(OutputConnector):
9
+ """
10
+ Output connector for Amazon S3 using IAM role authentication.
11
+
12
+ This connector instructs the ByteIT server to save processed results
13
+ directly to your S3 bucket. The result does NOT pass through your local machine.
14
+
15
+ Prerequisites:
16
+ You must first create an AWS connection in ByteIT by providing an IAM
17
+ role ARN that ByteIT can assume to access your S3 bucket.
18
+
19
+ Note:
20
+ The ByteIT server will use the IAM role configured in your AWS connection
21
+ to write to the S3 bucket. No AWS credentials are needed in your client code.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ bucket: str,
27
+ path: str = "",
28
+ ):
29
+ """
30
+ Initialize S3 output connector.
31
+
32
+ Args:
33
+ bucket: S3 bucket name where results will be saved
34
+ path: path prefix within the bucket (e.g., "results/" or "processed/2024/").
35
+ """
36
+ self.bucket = bucket
37
+ self.path = (
38
+ path.rstrip("/") + "/" if path and not path.endswith("/") else path
39
+ )
40
+
41
+ def to_dict(self) -> Dict[str, Any]:
42
+ """
43
+ Serialize connector configuration for the API.
44
+
45
+ Returns:
46
+ Dictionary with connector type and configuration matching API format:
47
+ {"type": "s3", "bucket": "bucket-name", "path": "output/path/"}
48
+ """
49
+ return {
50
+ "type": "s3",
51
+ "bucket": self.bucket,
52
+ "path": self.path,
53
+ }
@@ -0,0 +1,26 @@
1
+ """Connector classes for ByteIT file input and output operations."""
2
+
3
+ from .base import InputConnector, OutputConnector
4
+ from .LocalFileInputConnector import LocalFileInputConnector
5
+ from .LocalFileOutputConnector import LocalFileOutputConnector
6
+
7
+ # S3 connectors are optional and require boto3
8
+ try:
9
+ from .S3InputConnector import S3InputConnector
10
+ from .S3OutputConnector import S3OutputConnector
11
+
12
+ _s3_available = True
13
+ except ImportError:
14
+ _s3_available = False
15
+ S3InputConnector = None # type: ignore
16
+ S3OutputConnector = None # type: ignore
17
+
18
+ __all__ = [
19
+ "InputConnector",
20
+ "OutputConnector",
21
+ "LocalFileInputConnector",
22
+ "LocalFileOutputConnector",
23
+ ]
24
+
25
+ if _s3_available:
26
+ __all__.extend(["S3InputConnector", "S3OutputConnector"])