byteit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ """Base classes for ByteIT connectors."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Dict, Tuple
5
+
6
+
7
+ class InputConnector(ABC):
8
+ """Abstract base for input data sources.
9
+
10
+ Input connectors define how ByteIT accesses documents for processing.
11
+ Implementations handle local files, S3 buckets, and other data sources.
12
+
13
+ Subclasses must implement:
14
+ - get_file_data(): Returns file data for upload or connection info
15
+ - to_dict(): Serializes connector configuration for API
16
+ """
17
+
18
+ @abstractmethod
19
+ def get_file_data(self) -> Tuple[str, Any]:
20
+ """
21
+ Get file data for upload.
22
+
23
+ Returns:
24
+ Tuple of (filename, file_object) suitable for requests.files
25
+ """
26
+ ...
27
+
28
+ @abstractmethod
29
+ def to_dict(self) -> Dict[str, Any]:
30
+ """
31
+ Convert connector configuration to dictionary for API submission.
32
+
33
+ Returns:
34
+ Dictionary representation of the connector configuration
35
+ """
36
+ ...
37
+
38
+
39
+ class OutputConnector(ABC):
40
+ """Abstract base for output destinations.
41
+
42
+ Output connectors define where ByteIT stores processed results.
43
+ Implementations handle local storage, S3 buckets, and other destinations.
44
+
45
+ Subclasses must implement:
46
+ - to_dict(): Serializes connector configuration for API
47
+ """
48
+
49
+ @abstractmethod
50
+ def to_dict(self) -> Dict[str, Any]:
51
+ """
52
+ Convert connector configuration to dictionary for API submission.
53
+
54
+ Returns:
55
+ Dictionary representation of the connector configuration
56
+ """
57
+ ...
byteit/exceptions.py ADDED
@@ -0,0 +1,107 @@
1
+ """Custom exceptions for the ByteIT client library."""
2
+
3
+ from typing import Any
4
+
5
+
6
+ class ByteITError(Exception):
7
+ """Base exception for all ByteIT API errors.
8
+
9
+ All ByteIT exceptions inherit from this class, making it easy to catch
10
+ any ByteIT-related error with a single except clause.
11
+
12
+ Attributes:
13
+ message: Human-readable error description
14
+ status_code: HTTP status code if available
15
+ response: Full API response data if available
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ message: str,
21
+ status_code: int | None = None,
22
+ response: dict[str, Any] | None = None,
23
+ ):
24
+ super().__init__(message)
25
+ self.message = message
26
+ self.status_code = status_code
27
+ self.response = response
28
+
29
+
30
+ class AuthenticationError(ByteITError):
31
+ """Authentication failure.
32
+
33
+ Raised when API requests fail due to invalid or missing credentials.
34
+ Check your API key and ensure it's properly configured.
35
+ """
36
+
37
+ pass
38
+
39
+
40
+ class APIKeyError(AuthenticationError):
41
+ """API key validation error.
42
+
43
+ Raised when the provided API key is invalid, expired, or missing.
44
+ Verify your API key at https://byteit.ai/dashboard.
45
+ """
46
+
47
+ pass
48
+
49
+
50
+ class ValidationError(ByteITError):
51
+ """Request validation error.
52
+
53
+ Raised when request parameters are invalid or missing required fields.
54
+ Check the error message for details on which parameters need correction.
55
+ """
56
+
57
+ pass
58
+
59
+
60
+ class ResourceNotFoundError(ByteITError):
61
+ """Resource not found.
62
+
63
+ Raised when attempting to access a job or resource that doesn't exist
64
+ or that you don't have permission to access.
65
+ """
66
+
67
+ pass
68
+
69
+
70
+ class RateLimitError(ByteITError):
71
+ """Rate limit exceeded.
72
+
73
+ Raised when you've exceeded your API rate limits.
74
+ Wait before retrying or contact support to increase your limits.
75
+ """
76
+
77
+ pass
78
+
79
+
80
+ class ServerError(ByteITError):
81
+ """Server-side error.
82
+
83
+ Raised when ByteIT servers encounter an internal error (5xx status codes).
84
+ These errors are usually temporary - retry after a brief delay.
85
+ """
86
+
87
+ pass
88
+
89
+
90
+ class NetworkError(ByteITError):
91
+ """Network communication error.
92
+
93
+ Raised when unable to reach ByteIT servers due to network issues.
94
+ Check your internet connection and firewall settings.
95
+ """
96
+
97
+ pass
98
+
99
+
100
+ class JobProcessingError(ByteITError):
101
+ """Job processing failure.
102
+
103
+ Raised when a document processing job fails or cannot be completed.
104
+ Check the error message for specific details about the failure.
105
+ """
106
+
107
+ pass
@@ -0,0 +1,25 @@
1
+ """Data model for ByteIT Document Metadatata."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+
7
+ @dataclass
8
+ class DocumentMetadata:
9
+ """Document metadata information.
10
+
11
+ Contains information about the original document being processed.
12
+
13
+ Attributes:
14
+ original_filename: Original name of the uploaded file
15
+ document_type: Type/format of document (pdf, docx, etc.)
16
+ page_count: Number of pages in document (if applicable)
17
+ language: Document language code (default: 'en')
18
+ encoding: Character encoding (default: 'utf-8')
19
+ """
20
+
21
+ original_filename: str
22
+ document_type: str
23
+ page_count: Optional[int] = None
24
+ language: str = "en"
25
+ encoding: str = "utf-8"
byteit/models/Job.py ADDED
@@ -0,0 +1,153 @@
1
+ """Data model for ByteIT Job."""
2
+
3
+ from dataclasses import dataclass
4
+ from datetime import datetime
5
+ from typing import Any, Dict, Optional, cast
6
+ from byteit.models.DocumentMetadata import DocumentMetadata
7
+ from byteit.models.ProcessingOptions import ProcessingOptions
8
+
9
+
10
+ @dataclass
11
+ class Job:
12
+ """Document processing job.
13
+
14
+ Represents a document parsing job in the ByteIT system, tracking its
15
+ status, configuration, and results.
16
+
17
+ Attributes:
18
+ id: Unique job identifier
19
+ created_at: Job creation timestamp
20
+ updated_at: Last update timestamp
21
+ processing_status: Current status (pending, processing, completed, failed)
22
+ result_format: Output format (txt, json, md, html)
23
+ owner_user_id: ID of the user who created the job
24
+ file_data: Original file information
25
+ file_hash: Hash of the input file
26
+ nickname: Optional user-defined job name
27
+ metadata: Document metadata (filename, type, pages, etc.)
28
+ processing_options: Job configuration options
29
+ processing_error: Error message if job failed
30
+ storage_path: Internal storage location
31
+ result_path: Path to processed result
32
+ input_connector: Type of input connector used
33
+ input_connection_data: Input connector configuration
34
+ output_connector: Type of output connector used
35
+ output_connection_data: Output connector configuration
36
+ started_processing_at: Processing start time
37
+ finished_processing_at: Processing completion time
38
+
39
+ Properties:
40
+ is_completed: True if job finished successfully
41
+ is_failed: True if job failed
42
+ is_processing: True if job is currently being processed
43
+ """
44
+
45
+ id: str
46
+ created_at: datetime
47
+ updated_at: datetime
48
+ processing_status: str
49
+ result_format: str
50
+ owner_user_id: Optional[str] = None
51
+ file_data: Optional[str] = None
52
+ file_hash: Optional[str] = None
53
+ nickname: Optional[str] = None
54
+ metadata: Optional[DocumentMetadata] = None
55
+ processing_options: Optional[ProcessingOptions] = None
56
+ processing_error: Optional[str] = None
57
+ storage_path: Optional[str] = None
58
+ result_path: Optional[str] = None
59
+ input_connector: Optional[str] = None
60
+ input_connection_data: Optional[Dict[str, Any]] = None
61
+ output_connector: Optional[str] = None
62
+ output_connection_data: Optional[Dict[str, Any]] = None
63
+ started_processing_at: Optional[datetime] = None
64
+ finished_processing_at: Optional[datetime] = None
65
+
66
+ @property
67
+ def is_completed(self) -> bool:
68
+ """Check if the job is completed."""
69
+ return self.processing_status == "completed"
70
+
71
+ @property
72
+ def is_failed(self) -> bool:
73
+ """Check if the job failed."""
74
+ return self.processing_status == "failed"
75
+
76
+ @property
77
+ def is_processing(self) -> bool:
78
+ """Check if the job is currently processing."""
79
+ return self.processing_status in ("pending", "processing")
80
+
81
+ @classmethod
82
+ def from_dict(cls, data: Dict[str, Any]) -> "Job":
83
+ """Create a Job instance from API response data."""
84
+ # Parse datetime fields
85
+ created_at = data.get("created_at")
86
+ if isinstance(created_at, str):
87
+ created_at = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
88
+ else:
89
+ created_at = datetime.now() # fallback
90
+
91
+ updated_at = data.get("updated_at")
92
+ if isinstance(updated_at, str):
93
+ updated_at = datetime.fromisoformat(updated_at.replace("Z", "+00:00"))
94
+ else:
95
+ updated_at = datetime.now() # fallback
96
+
97
+ started_processing_at = data.get("started_processing_at")
98
+ if isinstance(started_processing_at, str):
99
+ started_processing_at = datetime.fromisoformat(
100
+ started_processing_at.replace("Z", "+00:00")
101
+ )
102
+
103
+ finished_processing_at = data.get("finished_processing_at")
104
+ if isinstance(finished_processing_at, str):
105
+ finished_processing_at = datetime.fromisoformat(
106
+ finished_processing_at.replace("Z", "+00:00")
107
+ )
108
+
109
+ # Parse metadata
110
+ metadata = None
111
+ if data.get("metadata") and isinstance(data["metadata"], dict):
112
+ metadata_dict = cast(Dict[str, Any], data["metadata"])
113
+ try:
114
+ metadata = DocumentMetadata(
115
+ original_filename=metadata_dict.get("original_filename", ""),
116
+ document_type=metadata_dict.get("document_type", ""),
117
+ page_count=metadata_dict.get("page_count"),
118
+ language=metadata_dict.get("language", "en"),
119
+ encoding=metadata_dict.get("encoding", "utf-8"),
120
+ )
121
+ except Exception as e:
122
+ # If metadata parsing fails, skip it
123
+ print(f"Warning: Failed to parse metadata: {e}")
124
+ metadata = None
125
+
126
+ # Parse processing options
127
+ processing_options = None
128
+ processing_options_data = data.get("processing_options")
129
+ if processing_options_data and isinstance(processing_options_data, dict):
130
+ processing_options = ProcessingOptions.from_dict(processing_options_data)
131
+
132
+ return cls(
133
+ id=data["id"],
134
+ created_at=created_at,
135
+ updated_at=updated_at,
136
+ processing_status=data["processing_status"],
137
+ result_format=data["result_format"],
138
+ owner_user_id=data.get("owner_user_id"),
139
+ file_data=data.get("file_data"),
140
+ file_hash=data.get("file_hash"),
141
+ nickname=data.get("nickname"),
142
+ metadata=metadata,
143
+ processing_options=processing_options,
144
+ processing_error=data.get("processing_error"),
145
+ storage_path=data.get("storage_path"),
146
+ result_path=data.get("result_path"),
147
+ input_connector=data.get("input_connector"),
148
+ input_connection_data=data.get("input_connection_data"),
149
+ output_connector=data.get("output_connector"),
150
+ output_connection_data=data.get("output_connection_data"),
151
+ started_processing_at=started_processing_at,
152
+ finished_processing_at=finished_processing_at,
153
+ )
@@ -0,0 +1,21 @@
1
+ """Data models for ByteIT API responses."""
2
+
3
+ from dataclasses import dataclass
4
+ from byteit.models.Job import Job
5
+
6
+
7
+ @dataclass
8
+ class JobList:
9
+ """Collection of jobs with metadata.
10
+
11
+ Returned by list operations containing multiple jobs.
12
+
13
+ Attributes:
14
+ jobs: List of Job objects
15
+ count: Total number of jobs
16
+ detail: Additional information or messages
17
+ """
18
+
19
+ jobs: list[Job]
20
+ count: int
21
+ detail: str
@@ -0,0 +1,16 @@
1
+ """Output format enumeration for document processing."""
2
+
3
+ from enum import Enum
4
+
5
+
6
+ class OutputFormat(str, Enum):
7
+ """Supported output formats for document processing."""
8
+
9
+ TXT = "txt"
10
+ JSON = "json"
11
+ HTML = "html"
12
+ MD = "md"
13
+
14
+ def __str__(self) -> str:
15
+ """Return the string value of the format."""
16
+ return self.value
@@ -0,0 +1,96 @@
1
+ """Processing options model for document processing."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Dict, List, Union
5
+
6
+ from byteit.models.OutputFormat import OutputFormat
7
+
8
+
9
+ def _default_list() -> List[str]:
10
+ """Factory function for default list."""
11
+ return ["en"]
12
+
13
+
14
+ @dataclass
15
+ class ProcessingOptions:
16
+ """Document processing configuration.
17
+
18
+ Specifies how documents should be processed by ByteIT.
19
+
20
+ Attributes:
21
+ languages: List of language codes for OCR/parsing (default: ['en'])
22
+ page_range: Specific pages to process (e.g., '1-5' or '1,3,5')
23
+ output_format: Desired output format (txt, json, html, md)
24
+
25
+ Note:
26
+ The output_format is extracted and sent separately in API requests,
27
+ while languages and page_range are sent as processing_options.
28
+ """
29
+
30
+ languages: List[str] = field(default_factory=_default_list)
31
+ page_range: str = field(default="")
32
+ output_format: Union[OutputFormat, str] = OutputFormat.TXT
33
+
34
+ def __post_init__(self) -> None:
35
+ """Validate and convert processing options."""
36
+ # Convert string to OutputFormat if necessary
37
+ if isinstance(self.output_format, str):
38
+ try:
39
+ object.__setattr__(
40
+ self, "output_format", OutputFormat(self.output_format)
41
+ )
42
+ except ValueError as exc:
43
+ raise ValueError(
44
+ f"Invalid output format: {self.output_format}. "
45
+ f"Valid formats are: txt, json, html, md"
46
+ ) from exc
47
+
48
+ def to_dict(self) -> Dict[str, Any]:
49
+ """
50
+ Convert ProcessingOptions to dictionary for API communication.
51
+
52
+ Note: output_format is included here but will be extracted by the
53
+ API client and sent as a top-level parameter.
54
+
55
+ Returns:
56
+ Dictionary representation suitable for API requests
57
+ """
58
+ result: Dict[str, Any] = {}
59
+
60
+ if self.languages:
61
+ result["languages"] = self.languages
62
+
63
+ if self.page_range:
64
+ result["page_range"] = self.page_range
65
+
66
+ # Include output_format for extraction by API client
67
+ if isinstance(self.output_format, OutputFormat):
68
+ result["output_format"] = self.output_format.value
69
+ else:
70
+ result["output_format"] = str(self.output_format)
71
+
72
+ return result
73
+
74
+ @classmethod
75
+ def from_dict(cls, data: Dict[str, Any]) -> "ProcessingOptions":
76
+ """
77
+ Create ProcessingOptions from dictionary.
78
+
79
+ Args:
80
+ data: Dictionary containing processing options
81
+
82
+ Returns:
83
+ ProcessingOptions instance
84
+ """
85
+ languages = data.get("languages", ["en"])
86
+ page_range = data.get("page_range", "")
87
+ output_format_str = data.get("output_format", "txt")
88
+
89
+ # Convert output_format to enum
90
+ output_format = OutputFormat(output_format_str)
91
+
92
+ return cls(
93
+ languages=languages,
94
+ page_range=page_range,
95
+ output_format=output_format,
96
+ )
byteit/validations.py ADDED
@@ -0,0 +1,42 @@
1
+ """Validation utilities for ByteIT API requests."""
2
+
3
+ from typing import Any, Dict, List, Set
4
+
5
+ from .exceptions import ValidationError
6
+
7
+
8
+ # Valid processing option fields (only languages and page_range are allowed)
9
+ VALID_PROCESSING_OPTIONS: Set[str] = {
10
+ "languages",
11
+ "page_range",
12
+ }
13
+
14
+
15
+ def validate_processing_options(options: Dict[str, Any]) -> None:
16
+ """
17
+ Validate processing options dictionary.
18
+
19
+ Only 'languages' and 'page_range' are allowed in processing_options.
20
+ The 'output_format' should be passed as a top-level parameter, not
21
+ inside processing_options.
22
+
23
+ Args:
24
+ options: Processing options dictionary to validate
25
+
26
+ Raises:
27
+ ValidationError: If any unexpected or deprecated fields are found
28
+
29
+ """
30
+ unexpected_fields: List[str] = []
31
+
32
+ for field in options.keys():
33
+ if field not in VALID_PROCESSING_OPTIONS:
34
+ unexpected_fields.append(field)
35
+
36
+ if unexpected_fields:
37
+ valid_fields = ", ".join(sorted(VALID_PROCESSING_OPTIONS))
38
+ unexpected = ", ".join(sorted(unexpected_fields))
39
+ raise ValidationError(
40
+ f"Unexpected processing option fields: {unexpected}. "
41
+ f"Valid fields are: {valid_fields}"
42
+ )