PyPI - byteit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

byteit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

byteit/ByteITClient.py +382 -0
byteit/__init__.py +51 -0
byteit/connectors/LocalFileInputConnector.py +63 -0
byteit/connectors/LocalFileOutputConnector.py +25 -0
byteit/connectors/S3InputConnector.py +82 -0
byteit/connectors/S3OutputConnector.py +53 -0
byteit/connectors/__init__.py +26 -0
byteit/connectors/base.py +57 -0
byteit/exceptions.py +107 -0
byteit/models/DocumentMetadata.py +25 -0
byteit/models/Job.py +153 -0
byteit/models/JobList.py +21 -0
byteit/models/OutputFormat.py +16 -0
byteit/models/ProcessingOptions.py +96 -0
byteit/validations.py +42 -0
byteit-0.1.0.dist-info/LICENSE +201 -0
byteit-0.1.0.dist-info/METADATA +424 -0
byteit-0.1.0.dist-info/RECORD +20 -0
byteit-0.1.0.dist-info/WHEEL +5 -0
byteit-0.1.0.dist-info/top_level.txt +1 -0

byteit/connectors/base.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Base classes for ByteIT connectors."""
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Tuple
+class InputConnector(ABC):
+    """Abstract base for input data sources.
+    Input connectors define how ByteIT accesses documents for processing.
+    Implementations handle local files, S3 buckets, and other data sources.
+    Subclasses must implement:
+        - get_file_data(): Returns file data for upload or connection info
+        - to_dict(): Serializes connector configuration for API
+    """
+    @abstractmethod
+    def get_file_data(self) -> Tuple[str, Any]:
+        """
+        Get file data for upload.
+        Returns:
+            Tuple of (filename, file_object) suitable for requests.files
+        """
+        ...
+    @abstractmethod
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert connector configuration to dictionary for API submission.
+        Returns:
+            Dictionary representation of the connector configuration
+        """
+        ...
+class OutputConnector(ABC):
+    """Abstract base for output destinations.
+    Output connectors define where ByteIT stores processed results.
+    Implementations handle local storage, S3 buckets, and other destinations.
+    Subclasses must implement:
+        - to_dict(): Serializes connector configuration for API
+    """
+    @abstractmethod
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert connector configuration to dictionary for API submission.
+        Returns:
+            Dictionary representation of the connector configuration
+        """
+        ...

byteit/exceptions.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""Custom exceptions for the ByteIT client library."""
+from typing import Any
+class ByteITError(Exception):
+    """Base exception for all ByteIT API errors.
+    All ByteIT exceptions inherit from this class, making it easy to catch
+    any ByteIT-related error with a single except clause.
+    Attributes:
+        message: Human-readable error description
+        status_code: HTTP status code if available
+        response: Full API response data if available
+    """
+    def __init__(
+        self,
+        message: str,
+        status_code: int | None = None,
+        response: dict[str, Any] | None = None,
+    ):
+        super().__init__(message)
+        self.message = message
+        self.status_code = status_code
+        self.response = response
+class AuthenticationError(ByteITError):
+    """Authentication failure.
+    Raised when API requests fail due to invalid or missing credentials.
+    Check your API key and ensure it's properly configured.
+    """
+    pass
+class APIKeyError(AuthenticationError):
+    """API key validation error.
+    Raised when the provided API key is invalid, expired, or missing.
+    Verify your API key at https://byteit.ai/dashboard.
+    """
+    pass
+class ValidationError(ByteITError):
+    """Request validation error.
+    Raised when request parameters are invalid or missing required fields.
+    Check the error message for details on which parameters need correction.
+    """
+    pass
+class ResourceNotFoundError(ByteITError):
+    """Resource not found.
+    Raised when attempting to access a job or resource that doesn't exist
+    or that you don't have permission to access.
+    """
+    pass
+class RateLimitError(ByteITError):
+    """Rate limit exceeded.
+    Raised when you've exceeded your API rate limits.
+    Wait before retrying or contact support to increase your limits.
+    """
+    pass
+class ServerError(ByteITError):
+    """Server-side error.
+    Raised when ByteIT servers encounter an internal error (5xx status codes).
+    These errors are usually temporary - retry after a brief delay.
+    """
+    pass
+class NetworkError(ByteITError):
+    """Network communication error.
+    Raised when unable to reach ByteIT servers due to network issues.
+    Check your internet connection and firewall settings.
+    """
+    pass
+class JobProcessingError(ByteITError):
+    """Job processing failure.
+    Raised when a document processing job fails or cannot be completed.
+    Check the error message for specific details about the failure.
+    """
+    pass

byteit/models/DocumentMetadata.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Data model for ByteIT Document Metadatata."""
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class DocumentMetadata:
+    """Document metadata information.
+    Contains information about the original document being processed.
+    Attributes:
+        original_filename: Original name of the uploaded file
+        document_type: Type/format of document (pdf, docx, etc.)
+        page_count: Number of pages in document (if applicable)
+        language: Document language code (default: 'en')
+        encoding: Character encoding (default: 'utf-8')
+    """
+    original_filename: str
+    document_type: str
+    page_count: Optional[int] = None
+    language: str = "en"
+    encoding: str = "utf-8"

byteit/models/Job.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""Data model for ByteIT Job."""
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Dict, Optional, cast
+from byteit.models.DocumentMetadata import DocumentMetadata
+from byteit.models.ProcessingOptions import ProcessingOptions
+@dataclass
+class Job:
+    """Document processing job.
+    Represents a document parsing job in the ByteIT system, tracking its
+    status, configuration, and results.
+    Attributes:
+        id: Unique job identifier
+        created_at: Job creation timestamp
+        updated_at: Last update timestamp
+        processing_status: Current status (pending, processing, completed, failed)
+        result_format: Output format (txt, json, md, html)
+        owner_user_id: ID of the user who created the job
+        file_data: Original file information
+        file_hash: Hash of the input file
+        nickname: Optional user-defined job name
+        metadata: Document metadata (filename, type, pages, etc.)
+        processing_options: Job configuration options
+        processing_error: Error message if job failed
+        storage_path: Internal storage location
+        result_path: Path to processed result
+        input_connector: Type of input connector used
+        input_connection_data: Input connector configuration
+        output_connector: Type of output connector used
+        output_connection_data: Output connector configuration
+        started_processing_at: Processing start time
+        finished_processing_at: Processing completion time
+    Properties:
+        is_completed: True if job finished successfully
+        is_failed: True if job failed
+        is_processing: True if job is currently being processed
+    """
+    id: str
+    created_at: datetime
+    updated_at: datetime
+    processing_status: str
+    result_format: str
+    owner_user_id: Optional[str] = None
+    file_data: Optional[str] = None
+    file_hash: Optional[str] = None
+    nickname: Optional[str] = None
+    metadata: Optional[DocumentMetadata] = None
+    processing_options: Optional[ProcessingOptions] = None
+    processing_error: Optional[str] = None
+    storage_path: Optional[str] = None
+    result_path: Optional[str] = None
+    input_connector: Optional[str] = None
+    input_connection_data: Optional[Dict[str, Any]] = None
+    output_connector: Optional[str] = None
+    output_connection_data: Optional[Dict[str, Any]] = None
+    started_processing_at: Optional[datetime] = None
+    finished_processing_at: Optional[datetime] = None
+    @property
+    def is_completed(self) -> bool:
+        """Check if the job is completed."""
+        return self.processing_status == "completed"
+    @property
+    def is_failed(self) -> bool:
+        """Check if the job failed."""
+        return self.processing_status == "failed"
+    @property
+    def is_processing(self) -> bool:
+        """Check if the job is currently processing."""
+        return self.processing_status in ("pending", "processing")
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "Job":
+        """Create a Job instance from API response data."""
+        # Parse datetime fields
+        created_at = data.get("created_at")
+        if isinstance(created_at, str):
+            created_at = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
+        else:
+            created_at = datetime.now()  # fallback
+        updated_at = data.get("updated_at")
+        if isinstance(updated_at, str):
+            updated_at = datetime.fromisoformat(updated_at.replace("Z", "+00:00"))
+        else:
+            updated_at = datetime.now()  # fallback
+        started_processing_at = data.get("started_processing_at")
+        if isinstance(started_processing_at, str):
+            started_processing_at = datetime.fromisoformat(
+                started_processing_at.replace("Z", "+00:00")
+            )
+        finished_processing_at = data.get("finished_processing_at")
+        if isinstance(finished_processing_at, str):
+            finished_processing_at = datetime.fromisoformat(
+                finished_processing_at.replace("Z", "+00:00")
+            )
+        # Parse metadata
+        metadata = None
+        if data.get("metadata") and isinstance(data["metadata"], dict):
+            metadata_dict = cast(Dict[str, Any], data["metadata"])
+            try:
+                metadata = DocumentMetadata(
+                    original_filename=metadata_dict.get("original_filename", ""),
+                    document_type=metadata_dict.get("document_type", ""),
+                    page_count=metadata_dict.get("page_count"),
+                    language=metadata_dict.get("language", "en"),
+                    encoding=metadata_dict.get("encoding", "utf-8"),
+                )
+            except Exception as e:
+                # If metadata parsing fails, skip it
+                print(f"Warning: Failed to parse metadata: {e}")
+                metadata = None
+        # Parse processing options
+        processing_options = None
+        processing_options_data = data.get("processing_options")
+        if processing_options_data and isinstance(processing_options_data, dict):
+            processing_options = ProcessingOptions.from_dict(processing_options_data)
+        return cls(
+            id=data["id"],
+            created_at=created_at,
+            updated_at=updated_at,
+            processing_status=data["processing_status"],
+            result_format=data["result_format"],
+            owner_user_id=data.get("owner_user_id"),
+            file_data=data.get("file_data"),
+            file_hash=data.get("file_hash"),
+            nickname=data.get("nickname"),
+            metadata=metadata,
+            processing_options=processing_options,
+            processing_error=data.get("processing_error"),
+            storage_path=data.get("storage_path"),
+            result_path=data.get("result_path"),
+            input_connector=data.get("input_connector"),
+            input_connection_data=data.get("input_connection_data"),
+            output_connector=data.get("output_connector"),
+            output_connection_data=data.get("output_connection_data"),
+            started_processing_at=started_processing_at,
+            finished_processing_at=finished_processing_at,
+        )

byteit/models/JobList.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Data models for ByteIT API responses."""
+from dataclasses import dataclass
+from byteit.models.Job import Job
+@dataclass
+class JobList:
+    """Collection of jobs with metadata.
+    Returned by list operations containing multiple jobs.
+    Attributes:
+        jobs: List of Job objects
+        count: Total number of jobs
+        detail: Additional information or messages
+    """
+    jobs: list[Job]
+    count: int
+    detail: str

byteit/models/OutputFormat.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Output format enumeration for document processing."""
+from enum import Enum
+class OutputFormat(str, Enum):
+    """Supported output formats for document processing."""
+    TXT = "txt"
+    JSON = "json"
+    HTML = "html"
+    MD = "md"
+    def __str__(self) -> str:
+        """Return the string value of the format."""
+        return self.value

byteit/models/ProcessingOptions.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""Processing options model for document processing."""
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Union
+from byteit.models.OutputFormat import OutputFormat
+def _default_list() -> List[str]:
+    """Factory function for default list."""
+    return ["en"]
+@dataclass
+class ProcessingOptions:
+    """Document processing configuration.
+    Specifies how documents should be processed by ByteIT.
+    Attributes:
+        languages: List of language codes for OCR/parsing (default: ['en'])
+        page_range: Specific pages to process (e.g., '1-5' or '1,3,5')
+        output_format: Desired output format (txt, json, html, md)
+    Note:
+        The output_format is extracted and sent separately in API requests,
+        while languages and page_range are sent as processing_options.
+    """
+    languages: List[str] = field(default_factory=_default_list)
+    page_range: str = field(default="")
+    output_format: Union[OutputFormat, str] = OutputFormat.TXT
+    def __post_init__(self) -> None:
+        """Validate and convert processing options."""
+        # Convert string to OutputFormat if necessary
+        if isinstance(self.output_format, str):
+            try:
+                object.__setattr__(
+                    self, "output_format", OutputFormat(self.output_format)
+                )
+            except ValueError as exc:
+                raise ValueError(
+                    f"Invalid output format: {self.output_format}. "
+                    f"Valid formats are: txt, json, html, md"
+                ) from exc
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert ProcessingOptions to dictionary for API communication.
+        Note: output_format is included here but will be extracted by the
+        API client and sent as a top-level parameter.
+        Returns:
+            Dictionary representation suitable for API requests
+        """
+        result: Dict[str, Any] = {}
+        if self.languages:
+            result["languages"] = self.languages
+        if self.page_range:
+            result["page_range"] = self.page_range
+        # Include output_format for extraction by API client
+        if isinstance(self.output_format, OutputFormat):
+            result["output_format"] = self.output_format.value
+        else:
+            result["output_format"] = str(self.output_format)
+        return result
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ProcessingOptions":
+        """
+        Create ProcessingOptions from dictionary.
+        Args:
+            data: Dictionary containing processing options
+        Returns:
+            ProcessingOptions instance
+        """
+        languages = data.get("languages", ["en"])
+        page_range = data.get("page_range", "")
+        output_format_str = data.get("output_format", "txt")
+        # Convert output_format to enum
+        output_format = OutputFormat(output_format_str)
+        return cls(
+            languages=languages,
+            page_range=page_range,
+            output_format=output_format,
+        )

byteit/validations.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""Validation utilities for ByteIT API requests."""
+from typing import Any, Dict, List, Set
+from .exceptions import ValidationError
+# Valid processing option fields (only languages and page_range are allowed)
+VALID_PROCESSING_OPTIONS: Set[str] = {
+    "languages",
+    "page_range",
+}
+def validate_processing_options(options: Dict[str, Any]) -> None:
+    """
+    Validate processing options dictionary.
+    Only 'languages' and 'page_range' are allowed in processing_options.
+    The 'output_format' should be passed as a top-level parameter, not
+    inside processing_options.
+    Args:
+        options: Processing options dictionary to validate
+    Raises:
+        ValidationError: If any unexpected or deprecated fields are found
+    """
+    unexpected_fields: List[str] = []
+    for field in options.keys():
+        if field not in VALID_PROCESSING_OPTIONS:
+            unexpected_fields.append(field)
+    if unexpected_fields:
+        valid_fields = ", ".join(sorted(VALID_PROCESSING_OPTIONS))
+        unexpected = ", ".join(sorted(unexpected_fields))
+        raise ValidationError(
+            f"Unexpected processing option fields: {unexpected}. "
+            f"Valid fields are: {valid_fields}"
+        )