leapocr 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. leapocr/__init__.py +91 -0
  2. leapocr/_internal/__init__.py +1 -0
  3. leapocr/_internal/polling.py +130 -0
  4. leapocr/_internal/retry.py +101 -0
  5. leapocr/_internal/upload.py +118 -0
  6. leapocr/_internal/utils.py +51 -0
  7. leapocr/_internal/validation.py +172 -0
  8. leapocr/client.py +96 -0
  9. leapocr/config.py +59 -0
  10. leapocr/errors.py +155 -0
  11. leapocr/generated/.openapi-generator/FILES +89 -0
  12. leapocr/generated/.openapi-generator/VERSION +1 -0
  13. leapocr/generated/.openapi-generator-ignore +23 -0
  14. leapocr/generated/leapocr/__init__.py +0 -0
  15. leapocr/generated/leapocr/generated/__init__.py +202 -0
  16. leapocr/generated/leapocr/generated/api/__init__.py +15 -0
  17. leapocr/generated/leapocr/generated/api/analytics_api.py +868 -0
  18. leapocr/generated/leapocr/generated/api/authentication_api.py +172 -0
  19. leapocr/generated/leapocr/generated/api/credits_api.py +653 -0
  20. leapocr/generated/leapocr/generated/api/health_api.py +161 -0
  21. leapocr/generated/leapocr/generated/api/jobs_api.py +836 -0
  22. leapocr/generated/leapocr/generated/api/models_api.py +161 -0
  23. leapocr/generated/leapocr/generated/api/ocr_api.py +863 -0
  24. leapocr/generated/leapocr/generated/api/sdk_api.py +744 -0
  25. leapocr/generated/leapocr/generated/api/templates_api.py +1006 -0
  26. leapocr/generated/leapocr/generated/api/upload_api.py +465 -0
  27. leapocr/generated/leapocr/generated/api/webhooks_api.py +324 -0
  28. leapocr/generated/leapocr/generated/api_client.py +738 -0
  29. leapocr/generated/leapocr/generated/api_response.py +25 -0
  30. leapocr/generated/leapocr/generated/configuration.py +476 -0
  31. leapocr/generated/leapocr/generated/exceptions.py +167 -0
  32. leapocr/generated/leapocr/generated/models/__init__.py +84 -0
  33. leapocr/generated/leapocr/generated/models/analytics_credits_overview.py +82 -0
  34. leapocr/generated/leapocr/generated/models/analytics_credits_timeseries_point.py +82 -0
  35. leapocr/generated/leapocr/generated/models/analytics_credits_usage_response.py +102 -0
  36. leapocr/generated/leapocr/generated/models/analytics_job_overview.py +94 -0
  37. leapocr/generated/leapocr/generated/models/analytics_job_timeseries_point.py +88 -0
  38. leapocr/generated/leapocr/generated/models/analytics_jobs_timeseries_get200_response.py +86 -0
  39. leapocr/generated/leapocr/generated/models/analytics_model_usage_stat.py +78 -0
  40. leapocr/generated/leapocr/generated/models/analytics_overview_response.py +100 -0
  41. leapocr/generated/leapocr/generated/models/analytics_page_overview.py +82 -0
  42. leapocr/generated/leapocr/generated/models/analytics_page_timeseries_point.py +84 -0
  43. leapocr/generated/leapocr/generated/models/analytics_pages_timeseries_get200_response.py +86 -0
  44. leapocr/generated/leapocr/generated/models/analytics_range.py +80 -0
  45. leapocr/generated/leapocr/generated/models/analytics_template_stat.py +82 -0
  46. leapocr/generated/leapocr/generated/models/analytics_top_templates_response.py +86 -0
  47. leapocr/generated/leapocr/generated/models/analytics_webhook_summary.py +82 -0
  48. leapocr/generated/leapocr/generated/models/auth_auth_response.py +81 -0
  49. leapocr/generated/leapocr/generated/models/credits_active_meter_response.py +82 -0
  50. leapocr/generated/leapocr/generated/models/credits_active_subscription_response.py +88 -0
  51. leapocr/generated/leapocr/generated/models/credits_catalog_benefit.py +74 -0
  52. leapocr/generated/leapocr/generated/models/credits_catalog_price.py +76 -0
  53. leapocr/generated/leapocr/generated/models/credits_catalog_product.py +96 -0
  54. leapocr/generated/leapocr/generated/models/credits_credit_balance_response.py +120 -0
  55. leapocr/generated/leapocr/generated/models/credits_credit_transaction_organization_response.py +102 -0
  56. leapocr/generated/leapocr/generated/models/credits_credit_transaction_project_response.py +102 -0
  57. leapocr/generated/leapocr/generated/models/credits_credit_transactions_response_credits_credit_transaction_organization_response.py +86 -0
  58. leapocr/generated/leapocr/generated/models/credits_credit_transactions_response_credits_credit_transaction_project_response.py +86 -0
  59. leapocr/generated/leapocr/generated/models/credits_granted_benefit_response.py +82 -0
  60. leapocr/generated/leapocr/generated/models/credits_pagination_info.py +76 -0
  61. leapocr/generated/leapocr/generated/models/credits_product_catalog.py +89 -0
  62. leapocr/generated/leapocr/generated/models/health_health_check.py +78 -0
  63. leapocr/generated/leapocr/generated/models/health_health_status.py +97 -0
  64. leapocr/generated/leapocr/generated/models/health_health_summary.py +78 -0
  65. leapocr/generated/leapocr/generated/models/jobs_job_list_item.py +100 -0
  66. leapocr/generated/leapocr/generated/models/jobs_job_management_response.py +82 -0
  67. leapocr/generated/leapocr/generated/models/jobs_job_response.py +102 -0
  68. leapocr/generated/leapocr/generated/models/jobs_job_status_response.py +90 -0
  69. leapocr/generated/leapocr/generated/models/jobs_jobs_list_response.py +86 -0
  70. leapocr/generated/leapocr/generated/models/jobs_pagination_info.py +78 -0
  71. leapocr/generated/leapocr/generated/models/jobs_restart_job_request.py +74 -0
  72. leapocr/generated/leapocr/generated/models/jobs_retry_job_request.py +74 -0
  73. leapocr/generated/leapocr/generated/models/jobs_workflow_job_status_info.py +80 -0
  74. leapocr/generated/leapocr/generated/models/jobs_workflow_progress_info.py +80 -0
  75. leapocr/generated/leapocr/generated/models/jobs_workflow_status_info.py +104 -0
  76. leapocr/generated/leapocr/generated/models/models_list_model_response.py +80 -0
  77. leapocr/generated/leapocr/generated/models/models_list_models_list_response.py +82 -0
  78. leapocr/generated/leapocr/generated/models/models_ocr_result_response.py +106 -0
  79. leapocr/generated/leapocr/generated/models/models_ocr_status_response.py +84 -0
  80. leapocr/generated/leapocr/generated/models/models_page_metadata.py +76 -0
  81. leapocr/generated/leapocr/generated/models/models_page_response.py +84 -0
  82. leapocr/generated/leapocr/generated/models/models_pagination_response.py +78 -0
  83. leapocr/generated/leapocr/generated/models/response_error_message.py +72 -0
  84. leapocr/generated/leapocr/generated/models/response_error_response.py +85 -0
  85. leapocr/generated/leapocr/generated/models/status_response.py +88 -0
  86. leapocr/generated/leapocr/generated/models/templates_create_template_request.py +90 -0
  87. leapocr/generated/leapocr/generated/models/templates_list_templates_response.py +88 -0
  88. leapocr/generated/leapocr/generated/models/templates_template_response.py +104 -0
  89. leapocr/generated/leapocr/generated/models/templates_template_stats_response.py +88 -0
  90. leapocr/generated/leapocr/generated/models/templates_update_template_request.py +88 -0
  91. leapocr/generated/leapocr/generated/models/upload_completed_part.py +74 -0
  92. leapocr/generated/leapocr/generated/models/upload_direct_upload_complete_request.py +80 -0
  93. leapocr/generated/leapocr/generated/models/upload_direct_upload_complete_response.py +78 -0
  94. leapocr/generated/leapocr/generated/models/upload_direct_upload_response.py +94 -0
  95. leapocr/generated/leapocr/generated/models/upload_initiate_direct_upload_request.py +96 -0
  96. leapocr/generated/leapocr/generated/models/upload_multipart_part.py +78 -0
  97. leapocr/generated/leapocr/generated/models/upload_remote_url_upload_request.py +94 -0
  98. leapocr/generated/leapocr/generated/models/upload_remote_url_upload_response.py +78 -0
  99. leapocr/generated/leapocr/generated/models/webhooks_r2_upload_notification.py +76 -0
  100. leapocr/generated/leapocr/generated/rest.py +257 -0
  101. leapocr/generated/leapocr/generated_README.md +205 -0
  102. leapocr/models.py +157 -0
  103. leapocr/ocr.py +376 -0
  104. leapocr-0.0.1.dist-info/METADATA +577 -0
  105. leapocr-0.0.1.dist-info/RECORD +106 -0
  106. leapocr-0.0.1.dist-info/WHEEL +4 -0
leapocr/__init__.py ADDED
@@ -0,0 +1,91 @@
1
+ """LeapOCR Python SDK - Transform documents into structured data using AI-powered OCR.
2
+
3
+ Example:
4
+ >>> import asyncio
5
+ >>> from leapocr import LeapOCR, ProcessOptions, Format
6
+ >>>
7
+ >>> async def main():
8
+ ... async with LeapOCR("your-api-key") as client:
9
+ ... result = await client.ocr.process_and_wait(
10
+ ... "document.pdf",
11
+ ... options=ProcessOptions(format=Format.MARKDOWN)
12
+ ... )
13
+ ... print(f"Processed {result.total_pages} pages")
14
+ >>>
15
+ >>> asyncio.run(main())
16
+ """
17
+
18
+ __version__ = "0.1.0"
19
+
20
+ # Core client
21
+ from .client import LeapOCR
22
+
23
+ # Configuration
24
+ from .config import ClientConfig
25
+
26
+ # Errors
27
+ from .errors import (
28
+ APIError,
29
+ AuthenticationError,
30
+ FileError,
31
+ InsufficientCreditsError,
32
+ JobError,
33
+ JobFailedError,
34
+ JobTimeoutError,
35
+ LeapOCRError,
36
+ NetworkError,
37
+ RateLimitError,
38
+ ValidationError,
39
+ )
40
+
41
+ # Models and enums
42
+ from .models import (
43
+ BatchResult,
44
+ Format,
45
+ JobResult,
46
+ JobStatus,
47
+ JobStatusType,
48
+ Model,
49
+ ModelInfo,
50
+ PageMetadata,
51
+ PageResult,
52
+ PaginationInfo,
53
+ PollOptions,
54
+ ProcessOptions,
55
+ ProcessResult,
56
+ )
57
+
58
+ __all__ = [
59
+ # Version
60
+ "__version__",
61
+ # Client
62
+ "LeapOCR",
63
+ # Configuration
64
+ "ClientConfig",
65
+ # Models
66
+ "Format",
67
+ "Model",
68
+ "JobStatusType",
69
+ "ProcessOptions",
70
+ "PollOptions",
71
+ "ProcessResult",
72
+ "JobStatus",
73
+ "JobResult",
74
+ "PageResult",
75
+ "PageMetadata",
76
+ "PaginationInfo",
77
+ "ModelInfo",
78
+ "BatchResult",
79
+ # Errors
80
+ "LeapOCRError",
81
+ "AuthenticationError",
82
+ "RateLimitError",
83
+ "ValidationError",
84
+ "FileError",
85
+ "JobError",
86
+ "JobFailedError",
87
+ "JobTimeoutError",
88
+ "NetworkError",
89
+ "APIError",
90
+ "InsufficientCreditsError",
91
+ ]
@@ -0,0 +1 @@
1
+ """Internal utilities for LeapOCR SDK."""
@@ -0,0 +1,130 @@
1
+ """Status polling utilities for long-running OCR jobs."""
2
+
3
+ import asyncio
4
+ from datetime import datetime, timedelta
5
+ from typing import TYPE_CHECKING, Callable
6
+
7
+ from ..errors import JobFailedError, JobTimeoutError
8
+ from ..models import JobStatusType, PollOptions
9
+
10
+ if TYPE_CHECKING:
11
+ from ..models import JobStatus
12
+
13
+
14
+ async def poll_until_done(
15
+ get_status_fn: Callable[[str], "JobStatus"],
16
+ job_id: str,
17
+ options: PollOptions | None = None,
18
+ ) -> None:
19
+ """Poll job status until completion or failure.
20
+
21
+ Args:
22
+ get_status_fn: Async function to get job status (takes job_id, returns JobStatus)
23
+ job_id: Job ID to poll
24
+ options: Polling options (interval, timeout, callbacks)
25
+
26
+ Raises:
27
+ JobTimeoutError: If job doesn't complete within max_wait
28
+ JobFailedError: If job processing fails
29
+ """
30
+ opts = options or PollOptions()
31
+ start_time = datetime.now()
32
+ max_wait_td = timedelta(seconds=opts.max_wait)
33
+
34
+ while True:
35
+ # Check timeout
36
+ elapsed = datetime.now() - start_time
37
+ if elapsed > max_wait_td:
38
+ raise JobTimeoutError(
39
+ f"Job {job_id} did not complete within {opts.max_wait} seconds",
40
+ job_id=job_id,
41
+ )
42
+
43
+ # Get current status
44
+ status = await get_status_fn(job_id)
45
+
46
+ # Call progress callback if provided
47
+ if opts.on_progress:
48
+ try:
49
+ opts.on_progress(status)
50
+ except Exception:
51
+ # Don't let callback errors stop polling
52
+ pass
53
+
54
+ # Check if job is complete
55
+ if status.status == JobStatusType.COMPLETED:
56
+ return
57
+
58
+ # Check if job failed
59
+ if status.status == JobStatusType.FAILED:
60
+ error_msg = status.error_message or "Job processing failed"
61
+ raise JobFailedError(error_msg, job_id=job_id, error_details=status.error_message)
62
+
63
+ # Wait before next poll
64
+ await asyncio.sleep(opts.poll_interval)
65
+
66
+
67
+ async def poll_with_backoff(
68
+ get_status_fn: Callable[[str], "JobStatus"],
69
+ job_id: str,
70
+ initial_interval: float = 1.0,
71
+ max_interval: float = 30.0,
72
+ backoff_multiplier: float = 1.5,
73
+ max_wait: float = 300.0,
74
+ on_progress: Callable[["JobStatus"], None] | None = None,
75
+ ) -> None:
76
+ """Poll job status with exponential backoff.
77
+
78
+ Starts with short intervals and gradually increases delay to reduce
79
+ API load for long-running jobs.
80
+
81
+ Args:
82
+ get_status_fn: Async function to get job status
83
+ job_id: Job ID to poll
84
+ initial_interval: Starting poll interval in seconds (default: 1.0)
85
+ max_interval: Maximum poll interval in seconds (default: 30.0)
86
+ backoff_multiplier: Interval multiplier after each poll (default: 1.5)
87
+ max_wait: Maximum total wait time in seconds (default: 300.0)
88
+ on_progress: Optional callback for progress updates
89
+
90
+ Raises:
91
+ JobTimeoutError: If job doesn't complete within max_wait
92
+ JobFailedError: If job processing fails
93
+ """
94
+ start_time = datetime.now()
95
+ max_wait_td = timedelta(seconds=max_wait)
96
+ current_interval = initial_interval
97
+
98
+ while True:
99
+ # Check timeout
100
+ elapsed = datetime.now() - start_time
101
+ if elapsed > max_wait_td:
102
+ raise JobTimeoutError(
103
+ f"Job {job_id} did not complete within {max_wait} seconds",
104
+ job_id=job_id,
105
+ )
106
+
107
+ # Get current status
108
+ status = await get_status_fn(job_id)
109
+
110
+ # Call progress callback if provided
111
+ if on_progress:
112
+ try:
113
+ on_progress(status)
114
+ except Exception:
115
+ pass
116
+
117
+ # Check if job is complete
118
+ if status.status == JobStatusType.COMPLETED:
119
+ return
120
+
121
+ # Check if job failed
122
+ if status.status == JobStatusType.FAILED:
123
+ error_msg = status.error_message or "Job processing failed"
124
+ raise JobFailedError(error_msg, job_id=job_id, error_details=status.error_message)
125
+
126
+ # Wait with current interval
127
+ await asyncio.sleep(current_interval)
128
+
129
+ # Increase interval for next iteration (exponential backoff)
130
+ current_interval = min(current_interval * backoff_multiplier, max_interval)
@@ -0,0 +1,101 @@
1
+ """Retry logic with exponential backoff for LeapOCR SDK."""
2
+
3
+ import asyncio
4
+ from collections.abc import Awaitable
5
+ from typing import Callable, TypeVar
6
+
7
+ from ..errors import LeapOCRError, NetworkError, RateLimitError
8
+
9
+ T = TypeVar("T")
10
+
11
+
12
+ def is_retryable_error(error: Exception) -> bool:
13
+ """Check if an error should be retried.
14
+
15
+ Args:
16
+ error: Exception to check
17
+
18
+ Returns:
19
+ True if the error is retryable, False otherwise
20
+ """
21
+ # Rate limit errors are retryable
22
+ if isinstance(error, RateLimitError):
23
+ return True
24
+
25
+ # Network errors are retryable
26
+ if isinstance(error, NetworkError):
27
+ return True
28
+
29
+ # SDK errors with 5xx status codes are retryable
30
+ if isinstance(error, LeapOCRError):
31
+ if error.status_code and 500 <= error.status_code < 600:
32
+ return True
33
+
34
+ # Check for httpx errors
35
+ try:
36
+ import httpx
37
+
38
+ if isinstance(error, (httpx.TimeoutException, httpx.NetworkError)):
39
+ return True
40
+ if isinstance(error, httpx.HTTPStatusError):
41
+ return error.response.status_code >= 500
42
+ except ImportError:
43
+ pass
44
+
45
+ return False
46
+
47
+
48
+ async def with_retry(
49
+ operation: Callable[[], Awaitable[T]],
50
+ max_retries: int = 3,
51
+ retry_delay: float = 1.0,
52
+ retry_multiplier: float = 2.0,
53
+ is_retryable: Callable[[Exception], bool] | None = None,
54
+ ) -> T:
55
+ """Execute an async operation with exponential backoff retry.
56
+
57
+ Args:
58
+ operation: Async function to execute
59
+ max_retries: Maximum number of retry attempts (default: 3)
60
+ retry_delay: Initial delay between retries in seconds (default: 1.0)
61
+ retry_multiplier: Multiplier for exponential backoff (default: 2.0)
62
+ is_retryable: Optional function to determine if error is retryable
63
+
64
+ Returns:
65
+ Result from the operation
66
+
67
+ Raises:
68
+ The last exception if all retries are exhausted
69
+ """
70
+ is_retryable_fn = is_retryable or is_retryable_error
71
+ last_error: Exception | None = None
72
+
73
+ for attempt in range(max_retries + 1):
74
+ try:
75
+ return await operation()
76
+ except Exception as error:
77
+ last_error = error
78
+
79
+ # Don't retry on last attempt
80
+ if attempt == max_retries:
81
+ raise
82
+
83
+ # Check if error is retryable
84
+ if not is_retryable_fn(error):
85
+ raise
86
+
87
+ # Calculate delay with exponential backoff
88
+ if isinstance(error, RateLimitError) and error.retry_after:
89
+ # Use server-provided retry-after if available
90
+ delay = float(error.retry_after)
91
+ else:
92
+ # Exponential backoff: delay * (multiplier ^ attempt)
93
+ delay = retry_delay * (retry_multiplier**attempt)
94
+
95
+ # Wait before retry
96
+ await asyncio.sleep(delay)
97
+
98
+ # Should never reach here, but just in case
99
+ if last_error:
100
+ raise last_error
101
+ raise RuntimeError("Retry loop completed without success or error")
@@ -0,0 +1,118 @@
1
+ """File upload utilities for multipart S3 uploads."""
2
+
3
+ from typing import Any, BinaryIO
4
+
5
+ import httpx
6
+
7
+ from ..errors import FileError, NetworkError
8
+ from .validation import get_file_size, guess_content_type
9
+
10
+
11
+ class MultipartUploader:
12
+ """Handle multipart file uploads to S3 via presigned URLs.
13
+
14
+ This class manages the upload of file parts to S3 using presigned URLs
15
+ returned from the LeapOCR API.
16
+ """
17
+
18
+ def __init__(self, timeout: float = 300.0):
19
+ """Initialize the uploader.
20
+
21
+ Args:
22
+ timeout: Timeout for upload requests in seconds (default: 5 minutes)
23
+ """
24
+ # Separate HTTP client for S3 uploads (no auth needed, different domain)
25
+ self._s3_client = httpx.AsyncClient(timeout=timeout)
26
+
27
+ async def close(self) -> None:
28
+ """Close the S3 HTTP client."""
29
+ await self._s3_client.aclose()
30
+
31
+ async def __aenter__(self) -> "MultipartUploader":
32
+ """Context manager entry."""
33
+ return self
34
+
35
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
36
+ """Context manager exit."""
37
+ await self.close()
38
+
39
+ async def upload_multipart(
40
+ self, file: BinaryIO, parts: list[dict[str, Any]]
41
+ ) -> list[dict[str, Any]]:
42
+ """Upload file parts to S3 presigned URLs and return ETags.
43
+
44
+ Args:
45
+ file: File-like object (must support seek/read)
46
+ parts: List of part dicts with part_number, start_byte, end_byte, upload_url
47
+
48
+ Returns:
49
+ List of dicts with part_number and etag for completion request
50
+
51
+ Raises:
52
+ FileError: If file reading fails
53
+ NetworkError: If upload fails
54
+ """
55
+ completed_parts: list[dict[str, Any]] = []
56
+
57
+ for part in parts:
58
+ part_number = part["part_number"]
59
+ upload_url = part["upload_url"]
60
+ start_byte = part["start_byte"]
61
+ end_byte = part["end_byte"]
62
+
63
+ # Calculate chunk size (end_byte is inclusive)
64
+ chunk_size = end_byte - start_byte + 1
65
+
66
+ # Read chunk from file
67
+ try:
68
+ file.seek(start_byte)
69
+ chunk_data = file.read(chunk_size)
70
+ except OSError as e:
71
+ raise FileError(
72
+ f"Failed to read file chunk for part {part_number}: {e}",
73
+ file_path=getattr(file, "name", None),
74
+ )
75
+
76
+ if len(chunk_data) != chunk_size:
77
+ raise FileError(
78
+ f"Failed to read expected chunk size for part {part_number}: "
79
+ f"got {len(chunk_data)} bytes, expected {chunk_size} bytes",
80
+ file_path=getattr(file, "name", None),
81
+ )
82
+
83
+ # Upload to S3 via presigned URL (raw PUT, not multipart/form-data)
84
+ try:
85
+ response = await self._s3_client.put(
86
+ upload_url,
87
+ content=chunk_data,
88
+ headers={
89
+ "Content-Length": str(len(chunk_data)),
90
+ },
91
+ )
92
+ response.raise_for_status()
93
+ except httpx.HTTPStatusError as e:
94
+ if e.response.status_code == 403:
95
+ raise NetworkError(
96
+ f"Presigned URL expired or invalid for part {part_number}",
97
+ cause=e,
98
+ )
99
+ raise NetworkError(
100
+ f"Failed to upload part {part_number} to S3: HTTP {e.response.status_code}",
101
+ cause=e,
102
+ )
103
+ except httpx.RequestError as e:
104
+ raise NetworkError(f"Network error uploading part {part_number}: {e}", cause=e)
105
+
106
+ # Extract ETag from response headers
107
+ # S3 returns ETag with quotes like: "9bb58f26192e4ba00f01e2e7b136bbd8"
108
+ etag = response.headers.get("ETag", "").strip('"')
109
+ if not etag:
110
+ raise NetworkError(f"Missing ETag in S3 response for part {part_number}")
111
+
112
+ completed_parts.append({"part_number": part_number, "etag": etag})
113
+
114
+ return completed_parts
115
+
116
+
117
+ # Re-export utility functions
118
+ __all__ = ["MultipartUploader", "get_file_size", "guess_content_type"]
@@ -0,0 +1,51 @@
1
+ """Common utility functions for LeapOCR SDK."""
2
+
3
+ from datetime import datetime
4
+
5
+
6
+ def parse_datetime(s: str | None) -> datetime:
7
+ """Parse RFC3339 datetime string.
8
+
9
+ Args:
10
+ s: RFC3339 datetime string (e.g., "2023-12-25T10:30:00Z")
11
+
12
+ Returns:
13
+ datetime object (defaults to epoch if parse fails)
14
+ """
15
+ if s is None:
16
+ # Return epoch as fallback
17
+ return datetime.fromtimestamp(0)
18
+
19
+ # Handle 'Z' timezone suffix by converting to +00:00
20
+ # Python's fromisoformat doesn't support 'Z' directly
21
+ if s.endswith("Z"):
22
+ s = s[:-1] + "+00:00"
23
+
24
+ try:
25
+ return datetime.fromisoformat(s)
26
+ except ValueError:
27
+ # Fallback: try without timezone
28
+ try:
29
+ return datetime.fromisoformat(s.split("+")[0].split("Z")[0])
30
+ except ValueError:
31
+ # Return epoch as last resort
32
+ return datetime.fromtimestamp(0)
33
+
34
+
35
+ def calculate_progress(status_data: dict) -> float:
36
+ """Calculate progress percentage from status data.
37
+
38
+ Args:
39
+ status_data: Dictionary with 'processed_pages' and 'total_pages'
40
+
41
+ Returns:
42
+ Progress percentage (0-100)
43
+ """
44
+ processed = status_data.get("processed_pages", 0)
45
+ total = status_data.get("total_pages", 0)
46
+
47
+ if total <= 0:
48
+ return 0.0
49
+
50
+ progress = (processed / total) * 100.0
51
+ return min(100.0, max(0.0, progress))
@@ -0,0 +1,172 @@
1
+ """Input validation utilities for LeapOCR SDK."""
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import BinaryIO, Union
7
+
8
+ from ..errors import FileError
9
+
10
+ # Maximum file size: 100MB
11
+ MAX_FILE_SIZE = 100 * 1024 * 1024
12
+
13
+ # Maximum instructions length
14
+ MAX_INSTRUCTIONS_LENGTH = 10000
15
+
16
+ # Supported file extensions
17
+ SUPPORTED_EXTENSIONS = {
18
+ ".pdf",
19
+ ".png",
20
+ ".jpg",
21
+ ".jpeg",
22
+ ".tiff",
23
+ ".tif",
24
+ }
25
+
26
+
27
+ @dataclass
28
+ class ValidationResult:
29
+ """Result of file validation."""
30
+
31
+ valid: bool
32
+ error: str | None = None
33
+ warnings: list[str] | None = None
34
+
35
+
36
+ def validate_file(
37
+ file_path: Union[str, Path],
38
+ max_size: int = MAX_FILE_SIZE,
39
+ allowed_types: set[str] | None = None,
40
+ ) -> ValidationResult:
41
+ """Validate a file before upload.
42
+
43
+ Args:
44
+ file_path: Path to the file to validate
45
+ max_size: Maximum file size in bytes (default: 100MB)
46
+ allowed_types: Set of allowed file extensions (default: SUPPORTED_EXTENSIONS)
47
+
48
+ Returns:
49
+ ValidationResult with validation status and any errors/warnings
50
+ """
51
+ path = Path(file_path)
52
+ allowed = allowed_types or SUPPORTED_EXTENSIONS
53
+ warnings: list[str] = []
54
+
55
+ # Check if file exists
56
+ if not path.exists():
57
+ return ValidationResult(valid=False, error=f"File not found: {path}")
58
+
59
+ # Check if it's a file (not a directory)
60
+ if not path.is_file():
61
+ return ValidationResult(valid=False, error=f"Not a file: {path}")
62
+
63
+ # Check if file is readable
64
+ if not os.access(path, os.R_OK):
65
+ return ValidationResult(valid=False, error=f"File not readable: {path}")
66
+
67
+ # Check file size
68
+ try:
69
+ file_size = path.stat().st_size
70
+ except OSError as e:
71
+ return ValidationResult(valid=False, error=f"Cannot stat file: {e}")
72
+
73
+ if file_size == 0:
74
+ return ValidationResult(valid=False, error="File is empty")
75
+
76
+ if file_size > max_size:
77
+ return ValidationResult(
78
+ valid=False,
79
+ error=f"File size ({file_size:,} bytes) exceeds maximum ({max_size:,} bytes)",
80
+ )
81
+
82
+ # Add warning for large files (>50MB will use multipart upload)
83
+ if file_size > 50 * 1024 * 1024:
84
+ warnings.append(f"Large file ({file_size:,} bytes) will use multipart upload")
85
+
86
+ # Check file extension
87
+ ext = path.suffix.lower()
88
+ if ext not in allowed:
89
+ return ValidationResult(
90
+ valid=False,
91
+ error=f"Unsupported file type: {ext}. Supported types: {', '.join(sorted(allowed))}",
92
+ )
93
+
94
+ return ValidationResult(valid=True, warnings=warnings or None)
95
+
96
+
97
+ def get_file_size(file: Union[str, Path, BinaryIO]) -> int:
98
+ """Get file size in bytes.
99
+
100
+ Args:
101
+ file: File path or file-like object
102
+
103
+ Returns:
104
+ File size in bytes
105
+
106
+ Raises:
107
+ FileError: If file size cannot be determined
108
+ """
109
+ if isinstance(file, (str, Path)):
110
+ try:
111
+ return Path(file).stat().st_size
112
+ except OSError as e:
113
+ raise FileError(f"Cannot determine file size: {e}", file_path=str(file))
114
+
115
+ # File-like object
116
+ if hasattr(file, "seek") and hasattr(file, "tell"):
117
+ try:
118
+ # Save current position
119
+ current_pos = file.tell()
120
+ # Seek to end
121
+ file.seek(0, 2)
122
+ size = file.tell()
123
+ # Restore position
124
+ file.seek(current_pos)
125
+ return size
126
+ except OSError as e:
127
+ raise FileError(f"Cannot determine file size: {e}")
128
+
129
+ raise FileError("Cannot determine file size - unsupported file type")
130
+
131
+
132
+ def guess_content_type(filename: str) -> str:
133
+ """Guess content type from filename extension.
134
+
135
+ Args:
136
+ filename: Filename to analyze
137
+
138
+ Returns:
139
+ MIME type string
140
+ """
141
+ ext = Path(filename).suffix.lower()
142
+ content_types = {
143
+ ".pdf": "application/pdf",
144
+ ".png": "image/png",
145
+ ".jpg": "image/jpeg",
146
+ ".jpeg": "image/jpeg",
147
+ ".tiff": "image/tiff",
148
+ ".tif": "image/tiff",
149
+ }
150
+ return content_types.get(ext, "application/octet-stream")
151
+
152
+
153
+ def validate_instructions(instructions: str) -> ValidationResult:
154
+ """Validate processing instructions.
155
+
156
+ Args:
157
+ instructions: Instructions text to validate
158
+
159
+ Returns:
160
+ ValidationResult with validation status
161
+ """
162
+ if not instructions:
163
+ return ValidationResult(valid=True)
164
+
165
+ if len(instructions) > MAX_INSTRUCTIONS_LENGTH:
166
+ return ValidationResult(
167
+ valid=False,
168
+ error=f"Instructions too long ({len(instructions)} characters). "
169
+ f"Maximum allowed is {MAX_INSTRUCTIONS_LENGTH} characters.",
170
+ )
171
+
172
+ return ValidationResult(valid=True)