leapocr 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- leapocr/__init__.py +91 -0
- leapocr/_internal/__init__.py +1 -0
- leapocr/_internal/polling.py +130 -0
- leapocr/_internal/retry.py +101 -0
- leapocr/_internal/upload.py +118 -0
- leapocr/_internal/utils.py +51 -0
- leapocr/_internal/validation.py +172 -0
- leapocr/client.py +96 -0
- leapocr/config.py +59 -0
- leapocr/errors.py +155 -0
- leapocr/generated/.openapi-generator/FILES +89 -0
- leapocr/generated/.openapi-generator/VERSION +1 -0
- leapocr/generated/.openapi-generator-ignore +23 -0
- leapocr/generated/leapocr/__init__.py +0 -0
- leapocr/generated/leapocr/generated/__init__.py +202 -0
- leapocr/generated/leapocr/generated/api/__init__.py +15 -0
- leapocr/generated/leapocr/generated/api/analytics_api.py +868 -0
- leapocr/generated/leapocr/generated/api/authentication_api.py +172 -0
- leapocr/generated/leapocr/generated/api/credits_api.py +653 -0
- leapocr/generated/leapocr/generated/api/health_api.py +161 -0
- leapocr/generated/leapocr/generated/api/jobs_api.py +836 -0
- leapocr/generated/leapocr/generated/api/models_api.py +161 -0
- leapocr/generated/leapocr/generated/api/ocr_api.py +863 -0
- leapocr/generated/leapocr/generated/api/sdk_api.py +744 -0
- leapocr/generated/leapocr/generated/api/templates_api.py +1006 -0
- leapocr/generated/leapocr/generated/api/upload_api.py +465 -0
- leapocr/generated/leapocr/generated/api/webhooks_api.py +324 -0
- leapocr/generated/leapocr/generated/api_client.py +738 -0
- leapocr/generated/leapocr/generated/api_response.py +25 -0
- leapocr/generated/leapocr/generated/configuration.py +476 -0
- leapocr/generated/leapocr/generated/exceptions.py +167 -0
- leapocr/generated/leapocr/generated/models/__init__.py +84 -0
- leapocr/generated/leapocr/generated/models/analytics_credits_overview.py +82 -0
- leapocr/generated/leapocr/generated/models/analytics_credits_timeseries_point.py +82 -0
- leapocr/generated/leapocr/generated/models/analytics_credits_usage_response.py +102 -0
- leapocr/generated/leapocr/generated/models/analytics_job_overview.py +94 -0
- leapocr/generated/leapocr/generated/models/analytics_job_timeseries_point.py +88 -0
- leapocr/generated/leapocr/generated/models/analytics_jobs_timeseries_get200_response.py +86 -0
- leapocr/generated/leapocr/generated/models/analytics_model_usage_stat.py +78 -0
- leapocr/generated/leapocr/generated/models/analytics_overview_response.py +100 -0
- leapocr/generated/leapocr/generated/models/analytics_page_overview.py +82 -0
- leapocr/generated/leapocr/generated/models/analytics_page_timeseries_point.py +84 -0
- leapocr/generated/leapocr/generated/models/analytics_pages_timeseries_get200_response.py +86 -0
- leapocr/generated/leapocr/generated/models/analytics_range.py +80 -0
- leapocr/generated/leapocr/generated/models/analytics_template_stat.py +82 -0
- leapocr/generated/leapocr/generated/models/analytics_top_templates_response.py +86 -0
- leapocr/generated/leapocr/generated/models/analytics_webhook_summary.py +82 -0
- leapocr/generated/leapocr/generated/models/auth_auth_response.py +81 -0
- leapocr/generated/leapocr/generated/models/credits_active_meter_response.py +82 -0
- leapocr/generated/leapocr/generated/models/credits_active_subscription_response.py +88 -0
- leapocr/generated/leapocr/generated/models/credits_catalog_benefit.py +74 -0
- leapocr/generated/leapocr/generated/models/credits_catalog_price.py +76 -0
- leapocr/generated/leapocr/generated/models/credits_catalog_product.py +96 -0
- leapocr/generated/leapocr/generated/models/credits_credit_balance_response.py +120 -0
- leapocr/generated/leapocr/generated/models/credits_credit_transaction_organization_response.py +102 -0
- leapocr/generated/leapocr/generated/models/credits_credit_transaction_project_response.py +102 -0
- leapocr/generated/leapocr/generated/models/credits_credit_transactions_response_credits_credit_transaction_organization_response.py +86 -0
- leapocr/generated/leapocr/generated/models/credits_credit_transactions_response_credits_credit_transaction_project_response.py +86 -0
- leapocr/generated/leapocr/generated/models/credits_granted_benefit_response.py +82 -0
- leapocr/generated/leapocr/generated/models/credits_pagination_info.py +76 -0
- leapocr/generated/leapocr/generated/models/credits_product_catalog.py +89 -0
- leapocr/generated/leapocr/generated/models/health_health_check.py +78 -0
- leapocr/generated/leapocr/generated/models/health_health_status.py +97 -0
- leapocr/generated/leapocr/generated/models/health_health_summary.py +78 -0
- leapocr/generated/leapocr/generated/models/jobs_job_list_item.py +100 -0
- leapocr/generated/leapocr/generated/models/jobs_job_management_response.py +82 -0
- leapocr/generated/leapocr/generated/models/jobs_job_response.py +102 -0
- leapocr/generated/leapocr/generated/models/jobs_job_status_response.py +90 -0
- leapocr/generated/leapocr/generated/models/jobs_jobs_list_response.py +86 -0
- leapocr/generated/leapocr/generated/models/jobs_pagination_info.py +78 -0
- leapocr/generated/leapocr/generated/models/jobs_restart_job_request.py +74 -0
- leapocr/generated/leapocr/generated/models/jobs_retry_job_request.py +74 -0
- leapocr/generated/leapocr/generated/models/jobs_workflow_job_status_info.py +80 -0
- leapocr/generated/leapocr/generated/models/jobs_workflow_progress_info.py +80 -0
- leapocr/generated/leapocr/generated/models/jobs_workflow_status_info.py +104 -0
- leapocr/generated/leapocr/generated/models/models_list_model_response.py +80 -0
- leapocr/generated/leapocr/generated/models/models_list_models_list_response.py +82 -0
- leapocr/generated/leapocr/generated/models/models_ocr_result_response.py +106 -0
- leapocr/generated/leapocr/generated/models/models_ocr_status_response.py +84 -0
- leapocr/generated/leapocr/generated/models/models_page_metadata.py +76 -0
- leapocr/generated/leapocr/generated/models/models_page_response.py +84 -0
- leapocr/generated/leapocr/generated/models/models_pagination_response.py +78 -0
- leapocr/generated/leapocr/generated/models/response_error_message.py +72 -0
- leapocr/generated/leapocr/generated/models/response_error_response.py +85 -0
- leapocr/generated/leapocr/generated/models/status_response.py +88 -0
- leapocr/generated/leapocr/generated/models/templates_create_template_request.py +90 -0
- leapocr/generated/leapocr/generated/models/templates_list_templates_response.py +88 -0
- leapocr/generated/leapocr/generated/models/templates_template_response.py +104 -0
- leapocr/generated/leapocr/generated/models/templates_template_stats_response.py +88 -0
- leapocr/generated/leapocr/generated/models/templates_update_template_request.py +88 -0
- leapocr/generated/leapocr/generated/models/upload_completed_part.py +74 -0
- leapocr/generated/leapocr/generated/models/upload_direct_upload_complete_request.py +80 -0
- leapocr/generated/leapocr/generated/models/upload_direct_upload_complete_response.py +78 -0
- leapocr/generated/leapocr/generated/models/upload_direct_upload_response.py +94 -0
- leapocr/generated/leapocr/generated/models/upload_initiate_direct_upload_request.py +96 -0
- leapocr/generated/leapocr/generated/models/upload_multipart_part.py +78 -0
- leapocr/generated/leapocr/generated/models/upload_remote_url_upload_request.py +94 -0
- leapocr/generated/leapocr/generated/models/upload_remote_url_upload_response.py +78 -0
- leapocr/generated/leapocr/generated/models/webhooks_r2_upload_notification.py +76 -0
- leapocr/generated/leapocr/generated/rest.py +257 -0
- leapocr/generated/leapocr/generated_README.md +205 -0
- leapocr/models.py +157 -0
- leapocr/ocr.py +376 -0
- leapocr-0.0.1.dist-info/METADATA +577 -0
- leapocr-0.0.1.dist-info/RECORD +106 -0
- leapocr-0.0.1.dist-info/WHEEL +4 -0
leapocr/__init__.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""LeapOCR Python SDK - Transform documents into structured data using AI-powered OCR.
|
|
2
|
+
|
|
3
|
+
Example:
|
|
4
|
+
>>> import asyncio
|
|
5
|
+
>>> from leapocr import LeapOCR, ProcessOptions, Format
|
|
6
|
+
>>>
|
|
7
|
+
>>> async def main():
|
|
8
|
+
... async with LeapOCR("your-api-key") as client:
|
|
9
|
+
... result = await client.ocr.process_and_wait(
|
|
10
|
+
... "document.pdf",
|
|
11
|
+
... options=ProcessOptions(format=Format.MARKDOWN)
|
|
12
|
+
... )
|
|
13
|
+
... print(f"Processed {result.total_pages} pages")
|
|
14
|
+
>>>
|
|
15
|
+
>>> asyncio.run(main())
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
__version__ = "0.1.0"
|
|
19
|
+
|
|
20
|
+
# Core client
|
|
21
|
+
from .client import LeapOCR
|
|
22
|
+
|
|
23
|
+
# Configuration
|
|
24
|
+
from .config import ClientConfig
|
|
25
|
+
|
|
26
|
+
# Errors
|
|
27
|
+
from .errors import (
|
|
28
|
+
APIError,
|
|
29
|
+
AuthenticationError,
|
|
30
|
+
FileError,
|
|
31
|
+
InsufficientCreditsError,
|
|
32
|
+
JobError,
|
|
33
|
+
JobFailedError,
|
|
34
|
+
JobTimeoutError,
|
|
35
|
+
LeapOCRError,
|
|
36
|
+
NetworkError,
|
|
37
|
+
RateLimitError,
|
|
38
|
+
ValidationError,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Models and enums
|
|
42
|
+
from .models import (
|
|
43
|
+
BatchResult,
|
|
44
|
+
Format,
|
|
45
|
+
JobResult,
|
|
46
|
+
JobStatus,
|
|
47
|
+
JobStatusType,
|
|
48
|
+
Model,
|
|
49
|
+
ModelInfo,
|
|
50
|
+
PageMetadata,
|
|
51
|
+
PageResult,
|
|
52
|
+
PaginationInfo,
|
|
53
|
+
PollOptions,
|
|
54
|
+
ProcessOptions,
|
|
55
|
+
ProcessResult,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
__all__ = [
|
|
59
|
+
# Version
|
|
60
|
+
"__version__",
|
|
61
|
+
# Client
|
|
62
|
+
"LeapOCR",
|
|
63
|
+
# Configuration
|
|
64
|
+
"ClientConfig",
|
|
65
|
+
# Models
|
|
66
|
+
"Format",
|
|
67
|
+
"Model",
|
|
68
|
+
"JobStatusType",
|
|
69
|
+
"ProcessOptions",
|
|
70
|
+
"PollOptions",
|
|
71
|
+
"ProcessResult",
|
|
72
|
+
"JobStatus",
|
|
73
|
+
"JobResult",
|
|
74
|
+
"PageResult",
|
|
75
|
+
"PageMetadata",
|
|
76
|
+
"PaginationInfo",
|
|
77
|
+
"ModelInfo",
|
|
78
|
+
"BatchResult",
|
|
79
|
+
# Errors
|
|
80
|
+
"LeapOCRError",
|
|
81
|
+
"AuthenticationError",
|
|
82
|
+
"RateLimitError",
|
|
83
|
+
"ValidationError",
|
|
84
|
+
"FileError",
|
|
85
|
+
"JobError",
|
|
86
|
+
"JobFailedError",
|
|
87
|
+
"JobTimeoutError",
|
|
88
|
+
"NetworkError",
|
|
89
|
+
"APIError",
|
|
90
|
+
"InsufficientCreditsError",
|
|
91
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Internal utilities for LeapOCR SDK."""
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Status polling utilities for long-running OCR jobs."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
from typing import TYPE_CHECKING, Callable
|
|
6
|
+
|
|
7
|
+
from ..errors import JobFailedError, JobTimeoutError
|
|
8
|
+
from ..models import JobStatusType, PollOptions
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from ..models import JobStatus
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def poll_until_done(
|
|
15
|
+
get_status_fn: Callable[[str], "JobStatus"],
|
|
16
|
+
job_id: str,
|
|
17
|
+
options: PollOptions | None = None,
|
|
18
|
+
) -> None:
|
|
19
|
+
"""Poll job status until completion or failure.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
get_status_fn: Async function to get job status (takes job_id, returns JobStatus)
|
|
23
|
+
job_id: Job ID to poll
|
|
24
|
+
options: Polling options (interval, timeout, callbacks)
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
JobTimeoutError: If job doesn't complete within max_wait
|
|
28
|
+
JobFailedError: If job processing fails
|
|
29
|
+
"""
|
|
30
|
+
opts = options or PollOptions()
|
|
31
|
+
start_time = datetime.now()
|
|
32
|
+
max_wait_td = timedelta(seconds=opts.max_wait)
|
|
33
|
+
|
|
34
|
+
while True:
|
|
35
|
+
# Check timeout
|
|
36
|
+
elapsed = datetime.now() - start_time
|
|
37
|
+
if elapsed > max_wait_td:
|
|
38
|
+
raise JobTimeoutError(
|
|
39
|
+
f"Job {job_id} did not complete within {opts.max_wait} seconds",
|
|
40
|
+
job_id=job_id,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Get current status
|
|
44
|
+
status = await get_status_fn(job_id)
|
|
45
|
+
|
|
46
|
+
# Call progress callback if provided
|
|
47
|
+
if opts.on_progress:
|
|
48
|
+
try:
|
|
49
|
+
opts.on_progress(status)
|
|
50
|
+
except Exception:
|
|
51
|
+
# Don't let callback errors stop polling
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
# Check if job is complete
|
|
55
|
+
if status.status == JobStatusType.COMPLETED:
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
# Check if job failed
|
|
59
|
+
if status.status == JobStatusType.FAILED:
|
|
60
|
+
error_msg = status.error_message or "Job processing failed"
|
|
61
|
+
raise JobFailedError(error_msg, job_id=job_id, error_details=status.error_message)
|
|
62
|
+
|
|
63
|
+
# Wait before next poll
|
|
64
|
+
await asyncio.sleep(opts.poll_interval)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def poll_with_backoff(
|
|
68
|
+
get_status_fn: Callable[[str], "JobStatus"],
|
|
69
|
+
job_id: str,
|
|
70
|
+
initial_interval: float = 1.0,
|
|
71
|
+
max_interval: float = 30.0,
|
|
72
|
+
backoff_multiplier: float = 1.5,
|
|
73
|
+
max_wait: float = 300.0,
|
|
74
|
+
on_progress: Callable[["JobStatus"], None] | None = None,
|
|
75
|
+
) -> None:
|
|
76
|
+
"""Poll job status with exponential backoff.
|
|
77
|
+
|
|
78
|
+
Starts with short intervals and gradually increases delay to reduce
|
|
79
|
+
API load for long-running jobs.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
get_status_fn: Async function to get job status
|
|
83
|
+
job_id: Job ID to poll
|
|
84
|
+
initial_interval: Starting poll interval in seconds (default: 1.0)
|
|
85
|
+
max_interval: Maximum poll interval in seconds (default: 30.0)
|
|
86
|
+
backoff_multiplier: Interval multiplier after each poll (default: 1.5)
|
|
87
|
+
max_wait: Maximum total wait time in seconds (default: 300.0)
|
|
88
|
+
on_progress: Optional callback for progress updates
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
JobTimeoutError: If job doesn't complete within max_wait
|
|
92
|
+
JobFailedError: If job processing fails
|
|
93
|
+
"""
|
|
94
|
+
start_time = datetime.now()
|
|
95
|
+
max_wait_td = timedelta(seconds=max_wait)
|
|
96
|
+
current_interval = initial_interval
|
|
97
|
+
|
|
98
|
+
while True:
|
|
99
|
+
# Check timeout
|
|
100
|
+
elapsed = datetime.now() - start_time
|
|
101
|
+
if elapsed > max_wait_td:
|
|
102
|
+
raise JobTimeoutError(
|
|
103
|
+
f"Job {job_id} did not complete within {max_wait} seconds",
|
|
104
|
+
job_id=job_id,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Get current status
|
|
108
|
+
status = await get_status_fn(job_id)
|
|
109
|
+
|
|
110
|
+
# Call progress callback if provided
|
|
111
|
+
if on_progress:
|
|
112
|
+
try:
|
|
113
|
+
on_progress(status)
|
|
114
|
+
except Exception:
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
# Check if job is complete
|
|
118
|
+
if status.status == JobStatusType.COMPLETED:
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
# Check if job failed
|
|
122
|
+
if status.status == JobStatusType.FAILED:
|
|
123
|
+
error_msg = status.error_message or "Job processing failed"
|
|
124
|
+
raise JobFailedError(error_msg, job_id=job_id, error_details=status.error_message)
|
|
125
|
+
|
|
126
|
+
# Wait with current interval
|
|
127
|
+
await asyncio.sleep(current_interval)
|
|
128
|
+
|
|
129
|
+
# Increase interval for next iteration (exponential backoff)
|
|
130
|
+
current_interval = min(current_interval * backoff_multiplier, max_interval)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Retry logic with exponential backoff for LeapOCR SDK."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from collections.abc import Awaitable
|
|
5
|
+
from typing import Callable, TypeVar
|
|
6
|
+
|
|
7
|
+
from ..errors import LeapOCRError, NetworkError, RateLimitError
|
|
8
|
+
|
|
9
|
+
T = TypeVar("T")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def is_retryable_error(error: Exception) -> bool:
|
|
13
|
+
"""Check if an error should be retried.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
error: Exception to check
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
True if the error is retryable, False otherwise
|
|
20
|
+
"""
|
|
21
|
+
# Rate limit errors are retryable
|
|
22
|
+
if isinstance(error, RateLimitError):
|
|
23
|
+
return True
|
|
24
|
+
|
|
25
|
+
# Network errors are retryable
|
|
26
|
+
if isinstance(error, NetworkError):
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
# SDK errors with 5xx status codes are retryable
|
|
30
|
+
if isinstance(error, LeapOCRError):
|
|
31
|
+
if error.status_code and 500 <= error.status_code < 600:
|
|
32
|
+
return True
|
|
33
|
+
|
|
34
|
+
# Check for httpx errors
|
|
35
|
+
try:
|
|
36
|
+
import httpx
|
|
37
|
+
|
|
38
|
+
if isinstance(error, (httpx.TimeoutException, httpx.NetworkError)):
|
|
39
|
+
return True
|
|
40
|
+
if isinstance(error, httpx.HTTPStatusError):
|
|
41
|
+
return error.response.status_code >= 500
|
|
42
|
+
except ImportError:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def with_retry(
|
|
49
|
+
operation: Callable[[], Awaitable[T]],
|
|
50
|
+
max_retries: int = 3,
|
|
51
|
+
retry_delay: float = 1.0,
|
|
52
|
+
retry_multiplier: float = 2.0,
|
|
53
|
+
is_retryable: Callable[[Exception], bool] | None = None,
|
|
54
|
+
) -> T:
|
|
55
|
+
"""Execute an async operation with exponential backoff retry.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
operation: Async function to execute
|
|
59
|
+
max_retries: Maximum number of retry attempts (default: 3)
|
|
60
|
+
retry_delay: Initial delay between retries in seconds (default: 1.0)
|
|
61
|
+
retry_multiplier: Multiplier for exponential backoff (default: 2.0)
|
|
62
|
+
is_retryable: Optional function to determine if error is retryable
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Result from the operation
|
|
66
|
+
|
|
67
|
+
Raises:
|
|
68
|
+
The last exception if all retries are exhausted
|
|
69
|
+
"""
|
|
70
|
+
is_retryable_fn = is_retryable or is_retryable_error
|
|
71
|
+
last_error: Exception | None = None
|
|
72
|
+
|
|
73
|
+
for attempt in range(max_retries + 1):
|
|
74
|
+
try:
|
|
75
|
+
return await operation()
|
|
76
|
+
except Exception as error:
|
|
77
|
+
last_error = error
|
|
78
|
+
|
|
79
|
+
# Don't retry on last attempt
|
|
80
|
+
if attempt == max_retries:
|
|
81
|
+
raise
|
|
82
|
+
|
|
83
|
+
# Check if error is retryable
|
|
84
|
+
if not is_retryable_fn(error):
|
|
85
|
+
raise
|
|
86
|
+
|
|
87
|
+
# Calculate delay with exponential backoff
|
|
88
|
+
if isinstance(error, RateLimitError) and error.retry_after:
|
|
89
|
+
# Use server-provided retry-after if available
|
|
90
|
+
delay = float(error.retry_after)
|
|
91
|
+
else:
|
|
92
|
+
# Exponential backoff: delay * (multiplier ^ attempt)
|
|
93
|
+
delay = retry_delay * (retry_multiplier**attempt)
|
|
94
|
+
|
|
95
|
+
# Wait before retry
|
|
96
|
+
await asyncio.sleep(delay)
|
|
97
|
+
|
|
98
|
+
# Should never reach here, but just in case
|
|
99
|
+
if last_error:
|
|
100
|
+
raise last_error
|
|
101
|
+
raise RuntimeError("Retry loop completed without success or error")
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""File upload utilities for multipart S3 uploads."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, BinaryIO
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
from ..errors import FileError, NetworkError
|
|
8
|
+
from .validation import get_file_size, guess_content_type
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MultipartUploader:
|
|
12
|
+
"""Handle multipart file uploads to S3 via presigned URLs.
|
|
13
|
+
|
|
14
|
+
This class manages the upload of file parts to S3 using presigned URLs
|
|
15
|
+
returned from the LeapOCR API.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, timeout: float = 300.0):
|
|
19
|
+
"""Initialize the uploader.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
timeout: Timeout for upload requests in seconds (default: 5 minutes)
|
|
23
|
+
"""
|
|
24
|
+
# Separate HTTP client for S3 uploads (no auth needed, different domain)
|
|
25
|
+
self._s3_client = httpx.AsyncClient(timeout=timeout)
|
|
26
|
+
|
|
27
|
+
async def close(self) -> None:
|
|
28
|
+
"""Close the S3 HTTP client."""
|
|
29
|
+
await self._s3_client.aclose()
|
|
30
|
+
|
|
31
|
+
async def __aenter__(self) -> "MultipartUploader":
|
|
32
|
+
"""Context manager entry."""
|
|
33
|
+
return self
|
|
34
|
+
|
|
35
|
+
async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
36
|
+
"""Context manager exit."""
|
|
37
|
+
await self.close()
|
|
38
|
+
|
|
39
|
+
async def upload_multipart(
|
|
40
|
+
self, file: BinaryIO, parts: list[dict[str, Any]]
|
|
41
|
+
) -> list[dict[str, Any]]:
|
|
42
|
+
"""Upload file parts to S3 presigned URLs and return ETags.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
file: File-like object (must support seek/read)
|
|
46
|
+
parts: List of part dicts with part_number, start_byte, end_byte, upload_url
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
List of dicts with part_number and etag for completion request
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
FileError: If file reading fails
|
|
53
|
+
NetworkError: If upload fails
|
|
54
|
+
"""
|
|
55
|
+
completed_parts: list[dict[str, Any]] = []
|
|
56
|
+
|
|
57
|
+
for part in parts:
|
|
58
|
+
part_number = part["part_number"]
|
|
59
|
+
upload_url = part["upload_url"]
|
|
60
|
+
start_byte = part["start_byte"]
|
|
61
|
+
end_byte = part["end_byte"]
|
|
62
|
+
|
|
63
|
+
# Calculate chunk size (end_byte is inclusive)
|
|
64
|
+
chunk_size = end_byte - start_byte + 1
|
|
65
|
+
|
|
66
|
+
# Read chunk from file
|
|
67
|
+
try:
|
|
68
|
+
file.seek(start_byte)
|
|
69
|
+
chunk_data = file.read(chunk_size)
|
|
70
|
+
except OSError as e:
|
|
71
|
+
raise FileError(
|
|
72
|
+
f"Failed to read file chunk for part {part_number}: {e}",
|
|
73
|
+
file_path=getattr(file, "name", None),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
if len(chunk_data) != chunk_size:
|
|
77
|
+
raise FileError(
|
|
78
|
+
f"Failed to read expected chunk size for part {part_number}: "
|
|
79
|
+
f"got {len(chunk_data)} bytes, expected {chunk_size} bytes",
|
|
80
|
+
file_path=getattr(file, "name", None),
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Upload to S3 via presigned URL (raw PUT, not multipart/form-data)
|
|
84
|
+
try:
|
|
85
|
+
response = await self._s3_client.put(
|
|
86
|
+
upload_url,
|
|
87
|
+
content=chunk_data,
|
|
88
|
+
headers={
|
|
89
|
+
"Content-Length": str(len(chunk_data)),
|
|
90
|
+
},
|
|
91
|
+
)
|
|
92
|
+
response.raise_for_status()
|
|
93
|
+
except httpx.HTTPStatusError as e:
|
|
94
|
+
if e.response.status_code == 403:
|
|
95
|
+
raise NetworkError(
|
|
96
|
+
f"Presigned URL expired or invalid for part {part_number}",
|
|
97
|
+
cause=e,
|
|
98
|
+
)
|
|
99
|
+
raise NetworkError(
|
|
100
|
+
f"Failed to upload part {part_number} to S3: HTTP {e.response.status_code}",
|
|
101
|
+
cause=e,
|
|
102
|
+
)
|
|
103
|
+
except httpx.RequestError as e:
|
|
104
|
+
raise NetworkError(f"Network error uploading part {part_number}: {e}", cause=e)
|
|
105
|
+
|
|
106
|
+
# Extract ETag from response headers
|
|
107
|
+
# S3 returns ETag with quotes like: "9bb58f26192e4ba00f01e2e7b136bbd8"
|
|
108
|
+
etag = response.headers.get("ETag", "").strip('"')
|
|
109
|
+
if not etag:
|
|
110
|
+
raise NetworkError(f"Missing ETag in S3 response for part {part_number}")
|
|
111
|
+
|
|
112
|
+
completed_parts.append({"part_number": part_number, "etag": etag})
|
|
113
|
+
|
|
114
|
+
return completed_parts
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# Re-export utility functions
|
|
118
|
+
__all__ = ["MultipartUploader", "get_file_size", "guess_content_type"]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Common utility functions for LeapOCR SDK."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse_datetime(s: str | None) -> datetime:
|
|
7
|
+
"""Parse RFC3339 datetime string.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
s: RFC3339 datetime string (e.g., "2023-12-25T10:30:00Z")
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
datetime object (defaults to epoch if parse fails)
|
|
14
|
+
"""
|
|
15
|
+
if s is None:
|
|
16
|
+
# Return epoch as fallback
|
|
17
|
+
return datetime.fromtimestamp(0)
|
|
18
|
+
|
|
19
|
+
# Handle 'Z' timezone suffix by converting to +00:00
|
|
20
|
+
# Python's fromisoformat doesn't support 'Z' directly
|
|
21
|
+
if s.endswith("Z"):
|
|
22
|
+
s = s[:-1] + "+00:00"
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
return datetime.fromisoformat(s)
|
|
26
|
+
except ValueError:
|
|
27
|
+
# Fallback: try without timezone
|
|
28
|
+
try:
|
|
29
|
+
return datetime.fromisoformat(s.split("+")[0].split("Z")[0])
|
|
30
|
+
except ValueError:
|
|
31
|
+
# Return epoch as last resort
|
|
32
|
+
return datetime.fromtimestamp(0)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def calculate_progress(status_data: dict) -> float:
|
|
36
|
+
"""Calculate progress percentage from status data.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
status_data: Dictionary with 'processed_pages' and 'total_pages'
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Progress percentage (0-100)
|
|
43
|
+
"""
|
|
44
|
+
processed = status_data.get("processed_pages", 0)
|
|
45
|
+
total = status_data.get("total_pages", 0)
|
|
46
|
+
|
|
47
|
+
if total <= 0:
|
|
48
|
+
return 0.0
|
|
49
|
+
|
|
50
|
+
progress = (processed / total) * 100.0
|
|
51
|
+
return min(100.0, max(0.0, progress))
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Input validation utilities for LeapOCR SDK."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import BinaryIO, Union
|
|
7
|
+
|
|
8
|
+
from ..errors import FileError
|
|
9
|
+
|
|
10
|
+
# Maximum file size: 100MB
|
|
11
|
+
MAX_FILE_SIZE = 100 * 1024 * 1024
|
|
12
|
+
|
|
13
|
+
# Maximum instructions length
|
|
14
|
+
MAX_INSTRUCTIONS_LENGTH = 10000
|
|
15
|
+
|
|
16
|
+
# Supported file extensions
|
|
17
|
+
SUPPORTED_EXTENSIONS = {
|
|
18
|
+
".pdf",
|
|
19
|
+
".png",
|
|
20
|
+
".jpg",
|
|
21
|
+
".jpeg",
|
|
22
|
+
".tiff",
|
|
23
|
+
".tif",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ValidationResult:
|
|
29
|
+
"""Result of file validation."""
|
|
30
|
+
|
|
31
|
+
valid: bool
|
|
32
|
+
error: str | None = None
|
|
33
|
+
warnings: list[str] | None = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def validate_file(
|
|
37
|
+
file_path: Union[str, Path],
|
|
38
|
+
max_size: int = MAX_FILE_SIZE,
|
|
39
|
+
allowed_types: set[str] | None = None,
|
|
40
|
+
) -> ValidationResult:
|
|
41
|
+
"""Validate a file before upload.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
file_path: Path to the file to validate
|
|
45
|
+
max_size: Maximum file size in bytes (default: 100MB)
|
|
46
|
+
allowed_types: Set of allowed file extensions (default: SUPPORTED_EXTENSIONS)
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
ValidationResult with validation status and any errors/warnings
|
|
50
|
+
"""
|
|
51
|
+
path = Path(file_path)
|
|
52
|
+
allowed = allowed_types or SUPPORTED_EXTENSIONS
|
|
53
|
+
warnings: list[str] = []
|
|
54
|
+
|
|
55
|
+
# Check if file exists
|
|
56
|
+
if not path.exists():
|
|
57
|
+
return ValidationResult(valid=False, error=f"File not found: {path}")
|
|
58
|
+
|
|
59
|
+
# Check if it's a file (not a directory)
|
|
60
|
+
if not path.is_file():
|
|
61
|
+
return ValidationResult(valid=False, error=f"Not a file: {path}")
|
|
62
|
+
|
|
63
|
+
# Check if file is readable
|
|
64
|
+
if not os.access(path, os.R_OK):
|
|
65
|
+
return ValidationResult(valid=False, error=f"File not readable: {path}")
|
|
66
|
+
|
|
67
|
+
# Check file size
|
|
68
|
+
try:
|
|
69
|
+
file_size = path.stat().st_size
|
|
70
|
+
except OSError as e:
|
|
71
|
+
return ValidationResult(valid=False, error=f"Cannot stat file: {e}")
|
|
72
|
+
|
|
73
|
+
if file_size == 0:
|
|
74
|
+
return ValidationResult(valid=False, error="File is empty")
|
|
75
|
+
|
|
76
|
+
if file_size > max_size:
|
|
77
|
+
return ValidationResult(
|
|
78
|
+
valid=False,
|
|
79
|
+
error=f"File size ({file_size:,} bytes) exceeds maximum ({max_size:,} bytes)",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Add warning for large files (>50MB will use multipart upload)
|
|
83
|
+
if file_size > 50 * 1024 * 1024:
|
|
84
|
+
warnings.append(f"Large file ({file_size:,} bytes) will use multipart upload")
|
|
85
|
+
|
|
86
|
+
# Check file extension
|
|
87
|
+
ext = path.suffix.lower()
|
|
88
|
+
if ext not in allowed:
|
|
89
|
+
return ValidationResult(
|
|
90
|
+
valid=False,
|
|
91
|
+
error=f"Unsupported file type: {ext}. Supported types: {', '.join(sorted(allowed))}",
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
return ValidationResult(valid=True, warnings=warnings or None)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_file_size(file: Union[str, Path, BinaryIO]) -> int:
|
|
98
|
+
"""Get file size in bytes.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
file: File path or file-like object
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
File size in bytes
|
|
105
|
+
|
|
106
|
+
Raises:
|
|
107
|
+
FileError: If file size cannot be determined
|
|
108
|
+
"""
|
|
109
|
+
if isinstance(file, (str, Path)):
|
|
110
|
+
try:
|
|
111
|
+
return Path(file).stat().st_size
|
|
112
|
+
except OSError as e:
|
|
113
|
+
raise FileError(f"Cannot determine file size: {e}", file_path=str(file))
|
|
114
|
+
|
|
115
|
+
# File-like object
|
|
116
|
+
if hasattr(file, "seek") and hasattr(file, "tell"):
|
|
117
|
+
try:
|
|
118
|
+
# Save current position
|
|
119
|
+
current_pos = file.tell()
|
|
120
|
+
# Seek to end
|
|
121
|
+
file.seek(0, 2)
|
|
122
|
+
size = file.tell()
|
|
123
|
+
# Restore position
|
|
124
|
+
file.seek(current_pos)
|
|
125
|
+
return size
|
|
126
|
+
except OSError as e:
|
|
127
|
+
raise FileError(f"Cannot determine file size: {e}")
|
|
128
|
+
|
|
129
|
+
raise FileError("Cannot determine file size - unsupported file type")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def guess_content_type(filename: str) -> str:
|
|
133
|
+
"""Guess content type from filename extension.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
filename: Filename to analyze
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
MIME type string
|
|
140
|
+
"""
|
|
141
|
+
ext = Path(filename).suffix.lower()
|
|
142
|
+
content_types = {
|
|
143
|
+
".pdf": "application/pdf",
|
|
144
|
+
".png": "image/png",
|
|
145
|
+
".jpg": "image/jpeg",
|
|
146
|
+
".jpeg": "image/jpeg",
|
|
147
|
+
".tiff": "image/tiff",
|
|
148
|
+
".tif": "image/tiff",
|
|
149
|
+
}
|
|
150
|
+
return content_types.get(ext, "application/octet-stream")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def validate_instructions(instructions: str) -> ValidationResult:
|
|
154
|
+
"""Validate processing instructions.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
instructions: Instructions text to validate
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
ValidationResult with validation status
|
|
161
|
+
"""
|
|
162
|
+
if not instructions:
|
|
163
|
+
return ValidationResult(valid=True)
|
|
164
|
+
|
|
165
|
+
if len(instructions) > MAX_INSTRUCTIONS_LENGTH:
|
|
166
|
+
return ValidationResult(
|
|
167
|
+
valid=False,
|
|
168
|
+
error=f"Instructions too long ({len(instructions)} characters). "
|
|
169
|
+
f"Maximum allowed is {MAX_INSTRUCTIONS_LENGTH} characters.",
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return ValidationResult(valid=True)
|