sarvamai 0.1.22a4__py3-none-any.whl → 0.1.22a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +62 -3
- sarvamai/client.py +3 -0
- sarvamai/core/client_wrapper.py +2 -2
- sarvamai/doc_digitization_job/__init__.py +4 -0
- sarvamai/doc_digitization_job/client.py +776 -0
- sarvamai/doc_digitization_job/job.py +496 -0
- sarvamai/doc_digitization_job/raw_client.py +1176 -0
- sarvamai/requests/__init__.py +20 -0
- sarvamai/requests/audio_data.py +0 -6
- sarvamai/requests/configure_connection.py +4 -0
- sarvamai/requests/configure_connection_data.py +40 -11
- sarvamai/requests/doc_digitization_create_job_response.py +25 -0
- sarvamai/requests/doc_digitization_download_files_response.py +37 -0
- sarvamai/requests/doc_digitization_error_details.py +21 -0
- sarvamai/requests/doc_digitization_error_message.py +11 -0
- sarvamai/requests/doc_digitization_job_detail.py +64 -0
- sarvamai/requests/doc_digitization_job_parameters.py +21 -0
- sarvamai/requests/doc_digitization_job_status_response.py +65 -0
- sarvamai/requests/doc_digitization_page_error.py +24 -0
- sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
- sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
- sarvamai/requests/speech_to_text_job_parameters.py +43 -2
- sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/speech_to_text/client.py +95 -10
- sarvamai/speech_to_text/raw_client.py +95 -10
- sarvamai/speech_to_text_job/client.py +60 -15
- sarvamai/speech_to_text_streaming/__init__.py +4 -0
- sarvamai/speech_to_text_streaming/client.py +102 -18
- sarvamai/speech_to_text_streaming/raw_client.py +102 -18
- sarvamai/speech_to_text_streaming/types/__init__.py +4 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
- sarvamai/speech_to_text_translate_streaming/client.py +20 -12
- sarvamai/speech_to_text_translate_streaming/raw_client.py +20 -12
- sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
- sarvamai/text/client.py +0 -12
- sarvamai/text/raw_client.py +0 -12
- sarvamai/text_to_speech/client.py +116 -14
- sarvamai/text_to_speech/raw_client.py +116 -14
- sarvamai/text_to_speech_streaming/__init__.py +2 -2
- sarvamai/text_to_speech_streaming/client.py +19 -6
- sarvamai/text_to_speech_streaming/raw_client.py +19 -6
- sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
- sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
- sarvamai/types/__init__.py +34 -2
- sarvamai/types/audio_data.py +0 -6
- sarvamai/types/configure_connection.py +4 -0
- sarvamai/types/configure_connection_data.py +40 -11
- sarvamai/types/configure_connection_data_model.py +5 -0
- sarvamai/types/configure_connection_data_speaker.py +35 -1
- sarvamai/types/doc_digitization_create_job_response.py +37 -0
- sarvamai/types/doc_digitization_download_files_response.py +47 -0
- sarvamai/types/doc_digitization_error_code.py +15 -0
- sarvamai/types/doc_digitization_error_details.py +33 -0
- sarvamai/types/doc_digitization_error_message.py +23 -0
- sarvamai/types/doc_digitization_job_detail.py +74 -0
- sarvamai/types/doc_digitization_job_detail_state.py +7 -0
- sarvamai/types/doc_digitization_job_parameters.py +33 -0
- sarvamai/types/doc_digitization_job_state.py +7 -0
- sarvamai/types/doc_digitization_job_status_response.py +75 -0
- sarvamai/types/doc_digitization_output_format.py +5 -0
- sarvamai/types/doc_digitization_page_error.py +36 -0
- sarvamai/types/doc_digitization_supported_language.py +32 -0
- sarvamai/types/doc_digitization_upload_files_response.py +44 -0
- sarvamai/types/doc_digitization_webhook_callback.py +31 -0
- sarvamai/types/mode.py +5 -0
- sarvamai/types/speech_to_text_job_parameters.py +43 -2
- sarvamai/types/speech_to_text_model.py +1 -1
- sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/types/text_to_speech_model.py +1 -1
- sarvamai/types/text_to_speech_speaker.py +35 -1
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/METADATA +1 -1
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/RECORD +75 -42
- sarvamai/types/audio_data_input_audio_codec.py +0 -33
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document Digitization Job helper classes.
|
|
3
|
+
|
|
4
|
+
This module provides high-level abstractions for managing document digitization jobs,
|
|
5
|
+
including file uploads, status polling, and output downloads.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import mimetypes
|
|
10
|
+
import os
|
|
11
|
+
import time
|
|
12
|
+
import typing
|
|
13
|
+
|
|
14
|
+
import httpx
|
|
15
|
+
from http import HTTPStatus
|
|
16
|
+
|
|
17
|
+
from ..types.doc_digitization_job_status_response import (
|
|
18
|
+
DocDigitizationJobStatusResponse,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
if typing.TYPE_CHECKING:
|
|
22
|
+
from .client import AsyncDocDigitizationJobClient, DocDigitizationJobClient
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AsyncDocDigitizationJob:
|
|
26
|
+
"""
|
|
27
|
+
Async helper class for managing a Document Digitization job.
|
|
28
|
+
|
|
29
|
+
Provides high-level methods for uploading files, starting the job,
|
|
30
|
+
polling for completion, and downloading outputs.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, job_id: str, client: "AsyncDocDigitizationJobClient"):
|
|
34
|
+
"""
|
|
35
|
+
Initialize the asynchronous document digitization job.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
job_id : str
|
|
40
|
+
The unique identifier for the job.
|
|
41
|
+
client : AsyncDocDigitizationJobClient
|
|
42
|
+
The async client instance to use for API calls.
|
|
43
|
+
"""
|
|
44
|
+
self._job_id = job_id
|
|
45
|
+
self._client = client
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def job_id(self) -> str:
|
|
49
|
+
"""Returns the job ID associated with this job instance."""
|
|
50
|
+
return self._job_id
|
|
51
|
+
|
|
52
|
+
async def upload_file(self, file_path: str, timeout: float = 120.0) -> bool:
|
|
53
|
+
"""
|
|
54
|
+
Upload the input PDF or ZIP file for the document digitization job.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
file_path : str
|
|
59
|
+
Path to the PDF or ZIP file to upload.
|
|
60
|
+
timeout : float, default=120.0
|
|
61
|
+
Timeout in seconds for the upload request.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
bool
|
|
66
|
+
True if upload was successful.
|
|
67
|
+
|
|
68
|
+
Raises
|
|
69
|
+
------
|
|
70
|
+
ValueError
|
|
71
|
+
If the file is not a PDF or ZIP file.
|
|
72
|
+
FileNotFoundError
|
|
73
|
+
If the file does not exist.
|
|
74
|
+
httpx.HTTPStatusError
|
|
75
|
+
If the upload request fails.
|
|
76
|
+
"""
|
|
77
|
+
if not os.path.exists(file_path):
|
|
78
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
79
|
+
|
|
80
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
81
|
+
if ext not in [".pdf", ".zip"]:
|
|
82
|
+
raise ValueError(f"Only PDF or ZIP files are supported, got: {ext}")
|
|
83
|
+
|
|
84
|
+
file_name = os.path.basename(file_path)
|
|
85
|
+
|
|
86
|
+
# Get upload URL
|
|
87
|
+
upload_response = await self._client.get_upload_links(
|
|
88
|
+
job_id=self._job_id,
|
|
89
|
+
files=[file_name],
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
if not upload_response.upload_urls:
|
|
93
|
+
raise ValueError("No upload URL returned from API")
|
|
94
|
+
|
|
95
|
+
upload_detail = list(upload_response.upload_urls.values())[0]
|
|
96
|
+
upload_url = upload_detail.file_url
|
|
97
|
+
|
|
98
|
+
# Determine content type
|
|
99
|
+
content_type, _ = mimetypes.guess_type(file_path)
|
|
100
|
+
if content_type is None:
|
|
101
|
+
content_type = "application/octet-stream"
|
|
102
|
+
|
|
103
|
+
# Upload the file
|
|
104
|
+
async with httpx.AsyncClient(timeout=timeout) as http_client:
|
|
105
|
+
with open(file_path, "rb") as f:
|
|
106
|
+
file_content = f.read()
|
|
107
|
+
|
|
108
|
+
response = await http_client.put(
|
|
109
|
+
upload_url,
|
|
110
|
+
content=file_content,
|
|
111
|
+
headers={
|
|
112
|
+
"Content-Type": content_type,
|
|
113
|
+
"x-ms-blob-type": "BlockBlob",
|
|
114
|
+
},
|
|
115
|
+
)
|
|
116
|
+
response.raise_for_status()
|
|
117
|
+
|
|
118
|
+
return True
|
|
119
|
+
|
|
120
|
+
async def start(self) -> DocDigitizationJobStatusResponse:
|
|
121
|
+
"""
|
|
122
|
+
Start the document digitization job processing.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
DocDigitizationJobStatusResponse
|
|
127
|
+
The job status after starting.
|
|
128
|
+
"""
|
|
129
|
+
return await self._client.start(job_id=self._job_id)
|
|
130
|
+
|
|
131
|
+
async def get_status(self) -> DocDigitizationJobStatusResponse:
|
|
132
|
+
"""
|
|
133
|
+
Retrieve the current status of the job.
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
DocDigitizationJobStatusResponse
|
|
138
|
+
The current job status.
|
|
139
|
+
"""
|
|
140
|
+
return await self._client.get_status(self._job_id)
|
|
141
|
+
|
|
142
|
+
async def wait_until_complete(
|
|
143
|
+
self, poll_interval: int = 5, timeout: int = 1800
|
|
144
|
+
) -> DocDigitizationJobStatusResponse:
|
|
145
|
+
"""
|
|
146
|
+
Polls job status until it completes, partially completes, or fails.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
poll_interval : int, default=5
|
|
151
|
+
Seconds between status checks.
|
|
152
|
+
timeout : int, default=1800
|
|
153
|
+
Maximum seconds to wait before raising TimeoutError.
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
DocDigitizationJobStatusResponse
|
|
158
|
+
The final job status.
|
|
159
|
+
|
|
160
|
+
Raises
|
|
161
|
+
------
|
|
162
|
+
TimeoutError
|
|
163
|
+
If the job does not complete within the timeout period.
|
|
164
|
+
"""
|
|
165
|
+
start_time = time.time()
|
|
166
|
+
terminal_states = {"completed", "partiallycompleted", "failed"}
|
|
167
|
+
|
|
168
|
+
while True:
|
|
169
|
+
status = await self.get_status()
|
|
170
|
+
|
|
171
|
+
if status.job_state and status.job_state.lower() in terminal_states:
|
|
172
|
+
return status
|
|
173
|
+
|
|
174
|
+
elapsed = time.time() - start_time
|
|
175
|
+
if elapsed >= timeout:
|
|
176
|
+
raise TimeoutError(
|
|
177
|
+
f"Job {self._job_id} did not complete within {timeout} seconds. "
|
|
178
|
+
f"Current state: {status.job_state}"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
await asyncio.sleep(poll_interval)
|
|
182
|
+
|
|
183
|
+
async def download_output(self, output_path: str, timeout: float = 120.0) -> bool:
|
|
184
|
+
"""
|
|
185
|
+
Download the output file (HTML or Markdown) to the specified path.
|
|
186
|
+
|
|
187
|
+
Parameters
|
|
188
|
+
----------
|
|
189
|
+
output_path : str
|
|
190
|
+
Path where the output file should be saved.
|
|
191
|
+
timeout : float, default=120.0
|
|
192
|
+
Timeout in seconds for the download request.
|
|
193
|
+
|
|
194
|
+
Returns
|
|
195
|
+
-------
|
|
196
|
+
bool
|
|
197
|
+
True if download was successful.
|
|
198
|
+
|
|
199
|
+
Raises
|
|
200
|
+
------
|
|
201
|
+
ValueError
|
|
202
|
+
If no download URL is available.
|
|
203
|
+
httpx.HTTPStatusError
|
|
204
|
+
If the download request fails.
|
|
205
|
+
"""
|
|
206
|
+
download_response = await self._client.get_download_links(job_id=self._job_id)
|
|
207
|
+
|
|
208
|
+
if not download_response.download_urls:
|
|
209
|
+
raise ValueError("No download URL returned from API")
|
|
210
|
+
|
|
211
|
+
download_detail = list(download_response.download_urls.values())[0]
|
|
212
|
+
download_url = download_detail.file_url
|
|
213
|
+
|
|
214
|
+
async with httpx.AsyncClient(timeout=timeout) as http_client:
|
|
215
|
+
response = await http_client.get(download_url)
|
|
216
|
+
response.raise_for_status()
|
|
217
|
+
|
|
218
|
+
# Ensure output directory exists
|
|
219
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
|
|
220
|
+
|
|
221
|
+
with open(output_path, "wb") as f:
|
|
222
|
+
f.write(response.content)
|
|
223
|
+
|
|
224
|
+
return True
|
|
225
|
+
|
|
226
|
+
async def get_page_metrics(self) -> typing.Optional[typing.Dict[str, int]]:
|
|
227
|
+
"""
|
|
228
|
+
Get page processing metrics from the job status.
|
|
229
|
+
|
|
230
|
+
Returns
|
|
231
|
+
-------
|
|
232
|
+
typing.Optional[typing.Dict[str, int]]
|
|
233
|
+
Dictionary with keys: total_pages, pages_processed, pages_succeeded, pages_failed.
|
|
234
|
+
Returns None if job_detail is not available.
|
|
235
|
+
"""
|
|
236
|
+
status = await self.get_status()
|
|
237
|
+
if status.job_details and len(status.job_details) > 0:
|
|
238
|
+
detail = status.job_details[0]
|
|
239
|
+
return {
|
|
240
|
+
"total_pages": detail.total_pages or 0,
|
|
241
|
+
"pages_processed": detail.pages_processed or 0,
|
|
242
|
+
"pages_succeeded": detail.pages_succeeded or 0,
|
|
243
|
+
"pages_failed": detail.pages_failed or 0,
|
|
244
|
+
}
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
async def get_page_errors(self) -> typing.Optional[typing.List[typing.Any]]:
|
|
248
|
+
"""
|
|
249
|
+
Get any page-level errors from the job.
|
|
250
|
+
|
|
251
|
+
Returns
|
|
252
|
+
-------
|
|
253
|
+
typing.Optional[typing.List[typing.Any]]
|
|
254
|
+
List of page errors, or None if not available.
|
|
255
|
+
"""
|
|
256
|
+
status = await self.get_status()
|
|
257
|
+
if status.job_details and len(status.job_details) > 0:
|
|
258
|
+
return status.job_details[0].page_errors
|
|
259
|
+
return None
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
class DocDigitizationJob:
|
|
263
|
+
"""
|
|
264
|
+
Sync helper class for managing a Document Digitization job.
|
|
265
|
+
|
|
266
|
+
Provides high-level methods for uploading files, starting the job,
|
|
267
|
+
polling for completion, and downloading outputs.
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
def __init__(self, job_id: str, client: "DocDigitizationJobClient"):
|
|
271
|
+
"""
|
|
272
|
+
Initialize the document digitization job.
|
|
273
|
+
|
|
274
|
+
Parameters
|
|
275
|
+
----------
|
|
276
|
+
job_id : str
|
|
277
|
+
The unique identifier for the job.
|
|
278
|
+
client : DocDigitizationJobClient
|
|
279
|
+
The client instance to use for API calls.
|
|
280
|
+
"""
|
|
281
|
+
self._job_id = job_id
|
|
282
|
+
self._client = client
|
|
283
|
+
|
|
284
|
+
@property
|
|
285
|
+
def job_id(self) -> str:
|
|
286
|
+
"""Returns the job ID associated with this job instance."""
|
|
287
|
+
return self._job_id
|
|
288
|
+
|
|
289
|
+
def upload_file(self, file_path: str, timeout: float = 120.0) -> bool:
|
|
290
|
+
"""
|
|
291
|
+
Upload the input PDF or ZIP file for the document digitization job.
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
file_path : str
|
|
296
|
+
Path to the PDF or ZIP file to upload.
|
|
297
|
+
timeout : float, default=120.0
|
|
298
|
+
Timeout in seconds for the upload request.
|
|
299
|
+
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
bool
|
|
303
|
+
True if upload was successful.
|
|
304
|
+
|
|
305
|
+
Raises
|
|
306
|
+
------
|
|
307
|
+
ValueError
|
|
308
|
+
If the file is not a PDF or ZIP file.
|
|
309
|
+
FileNotFoundError
|
|
310
|
+
If the file does not exist.
|
|
311
|
+
httpx.HTTPStatusError
|
|
312
|
+
If the upload request fails.
|
|
313
|
+
"""
|
|
314
|
+
if not os.path.exists(file_path):
|
|
315
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
316
|
+
|
|
317
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
318
|
+
if ext not in [".pdf", ".zip"]:
|
|
319
|
+
raise ValueError(f"Only PDF or ZIP files are supported, got: {ext}")
|
|
320
|
+
|
|
321
|
+
file_name = os.path.basename(file_path)
|
|
322
|
+
|
|
323
|
+
# Get upload URL
|
|
324
|
+
upload_response = self._client.get_upload_links(
|
|
325
|
+
job_id=self._job_id,
|
|
326
|
+
files=[file_name],
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
if not upload_response.upload_urls:
|
|
330
|
+
raise ValueError("No upload URL returned from API")
|
|
331
|
+
|
|
332
|
+
upload_detail = list(upload_response.upload_urls.values())[0]
|
|
333
|
+
upload_url = upload_detail.file_url
|
|
334
|
+
|
|
335
|
+
# Determine content type
|
|
336
|
+
content_type, _ = mimetypes.guess_type(file_path)
|
|
337
|
+
if content_type is None:
|
|
338
|
+
content_type = "application/octet-stream"
|
|
339
|
+
|
|
340
|
+
# Upload the file
|
|
341
|
+
with httpx.Client(timeout=timeout) as http_client:
|
|
342
|
+
with open(file_path, "rb") as f:
|
|
343
|
+
file_content = f.read()
|
|
344
|
+
|
|
345
|
+
response = http_client.put(
|
|
346
|
+
upload_url,
|
|
347
|
+
content=file_content,
|
|
348
|
+
headers={
|
|
349
|
+
"Content-Type": content_type,
|
|
350
|
+
"x-ms-blob-type": "BlockBlob",
|
|
351
|
+
},
|
|
352
|
+
)
|
|
353
|
+
response.raise_for_status()
|
|
354
|
+
|
|
355
|
+
return True
|
|
356
|
+
|
|
357
|
+
def start(self) -> DocDigitizationJobStatusResponse:
|
|
358
|
+
"""
|
|
359
|
+
Start the document digitization job processing.
|
|
360
|
+
|
|
361
|
+
Returns
|
|
362
|
+
-------
|
|
363
|
+
DocDigitizationJobStatusResponse
|
|
364
|
+
The job status after starting.
|
|
365
|
+
"""
|
|
366
|
+
return self._client.start(job_id=self._job_id)
|
|
367
|
+
|
|
368
|
+
def get_status(self) -> DocDigitizationJobStatusResponse:
|
|
369
|
+
"""
|
|
370
|
+
Retrieve the current status of the job.
|
|
371
|
+
|
|
372
|
+
Returns
|
|
373
|
+
-------
|
|
374
|
+
DocDigitizationJobStatusResponse
|
|
375
|
+
The current job status.
|
|
376
|
+
"""
|
|
377
|
+
return self._client.get_status(self._job_id)
|
|
378
|
+
|
|
379
|
+
def wait_until_complete(
|
|
380
|
+
self, poll_interval: int = 5, timeout: int = 1800
|
|
381
|
+
) -> DocDigitizationJobStatusResponse:
|
|
382
|
+
"""
|
|
383
|
+
Polls job status until it completes, partially completes, or fails.
|
|
384
|
+
|
|
385
|
+
Parameters
|
|
386
|
+
----------
|
|
387
|
+
poll_interval : int, default=5
|
|
388
|
+
Seconds between status checks.
|
|
389
|
+
timeout : int, default=1800
|
|
390
|
+
Maximum seconds to wait before raising TimeoutError.
|
|
391
|
+
|
|
392
|
+
Returns
|
|
393
|
+
-------
|
|
394
|
+
DocDigitizationJobStatusResponse
|
|
395
|
+
The final job status.
|
|
396
|
+
|
|
397
|
+
Raises
|
|
398
|
+
------
|
|
399
|
+
TimeoutError
|
|
400
|
+
If the job does not complete within the timeout period.
|
|
401
|
+
"""
|
|
402
|
+
start_time = time.time()
|
|
403
|
+
terminal_states = {"completed", "partiallycompleted", "failed"}
|
|
404
|
+
|
|
405
|
+
while True:
|
|
406
|
+
status = self.get_status()
|
|
407
|
+
|
|
408
|
+
if status.job_state and status.job_state.lower() in terminal_states:
|
|
409
|
+
return status
|
|
410
|
+
|
|
411
|
+
elapsed = time.time() - start_time
|
|
412
|
+
if elapsed >= timeout:
|
|
413
|
+
raise TimeoutError(
|
|
414
|
+
f"Job {self._job_id} did not complete within {timeout} seconds. "
|
|
415
|
+
f"Current state: {status.job_state}"
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
time.sleep(poll_interval)
|
|
419
|
+
|
|
420
|
+
def download_output(self, output_path: str, timeout: float = 120.0) -> bool:
|
|
421
|
+
"""
|
|
422
|
+
Download the output file (HTML or Markdown) to the specified path.
|
|
423
|
+
|
|
424
|
+
Parameters
|
|
425
|
+
----------
|
|
426
|
+
output_path : str
|
|
427
|
+
Path where the output file should be saved.
|
|
428
|
+
timeout : float, default=120.0
|
|
429
|
+
Timeout in seconds for the download request.
|
|
430
|
+
|
|
431
|
+
Returns
|
|
432
|
+
-------
|
|
433
|
+
bool
|
|
434
|
+
True if download was successful.
|
|
435
|
+
|
|
436
|
+
Raises
|
|
437
|
+
------
|
|
438
|
+
ValueError
|
|
439
|
+
If no download URL is available.
|
|
440
|
+
httpx.HTTPStatusError
|
|
441
|
+
If the download request fails.
|
|
442
|
+
"""
|
|
443
|
+
download_response = self._client.get_download_links(job_id=self._job_id)
|
|
444
|
+
|
|
445
|
+
if not download_response.download_urls:
|
|
446
|
+
raise ValueError("No download URL returned from API")
|
|
447
|
+
|
|
448
|
+
download_detail = list(download_response.download_urls.values())[0]
|
|
449
|
+
download_url = download_detail.file_url
|
|
450
|
+
|
|
451
|
+
with httpx.Client(timeout=timeout) as http_client:
|
|
452
|
+
response = http_client.get(download_url)
|
|
453
|
+
response.raise_for_status()
|
|
454
|
+
|
|
455
|
+
# Ensure output directory exists
|
|
456
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
|
|
457
|
+
|
|
458
|
+
with open(output_path, "wb") as f:
|
|
459
|
+
f.write(response.content)
|
|
460
|
+
|
|
461
|
+
return True
|
|
462
|
+
|
|
463
|
+
def get_page_metrics(self) -> typing.Optional[typing.Dict[str, int]]:
|
|
464
|
+
"""
|
|
465
|
+
Get page processing metrics from the job status.
|
|
466
|
+
|
|
467
|
+
Returns
|
|
468
|
+
-------
|
|
469
|
+
typing.Optional[typing.Dict[str, int]]
|
|
470
|
+
Dictionary with keys: total_pages, pages_processed, pages_succeeded, pages_failed.
|
|
471
|
+
Returns None if job_detail is not available.
|
|
472
|
+
"""
|
|
473
|
+
status = self.get_status()
|
|
474
|
+
if status.job_details and len(status.job_details) > 0:
|
|
475
|
+
detail = status.job_details[0]
|
|
476
|
+
return {
|
|
477
|
+
"total_pages": detail.total_pages or 0,
|
|
478
|
+
"pages_processed": detail.pages_processed or 0,
|
|
479
|
+
"pages_succeeded": detail.pages_succeeded or 0,
|
|
480
|
+
"pages_failed": detail.pages_failed or 0,
|
|
481
|
+
}
|
|
482
|
+
return None
|
|
483
|
+
|
|
484
|
+
def get_page_errors(self) -> typing.Optional[typing.List[typing.Any]]:
|
|
485
|
+
"""
|
|
486
|
+
Get any page-level errors from the job.
|
|
487
|
+
|
|
488
|
+
Returns
|
|
489
|
+
-------
|
|
490
|
+
typing.Optional[typing.List[typing.Any]]
|
|
491
|
+
List of page errors, or None if not available.
|
|
492
|
+
"""
|
|
493
|
+
status = self.get_status()
|
|
494
|
+
if status.job_details and len(status.job_details) > 0:
|
|
495
|
+
return status.job_details[0].page_errors
|
|
496
|
+
return None
|