sarvamai 0.1.23a7__py3-none-any.whl → 0.1.23a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/core/client_wrapper.py +2 -2
- sarvamai/doc_digitization_job/client.py +223 -17
- sarvamai/doc_digitization_job/job.py +496 -0
- {sarvamai-0.1.23a7.dist-info → sarvamai-0.1.23a8.dist-info}/METADATA +1 -1
- {sarvamai-0.1.23a7.dist-info → sarvamai-0.1.23a8.dist-info}/RECORD +6 -5
- {sarvamai-0.1.23a7.dist-info → sarvamai-0.1.23a8.dist-info}/WHEEL +0 -0
sarvamai/core/client_wrapper.py
CHANGED
|
@@ -23,10 +23,10 @@ class BaseClientWrapper:
|
|
|
23
23
|
|
|
24
24
|
def get_headers(self) -> typing.Dict[str, str]:
|
|
25
25
|
headers: typing.Dict[str, str] = {
|
|
26
|
-
"User-Agent": "sarvamai/0.1.
|
|
26
|
+
"User-Agent": "sarvamai/0.1.23a8",
|
|
27
27
|
"X-Fern-Language": "Python",
|
|
28
28
|
"X-Fern-SDK-Name": "sarvamai",
|
|
29
|
-
"X-Fern-SDK-Version": "0.1.
|
|
29
|
+
"X-Fern-SDK-Version": "0.1.23a8",
|
|
30
30
|
**(self.get_custom_headers() or {}),
|
|
31
31
|
}
|
|
32
32
|
headers["api-subscription-key"] = self.api_subscription_key
|
|
@@ -1,16 +1,32 @@
|
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
3
|
|
|
3
4
|
import typing
|
|
4
5
|
|
|
5
6
|
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
|
6
7
|
from ..core.request_options import RequestOptions
|
|
7
|
-
from ..requests.doc_digitization_job_parameters import
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from ..
|
|
11
|
-
|
|
12
|
-
|
|
8
|
+
from ..requests.doc_digitization_job_parameters import (
|
|
9
|
+
DocDigitizationJobParametersParams,
|
|
10
|
+
)
|
|
11
|
+
from ..requests.doc_digitization_webhook_callback import (
|
|
12
|
+
DocDigitizationWebhookCallbackParams,
|
|
13
|
+
)
|
|
14
|
+
from ..types.doc_digitization_create_job_response import (
|
|
15
|
+
DocDigitizationCreateJobResponse,
|
|
16
|
+
)
|
|
17
|
+
from ..types.doc_digitization_download_files_response import (
|
|
18
|
+
DocDigitizationDownloadFilesResponse,
|
|
19
|
+
)
|
|
20
|
+
from ..types.doc_digitization_job_status_response import (
|
|
21
|
+
DocDigitizationJobStatusResponse,
|
|
22
|
+
)
|
|
23
|
+
from ..types.doc_digitization_upload_files_response import (
|
|
24
|
+
DocDigitizationUploadFilesResponse,
|
|
25
|
+
)
|
|
26
|
+
from ..types.doc_digitization_supported_language import DocDigitizationSupportedLanguage
|
|
27
|
+
from ..types.doc_digitization_output_format import DocDigitizationOutputFormat
|
|
13
28
|
from .raw_client import AsyncRawDocDigitizationJobClient, RawDocDigitizationJobClient
|
|
29
|
+
from .job import DocDigitizationJob, AsyncDocDigitizationJob
|
|
14
30
|
|
|
15
31
|
# this is used as the default value for optional parameters
|
|
16
32
|
OMIT = typing.cast(typing.Any, ...)
|
|
@@ -99,12 +115,18 @@ class DocDigitizationJobClient:
|
|
|
99
115
|
client.doc_digitization_job.initialise()
|
|
100
116
|
"""
|
|
101
117
|
_response = self._raw_client.initialise(
|
|
102
|
-
job_parameters=job_parameters,
|
|
118
|
+
job_parameters=job_parameters,
|
|
119
|
+
callback=callback,
|
|
120
|
+
request_options=request_options,
|
|
103
121
|
)
|
|
104
122
|
return _response.data
|
|
105
123
|
|
|
106
124
|
def get_upload_links(
|
|
107
|
-
self,
|
|
125
|
+
self,
|
|
126
|
+
*,
|
|
127
|
+
job_id: str,
|
|
128
|
+
files: typing.Sequence[str],
|
|
129
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
108
130
|
) -> DocDigitizationUploadFilesResponse:
|
|
109
131
|
"""
|
|
110
132
|
Returns presigned URLs for uploading input files.
|
|
@@ -142,7 +164,9 @@ class DocDigitizationJobClient:
|
|
|
142
164
|
files=["files"],
|
|
143
165
|
)
|
|
144
166
|
"""
|
|
145
|
-
_response = self._raw_client.get_upload_links(
|
|
167
|
+
_response = self._raw_client.get_upload_links(
|
|
168
|
+
job_id=job_id, files=files, request_options=request_options
|
|
169
|
+
)
|
|
146
170
|
return _response.data
|
|
147
171
|
|
|
148
172
|
def start(
|
|
@@ -269,13 +293,97 @@ class DocDigitizationJobClient:
|
|
|
269
293
|
job_id="job_id",
|
|
270
294
|
)
|
|
271
295
|
"""
|
|
272
|
-
_response = self._raw_client.get_download_links(
|
|
296
|
+
_response = self._raw_client.get_download_links(
|
|
297
|
+
job_id, request_options=request_options
|
|
298
|
+
)
|
|
273
299
|
return _response.data
|
|
274
300
|
|
|
301
|
+
def create_job(
|
|
302
|
+
self,
|
|
303
|
+
language: DocDigitizationSupportedLanguage = "hi",
|
|
304
|
+
output_format: DocDigitizationOutputFormat = "html",
|
|
305
|
+
callback: typing.Optional[DocDigitizationWebhookCallbackParams] = OMIT,
|
|
306
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
307
|
+
) -> DocDigitizationJob:
|
|
308
|
+
"""
|
|
309
|
+
Create a new Document Digitization job.
|
|
310
|
+
|
|
311
|
+
Parameters
|
|
312
|
+
----------
|
|
313
|
+
language : DocDigitizationSupportedLanguage, default="hi"
|
|
314
|
+
ISO language code for the document.
|
|
315
|
+
|
|
316
|
+
output_format : DocDigitizationOutputFormat, default="html"
|
|
317
|
+
Output format: "html" for structured HTML or "md" for Markdown.
|
|
318
|
+
|
|
319
|
+
callback : typing.Optional[DocDigitizationWebhookCallbackParams], default=OMIT
|
|
320
|
+
Optional webhook configuration for job completion notification.
|
|
321
|
+
|
|
322
|
+
request_options : typing.Optional[RequestOptions], default=None
|
|
323
|
+
Request-specific configuration.
|
|
324
|
+
|
|
325
|
+
Returns
|
|
326
|
+
-------
|
|
327
|
+
DocDigitizationJob
|
|
328
|
+
A handle to the newly created Document Digitization job.
|
|
329
|
+
|
|
330
|
+
Examples
|
|
331
|
+
--------
|
|
332
|
+
from sarvamai import SarvamAI
|
|
333
|
+
|
|
334
|
+
client = SarvamAI(api_subscription_key="YOUR_API_SUBSCRIPTION_KEY")
|
|
335
|
+
|
|
336
|
+
# Create job and get handle
|
|
337
|
+
job = client.doc_digitization_job.create_job(language="hi", output_format="md")
|
|
338
|
+
|
|
339
|
+
# Upload PDF, start, wait, download
|
|
340
|
+
job.upload_file("/path/to/document.pdf")
|
|
341
|
+
job.start()
|
|
342
|
+
job.wait_until_complete()
|
|
343
|
+
job.download_output("./output.md")
|
|
344
|
+
"""
|
|
345
|
+
response = self.initialise(
|
|
346
|
+
job_parameters=DocDigitizationJobParametersParams(
|
|
347
|
+
language=language,
|
|
348
|
+
output_format=output_format,
|
|
349
|
+
),
|
|
350
|
+
callback=callback,
|
|
351
|
+
request_options=request_options,
|
|
352
|
+
)
|
|
353
|
+
return DocDigitizationJob(job_id=response.job_id, client=self)
|
|
354
|
+
|
|
355
|
+
def get_job(self, job_id: str) -> DocDigitizationJob:
|
|
356
|
+
"""
|
|
357
|
+
Get an existing Document Digitization job handle by job ID.
|
|
358
|
+
|
|
359
|
+
Parameters
|
|
360
|
+
----------
|
|
361
|
+
job_id : str
|
|
362
|
+
The job ID of a previously created Document Digitization job.
|
|
363
|
+
|
|
364
|
+
Returns
|
|
365
|
+
-------
|
|
366
|
+
DocDigitizationJob
|
|
367
|
+
A job handle which can be used to check status, upload files, or download results.
|
|
368
|
+
|
|
369
|
+
Examples
|
|
370
|
+
--------
|
|
371
|
+
from sarvamai import SarvamAI
|
|
372
|
+
|
|
373
|
+
client = SarvamAI(api_subscription_key="YOUR_API_SUBSCRIPTION_KEY")
|
|
374
|
+
|
|
375
|
+
# Get existing job
|
|
376
|
+
job = client.doc_digitization_job.get_job(job_id="your-job-uuid")
|
|
377
|
+
status = job.get_status()
|
|
378
|
+
"""
|
|
379
|
+
return DocDigitizationJob(job_id=job_id, client=self)
|
|
380
|
+
|
|
275
381
|
|
|
276
382
|
class AsyncDocDigitizationJobClient:
|
|
277
383
|
def __init__(self, *, client_wrapper: AsyncClientWrapper):
|
|
278
|
-
self._raw_client = AsyncRawDocDigitizationJobClient(
|
|
384
|
+
self._raw_client = AsyncRawDocDigitizationJobClient(
|
|
385
|
+
client_wrapper=client_wrapper
|
|
386
|
+
)
|
|
279
387
|
|
|
280
388
|
@property
|
|
281
389
|
def with_raw_response(self) -> AsyncRawDocDigitizationJobClient:
|
|
@@ -364,12 +472,18 @@ class AsyncDocDigitizationJobClient:
|
|
|
364
472
|
asyncio.run(main())
|
|
365
473
|
"""
|
|
366
474
|
_response = await self._raw_client.initialise(
|
|
367
|
-
job_parameters=job_parameters,
|
|
475
|
+
job_parameters=job_parameters,
|
|
476
|
+
callback=callback,
|
|
477
|
+
request_options=request_options,
|
|
368
478
|
)
|
|
369
479
|
return _response.data
|
|
370
480
|
|
|
371
481
|
async def get_upload_links(
|
|
372
|
-
self,
|
|
482
|
+
self,
|
|
483
|
+
*,
|
|
484
|
+
job_id: str,
|
|
485
|
+
files: typing.Sequence[str],
|
|
486
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
373
487
|
) -> DocDigitizationUploadFilesResponse:
|
|
374
488
|
"""
|
|
375
489
|
Returns presigned URLs for uploading input files.
|
|
@@ -415,7 +529,9 @@ class AsyncDocDigitizationJobClient:
|
|
|
415
529
|
|
|
416
530
|
asyncio.run(main())
|
|
417
531
|
"""
|
|
418
|
-
_response = await self._raw_client.get_upload_links(
|
|
532
|
+
_response = await self._raw_client.get_upload_links(
|
|
533
|
+
job_id=job_id, files=files, request_options=request_options
|
|
534
|
+
)
|
|
419
535
|
return _response.data
|
|
420
536
|
|
|
421
537
|
async def start(
|
|
@@ -469,7 +585,9 @@ class AsyncDocDigitizationJobClient:
|
|
|
469
585
|
|
|
470
586
|
asyncio.run(main())
|
|
471
587
|
"""
|
|
472
|
-
_response = await self._raw_client.start(
|
|
588
|
+
_response = await self._raw_client.start(
|
|
589
|
+
job_id, request_options=request_options
|
|
590
|
+
)
|
|
473
591
|
return _response.data
|
|
474
592
|
|
|
475
593
|
async def get_status(
|
|
@@ -521,7 +639,9 @@ class AsyncDocDigitizationJobClient:
|
|
|
521
639
|
|
|
522
640
|
asyncio.run(main())
|
|
523
641
|
"""
|
|
524
|
-
_response = await self._raw_client.get_status(
|
|
642
|
+
_response = await self._raw_client.get_status(
|
|
643
|
+
job_id, request_options=request_options
|
|
644
|
+
)
|
|
525
645
|
return _response.data
|
|
526
646
|
|
|
527
647
|
async def get_download_links(
|
|
@@ -566,5 +686,91 @@ class AsyncDocDigitizationJobClient:
|
|
|
566
686
|
|
|
567
687
|
asyncio.run(main())
|
|
568
688
|
"""
|
|
569
|
-
_response = await self._raw_client.get_download_links(
|
|
689
|
+
_response = await self._raw_client.get_download_links(
|
|
690
|
+
job_id, request_options=request_options
|
|
691
|
+
)
|
|
570
692
|
return _response.data
|
|
693
|
+
|
|
694
|
+
async def create_job(
|
|
695
|
+
self,
|
|
696
|
+
language: DocDigitizationSupportedLanguage = "hi",
|
|
697
|
+
output_format: DocDigitizationOutputFormat = "html",
|
|
698
|
+
callback: typing.Optional[DocDigitizationWebhookCallbackParams] = OMIT,
|
|
699
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
700
|
+
) -> AsyncDocDigitizationJob:
|
|
701
|
+
"""
|
|
702
|
+
Create a new Document Digitization job.
|
|
703
|
+
|
|
704
|
+
Parameters
|
|
705
|
+
----------
|
|
706
|
+
language : DocDigitizationSupportedLanguage, default="hi"
|
|
707
|
+
ISO language code for the document.
|
|
708
|
+
|
|
709
|
+
output_format : DocDigitizationOutputFormat, default="html"
|
|
710
|
+
Output format: "html" for structured HTML or "md" for Markdown.
|
|
711
|
+
|
|
712
|
+
callback : typing.Optional[DocDigitizationWebhookCallbackParams], default=OMIT
|
|
713
|
+
Optional webhook configuration for job completion notification.
|
|
714
|
+
|
|
715
|
+
request_options : typing.Optional[RequestOptions], default=None
|
|
716
|
+
Request-specific configuration.
|
|
717
|
+
|
|
718
|
+
Returns
|
|
719
|
+
-------
|
|
720
|
+
AsyncDocDigitizationJob
|
|
721
|
+
A handle to the newly created Document Digitization job.
|
|
722
|
+
|
|
723
|
+
Examples
|
|
724
|
+
--------
|
|
725
|
+
import asyncio
|
|
726
|
+
from sarvamai import AsyncSarvamAI
|
|
727
|
+
|
|
728
|
+
client = AsyncSarvamAI(api_subscription_key="YOUR_API_SUBSCRIPTION_KEY")
|
|
729
|
+
|
|
730
|
+
async def main():
|
|
731
|
+
job = await client.doc_digitization_job.create_job(language="hi", output_format="md")
|
|
732
|
+
await job.upload_file("/path/to/document.pdf")
|
|
733
|
+
await job.start()
|
|
734
|
+
await job.wait_until_complete()
|
|
735
|
+
await job.download_output("./output.md")
|
|
736
|
+
|
|
737
|
+
asyncio.run(main())
|
|
738
|
+
"""
|
|
739
|
+
response = await self.initialise(
|
|
740
|
+
job_parameters=DocDigitizationJobParametersParams(
|
|
741
|
+
language=language,
|
|
742
|
+
output_format=output_format,
|
|
743
|
+
),
|
|
744
|
+
callback=callback,
|
|
745
|
+
request_options=request_options,
|
|
746
|
+
)
|
|
747
|
+
return AsyncDocDigitizationJob(job_id=response.job_id, client=self)
|
|
748
|
+
|
|
749
|
+
def get_job(self, job_id: str) -> AsyncDocDigitizationJob:
|
|
750
|
+
"""
|
|
751
|
+
Get an existing Document Digitization job handle by job ID.
|
|
752
|
+
|
|
753
|
+
Parameters
|
|
754
|
+
----------
|
|
755
|
+
job_id : str
|
|
756
|
+
The job ID of a previously created Document Digitization job.
|
|
757
|
+
|
|
758
|
+
Returns
|
|
759
|
+
-------
|
|
760
|
+
AsyncDocDigitizationJob
|
|
761
|
+
A job handle which can be used to check status, upload files, or download results.
|
|
762
|
+
|
|
763
|
+
Examples
|
|
764
|
+
--------
|
|
765
|
+
import asyncio
|
|
766
|
+
from sarvamai import AsyncSarvamAI
|
|
767
|
+
|
|
768
|
+
client = AsyncSarvamAI(api_subscription_key="YOUR_API_SUBSCRIPTION_KEY")
|
|
769
|
+
|
|
770
|
+
async def main():
|
|
771
|
+
job = client.doc_digitization_job.get_job(job_id="your-job-uuid")
|
|
772
|
+
status = await job.get_status()
|
|
773
|
+
|
|
774
|
+
asyncio.run(main())
|
|
775
|
+
"""
|
|
776
|
+
return AsyncDocDigitizationJob(job_id=job_id, client=self)
|
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document Digitization Job helper classes.
|
|
3
|
+
|
|
4
|
+
This module provides high-level abstractions for managing document digitization jobs,
|
|
5
|
+
including file uploads, status polling, and output downloads.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import mimetypes
|
|
10
|
+
import os
|
|
11
|
+
import time
|
|
12
|
+
import typing
|
|
13
|
+
|
|
14
|
+
import httpx
|
|
15
|
+
from http import HTTPStatus
|
|
16
|
+
|
|
17
|
+
from ..types.doc_digitization_job_status_response import (
|
|
18
|
+
DocDigitizationJobStatusResponse,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
if typing.TYPE_CHECKING:
|
|
22
|
+
from .client import AsyncDocDigitizationJobClient, DocDigitizationJobClient
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AsyncDocDigitizationJob:
|
|
26
|
+
"""
|
|
27
|
+
Async helper class for managing a Document Digitization job.
|
|
28
|
+
|
|
29
|
+
Provides high-level methods for uploading files, starting the job,
|
|
30
|
+
polling for completion, and downloading outputs.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, job_id: str, client: "AsyncDocDigitizationJobClient"):
|
|
34
|
+
"""
|
|
35
|
+
Initialize the asynchronous document digitization job.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
job_id : str
|
|
40
|
+
The unique identifier for the job.
|
|
41
|
+
client : AsyncDocDigitizationJobClient
|
|
42
|
+
The async client instance to use for API calls.
|
|
43
|
+
"""
|
|
44
|
+
self._job_id = job_id
|
|
45
|
+
self._client = client
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def job_id(self) -> str:
|
|
49
|
+
"""Returns the job ID associated with this job instance."""
|
|
50
|
+
return self._job_id
|
|
51
|
+
|
|
52
|
+
async def upload_file(self, file_path: str, timeout: float = 120.0) -> bool:
|
|
53
|
+
"""
|
|
54
|
+
Upload the input PDF or ZIP file for the document digitization job.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
file_path : str
|
|
59
|
+
Path to the PDF or ZIP file to upload.
|
|
60
|
+
timeout : float, default=120.0
|
|
61
|
+
Timeout in seconds for the upload request.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
bool
|
|
66
|
+
True if upload was successful.
|
|
67
|
+
|
|
68
|
+
Raises
|
|
69
|
+
------
|
|
70
|
+
ValueError
|
|
71
|
+
If the file is not a PDF or ZIP file.
|
|
72
|
+
FileNotFoundError
|
|
73
|
+
If the file does not exist.
|
|
74
|
+
httpx.HTTPStatusError
|
|
75
|
+
If the upload request fails.
|
|
76
|
+
"""
|
|
77
|
+
if not os.path.exists(file_path):
|
|
78
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
79
|
+
|
|
80
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
81
|
+
if ext not in [".pdf", ".zip"]:
|
|
82
|
+
raise ValueError(f"Only PDF or ZIP files are supported, got: {ext}")
|
|
83
|
+
|
|
84
|
+
file_name = os.path.basename(file_path)
|
|
85
|
+
|
|
86
|
+
# Get upload URL
|
|
87
|
+
upload_response = await self._client.get_upload_links(
|
|
88
|
+
job_id=self._job_id,
|
|
89
|
+
files=[file_name],
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
if not upload_response.upload_urls:
|
|
93
|
+
raise ValueError("No upload URL returned from API")
|
|
94
|
+
|
|
95
|
+
upload_detail = list(upload_response.upload_urls.values())[0]
|
|
96
|
+
upload_url = upload_detail.file_url
|
|
97
|
+
|
|
98
|
+
# Determine content type
|
|
99
|
+
content_type, _ = mimetypes.guess_type(file_path)
|
|
100
|
+
if content_type is None:
|
|
101
|
+
content_type = "application/octet-stream"
|
|
102
|
+
|
|
103
|
+
# Upload the file
|
|
104
|
+
async with httpx.AsyncClient(timeout=timeout) as http_client:
|
|
105
|
+
with open(file_path, "rb") as f:
|
|
106
|
+
file_content = f.read()
|
|
107
|
+
|
|
108
|
+
response = await http_client.put(
|
|
109
|
+
upload_url,
|
|
110
|
+
content=file_content,
|
|
111
|
+
headers={
|
|
112
|
+
"Content-Type": content_type,
|
|
113
|
+
"x-ms-blob-type": "BlockBlob",
|
|
114
|
+
},
|
|
115
|
+
)
|
|
116
|
+
response.raise_for_status()
|
|
117
|
+
|
|
118
|
+
return True
|
|
119
|
+
|
|
120
|
+
async def start(self) -> DocDigitizationJobStatusResponse:
|
|
121
|
+
"""
|
|
122
|
+
Start the document digitization job processing.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
DocDigitizationJobStatusResponse
|
|
127
|
+
The job status after starting.
|
|
128
|
+
"""
|
|
129
|
+
return await self._client.start(job_id=self._job_id)
|
|
130
|
+
|
|
131
|
+
async def get_status(self) -> DocDigitizationJobStatusResponse:
|
|
132
|
+
"""
|
|
133
|
+
Retrieve the current status of the job.
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
DocDigitizationJobStatusResponse
|
|
138
|
+
The current job status.
|
|
139
|
+
"""
|
|
140
|
+
return await self._client.get_status(self._job_id)
|
|
141
|
+
|
|
142
|
+
async def wait_until_complete(
|
|
143
|
+
self, poll_interval: int = 5, timeout: int = 1800
|
|
144
|
+
) -> DocDigitizationJobStatusResponse:
|
|
145
|
+
"""
|
|
146
|
+
Polls job status until it completes, partially completes, or fails.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
poll_interval : int, default=5
|
|
151
|
+
Seconds between status checks.
|
|
152
|
+
timeout : int, default=1800
|
|
153
|
+
Maximum seconds to wait before raising TimeoutError.
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
DocDigitizationJobStatusResponse
|
|
158
|
+
The final job status.
|
|
159
|
+
|
|
160
|
+
Raises
|
|
161
|
+
------
|
|
162
|
+
TimeoutError
|
|
163
|
+
If the job does not complete within the timeout period.
|
|
164
|
+
"""
|
|
165
|
+
start_time = time.time()
|
|
166
|
+
terminal_states = {"completed", "partiallycompleted", "failed"}
|
|
167
|
+
|
|
168
|
+
while True:
|
|
169
|
+
status = await self.get_status()
|
|
170
|
+
|
|
171
|
+
if status.job_state and status.job_state.lower() in terminal_states:
|
|
172
|
+
return status
|
|
173
|
+
|
|
174
|
+
elapsed = time.time() - start_time
|
|
175
|
+
if elapsed >= timeout:
|
|
176
|
+
raise TimeoutError(
|
|
177
|
+
f"Job {self._job_id} did not complete within {timeout} seconds. "
|
|
178
|
+
f"Current state: {status.job_state}"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
await asyncio.sleep(poll_interval)
|
|
182
|
+
|
|
183
|
+
async def download_output(self, output_path: str, timeout: float = 120.0) -> bool:
|
|
184
|
+
"""
|
|
185
|
+
Download the output file (HTML or Markdown) to the specified path.
|
|
186
|
+
|
|
187
|
+
Parameters
|
|
188
|
+
----------
|
|
189
|
+
output_path : str
|
|
190
|
+
Path where the output file should be saved.
|
|
191
|
+
timeout : float, default=120.0
|
|
192
|
+
Timeout in seconds for the download request.
|
|
193
|
+
|
|
194
|
+
Returns
|
|
195
|
+
-------
|
|
196
|
+
bool
|
|
197
|
+
True if download was successful.
|
|
198
|
+
|
|
199
|
+
Raises
|
|
200
|
+
------
|
|
201
|
+
ValueError
|
|
202
|
+
If no download URL is available.
|
|
203
|
+
httpx.HTTPStatusError
|
|
204
|
+
If the download request fails.
|
|
205
|
+
"""
|
|
206
|
+
download_response = await self._client.get_download_links(job_id=self._job_id)
|
|
207
|
+
|
|
208
|
+
if not download_response.download_urls:
|
|
209
|
+
raise ValueError("No download URL returned from API")
|
|
210
|
+
|
|
211
|
+
download_detail = list(download_response.download_urls.values())[0]
|
|
212
|
+
download_url = download_detail.file_url
|
|
213
|
+
|
|
214
|
+
async with httpx.AsyncClient(timeout=timeout) as http_client:
|
|
215
|
+
response = await http_client.get(download_url)
|
|
216
|
+
response.raise_for_status()
|
|
217
|
+
|
|
218
|
+
# Ensure output directory exists
|
|
219
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
|
|
220
|
+
|
|
221
|
+
with open(output_path, "wb") as f:
|
|
222
|
+
f.write(response.content)
|
|
223
|
+
|
|
224
|
+
return True
|
|
225
|
+
|
|
226
|
+
async def get_page_metrics(self) -> typing.Optional[typing.Dict[str, int]]:
|
|
227
|
+
"""
|
|
228
|
+
Get page processing metrics from the job status.
|
|
229
|
+
|
|
230
|
+
Returns
|
|
231
|
+
-------
|
|
232
|
+
typing.Optional[typing.Dict[str, int]]
|
|
233
|
+
Dictionary with keys: total_pages, pages_processed, pages_succeeded, pages_failed.
|
|
234
|
+
Returns None if job_detail is not available.
|
|
235
|
+
"""
|
|
236
|
+
status = await self.get_status()
|
|
237
|
+
if status.job_details and len(status.job_details) > 0:
|
|
238
|
+
detail = status.job_details[0]
|
|
239
|
+
return {
|
|
240
|
+
"total_pages": detail.total_pages or 0,
|
|
241
|
+
"pages_processed": detail.pages_processed or 0,
|
|
242
|
+
"pages_succeeded": detail.pages_succeeded or 0,
|
|
243
|
+
"pages_failed": detail.pages_failed or 0,
|
|
244
|
+
}
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
async def get_page_errors(self) -> typing.Optional[typing.List[typing.Any]]:
|
|
248
|
+
"""
|
|
249
|
+
Get any page-level errors from the job.
|
|
250
|
+
|
|
251
|
+
Returns
|
|
252
|
+
-------
|
|
253
|
+
typing.Optional[typing.List[typing.Any]]
|
|
254
|
+
List of page errors, or None if not available.
|
|
255
|
+
"""
|
|
256
|
+
status = await self.get_status()
|
|
257
|
+
if status.job_details and len(status.job_details) > 0:
|
|
258
|
+
return status.job_details[0].page_errors
|
|
259
|
+
return None
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
class DocDigitizationJob:
|
|
263
|
+
"""
|
|
264
|
+
Sync helper class for managing a Document Digitization job.
|
|
265
|
+
|
|
266
|
+
Provides high-level methods for uploading files, starting the job,
|
|
267
|
+
polling for completion, and downloading outputs.
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
def __init__(self, job_id: str, client: "DocDigitizationJobClient"):
|
|
271
|
+
"""
|
|
272
|
+
Initialize the document digitization job.
|
|
273
|
+
|
|
274
|
+
Parameters
|
|
275
|
+
----------
|
|
276
|
+
job_id : str
|
|
277
|
+
The unique identifier for the job.
|
|
278
|
+
client : DocDigitizationJobClient
|
|
279
|
+
The client instance to use for API calls.
|
|
280
|
+
"""
|
|
281
|
+
self._job_id = job_id
|
|
282
|
+
self._client = client
|
|
283
|
+
|
|
284
|
+
@property
|
|
285
|
+
def job_id(self) -> str:
|
|
286
|
+
"""Returns the job ID associated with this job instance."""
|
|
287
|
+
return self._job_id
|
|
288
|
+
|
|
289
|
+
def upload_file(self, file_path: str, timeout: float = 120.0) -> bool:
|
|
290
|
+
"""
|
|
291
|
+
Upload the input PDF or ZIP file for the document digitization job.
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
file_path : str
|
|
296
|
+
Path to the PDF or ZIP file to upload.
|
|
297
|
+
timeout : float, default=120.0
|
|
298
|
+
Timeout in seconds for the upload request.
|
|
299
|
+
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
bool
|
|
303
|
+
True if upload was successful.
|
|
304
|
+
|
|
305
|
+
Raises
|
|
306
|
+
------
|
|
307
|
+
ValueError
|
|
308
|
+
If the file is not a PDF or ZIP file.
|
|
309
|
+
FileNotFoundError
|
|
310
|
+
If the file does not exist.
|
|
311
|
+
httpx.HTTPStatusError
|
|
312
|
+
If the upload request fails.
|
|
313
|
+
"""
|
|
314
|
+
if not os.path.exists(file_path):
|
|
315
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
316
|
+
|
|
317
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
318
|
+
if ext not in [".pdf", ".zip"]:
|
|
319
|
+
raise ValueError(f"Only PDF or ZIP files are supported, got: {ext}")
|
|
320
|
+
|
|
321
|
+
file_name = os.path.basename(file_path)
|
|
322
|
+
|
|
323
|
+
# Get upload URL
|
|
324
|
+
upload_response = self._client.get_upload_links(
|
|
325
|
+
job_id=self._job_id,
|
|
326
|
+
files=[file_name],
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
if not upload_response.upload_urls:
|
|
330
|
+
raise ValueError("No upload URL returned from API")
|
|
331
|
+
|
|
332
|
+
upload_detail = list(upload_response.upload_urls.values())[0]
|
|
333
|
+
upload_url = upload_detail.file_url
|
|
334
|
+
|
|
335
|
+
# Determine content type
|
|
336
|
+
content_type, _ = mimetypes.guess_type(file_path)
|
|
337
|
+
if content_type is None:
|
|
338
|
+
content_type = "application/octet-stream"
|
|
339
|
+
|
|
340
|
+
# Upload the file
|
|
341
|
+
with httpx.Client(timeout=timeout) as http_client:
|
|
342
|
+
with open(file_path, "rb") as f:
|
|
343
|
+
file_content = f.read()
|
|
344
|
+
|
|
345
|
+
response = http_client.put(
|
|
346
|
+
upload_url,
|
|
347
|
+
content=file_content,
|
|
348
|
+
headers={
|
|
349
|
+
"Content-Type": content_type,
|
|
350
|
+
"x-ms-blob-type": "BlockBlob",
|
|
351
|
+
},
|
|
352
|
+
)
|
|
353
|
+
response.raise_for_status()
|
|
354
|
+
|
|
355
|
+
return True
|
|
356
|
+
|
|
357
|
+
def start(self) -> DocDigitizationJobStatusResponse:
|
|
358
|
+
"""
|
|
359
|
+
Start the document digitization job processing.
|
|
360
|
+
|
|
361
|
+
Returns
|
|
362
|
+
-------
|
|
363
|
+
DocDigitizationJobStatusResponse
|
|
364
|
+
The job status after starting.
|
|
365
|
+
"""
|
|
366
|
+
return self._client.start(job_id=self._job_id)
|
|
367
|
+
|
|
368
|
+
def get_status(self) -> DocDigitizationJobStatusResponse:
|
|
369
|
+
"""
|
|
370
|
+
Retrieve the current status of the job.
|
|
371
|
+
|
|
372
|
+
Returns
|
|
373
|
+
-------
|
|
374
|
+
DocDigitizationJobStatusResponse
|
|
375
|
+
The current job status.
|
|
376
|
+
"""
|
|
377
|
+
return self._client.get_status(self._job_id)
|
|
378
|
+
|
|
379
|
+
def wait_until_complete(
|
|
380
|
+
self, poll_interval: int = 5, timeout: int = 1800
|
|
381
|
+
) -> DocDigitizationJobStatusResponse:
|
|
382
|
+
"""
|
|
383
|
+
Polls job status until it completes, partially completes, or fails.
|
|
384
|
+
|
|
385
|
+
Parameters
|
|
386
|
+
----------
|
|
387
|
+
poll_interval : int, default=5
|
|
388
|
+
Seconds between status checks.
|
|
389
|
+
timeout : int, default=1800
|
|
390
|
+
Maximum seconds to wait before raising TimeoutError.
|
|
391
|
+
|
|
392
|
+
Returns
|
|
393
|
+
-------
|
|
394
|
+
DocDigitizationJobStatusResponse
|
|
395
|
+
The final job status.
|
|
396
|
+
|
|
397
|
+
Raises
|
|
398
|
+
------
|
|
399
|
+
TimeoutError
|
|
400
|
+
If the job does not complete within the timeout period.
|
|
401
|
+
"""
|
|
402
|
+
start_time = time.time()
|
|
403
|
+
terminal_states = {"completed", "partiallycompleted", "failed"}
|
|
404
|
+
|
|
405
|
+
while True:
|
|
406
|
+
status = self.get_status()
|
|
407
|
+
|
|
408
|
+
if status.job_state and status.job_state.lower() in terminal_states:
|
|
409
|
+
return status
|
|
410
|
+
|
|
411
|
+
elapsed = time.time() - start_time
|
|
412
|
+
if elapsed >= timeout:
|
|
413
|
+
raise TimeoutError(
|
|
414
|
+
f"Job {self._job_id} did not complete within {timeout} seconds. "
|
|
415
|
+
f"Current state: {status.job_state}"
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
time.sleep(poll_interval)
|
|
419
|
+
|
|
420
|
+
def download_output(self, output_path: str, timeout: float = 120.0) -> bool:
|
|
421
|
+
"""
|
|
422
|
+
Download the output file (HTML or Markdown) to the specified path.
|
|
423
|
+
|
|
424
|
+
Parameters
|
|
425
|
+
----------
|
|
426
|
+
output_path : str
|
|
427
|
+
Path where the output file should be saved.
|
|
428
|
+
timeout : float, default=120.0
|
|
429
|
+
Timeout in seconds for the download request.
|
|
430
|
+
|
|
431
|
+
Returns
|
|
432
|
+
-------
|
|
433
|
+
bool
|
|
434
|
+
True if download was successful.
|
|
435
|
+
|
|
436
|
+
Raises
|
|
437
|
+
------
|
|
438
|
+
ValueError
|
|
439
|
+
If no download URL is available.
|
|
440
|
+
httpx.HTTPStatusError
|
|
441
|
+
If the download request fails.
|
|
442
|
+
"""
|
|
443
|
+
download_response = self._client.get_download_links(job_id=self._job_id)
|
|
444
|
+
|
|
445
|
+
if not download_response.download_urls:
|
|
446
|
+
raise ValueError("No download URL returned from API")
|
|
447
|
+
|
|
448
|
+
download_detail = list(download_response.download_urls.values())[0]
|
|
449
|
+
download_url = download_detail.file_url
|
|
450
|
+
|
|
451
|
+
with httpx.Client(timeout=timeout) as http_client:
|
|
452
|
+
response = http_client.get(download_url)
|
|
453
|
+
response.raise_for_status()
|
|
454
|
+
|
|
455
|
+
# Ensure output directory exists
|
|
456
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
|
|
457
|
+
|
|
458
|
+
with open(output_path, "wb") as f:
|
|
459
|
+
f.write(response.content)
|
|
460
|
+
|
|
461
|
+
return True
|
|
462
|
+
|
|
463
|
+
def get_page_metrics(self) -> typing.Optional[typing.Dict[str, int]]:
|
|
464
|
+
"""
|
|
465
|
+
Get page processing metrics from the job status.
|
|
466
|
+
|
|
467
|
+
Returns
|
|
468
|
+
-------
|
|
469
|
+
typing.Optional[typing.Dict[str, int]]
|
|
470
|
+
Dictionary with keys: total_pages, pages_processed, pages_succeeded, pages_failed.
|
|
471
|
+
Returns None if job_detail is not available.
|
|
472
|
+
"""
|
|
473
|
+
status = self.get_status()
|
|
474
|
+
if status.job_details and len(status.job_details) > 0:
|
|
475
|
+
detail = status.job_details[0]
|
|
476
|
+
return {
|
|
477
|
+
"total_pages": detail.total_pages or 0,
|
|
478
|
+
"pages_processed": detail.pages_processed or 0,
|
|
479
|
+
"pages_succeeded": detail.pages_succeeded or 0,
|
|
480
|
+
"pages_failed": detail.pages_failed or 0,
|
|
481
|
+
}
|
|
482
|
+
return None
|
|
483
|
+
|
|
484
|
+
def get_page_errors(self) -> typing.Optional[typing.List[typing.Any]]:
|
|
485
|
+
"""
|
|
486
|
+
Get any page-level errors from the job.
|
|
487
|
+
|
|
488
|
+
Returns
|
|
489
|
+
-------
|
|
490
|
+
typing.Optional[typing.List[typing.Any]]
|
|
491
|
+
List of page errors, or None if not available.
|
|
492
|
+
"""
|
|
493
|
+
status = self.get_status()
|
|
494
|
+
if status.job_details and len(status.job_details) > 0:
|
|
495
|
+
return status.job_details[0].page_errors
|
|
496
|
+
return None
|
|
@@ -5,7 +5,7 @@ sarvamai/chat/raw_client.py,sha256=A2kRuZcVWlJhyYCD7YKgqNkZEp3cYa1731KhRkhirU0,1
|
|
|
5
5
|
sarvamai/client.py,sha256=Ii9ASYuAJXPE4R0byshfXEtb8ivjm2xJjeJNm95rJz0,8505
|
|
6
6
|
sarvamai/core/__init__.py,sha256=YE2CtXeASe1RAbaI39twKWYKCuT4tW5is9HWHhJjR_g,1653
|
|
7
7
|
sarvamai/core/api_error.py,sha256=44vPoTyWN59gonCIZMdzw7M1uspygiLnr3GNFOoVL2Q,614
|
|
8
|
-
sarvamai/core/client_wrapper.py,sha256=
|
|
8
|
+
sarvamai/core/client_wrapper.py,sha256=VT-vH4EOWOIsBhQdXoxlma-fjFBBvEnqYOjuKTSEErc,2570
|
|
9
9
|
sarvamai/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
|
|
10
10
|
sarvamai/core/events.py,sha256=HvKBdSoYcFetk7cgNXb7FxuY-FtY8NtUhZIN7mGVx8U,1159
|
|
11
11
|
sarvamai/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
|
|
@@ -19,7 +19,8 @@ sarvamai/core/remove_none_from_dict.py,sha256=EU9SGgYidWq7SexuJbNs4-PZ-5Bl3Vppd8
|
|
|
19
19
|
sarvamai/core/request_options.py,sha256=h0QUNCFVdCW_7GclVySCAY2w4NhtXVBUCmHgmzaxpcg,1681
|
|
20
20
|
sarvamai/core/serialization.py,sha256=ECL3bvv_0i7U4uvPidZCNel--MUbA0iq0aGcNKi3kws,9818
|
|
21
21
|
sarvamai/doc_digitization_job/__init__.py,sha256=_VhToAyIt_5axN6CLJwtxg3-CO7THa_23pbUzqhXJa4,85
|
|
22
|
-
sarvamai/doc_digitization_job/client.py,sha256=
|
|
22
|
+
sarvamai/doc_digitization_job/client.py,sha256=3c4V13RIeuZjKl28loxZS1TTTIhI6zyRf-0NXYujrcw,23715
|
|
23
|
+
sarvamai/doc_digitization_job/job.py,sha256=lbQ0N6GxTgNAp_dJ_Q-6YlvpVwcgTdXyCn-ZoE9kbHg,15542
|
|
23
24
|
sarvamai/doc_digitization_job/raw_client.py,sha256=RjjYexOrF0GmaW97pyHXV_4QvtK7LN2O-qAgV-3r2kQ,48300
|
|
24
25
|
sarvamai/environment.py,sha256=hdwTU767BqRgSMLiAOocY_Vpw8V2N_hAy3yhoK8VFS0,402
|
|
25
26
|
sarvamai/errors/__init__.py,sha256=EEGwHETz9DGVcwYvrxxvTpfqaG-tjF-SiYEe6ULeXt4,595
|
|
@@ -256,6 +257,6 @@ sarvamai/types/transliterate_mode.py,sha256=1jSEMlGcoLkWuk12TgoOpSgwifa4rThGKZ1h
|
|
|
256
257
|
sarvamai/types/transliterate_source_language.py,sha256=bSY9wJszF0sg-Cgg6F-YcWC8ly1mIlj9rqa15-jBtx8,283
|
|
257
258
|
sarvamai/types/transliteration_response.py,sha256=yt-lzTbDeJ_ZL4I8kQa6oESxA9ebeJJY7LfFHpdEsmM,815
|
|
258
259
|
sarvamai/version.py,sha256=Qkp3Ee9YH-O9RTix90e0i7iNrFAGN-QDt2AFwGA4n8k,75
|
|
259
|
-
sarvamai-0.1.
|
|
260
|
-
sarvamai-0.1.
|
|
261
|
-
sarvamai-0.1.
|
|
260
|
+
sarvamai-0.1.23a8.dist-info/METADATA,sha256=oZaRA_igtRwhVfU3GPr8jvbkAmdw0N-KdY42VN-8-Xw,26753
|
|
261
|
+
sarvamai-0.1.23a8.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
|
|
262
|
+
sarvamai-0.1.23a8.dist-info/RECORD,,
|
|
File without changes
|