datalab-python-sdk 0.1.5__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/.gitignore +1 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/PKG-INFO +2 -1
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/datalab_sdk/client.py +78 -8
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/datalab_sdk/models.py +7 -2
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/datalab_sdk/settings.py +1 -1
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/integration/test_live_api.py +8 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/pyproject.toml +3 -1
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/tests/test_client_methods.py +185 -1
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/uv.lock +13 -2
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/.github/workflows/ci.yml +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/.github/workflows/publish.yml +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/.pre-commit-config.yaml +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/.python-version +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/LICENSE +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/README.md +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/data/08-Lambda-Calculus.pptx +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/data/adversarial.pdf +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/data/bid_evaluation.docx +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/data/book_review.ppt +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/data/book_store.xls +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/data/chi_hind.png +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/data/how_to_read.doc +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/data/normandy.epub +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/data/sample-1-sheet.xlsx +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/data/thinkpython.pdf +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/data/vibe.html +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/datalab_sdk/__init__.py +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/datalab_sdk/cli.py +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/datalab_sdk/exceptions.py +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/datalab_sdk/mimetypes.py +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/integration/README.md +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/integration/__init__.py +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/integration/test_readme_examples.py +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/poetry.lock +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/pytest.ini +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/tests/__init__.py +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/tests/conftest.py +0 -0
- {datalab_python_sdk-0.1.5 → datalab_python_sdk-0.1.6}/tests/test_cli_simple.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datalab-python-sdk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: SDK for the Datalab document intelligence API
|
|
5
5
|
Author-email: Datalab Team <hi@datalab.to>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -11,6 +11,7 @@ Requires-Dist: aiohttp>=3.12.14
|
|
|
11
11
|
Requires-Dist: click>=8.2.1
|
|
12
12
|
Requires-Dist: pydantic-settings<3.0.0,>=2.10.1
|
|
13
13
|
Requires-Dist: pydantic<3.0.0,>=2.11.7
|
|
14
|
+
Requires-Dist: tenacity<9.0.0,>=8.2.3
|
|
14
15
|
Description-Content-Type: text/markdown
|
|
15
16
|
|
|
16
17
|
# Datalab SDK
|
|
@@ -5,6 +5,13 @@ Datalab API client - async core with sync wrapper
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import mimetypes
|
|
7
7
|
import aiohttp
|
|
8
|
+
from tenacity import (
|
|
9
|
+
retry,
|
|
10
|
+
retry_if_exception,
|
|
11
|
+
retry_if_exception_type,
|
|
12
|
+
stop_after_attempt,
|
|
13
|
+
wait_exponential_jitter,
|
|
14
|
+
)
|
|
8
15
|
from pathlib import Path
|
|
9
16
|
from typing import Union, Optional, Dict, Any
|
|
10
17
|
|
|
@@ -119,7 +126,7 @@ class AsyncDatalabClient:
|
|
|
119
126
|
)
|
|
120
127
|
|
|
121
128
|
for i in range(max_polls):
|
|
122
|
-
data = await self.
|
|
129
|
+
data = await self._poll_get_with_retry(full_url)
|
|
123
130
|
|
|
124
131
|
if data.get("status") == "complete":
|
|
125
132
|
return data
|
|
@@ -135,6 +142,32 @@ class AsyncDatalabClient:
|
|
|
135
142
|
f"Polling timed out after {max_polls * poll_interval} seconds"
|
|
136
143
|
)
|
|
137
144
|
|
|
145
|
+
@retry(
|
|
146
|
+
retry=(
|
|
147
|
+
retry_if_exception_type(DatalabTimeoutError)
|
|
148
|
+
| retry_if_exception(
|
|
149
|
+
lambda e: isinstance(e, DatalabAPIError)
|
|
150
|
+
and (
|
|
151
|
+
# retry request timeout or too many requests
|
|
152
|
+
getattr(e, "status_code", None) in (408, 429)
|
|
153
|
+
or (
|
|
154
|
+
# or if there's a server error
|
|
155
|
+
getattr(e, "status_code", None) is not None
|
|
156
|
+
and getattr(e, "status_code") >= 500
|
|
157
|
+
)
|
|
158
|
+
# or datalab api error without status code (e.g., connection errors)
|
|
159
|
+
or getattr(e, "status_code", None) is None
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
),
|
|
163
|
+
stop=stop_after_attempt(2),
|
|
164
|
+
wait=wait_exponential_jitter(max=0.5),
|
|
165
|
+
reraise=True,
|
|
166
|
+
)
|
|
167
|
+
async def _poll_get_with_retry(self, url: str) -> Dict[str, Any]:
|
|
168
|
+
"""GET wrapper for polling with scoped retries for transient failures"""
|
|
169
|
+
return await self._make_request("GET", url)
|
|
170
|
+
|
|
138
171
|
def _prepare_file_data(self, file_path: Union[str, Path]) -> tuple:
|
|
139
172
|
"""Prepare file data for upload"""
|
|
140
173
|
file_path = Path(file_path)
|
|
@@ -156,7 +189,7 @@ class AsyncDatalabClient:
|
|
|
156
189
|
|
|
157
190
|
if file_url and file_path:
|
|
158
191
|
raise ValueError("Either file_path or file_url must be provided, not both.")
|
|
159
|
-
|
|
192
|
+
|
|
160
193
|
# Use either file_url or file upload, not both
|
|
161
194
|
if file_url:
|
|
162
195
|
form_data.add_field("file_url", file_url)
|
|
@@ -184,13 +217,19 @@ class AsyncDatalabClient:
|
|
|
184
217
|
file_url: Optional[str] = None,
|
|
185
218
|
options: Optional[ProcessingOptions] = None,
|
|
186
219
|
save_output: Optional[Union[str, Path]] = None,
|
|
220
|
+
max_polls: int = 300,
|
|
221
|
+
poll_interval: int = 1,
|
|
187
222
|
) -> ConversionResult:
|
|
188
223
|
"""Convert a document using the marker endpoint"""
|
|
189
224
|
if options is None:
|
|
190
225
|
options = ConvertOptions()
|
|
191
226
|
|
|
192
227
|
initial_data = await self._make_request(
|
|
193
|
-
"POST",
|
|
228
|
+
"POST",
|
|
229
|
+
"/api/v1/marker",
|
|
230
|
+
data=self.get_form_params(
|
|
231
|
+
file_path=file_path, file_url=file_url, options=options
|
|
232
|
+
),
|
|
194
233
|
)
|
|
195
234
|
|
|
196
235
|
if not initial_data.get("success"):
|
|
@@ -198,7 +237,11 @@ class AsyncDatalabClient:
|
|
|
198
237
|
f"Request failed: {initial_data.get('error', 'Unknown error')}"
|
|
199
238
|
)
|
|
200
239
|
|
|
201
|
-
result_data = await self._poll_result(
|
|
240
|
+
result_data = await self._poll_result(
|
|
241
|
+
initial_data["request_check_url"],
|
|
242
|
+
max_polls=max_polls,
|
|
243
|
+
poll_interval=poll_interval,
|
|
244
|
+
)
|
|
202
245
|
|
|
203
246
|
result = ConversionResult(
|
|
204
247
|
success=result_data.get("success", False),
|
|
@@ -227,13 +270,17 @@ class AsyncDatalabClient:
|
|
|
227
270
|
file_path: Union[str, Path],
|
|
228
271
|
options: Optional[ProcessingOptions] = None,
|
|
229
272
|
save_output: Optional[Union[str, Path]] = None,
|
|
273
|
+
max_polls: int = 300,
|
|
274
|
+
poll_interval: int = 1,
|
|
230
275
|
) -> OCRResult:
|
|
231
276
|
"""Perform OCR on a document"""
|
|
232
277
|
if options is None:
|
|
233
278
|
options = OCROptions()
|
|
234
279
|
|
|
235
280
|
initial_data = await self._make_request(
|
|
236
|
-
"POST",
|
|
281
|
+
"POST",
|
|
282
|
+
"/api/v1/ocr",
|
|
283
|
+
data=self.get_form_params(file_path=file_path, options=options),
|
|
237
284
|
)
|
|
238
285
|
|
|
239
286
|
if not initial_data.get("success"):
|
|
@@ -241,7 +288,11 @@ class AsyncDatalabClient:
|
|
|
241
288
|
f"Request failed: {initial_data.get('error', 'Unknown error')}"
|
|
242
289
|
)
|
|
243
290
|
|
|
244
|
-
result_data = await self._poll_result(
|
|
291
|
+
result_data = await self._poll_result(
|
|
292
|
+
initial_data["request_check_url"],
|
|
293
|
+
max_polls=max_polls,
|
|
294
|
+
poll_interval=poll_interval,
|
|
295
|
+
)
|
|
245
296
|
|
|
246
297
|
result = OCRResult(
|
|
247
298
|
success=result_data.get("success", False),
|
|
@@ -299,10 +350,19 @@ class DatalabClient:
|
|
|
299
350
|
file_url: Optional[str] = None,
|
|
300
351
|
options: Optional[ProcessingOptions] = None,
|
|
301
352
|
save_output: Optional[Union[str, Path]] = None,
|
|
353
|
+
max_polls: int = 300,
|
|
354
|
+
poll_interval: int = 1,
|
|
302
355
|
) -> ConversionResult:
|
|
303
356
|
"""Convert a document using the marker endpoint (sync version)"""
|
|
304
357
|
return self._run_async(
|
|
305
|
-
self._async_client.convert(
|
|
358
|
+
self._async_client.convert(
|
|
359
|
+
file_path=file_path,
|
|
360
|
+
file_url=file_url,
|
|
361
|
+
options=options,
|
|
362
|
+
save_output=save_output,
|
|
363
|
+
max_polls=max_polls,
|
|
364
|
+
poll_interval=poll_interval,
|
|
365
|
+
)
|
|
306
366
|
)
|
|
307
367
|
|
|
308
368
|
def ocr(
|
|
@@ -310,6 +370,16 @@ class DatalabClient:
|
|
|
310
370
|
file_path: Union[str, Path],
|
|
311
371
|
options: Optional[ProcessingOptions] = None,
|
|
312
372
|
save_output: Optional[Union[str, Path]] = None,
|
|
373
|
+
max_polls: int = 300,
|
|
374
|
+
poll_interval: int = 1,
|
|
313
375
|
) -> OCRResult:
|
|
314
376
|
"""Perform OCR on a document (sync version)"""
|
|
315
|
-
return self._run_async(
|
|
377
|
+
return self._run_async(
|
|
378
|
+
self._async_client.ocr(
|
|
379
|
+
file_path=file_path,
|
|
380
|
+
options=options,
|
|
381
|
+
save_output=save_output,
|
|
382
|
+
max_polls=max_polls,
|
|
383
|
+
poll_interval=poll_interval,
|
|
384
|
+
)
|
|
385
|
+
)
|
|
@@ -47,7 +47,8 @@ class ConvertOptions(ProcessingOptions):
|
|
|
47
47
|
block_correction_prompt: Optional[str] = None
|
|
48
48
|
additional_config: Optional[Dict[str, Any]] = None
|
|
49
49
|
page_schema: Optional[Dict[str, Any]] = None
|
|
50
|
-
output_format: str = "markdown" # markdown, json, html
|
|
50
|
+
output_format: str = "markdown" # markdown, json, html, chunks
|
|
51
|
+
mode: str = "fast" # fast, balanced, accurate
|
|
51
52
|
|
|
52
53
|
|
|
53
54
|
@dataclass
|
|
@@ -91,7 +92,11 @@ class ConversionResult:
|
|
|
91
92
|
json.dump(self.json, f, indent=2)
|
|
92
93
|
|
|
93
94
|
if self.extraction_schema_json:
|
|
94
|
-
with open(
|
|
95
|
+
with open(
|
|
96
|
+
output_path.with_suffix("_extraction_results.json"),
|
|
97
|
+
"w",
|
|
98
|
+
encoding="utf-8",
|
|
99
|
+
) as f:
|
|
95
100
|
f.write(self.extraction_schema_json)
|
|
96
101
|
|
|
97
102
|
# Save images if present
|
|
@@ -62,6 +62,14 @@ class TestMarkerIntegration:
|
|
|
62
62
|
assert len(result.html) > 0
|
|
63
63
|
assert result.output_format == "html"
|
|
64
64
|
|
|
65
|
+
def test_convert_pdf_high_accuracy(self):
|
|
66
|
+
client = DatalabClient()
|
|
67
|
+
pdf_file = DATA_DIR / "adversarial.pdf"
|
|
68
|
+
options = ConvertOptions(mode="accurate", max_pages=1)
|
|
69
|
+
result = client.convert(pdf_file, options=options)
|
|
70
|
+
|
|
71
|
+
assert "subspace" in result.markdown.lower()
|
|
72
|
+
|
|
65
73
|
@pytest.mark.asyncio
|
|
66
74
|
async def test_convert_async_with_json(self):
|
|
67
75
|
"""Test async conversion with JSON output"""
|
|
@@ -7,7 +7,7 @@ readme = "README.md"
|
|
|
7
7
|
license = "MIT"
|
|
8
8
|
repository = "https://github.com/datalab-to/sdk"
|
|
9
9
|
keywords = ["datalab", "sdk", "document-intelligence", "api"]
|
|
10
|
-
version = "0.1.
|
|
10
|
+
version = "0.1.6"
|
|
11
11
|
description = "SDK for the Datalab document intelligence API"
|
|
12
12
|
requires-python = ">=3.10"
|
|
13
13
|
dependencies = [
|
|
@@ -15,6 +15,7 @@ dependencies = [
|
|
|
15
15
|
"click>=8.2.1",
|
|
16
16
|
"pydantic>=2.11.7,<3.0.0",
|
|
17
17
|
"pydantic-settings>=2.10.1,<3.0.0",
|
|
18
|
+
"tenacity>=8.2.3,<9.0.0",
|
|
18
19
|
]
|
|
19
20
|
|
|
20
21
|
[project.scripts]
|
|
@@ -27,6 +28,7 @@ test = [
|
|
|
27
28
|
"pytest-mock>=3.11.0",
|
|
28
29
|
"pytest-cov>=4.1.0",
|
|
29
30
|
"aiofiles>=23.2.0",
|
|
31
|
+
"pytest-xdist>=3.8.0",
|
|
30
32
|
]
|
|
31
33
|
|
|
32
34
|
[build-system]
|
|
@@ -8,7 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
from datalab_sdk import DatalabClient, AsyncDatalabClient
|
|
10
10
|
from datalab_sdk.models import ConversionResult, OCRResult, ConvertOptions, OCROptions
|
|
11
|
-
from datalab_sdk.exceptions import
|
|
11
|
+
from datalab_sdk.exceptions import (
|
|
12
|
+
DatalabAPIError,
|
|
13
|
+
DatalabFileError,
|
|
14
|
+
DatalabTimeoutError,
|
|
15
|
+
)
|
|
12
16
|
|
|
13
17
|
|
|
14
18
|
class TestConvertMethod:
|
|
@@ -169,6 +173,50 @@ class TestConvertMethod:
|
|
|
169
173
|
assert result.html == "<h1>Test Document</h1>"
|
|
170
174
|
assert result.output_format == "html"
|
|
171
175
|
|
|
176
|
+
@pytest.mark.asyncio
|
|
177
|
+
async def test_convert_async_respects_polling_params(self, temp_dir):
|
|
178
|
+
"""Verify convert passes max_polls and poll_interval to poller"""
|
|
179
|
+
# Create test file
|
|
180
|
+
pdf_file = temp_dir / "test.pdf"
|
|
181
|
+
pdf_file.write_bytes(b"%PDF-1.4\n%Test PDF content\n%%EOF\n")
|
|
182
|
+
|
|
183
|
+
# Mock API responses
|
|
184
|
+
mock_initial_response = {
|
|
185
|
+
"success": True,
|
|
186
|
+
"request_id": "rid-1",
|
|
187
|
+
"request_check_url": "https://api.datalab.to/api/v1/marker/rid-1",
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
mock_result_response = {
|
|
191
|
+
"success": True,
|
|
192
|
+
"status": "complete",
|
|
193
|
+
"output_format": "markdown",
|
|
194
|
+
"markdown": "ok",
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
async with AsyncDatalabClient(api_key="test-key") as client:
|
|
198
|
+
with patch.object(
|
|
199
|
+
client, "_make_request", new_callable=AsyncMock
|
|
200
|
+
) as mock_req:
|
|
201
|
+
with patch.object(
|
|
202
|
+
client, "_poll_result", new_callable=AsyncMock
|
|
203
|
+
) as mock_poll:
|
|
204
|
+
mock_req.return_value = mock_initial_response
|
|
205
|
+
mock_poll.return_value = mock_result_response
|
|
206
|
+
|
|
207
|
+
max_polls = 7
|
|
208
|
+
poll_interval = 3
|
|
209
|
+
await client.convert(
|
|
210
|
+
pdf_file, max_polls=max_polls, poll_interval=poll_interval
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
mock_poll.assert_awaited_once()
|
|
214
|
+
# Verify kwargs were forwarded
|
|
215
|
+
args, kwargs = mock_poll.await_args
|
|
216
|
+
assert args[0] == mock_initial_response["request_check_url"]
|
|
217
|
+
assert kwargs["max_polls"] == max_polls
|
|
218
|
+
assert kwargs["poll_interval"] == poll_interval
|
|
219
|
+
|
|
172
220
|
|
|
173
221
|
class TestOCRMethod:
|
|
174
222
|
"""Test the ocr method"""
|
|
@@ -356,6 +404,77 @@ class TestOCRMethod:
|
|
|
356
404
|
assert "Page 1 content" in all_text
|
|
357
405
|
assert "Page 2 content" in all_text
|
|
358
406
|
|
|
407
|
+
@pytest.mark.asyncio
|
|
408
|
+
async def test_ocr_async_respects_polling_params(self, temp_dir):
|
|
409
|
+
"""Verify ocr passes max_polls and poll_interval to poller"""
|
|
410
|
+
pdf_file = temp_dir / "test.pdf"
|
|
411
|
+
pdf_file.write_bytes(b"%PDF-1.4\n%Test PDF content\n%%EOF\n")
|
|
412
|
+
|
|
413
|
+
mock_initial_response = {
|
|
414
|
+
"success": True,
|
|
415
|
+
"request_id": "rid-2",
|
|
416
|
+
"request_check_url": "https://api.datalab.to/api/v1/ocr/rid-2",
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
mock_result_response = {
|
|
420
|
+
"success": True,
|
|
421
|
+
"status": "complete",
|
|
422
|
+
"pages": [],
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
async with AsyncDatalabClient(api_key="test-key") as client:
|
|
426
|
+
with patch.object(
|
|
427
|
+
client, "_make_request", new_callable=AsyncMock
|
|
428
|
+
) as mock_req:
|
|
429
|
+
with patch.object(
|
|
430
|
+
client, "_poll_result", new_callable=AsyncMock
|
|
431
|
+
) as mock_poll:
|
|
432
|
+
mock_req.return_value = mock_initial_response
|
|
433
|
+
mock_poll.return_value = mock_result_response
|
|
434
|
+
|
|
435
|
+
max_polls = 11
|
|
436
|
+
poll_interval = 2
|
|
437
|
+
await client.ocr(
|
|
438
|
+
pdf_file, max_polls=max_polls, poll_interval=poll_interval
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
mock_poll.assert_awaited_once()
|
|
442
|
+
args, kwargs = mock_poll.await_args
|
|
443
|
+
assert args[0] == mock_initial_response["request_check_url"]
|
|
444
|
+
assert kwargs["max_polls"] == max_polls
|
|
445
|
+
assert kwargs["poll_interval"] == poll_interval
|
|
446
|
+
|
|
447
|
+
def test_sync_wrappers_forward_polling_params(self, temp_dir):
|
|
448
|
+
"""Ensure sync client forwards polling params to async client"""
|
|
449
|
+
pdf_file = temp_dir / "test.pdf"
|
|
450
|
+
pdf_file.write_bytes(b"%PDF-1.4\n%Test PDF content\n%%EOF\n")
|
|
451
|
+
|
|
452
|
+
client = DatalabClient(api_key="test-key")
|
|
453
|
+
|
|
454
|
+
# Patch async convert/ocr to capture kwargs
|
|
455
|
+
with patch.object(
|
|
456
|
+
client._async_client, "convert", new_callable=AsyncMock
|
|
457
|
+
) as mock_conv:
|
|
458
|
+
with patch.object(
|
|
459
|
+
client._async_client, "ocr", new_callable=AsyncMock
|
|
460
|
+
) as mock_ocr:
|
|
461
|
+
mock_conv.return_value = ConversionResult(
|
|
462
|
+
success=True, output_format="markdown", markdown="ok"
|
|
463
|
+
)
|
|
464
|
+
mock_ocr.return_value = OCRResult(success=True, pages=[])
|
|
465
|
+
|
|
466
|
+
client.convert(pdf_file, max_polls=5, poll_interval=9)
|
|
467
|
+
client.ocr(pdf_file, max_polls=6, poll_interval=4)
|
|
468
|
+
|
|
469
|
+
# Assert called with forwarded kwargs
|
|
470
|
+
_, conv_kwargs = mock_conv.await_args
|
|
471
|
+
assert conv_kwargs["max_polls"] == 5
|
|
472
|
+
assert conv_kwargs["poll_interval"] == 9
|
|
473
|
+
|
|
474
|
+
_, ocr_kwargs = mock_ocr.await_args
|
|
475
|
+
assert ocr_kwargs["max_polls"] == 6
|
|
476
|
+
assert ocr_kwargs["poll_interval"] == 4
|
|
477
|
+
|
|
359
478
|
|
|
360
479
|
class TestClientErrorHandling:
|
|
361
480
|
"""Test error handling in client methods"""
|
|
@@ -416,3 +535,68 @@ class TestClientErrorHandling:
|
|
|
416
535
|
DatalabAPIError, match="Request failed: Processing failed"
|
|
417
536
|
):
|
|
418
537
|
client.convert(pdf_file)
|
|
538
|
+
|
|
539
|
+
def test_convert_timeout_bubbles_up(self, temp_dir):
|
|
540
|
+
"""Polling timeout surfaces as DatalabTimeoutError for sync convert"""
|
|
541
|
+
pdf_file = temp_dir / "test.pdf"
|
|
542
|
+
pdf_file.write_bytes(b"%PDF-1.4\n%Test PDF content\n%%EOF\n")
|
|
543
|
+
|
|
544
|
+
mock_initial_response = {
|
|
545
|
+
"success": True,
|
|
546
|
+
"request_id": "rid-timeout",
|
|
547
|
+
"request_check_url": "https://api.datalab.to/api/v1/marker/rid-timeout",
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
client = DatalabClient(api_key="test-key")
|
|
551
|
+
with patch.object(
|
|
552
|
+
client._async_client, "_make_request", new_callable=AsyncMock
|
|
553
|
+
) as mock_request:
|
|
554
|
+
with patch.object(
|
|
555
|
+
client._async_client, "_poll_result", new_callable=AsyncMock
|
|
556
|
+
) as mock_poll:
|
|
557
|
+
mock_request.return_value = mock_initial_response
|
|
558
|
+
mock_poll.side_effect = DatalabTimeoutError("Polling timed out")
|
|
559
|
+
|
|
560
|
+
with pytest.raises(DatalabTimeoutError, match="Polling timed out"):
|
|
561
|
+
client.convert(pdf_file)
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
class TestPollingLoop:
|
|
565
|
+
"""Direct tests for the internal polling helper"""
|
|
566
|
+
|
|
567
|
+
@pytest.mark.asyncio
|
|
568
|
+
async def test_poll_result_times_out(self):
|
|
569
|
+
async with AsyncDatalabClient(api_key="test-key") as client:
|
|
570
|
+
with (
|
|
571
|
+
patch.object(
|
|
572
|
+
client, "_make_request", new_callable=AsyncMock
|
|
573
|
+
) as mock_req,
|
|
574
|
+
patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep,
|
|
575
|
+
):
|
|
576
|
+
# Always return processing so we hit timeout
|
|
577
|
+
mock_req.return_value = {"status": "processing", "success": True}
|
|
578
|
+
|
|
579
|
+
with pytest.raises(DatalabTimeoutError):
|
|
580
|
+
await client._poll_result(
|
|
581
|
+
"https://api.example.com/check", max_polls=3, poll_interval=0
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
assert mock_req.await_count == 3
|
|
585
|
+
assert mock_sleep.await_count >= 1
|
|
586
|
+
|
|
587
|
+
@pytest.mark.asyncio
|
|
588
|
+
async def test_poll_result_raises_on_failed_status(self):
|
|
589
|
+
async with AsyncDatalabClient(api_key="test-key") as client:
|
|
590
|
+
with patch.object(
|
|
591
|
+
client, "_make_request", new_callable=AsyncMock
|
|
592
|
+
) as mock_req:
|
|
593
|
+
mock_req.return_value = {
|
|
594
|
+
"status": "failed",
|
|
595
|
+
"success": False,
|
|
596
|
+
"error": "boom",
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
with pytest.raises(DatalabAPIError, match="Processing failed: boom"):
|
|
600
|
+
await client._poll_result(
|
|
601
|
+
"https://api.example.com/check", max_polls=1, poll_interval=0
|
|
602
|
+
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
version = 1
|
|
2
|
-
revision =
|
|
2
|
+
revision = 2
|
|
3
3
|
requires-python = ">=3.10"
|
|
4
4
|
|
|
5
5
|
[[package]]
|
|
@@ -169,13 +169,14 @@ wheels = [
|
|
|
169
169
|
|
|
170
170
|
[[package]]
|
|
171
171
|
name = "datalab-python-sdk"
|
|
172
|
-
version = "0.1.
|
|
172
|
+
version = "0.1.6"
|
|
173
173
|
source = { editable = "." }
|
|
174
174
|
dependencies = [
|
|
175
175
|
{ name = "aiohttp" },
|
|
176
176
|
{ name = "click" },
|
|
177
177
|
{ name = "pydantic" },
|
|
178
178
|
{ name = "pydantic-settings" },
|
|
179
|
+
{ name = "tenacity" },
|
|
179
180
|
]
|
|
180
181
|
|
|
181
182
|
[package.dev-dependencies]
|
|
@@ -195,6 +196,7 @@ requires-dist = [
|
|
|
195
196
|
{ name = "click", specifier = ">=8.2.1" },
|
|
196
197
|
{ name = "pydantic", specifier = ">=2.11.7,<3.0.0" },
|
|
197
198
|
{ name = "pydantic-settings", specifier = ">=2.10.1,<3.0.0" },
|
|
199
|
+
{ name = "tenacity", specifier = ">=8.2.3,<9.0.0" },
|
|
198
200
|
]
|
|
199
201
|
|
|
200
202
|
[package.metadata.requires-dev]
|
|
@@ -857,6 +859,15 @@ wheels = [
|
|
|
857
859
|
{ url = "https://files.pythonhosted.org/packages/e0/30/f3eaf6563c637b6e66238ed6535f6775480db973c836336e4122161986fc/ruff-0.12.3-py3-none-win_arm64.whl", hash = "sha256:5f9c7c9c8f84c2d7f27e93674d27136fbf489720251544c4da7fb3d742e011b1", size = 10805855, upload-time = "2025-07-11T13:21:13.547Z" },
|
|
858
860
|
]
|
|
859
861
|
|
|
862
|
+
[[package]]
|
|
863
|
+
name = "tenacity"
|
|
864
|
+
version = "8.5.0"
|
|
865
|
+
source = { registry = "https://pypi.org/simple" }
|
|
866
|
+
sdist = { url = "https://files.pythonhosted.org/packages/a3/4d/6a19536c50b849338fcbe9290d562b52cbdcf30d8963d3588a68a4107df1/tenacity-8.5.0.tar.gz", hash = "sha256:8bc6c0c8a09b31e6cad13c47afbed1a567518250a9a171418582ed8d9c20ca78", size = 47309, upload-time = "2024-07-05T07:25:31.836Z" }
|
|
867
|
+
wheels = [
|
|
868
|
+
{ url = "https://files.pythonhosted.org/packages/d2/3f/8ba87d9e287b9d385a02a7114ddcef61b26f86411e121c9003eb509a1773/tenacity-8.5.0-py3-none-any.whl", hash = "sha256:b594c2a5945830c267ce6b79a166228323ed52718f30302c1359836112346687", size = 28165, upload-time = "2024-07-05T07:25:29.591Z" },
|
|
869
|
+
]
|
|
870
|
+
|
|
860
871
|
[[package]]
|
|
861
872
|
name = "tomli"
|
|
862
873
|
version = "2.2.1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|