datalab-python-sdk 0.1.6__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/PKG-INFO +1 -1
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/cli.py +25 -3
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/client.py +1 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/models.py +6 -1
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/pyproject.toml +1 -1
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/tests/test_client_methods.py +6 -1
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/uv.lock +2 -2
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/.github/workflows/ci.yml +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/.github/workflows/publish.yml +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/.gitignore +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/.pre-commit-config.yaml +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/.python-version +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/LICENSE +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/README.md +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/08-Lambda-Calculus.pptx +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/adversarial.pdf +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/bid_evaluation.docx +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/book_review.ppt +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/book_store.xls +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/chi_hind.png +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/how_to_read.doc +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/normandy.epub +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/sample-1-sheet.xlsx +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/thinkpython.pdf +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/vibe.html +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/__init__.py +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/exceptions.py +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/mimetypes.py +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/settings.py +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/integration/README.md +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/integration/__init__.py +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/integration/test_live_api.py +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/integration/test_readme_examples.py +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/poetry.lock +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/pytest.ini +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/tests/__init__.py +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/tests/conftest.py +0 -0
- {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/tests/test_cli_simple.py +0 -0
|
@@ -42,6 +42,12 @@ def common_options(func):
|
|
|
42
42
|
func = click.option("--skip_cache", help="Skip the cache when running inference")(
|
|
43
43
|
func
|
|
44
44
|
)
|
|
45
|
+
func = click.option(
|
|
46
|
+
"--max_polls", default=300, type=int, help="Maximum number of polling attempts"
|
|
47
|
+
)(func)
|
|
48
|
+
func = click.option(
|
|
49
|
+
"--poll_interval", default=1, type=int, help="Polling interval in seconds"
|
|
50
|
+
)(func)
|
|
45
51
|
return func
|
|
46
52
|
|
|
47
53
|
|
|
@@ -51,7 +57,7 @@ def marker_options(func):
|
|
|
51
57
|
"--format",
|
|
52
58
|
"output_format",
|
|
53
59
|
default="markdown",
|
|
54
|
-
type=click.Choice(["markdown", "html", "json"]),
|
|
60
|
+
type=click.Choice(["markdown", "html", "json", "chunks"]),
|
|
55
61
|
help="Output format",
|
|
56
62
|
)(func)
|
|
57
63
|
func = click.option("--force_ocr", is_flag=True, help="Force OCR on every page")(
|
|
@@ -106,6 +112,8 @@ async def process_files_async(
|
|
|
106
112
|
max_concurrent: int = 5,
|
|
107
113
|
api_key: str | None = None,
|
|
108
114
|
base_url: str | None = None,
|
|
115
|
+
max_polls: int = 300,
|
|
116
|
+
poll_interval: int = 1,
|
|
109
117
|
) -> List[dict]:
|
|
110
118
|
"""Process files asynchronously"""
|
|
111
119
|
semaphore = asyncio.Semaphore(max_concurrent)
|
|
@@ -124,11 +132,13 @@ async def process_files_async(
|
|
|
124
132
|
) as client:
|
|
125
133
|
if method == "convert":
|
|
126
134
|
result = await client.convert(
|
|
127
|
-
file_path, options=options, save_output=output_path
|
|
135
|
+
file_path, options=options, save_output=output_path,
|
|
136
|
+
max_polls=max_polls, poll_interval=poll_interval
|
|
128
137
|
)
|
|
129
138
|
else: # method == 'ocr'
|
|
130
139
|
result = await client.ocr(
|
|
131
|
-
file_path, options=options, save_output=output_path
|
|
140
|
+
file_path, options=options, save_output=output_path,
|
|
141
|
+
max_polls=max_polls, poll_interval=poll_interval
|
|
132
142
|
)
|
|
133
143
|
|
|
134
144
|
return {
|
|
@@ -218,6 +228,8 @@ def process_documents(
|
|
|
218
228
|
base_url: str,
|
|
219
229
|
page_range: Optional[str],
|
|
220
230
|
skip_cache: bool,
|
|
231
|
+
max_polls: int,
|
|
232
|
+
poll_interval: int,
|
|
221
233
|
# Convert-specific options
|
|
222
234
|
output_format: Optional[str] = None,
|
|
223
235
|
force_ocr: bool = False,
|
|
@@ -288,6 +300,8 @@ def process_documents(
|
|
|
288
300
|
max_concurrent=max_concurrent,
|
|
289
301
|
api_key=api_key,
|
|
290
302
|
base_url=base_url,
|
|
303
|
+
max_polls=max_polls,
|
|
304
|
+
poll_interval=poll_interval,
|
|
291
305
|
)
|
|
292
306
|
)
|
|
293
307
|
|
|
@@ -320,6 +334,8 @@ def convert(
|
|
|
320
334
|
base_url: str,
|
|
321
335
|
page_range: Optional[str],
|
|
322
336
|
skip_cache: bool,
|
|
337
|
+
max_polls: int,
|
|
338
|
+
poll_interval: int,
|
|
323
339
|
output_format: str,
|
|
324
340
|
force_ocr: bool,
|
|
325
341
|
format_lines: bool,
|
|
@@ -342,6 +358,8 @@ def convert(
|
|
|
342
358
|
base_url=base_url,
|
|
343
359
|
page_range=page_range,
|
|
344
360
|
skip_cache=skip_cache,
|
|
361
|
+
max_polls=max_polls,
|
|
362
|
+
poll_interval=poll_interval,
|
|
345
363
|
output_format=output_format,
|
|
346
364
|
force_ocr=force_ocr,
|
|
347
365
|
format_lines=format_lines,
|
|
@@ -367,6 +385,8 @@ def ocr(
|
|
|
367
385
|
base_url: str,
|
|
368
386
|
page_range: Optional[str],
|
|
369
387
|
skip_cache: bool,
|
|
388
|
+
max_polls: int,
|
|
389
|
+
poll_interval: int,
|
|
370
390
|
):
|
|
371
391
|
"""Perform OCR on documents"""
|
|
372
392
|
process_documents(
|
|
@@ -380,6 +400,8 @@ def ocr(
|
|
|
380
400
|
base_url=base_url,
|
|
381
401
|
page_range=page_range,
|
|
382
402
|
skip_cache=skip_cache,
|
|
403
|
+
max_polls=max_polls,
|
|
404
|
+
poll_interval=poll_interval,
|
|
383
405
|
)
|
|
384
406
|
|
|
385
407
|
|
|
@@ -249,6 +249,7 @@ class AsyncDatalabClient:
|
|
|
249
249
|
markdown=result_data.get("markdown"),
|
|
250
250
|
html=result_data.get("html"),
|
|
251
251
|
json=result_data.get("json"),
|
|
252
|
+
chunks=result_data.get("chunks"),
|
|
252
253
|
extraction_schema_json=result_data.get("extraction_schema_json"),
|
|
253
254
|
images=result_data.get("images"),
|
|
254
255
|
metadata=result_data.get("metadata"),
|
|
@@ -13,7 +13,7 @@ import base64
|
|
|
13
13
|
class ProcessingOptions:
|
|
14
14
|
# Common options
|
|
15
15
|
max_pages: Optional[int] = None
|
|
16
|
-
skip_cache: bool =
|
|
16
|
+
skip_cache: bool = False
|
|
17
17
|
page_range: Optional[str] = None
|
|
18
18
|
|
|
19
19
|
def to_form_data(self) -> Dict[str, Any]:
|
|
@@ -65,6 +65,7 @@ class ConversionResult:
|
|
|
65
65
|
markdown: Optional[str] = None
|
|
66
66
|
html: Optional[str] = None
|
|
67
67
|
json: Optional[Dict[str, Any]] = None
|
|
68
|
+
chunks: Optional[Dict[str, Any]] = None
|
|
68
69
|
extraction_schema_json: Optional[str] = None
|
|
69
70
|
images: Optional[Dict[str, str]] = None
|
|
70
71
|
metadata: Optional[Dict[str, Any]] = None
|
|
@@ -91,6 +92,10 @@ class ConversionResult:
|
|
|
91
92
|
with open(output_path.with_suffix(".json"), "w", encoding="utf-8") as f:
|
|
92
93
|
json.dump(self.json, f, indent=2)
|
|
93
94
|
|
|
95
|
+
if self.chunks:
|
|
96
|
+
with open(output_path.with_suffix(".chunks.json"), "w", encoding="utf-8") as f:
|
|
97
|
+
json.dump(self.chunks, f, indent=2)
|
|
98
|
+
|
|
94
99
|
if self.extraction_schema_json:
|
|
95
100
|
with open(
|
|
96
101
|
output_path.with_suffix("_extraction_results.json"),
|
|
@@ -7,7 +7,7 @@ readme = "README.md"
|
|
|
7
7
|
license = "MIT"
|
|
8
8
|
repository = "https://github.com/datalab-to/sdk"
|
|
9
9
|
keywords = ["datalab", "sdk", "document-intelligence", "api"]
|
|
10
|
-
version = "0.1.
|
|
10
|
+
version = "0.1.8"
|
|
11
11
|
description = "SDK for the Datalab document intelligence API"
|
|
12
12
|
requires-python = ">=3.10"
|
|
13
13
|
dependencies = [
|
|
@@ -89,6 +89,7 @@ class TestConvertMethod:
|
|
|
89
89
|
"markdown": "# Test Document\n\nThis is a test document.",
|
|
90
90
|
"html": None,
|
|
91
91
|
"json": None,
|
|
92
|
+
"chunks": {"some_content": True},
|
|
92
93
|
"images": {},
|
|
93
94
|
"metadata": {"pages": 1},
|
|
94
95
|
"error": "",
|
|
@@ -114,13 +115,17 @@ class TestConvertMethod:
|
|
|
114
115
|
# Verify result
|
|
115
116
|
assert result.success is True
|
|
116
117
|
|
|
117
|
-
# Verify file was saved
|
|
118
|
+
# Verify Markdown file was saved
|
|
118
119
|
assert (output_path.with_suffix(".md")).exists()
|
|
119
120
|
saved_content = (output_path.with_suffix(".md")).read_text()
|
|
120
121
|
assert (
|
|
121
122
|
saved_content == "# Test Document\n\nThis is a test document."
|
|
122
123
|
)
|
|
123
124
|
|
|
125
|
+
assert (output_path.with_suffix(".chunks.json")).exists()
|
|
126
|
+
saved_chunks = json.loads((output_path.with_suffix(".chunks.json")).read_text())
|
|
127
|
+
assert saved_chunks == {"some_content": True}
|
|
128
|
+
|
|
124
129
|
def test_convert_sync_with_processing_options(self, temp_dir):
|
|
125
130
|
"""Test synchronous conversion with processing options"""
|
|
126
131
|
# Create test file
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
version = 1
|
|
2
|
-
revision =
|
|
2
|
+
revision = 3
|
|
3
3
|
requires-python = ">=3.10"
|
|
4
4
|
|
|
5
5
|
[[package]]
|
|
@@ -169,7 +169,7 @@ wheels = [
|
|
|
169
169
|
|
|
170
170
|
[[package]]
|
|
171
171
|
name = "datalab-python-sdk"
|
|
172
|
-
version = "0.1.
|
|
172
|
+
version = "0.1.8"
|
|
173
173
|
source = { editable = "." }
|
|
174
174
|
dependencies = [
|
|
175
175
|
{ name = "aiohttp" },
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|