datalab-python-sdk 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/.github/workflows/ci.yml +1 -1
- datalab_python_sdk-0.1.4/PKG-INFO +68 -0
- datalab_python_sdk-0.1.4/README.md +53 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/datalab_sdk/__init__.py +2 -1
- datalab_python_sdk-0.1.4/datalab_sdk/cli.py +391 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/datalab_sdk/client.py +4 -5
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/datalab_sdk/settings.py +1 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/pyproject.toml +15 -7
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/tests/test_cli_simple.py +41 -95
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/uv.lock +8 -127
- datalab_python_sdk-0.1.2/PKG-INFO +0 -17
- datalab_python_sdk-0.1.2/README.md +0 -178
- datalab_python_sdk-0.1.2/datalab_sdk/cli.py +0 -440
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/.github/workflows/publish.yml +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/.gitignore +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/.pre-commit-config.yaml +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/.python-version +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/LICENSE +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/data/08-Lambda-Calculus.pptx +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/data/adversarial.pdf +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/data/bid_evaluation.docx +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/data/book_review.ppt +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/data/book_store.xls +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/data/chi_hind.png +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/data/how_to_read.doc +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/data/normandy.epub +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/data/sample-1-sheet.xlsx +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/data/thinkpython.pdf +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/data/vibe.html +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/datalab_sdk/exceptions.py +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/datalab_sdk/mimetypes.py +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/datalab_sdk/models.py +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/integration/README.md +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/integration/__init__.py +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/integration/test_live_api.py +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/integration/test_readme_examples.py +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/poetry.lock +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/pytest.ini +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/tests/__init__.py +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/tests/conftest.py +0 -0
- {datalab_python_sdk-0.1.2 → datalab_python_sdk-0.1.4}/tests/test_client_methods.py +0 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datalab-python-sdk
|
|
3
|
+
Version: 0.1.4
|
|
4
|
+
Summary: SDK for the Datalab document intelligence API
|
|
5
|
+
Author-email: Datalab Team <hi@datalab.to>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: api,datalab,document-intelligence,sdk
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Requires-Dist: aiohttp>=3.12.14
|
|
11
|
+
Requires-Dist: click>=8.2.1
|
|
12
|
+
Requires-Dist: pydantic-settings<3.0.0,>=2.10.1
|
|
13
|
+
Requires-Dist: pydantic<3.0.0,>=2.11.7
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# Datalab SDK
|
|
17
|
+
|
|
18
|
+
A Python SDK for the [Datalab API](https://www.datalab.to) - a document intelligence platform powered by [marker](https://github.com/VikParuchuri/marker) and [surya](https://github.com/VikParuchuri/surya).
|
|
19
|
+
|
|
20
|
+
See the full documentation at [https://documentation.datalab.to](https://documentation.datalab.to).
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install datalab-python-sdk
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
### Authentication
|
|
31
|
+
|
|
32
|
+
Get your API key from [https://www.datalab.to/app/keys](https://www.datalab.to/app/keys):
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
export DATALAB_API_KEY="your_api_key_here"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Basic Usage
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from datalab_sdk import DatalabClient
|
|
42
|
+
|
|
43
|
+
client = DatalabClient() # use env var from above, or pass api_key="your_api_key_here"
|
|
44
|
+
|
|
45
|
+
# Convert PDF to markdown
|
|
46
|
+
result = client.convert("document.pdf")
|
|
47
|
+
print(result.markdown)
|
|
48
|
+
|
|
49
|
+
# OCR a document
|
|
50
|
+
ocr_result = client.ocr("document.pdf")
|
|
51
|
+
print(ocr_result.pages) # Get all text as string
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## CLI Usage
|
|
55
|
+
|
|
56
|
+
The SDK includes a command-line interface:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
# Convert document to markdown
|
|
60
|
+
datalab convert document.pdf
|
|
61
|
+
|
|
62
|
+
# OCR with JSON output
|
|
63
|
+
datalab ocr document.pdf --output-format json
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## License
|
|
67
|
+
|
|
68
|
+
MIT License
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Datalab SDK
|
|
2
|
+
|
|
3
|
+
A Python SDK for the [Datalab API](https://www.datalab.to) - a document intelligence platform powered by [marker](https://github.com/VikParuchuri/marker) and [surya](https://github.com/VikParuchuri/surya).
|
|
4
|
+
|
|
5
|
+
See the full documentation at [https://documentation.datalab.to](https://documentation.datalab.to).
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install datalab-python-sdk
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Quick Start
|
|
14
|
+
|
|
15
|
+
### Authentication
|
|
16
|
+
|
|
17
|
+
Get your API key from [https://www.datalab.to/app/keys](https://www.datalab.to/app/keys):
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
export DATALAB_API_KEY="your_api_key_here"
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Basic Usage
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from datalab_sdk import DatalabClient
|
|
27
|
+
|
|
28
|
+
client = DatalabClient() # use env var from above, or pass api_key="your_api_key_here"
|
|
29
|
+
|
|
30
|
+
# Convert PDF to markdown
|
|
31
|
+
result = client.convert("document.pdf")
|
|
32
|
+
print(result.markdown)
|
|
33
|
+
|
|
34
|
+
# OCR a document
|
|
35
|
+
ocr_result = client.ocr("document.pdf")
|
|
36
|
+
print(ocr_result.pages) # Get all text as string
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## CLI Usage
|
|
40
|
+
|
|
41
|
+
The SDK includes a command-line interface:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# Convert document to markdown
|
|
45
|
+
datalab convert document.pdf
|
|
46
|
+
|
|
47
|
+
# OCR with JSON output
|
|
48
|
+
datalab ocr document.pdf --output-format json
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## License
|
|
52
|
+
|
|
53
|
+
MIT License
|
|
@@ -8,8 +8,9 @@ supporting document conversion, OCR, layout analysis, and table recognition.
|
|
|
8
8
|
from .client import DatalabClient, AsyncDatalabClient
|
|
9
9
|
from .exceptions import DatalabError, DatalabAPIError, DatalabTimeoutError
|
|
10
10
|
from .models import ConversionResult, OCRResult, ConvertOptions, OCROptions
|
|
11
|
+
from .settings import settings
|
|
11
12
|
|
|
12
|
-
__version__ =
|
|
13
|
+
__version__ = settings.VERSION
|
|
13
14
|
__all__ = [
|
|
14
15
|
"DatalabClient",
|
|
15
16
|
"AsyncDatalabClient",
|
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Datalab SDK Command Line Interface
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
import asyncio
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional, List
|
|
11
|
+
import click
|
|
12
|
+
|
|
13
|
+
from datalab_sdk.client import AsyncDatalabClient
|
|
14
|
+
from datalab_sdk.mimetypes import SUPPORTED_EXTENSIONS
|
|
15
|
+
from datalab_sdk.models import OCROptions, ConvertOptions, ProcessingOptions
|
|
16
|
+
from datalab_sdk.exceptions import DatalabError
|
|
17
|
+
from datalab_sdk.settings import settings
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Common CLI options
|
|
21
|
+
def common_options(func):
|
|
22
|
+
"""Common options for all commands"""
|
|
23
|
+
func = click.option("--api_key", required=False, help="Datalab API key")(func)
|
|
24
|
+
func = click.option(
|
|
25
|
+
"--output_dir", "-o", required=False, type=click.Path(), help="Output directory"
|
|
26
|
+
)(func)
|
|
27
|
+
func = click.option(
|
|
28
|
+
"--max_pages", type=int, help="Maximum number of pages to process"
|
|
29
|
+
)(func)
|
|
30
|
+
func = click.option(
|
|
31
|
+
"--extensions", help="Comma-separated list of file extensions (for directories)"
|
|
32
|
+
)(func)
|
|
33
|
+
func = click.option(
|
|
34
|
+
"--max_concurrent", default=5, type=int, help="Maximum concurrent requests"
|
|
35
|
+
)(func)
|
|
36
|
+
func = click.option(
|
|
37
|
+
"--base_url", default=settings.DATALAB_HOST, help="API base URL"
|
|
38
|
+
)(func)
|
|
39
|
+
func = click.option(
|
|
40
|
+
"--page_range", help='Page range to process (e.g., "0-2" or "0,1,2")'
|
|
41
|
+
)(func)
|
|
42
|
+
func = click.option("--skip_cache", help="Skip the cache when running inference")(
|
|
43
|
+
func
|
|
44
|
+
)
|
|
45
|
+
return func
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def marker_options(func):
|
|
49
|
+
"""Options specific to marker/convert command"""
|
|
50
|
+
func = click.option(
|
|
51
|
+
"--format",
|
|
52
|
+
"output_format",
|
|
53
|
+
default="markdown",
|
|
54
|
+
type=click.Choice(["markdown", "html", "json"]),
|
|
55
|
+
help="Output format",
|
|
56
|
+
)(func)
|
|
57
|
+
func = click.option("--force_ocr", is_flag=True, help="Force OCR on every page")(
|
|
58
|
+
func
|
|
59
|
+
)
|
|
60
|
+
func = click.option(
|
|
61
|
+
"--format_lines", is_flag=True, help="Partially OCR lines for better formatting"
|
|
62
|
+
)(func)
|
|
63
|
+
func = click.option(
|
|
64
|
+
"--paginate", is_flag=True, help="Add page delimiters to output"
|
|
65
|
+
)(func)
|
|
66
|
+
func = click.option("--use_llm", is_flag=True, help="Use LLM to enhance accuracy")(
|
|
67
|
+
func
|
|
68
|
+
)
|
|
69
|
+
func = click.option(
|
|
70
|
+
"--strip_existing_ocr",
|
|
71
|
+
is_flag=True,
|
|
72
|
+
help="Remove existing OCR text and redo OCR",
|
|
73
|
+
)(func)
|
|
74
|
+
func = click.option(
|
|
75
|
+
"--disable_image_extraction", is_flag=True, help="Disable extraction of images"
|
|
76
|
+
)(func)
|
|
77
|
+
func = click.option(
|
|
78
|
+
"--block_correction_prompt", help="Custom prompt for block correction"
|
|
79
|
+
)(func)
|
|
80
|
+
func = click.option(
|
|
81
|
+
"--page_schema", help="Schema to set to do structured extraction"
|
|
82
|
+
)(func)
|
|
83
|
+
return func
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def find_files_in_directory(
|
|
87
|
+
directory: Path, extensions: Optional[List[str]] = None
|
|
88
|
+
) -> List[Path]:
|
|
89
|
+
"""Find all supported files in a directory"""
|
|
90
|
+
if extensions is None:
|
|
91
|
+
extensions = SUPPORTED_EXTENSIONS
|
|
92
|
+
|
|
93
|
+
files = []
|
|
94
|
+
for file_path in directory.rglob("*"):
|
|
95
|
+
if file_path.is_file() and file_path.suffix.lower() in extensions:
|
|
96
|
+
files.append(file_path)
|
|
97
|
+
|
|
98
|
+
return files
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def process_files_async(
|
|
102
|
+
files: List[Path],
|
|
103
|
+
output_dir: Path,
|
|
104
|
+
method: str,
|
|
105
|
+
options: Optional[ProcessingOptions] = None,
|
|
106
|
+
max_concurrent: int = 5,
|
|
107
|
+
api_key: str | None = None,
|
|
108
|
+
base_url: str | None = None,
|
|
109
|
+
) -> List[dict]:
|
|
110
|
+
"""Process files asynchronously"""
|
|
111
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
112
|
+
|
|
113
|
+
async def process_single_file(file_path: Path) -> dict:
|
|
114
|
+
async with semaphore:
|
|
115
|
+
try:
|
|
116
|
+
# Create output path
|
|
117
|
+
relative_path = file_path.name
|
|
118
|
+
output_path = (
|
|
119
|
+
output_dir / Path(relative_path).stem / Path(relative_path).stem
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
async with AsyncDatalabClient(
|
|
123
|
+
api_key=api_key, base_url=base_url
|
|
124
|
+
) as client:
|
|
125
|
+
if method == "convert":
|
|
126
|
+
result = await client.convert(
|
|
127
|
+
file_path, options=options, save_output=output_path
|
|
128
|
+
)
|
|
129
|
+
else: # method == 'ocr'
|
|
130
|
+
result = await client.ocr(
|
|
131
|
+
file_path, options=options, save_output=output_path
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
"file_path": str(file_path),
|
|
136
|
+
"output_path": str(output_path),
|
|
137
|
+
"success": result.success,
|
|
138
|
+
"error": result.error,
|
|
139
|
+
"page_count": result.page_count,
|
|
140
|
+
}
|
|
141
|
+
except Exception as e:
|
|
142
|
+
return {
|
|
143
|
+
"file_path": str(file_path),
|
|
144
|
+
"output_path": None,
|
|
145
|
+
"success": False,
|
|
146
|
+
"error": str(e),
|
|
147
|
+
"page_count": None,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
# Process all files concurrently
|
|
151
|
+
tasks = [process_single_file(file_path) for file_path in files]
|
|
152
|
+
results = await asyncio.gather(*tasks)
|
|
153
|
+
|
|
154
|
+
return results
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def setup_output_directory(output_dir: Optional[str]) -> Path:
|
|
158
|
+
"""Setup and return output directory"""
|
|
159
|
+
if output_dir is None:
|
|
160
|
+
output_dir = os.getcwd()
|
|
161
|
+
|
|
162
|
+
output_dir = Path(output_dir)
|
|
163
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
164
|
+
return output_dir
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def parse_extensions(extensions: Optional[str]) -> Optional[List[str]]:
|
|
168
|
+
"""Parse file extensions from comma-separated string"""
|
|
169
|
+
if not extensions:
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
file_extensions = [ext.strip() for ext in extensions.split(",")]
|
|
173
|
+
return [ext if ext.startswith(".") else f".{ext}" for ext in file_extensions]
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def get_files_to_process(
|
|
177
|
+
path: Path, file_extensions: Optional[List[str]]
|
|
178
|
+
) -> List[Path]:
|
|
179
|
+
"""Get list of files to process"""
|
|
180
|
+
if path.is_file():
|
|
181
|
+
# Single file processing
|
|
182
|
+
if file_extensions and path.suffix.lower() not in file_extensions:
|
|
183
|
+
click.echo(f"❌ Skipping {path}: unsupported file type", err=True)
|
|
184
|
+
sys.exit(1)
|
|
185
|
+
return [path]
|
|
186
|
+
else:
|
|
187
|
+
# Directory processing
|
|
188
|
+
return find_files_in_directory(path, file_extensions)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def show_results(results: List[dict], operation: str, output_dir: Path):
|
|
192
|
+
"""Display processing results"""
|
|
193
|
+
successful = sum(1 for r in results if r["success"])
|
|
194
|
+
failed = len(results) - successful
|
|
195
|
+
|
|
196
|
+
click.echo(f"\n📊 {operation} Summary:")
|
|
197
|
+
click.echo(f" ✅ Successfully processed: {successful} files")
|
|
198
|
+
if failed > 0:
|
|
199
|
+
click.echo(f" ❌ Failed: {failed} files")
|
|
200
|
+
|
|
201
|
+
# Show failed files
|
|
202
|
+
click.echo("\n Failed files:")
|
|
203
|
+
for result in results:
|
|
204
|
+
if not result["success"]:
|
|
205
|
+
click.echo(f" - {result['file_path']}: {result['error']}")
|
|
206
|
+
|
|
207
|
+
click.echo(f"\n📁 Output saved to: {output_dir}")
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def process_documents(
|
|
211
|
+
path: str,
|
|
212
|
+
method: str,
|
|
213
|
+
api_key: Optional[str],
|
|
214
|
+
output_dir: Optional[str],
|
|
215
|
+
max_pages: Optional[int],
|
|
216
|
+
extensions: Optional[str],
|
|
217
|
+
max_concurrent: int,
|
|
218
|
+
base_url: str,
|
|
219
|
+
page_range: Optional[str],
|
|
220
|
+
skip_cache: bool,
|
|
221
|
+
# Convert-specific options
|
|
222
|
+
output_format: Optional[str] = None,
|
|
223
|
+
force_ocr: bool = False,
|
|
224
|
+
format_lines: bool = False,
|
|
225
|
+
paginate: bool = False,
|
|
226
|
+
use_llm: bool = False,
|
|
227
|
+
strip_existing_ocr: bool = False,
|
|
228
|
+
disable_image_extraction: bool = False,
|
|
229
|
+
block_correction_prompt: Optional[str] = None,
|
|
230
|
+
page_schema: Optional[str] = None,
|
|
231
|
+
):
|
|
232
|
+
"""Unified document processing function"""
|
|
233
|
+
try:
|
|
234
|
+
# Validate inputs
|
|
235
|
+
if api_key is None:
|
|
236
|
+
api_key = settings.DATALAB_API_KEY
|
|
237
|
+
|
|
238
|
+
if api_key is None:
|
|
239
|
+
raise DatalabError(
|
|
240
|
+
"You must either pass in an api key via --api_key or set the DATALAB_API_KEY env variable."
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
if base_url is None:
|
|
244
|
+
base_url = settings.DATALAB_HOST
|
|
245
|
+
|
|
246
|
+
output_dir = setup_output_directory(output_dir)
|
|
247
|
+
file_extensions = parse_extensions(extensions)
|
|
248
|
+
|
|
249
|
+
# Get files to process
|
|
250
|
+
path = Path(path)
|
|
251
|
+
to_process = get_files_to_process(path, file_extensions)
|
|
252
|
+
|
|
253
|
+
if not to_process:
|
|
254
|
+
click.echo(f"❌ No supported files found in {path}", err=True)
|
|
255
|
+
sys.exit(1)
|
|
256
|
+
|
|
257
|
+
click.echo(f"📂 Found {len(to_process)} files to process")
|
|
258
|
+
|
|
259
|
+
# Create processing options based on method
|
|
260
|
+
if method == "convert":
|
|
261
|
+
options = ConvertOptions(
|
|
262
|
+
output_format=output_format,
|
|
263
|
+
max_pages=max_pages,
|
|
264
|
+
force_ocr=force_ocr,
|
|
265
|
+
format_lines=format_lines,
|
|
266
|
+
paginate=paginate,
|
|
267
|
+
use_llm=use_llm,
|
|
268
|
+
strip_existing_ocr=strip_existing_ocr,
|
|
269
|
+
disable_image_extraction=disable_image_extraction,
|
|
270
|
+
page_range=page_range,
|
|
271
|
+
block_correction_prompt=block_correction_prompt,
|
|
272
|
+
skip_cache=skip_cache,
|
|
273
|
+
page_schema=page_schema,
|
|
274
|
+
)
|
|
275
|
+
else: # method == "ocr"
|
|
276
|
+
options = OCROptions(
|
|
277
|
+
max_pages=max_pages,
|
|
278
|
+
page_range=page_range,
|
|
279
|
+
skip_cache=skip_cache,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
results = asyncio.run(
|
|
283
|
+
process_files_async(
|
|
284
|
+
to_process,
|
|
285
|
+
output_dir,
|
|
286
|
+
method,
|
|
287
|
+
options=options,
|
|
288
|
+
max_concurrent=max_concurrent,
|
|
289
|
+
api_key=api_key,
|
|
290
|
+
base_url=base_url,
|
|
291
|
+
)
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# Show results
|
|
295
|
+
operation = "Conversion" if method == "convert" else "OCR"
|
|
296
|
+
show_results(results, operation, output_dir)
|
|
297
|
+
|
|
298
|
+
except DatalabError as e:
|
|
299
|
+
click.echo(f"❌ Error: {e}", err=True)
|
|
300
|
+
sys.exit(1)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
@click.group()
|
|
304
|
+
@click.version_option(version=settings.VERSION)
|
|
305
|
+
def cli():
|
|
306
|
+
pass
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
@click.command()
|
|
310
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
311
|
+
@common_options
|
|
312
|
+
@marker_options
|
|
313
|
+
def convert(
|
|
314
|
+
path: str,
|
|
315
|
+
api_key: str,
|
|
316
|
+
output_dir: str,
|
|
317
|
+
max_pages: Optional[int],
|
|
318
|
+
extensions: Optional[str],
|
|
319
|
+
max_concurrent: int,
|
|
320
|
+
base_url: str,
|
|
321
|
+
page_range: Optional[str],
|
|
322
|
+
skip_cache: bool,
|
|
323
|
+
output_format: str,
|
|
324
|
+
force_ocr: bool,
|
|
325
|
+
format_lines: bool,
|
|
326
|
+
paginate: bool,
|
|
327
|
+
use_llm: bool,
|
|
328
|
+
strip_existing_ocr: bool,
|
|
329
|
+
disable_image_extraction: bool,
|
|
330
|
+
block_correction_prompt: Optional[str],
|
|
331
|
+
page_schema: Optional[str],
|
|
332
|
+
):
|
|
333
|
+
"""Convert documents to markdown, HTML, or JSON"""
|
|
334
|
+
process_documents(
|
|
335
|
+
path=path,
|
|
336
|
+
method="convert",
|
|
337
|
+
api_key=api_key,
|
|
338
|
+
output_dir=output_dir,
|
|
339
|
+
max_pages=max_pages,
|
|
340
|
+
extensions=extensions,
|
|
341
|
+
max_concurrent=max_concurrent,
|
|
342
|
+
base_url=base_url,
|
|
343
|
+
page_range=page_range,
|
|
344
|
+
skip_cache=skip_cache,
|
|
345
|
+
output_format=output_format,
|
|
346
|
+
force_ocr=force_ocr,
|
|
347
|
+
format_lines=format_lines,
|
|
348
|
+
paginate=paginate,
|
|
349
|
+
use_llm=use_llm,
|
|
350
|
+
strip_existing_ocr=strip_existing_ocr,
|
|
351
|
+
disable_image_extraction=disable_image_extraction,
|
|
352
|
+
block_correction_prompt=block_correction_prompt,
|
|
353
|
+
page_schema=page_schema,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
@click.command()
|
|
358
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
359
|
+
@common_options
|
|
360
|
+
def ocr(
|
|
361
|
+
path: str,
|
|
362
|
+
api_key: str,
|
|
363
|
+
output_dir: str,
|
|
364
|
+
max_pages: Optional[int],
|
|
365
|
+
extensions: Optional[str],
|
|
366
|
+
max_concurrent: int,
|
|
367
|
+
base_url: str,
|
|
368
|
+
page_range: Optional[str],
|
|
369
|
+
skip_cache: bool,
|
|
370
|
+
):
|
|
371
|
+
"""Perform OCR on documents"""
|
|
372
|
+
process_documents(
|
|
373
|
+
path=path,
|
|
374
|
+
method="ocr",
|
|
375
|
+
api_key=api_key,
|
|
376
|
+
output_dir=output_dir,
|
|
377
|
+
max_pages=max_pages,
|
|
378
|
+
extensions=extensions,
|
|
379
|
+
max_concurrent=max_concurrent,
|
|
380
|
+
base_url=base_url,
|
|
381
|
+
page_range=page_range,
|
|
382
|
+
skip_cache=skip_cache,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
# Add commands to CLI group
|
|
387
|
+
cli.add_command(convert)
|
|
388
|
+
cli.add_command(ocr)
|
|
389
|
+
|
|
390
|
+
if __name__ == "__main__":
|
|
391
|
+
cli()
|
|
@@ -68,7 +68,7 @@ class AsyncDatalabClient:
|
|
|
68
68
|
timeout=timeout,
|
|
69
69
|
headers={
|
|
70
70
|
"X-Api-Key": self.api_key,
|
|
71
|
-
"User-Agent": "datalab-python-sdk/
|
|
71
|
+
"User-Agent": f"datalab-python-sdk/{settings.VERSION}",
|
|
72
72
|
},
|
|
73
73
|
)
|
|
74
74
|
|
|
@@ -271,11 +271,10 @@ class DatalabClient:
|
|
|
271
271
|
"""Run async coroutine in sync context"""
|
|
272
272
|
try:
|
|
273
273
|
loop = asyncio.get_event_loop()
|
|
274
|
+
return loop.run_until_complete(self._async_wrapper(coro))
|
|
274
275
|
except RuntimeError:
|
|
275
|
-
loop
|
|
276
|
-
asyncio.
|
|
277
|
-
|
|
278
|
-
return loop.run_until_complete(self._async_wrapper(coro))
|
|
276
|
+
# No event loop exists, create and clean up
|
|
277
|
+
return asyncio.run(self._async_wrapper(coro))
|
|
279
278
|
|
|
280
279
|
async def _async_wrapper(self, coro):
|
|
281
280
|
"""Wrapper to ensure session management"""
|
|
@@ -1,24 +1,29 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "datalab-python-sdk"
|
|
3
|
-
|
|
3
|
+
authors = [
|
|
4
|
+
{name = "Datalab Team", email = "hi@datalab.to"}
|
|
5
|
+
]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
repository = "https://github.com/datalab-to/sdk"
|
|
9
|
+
keywords = ["datalab", "sdk", "document-intelligence", "api"]
|
|
10
|
+
version = "0.1.4"
|
|
4
11
|
description = "SDK for the Datalab document intelligence API"
|
|
5
12
|
requires-python = ">=3.10"
|
|
6
13
|
dependencies = [
|
|
7
14
|
"aiohttp>=3.12.14",
|
|
8
15
|
"click>=8.2.1",
|
|
9
|
-
"pydantic
|
|
10
|
-
"pydantic-settings
|
|
11
|
-
"pytest-asyncio>=1.0.0",
|
|
16
|
+
"pydantic>=2.11.7,<3.0.0",
|
|
17
|
+
"pydantic-settings>=2.10.1,<3.0.0",
|
|
12
18
|
]
|
|
13
19
|
|
|
14
|
-
|
|
15
20
|
[project.scripts]
|
|
16
21
|
datalab = "datalab_sdk.cli:cli"
|
|
17
22
|
|
|
18
|
-
[project.
|
|
23
|
+
[project.dev-dependencies]
|
|
19
24
|
test = [
|
|
20
25
|
"pytest>=7.4.0",
|
|
21
|
-
"pytest-asyncio>=0.
|
|
26
|
+
"pytest-asyncio>=1.0.0",
|
|
22
27
|
"pytest-mock>=3.11.0",
|
|
23
28
|
"pytest-cov>=4.1.0",
|
|
24
29
|
"aiofiles>=23.2.0",
|
|
@@ -33,8 +38,11 @@ packages = ["datalab_sdk"]
|
|
|
33
38
|
|
|
34
39
|
[dependency-groups]
|
|
35
40
|
dev = [
|
|
41
|
+
"aiohttp>=3.12.14",
|
|
42
|
+
"click>=8.2.1",
|
|
36
43
|
"pre-commit>=4.2.0",
|
|
37
44
|
"pytest>=8.4.1",
|
|
45
|
+
"pytest-asyncio>=1.0.0",
|
|
38
46
|
"pytest-xdist>=3.8.0",
|
|
39
47
|
"ruff>=0.12.2",
|
|
40
48
|
]
|