datalab-python-sdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datalab_python_sdk-0.1.0.dist-info/METADATA +17 -0
- datalab_python_sdk-0.1.0.dist-info/RECORD +12 -0
- datalab_python_sdk-0.1.0.dist-info/WHEEL +4 -0
- datalab_python_sdk-0.1.0.dist-info/entry_points.txt +2 -0
- datalab_python_sdk-0.1.0.dist-info/licenses/LICENSE +18 -0
- datalab_sdk/__init__.py +26 -0
- datalab_sdk/cli.py +416 -0
- datalab_sdk/client.py +301 -0
- datalab_sdk/exceptions.py +38 -0
- datalab_sdk/mimetypes.py +18 -0
- datalab_sdk/models.py +155 -0
- datalab_sdk/settings.py +15 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datalab-python-sdk
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Auto-generated SDK for Datalab API
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: aiohttp>=3.12.14
|
|
8
|
+
Requires-Dist: click>=8.2.1
|
|
9
|
+
Requires-Dist: pydantic-settings<3.0.0,>=2.10.1
|
|
10
|
+
Requires-Dist: pydantic<3.0.0,>=2.11.7
|
|
11
|
+
Requires-Dist: pytest-asyncio>=1.0.0
|
|
12
|
+
Provides-Extra: test
|
|
13
|
+
Requires-Dist: aiofiles>=23.2.0; extra == 'test'
|
|
14
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'test'
|
|
15
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == 'test'
|
|
16
|
+
Requires-Dist: pytest-mock>=3.11.0; extra == 'test'
|
|
17
|
+
Requires-Dist: pytest>=7.4.0; extra == 'test'
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
datalab_sdk/__init__.py,sha256=Ou4cGIih1julnRVMmXLipBiQibYGGK_f4DtQggM286E,647
|
|
2
|
+
datalab_sdk/cli.py,sha256=aoEunI4h7hOfS3lnGeC55sIEfMz8avKG6rHtjpH4KJw,13191
|
|
3
|
+
datalab_sdk/client.py,sha256=UoP8cZxZOxzuzaRFekHdwKofYVOqQXk2sKRCOTgj1qk,10065
|
|
4
|
+
datalab_sdk/exceptions.py,sha256=MDfNd-l152OUVhZSfEsE95McJq-IuOsrEsEQ4CRMbEI,785
|
|
5
|
+
datalab_sdk/mimetypes.py,sha256=kgAM5ZRIcbjwBgDeMs4LsjtlaYpbDyRLDkZGy8vpruc,698
|
|
6
|
+
datalab_sdk/models.py,sha256=e2_wHonABi_-TVrfe77gniK1Hh-EjepTrItmbRRQMlY,4954
|
|
7
|
+
datalab_sdk/settings.py,sha256=kVa9mlfkbOHX0VyX154UmeN_um1BPNaVBgRQ-DkaUNQ,338
|
|
8
|
+
datalab_python_sdk-0.1.0.dist-info/METADATA,sha256=Z0qLU5jMAdtZnm9lsA6XxZsEvHMo0YMcKaAOqaEFQIQ,609
|
|
9
|
+
datalab_python_sdk-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
10
|
+
datalab_python_sdk-0.1.0.dist-info/entry_points.txt,sha256=rqWxO21UV57JxxyeF20iBTxHpzBYOA69hCHIqwdn-Ok,48
|
|
11
|
+
datalab_python_sdk-0.1.0.dist-info/licenses/LICENSE,sha256=XGjT-Uz6t7UZzr3JCZfilsGcZEgMm_1aONcRunohxWg,1064
|
|
12
|
+
datalab_python_sdk-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Copyright 2025 Endless Labs Inc
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
4
|
+
this software and associated documentation files (the “Software”), to deal in
|
|
5
|
+
the Software without restriction, including without limitation the rights to
|
|
6
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
7
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
|
8
|
+
subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
15
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
16
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
18
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
datalab_sdk/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Datalab SDK - Python client for Datalab API
|
|
3
|
+
|
|
4
|
+
This SDK provides both synchronous and asynchronous interfaces to the Datalab API,
|
|
5
|
+
supporting document conversion, OCR, layout analysis, and table recognition.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .client import DatalabClient, AsyncDatalabClient
|
|
9
|
+
from .exceptions import DatalabError, DatalabAPIError, DatalabTimeoutError
|
|
10
|
+
from .models import (
|
|
11
|
+
ConversionResult,
|
|
12
|
+
OCRResult,
|
|
13
|
+
ProcessingOptions,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__version__ = "1.0.0"
|
|
17
|
+
__all__ = [
|
|
18
|
+
"DatalabClient",
|
|
19
|
+
"AsyncDatalabClient",
|
|
20
|
+
"DatalabError",
|
|
21
|
+
"DatalabAPIError",
|
|
22
|
+
"DatalabTimeoutError",
|
|
23
|
+
"ConversionResult",
|
|
24
|
+
"OCRResult",
|
|
25
|
+
"ProcessingOptions",
|
|
26
|
+
]
|
datalab_sdk/cli.py
ADDED
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Datalab SDK Command Line Interface
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
import asyncio
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional, List
|
|
11
|
+
import click
|
|
12
|
+
|
|
13
|
+
from datalab_sdk.client import DatalabClient, AsyncDatalabClient
|
|
14
|
+
from datalab_sdk.models import ProcessingOptions
|
|
15
|
+
from datalab_sdk.exceptions import DatalabError
|
|
16
|
+
from datalab_sdk.settings import settings
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_supported_extensions() -> List[str]:
|
|
20
|
+
"""Get list of supported file extensions"""
|
|
21
|
+
return [
|
|
22
|
+
".pdf",
|
|
23
|
+
".png",
|
|
24
|
+
".jpg",
|
|
25
|
+
".jpeg",
|
|
26
|
+
".gif",
|
|
27
|
+
".tiff",
|
|
28
|
+
".webp",
|
|
29
|
+
".docx",
|
|
30
|
+
".doc",
|
|
31
|
+
".xlsx",
|
|
32
|
+
".xls",
|
|
33
|
+
".pptx",
|
|
34
|
+
".ppt",
|
|
35
|
+
".html",
|
|
36
|
+
".epub",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def find_files_in_directory(
|
|
41
|
+
directory: Path, extensions: Optional[List[str]] = None
|
|
42
|
+
) -> List[Path]:
|
|
43
|
+
"""Find all supported files in a directory"""
|
|
44
|
+
if extensions is None:
|
|
45
|
+
extensions = get_supported_extensions()
|
|
46
|
+
|
|
47
|
+
files = []
|
|
48
|
+
for file_path in directory.rglob("*"):
|
|
49
|
+
if file_path.is_file() and file_path.suffix.lower() in extensions:
|
|
50
|
+
files.append(file_path)
|
|
51
|
+
|
|
52
|
+
return files
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def process_files_async(
|
|
56
|
+
files: List[Path],
|
|
57
|
+
output_dir: Path,
|
|
58
|
+
method: str,
|
|
59
|
+
options: Optional[ProcessingOptions] = None,
|
|
60
|
+
max_pages: Optional[int] = None,
|
|
61
|
+
max_concurrent: int = 5,
|
|
62
|
+
) -> List[dict]:
|
|
63
|
+
"""Process files asynchronously"""
|
|
64
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
65
|
+
|
|
66
|
+
async def process_single_file(file_path: Path) -> dict:
|
|
67
|
+
async with semaphore:
|
|
68
|
+
try:
|
|
69
|
+
# Create output path
|
|
70
|
+
relative_path = file_path.name
|
|
71
|
+
output_path = output_dir / Path(relative_path).stem
|
|
72
|
+
|
|
73
|
+
async with AsyncDatalabClient() as client:
|
|
74
|
+
if method == "convert":
|
|
75
|
+
result = await client.convert(
|
|
76
|
+
file_path, options=options, save_output=output_path
|
|
77
|
+
)
|
|
78
|
+
else: # method == 'ocr'
|
|
79
|
+
result = await client.ocr(
|
|
80
|
+
file_path, max_pages=max_pages, save_output=output_path
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
"file_path": str(file_path),
|
|
85
|
+
"output_path": str(output_path),
|
|
86
|
+
"success": result.success,
|
|
87
|
+
"error": result.error,
|
|
88
|
+
"page_count": result.page_count,
|
|
89
|
+
}
|
|
90
|
+
except Exception as e:
|
|
91
|
+
return {
|
|
92
|
+
"file_path": str(file_path),
|
|
93
|
+
"output_path": None,
|
|
94
|
+
"success": False,
|
|
95
|
+
"error": str(e),
|
|
96
|
+
"page_count": None,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# Process all files concurrently
|
|
100
|
+
tasks = [process_single_file(file_path) for file_path in files]
|
|
101
|
+
results = await asyncio.gather(*tasks)
|
|
102
|
+
|
|
103
|
+
return results
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def process_single_file_sync(
|
|
107
|
+
file_path: Path,
|
|
108
|
+
output_dir: Path,
|
|
109
|
+
method: str,
|
|
110
|
+
options: Optional[ProcessingOptions] = None,
|
|
111
|
+
max_pages: Optional[int] = None,
|
|
112
|
+
) -> dict:
|
|
113
|
+
"""Process a single file synchronously"""
|
|
114
|
+
try:
|
|
115
|
+
# Create output path
|
|
116
|
+
output_path = output_dir / file_path.stem
|
|
117
|
+
output_file = output_path / file_path.stem
|
|
118
|
+
|
|
119
|
+
client = DatalabClient()
|
|
120
|
+
if method == "convert":
|
|
121
|
+
result = client.convert(file_path, options=options, save_output=output_file)
|
|
122
|
+
else: # method == 'ocr'
|
|
123
|
+
result = client.ocr(file_path, max_pages=max_pages, save_output=output_file)
|
|
124
|
+
|
|
125
|
+
return {
|
|
126
|
+
"file_path": str(file_path),
|
|
127
|
+
"output_path": str(output_path),
|
|
128
|
+
"success": result.success,
|
|
129
|
+
"error": result.error,
|
|
130
|
+
"page_count": result.page_count,
|
|
131
|
+
}
|
|
132
|
+
except Exception as e:
|
|
133
|
+
return {
|
|
134
|
+
"file_path": str(file_path),
|
|
135
|
+
"output_path": None,
|
|
136
|
+
"success": False,
|
|
137
|
+
"error": str(e),
|
|
138
|
+
"page_count": None,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@click.group()
|
|
143
|
+
@click.version_option(version="1.0.0")
|
|
144
|
+
def cli():
|
|
145
|
+
"""Datalab SDK - Command line interface for document processing"""
|
|
146
|
+
pass
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@click.command()
|
|
150
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
151
|
+
@click.option("--api_key", required=False, help="Datalab API key")
|
|
152
|
+
@click.option(
|
|
153
|
+
"--output_dir", "-o", required=False, type=click.Path(), help="Output directory"
|
|
154
|
+
)
|
|
155
|
+
@click.option(
|
|
156
|
+
"--format",
|
|
157
|
+
"output_format",
|
|
158
|
+
default="markdown",
|
|
159
|
+
type=click.Choice(["markdown", "html", "json"]),
|
|
160
|
+
help="Output format",
|
|
161
|
+
)
|
|
162
|
+
@click.option("--max_pages", type=int, help="Maximum number of pages to process")
|
|
163
|
+
@click.option("--force_ocr", is_flag=True, help="Force OCR on every page")
|
|
164
|
+
@click.option(
|
|
165
|
+
"--format_lines", is_flag=True, help="Partially OCR lines for better formatting"
|
|
166
|
+
)
|
|
167
|
+
@click.option("--paginate", is_flag=True, help="Add page delimiters to output")
|
|
168
|
+
@click.option("--use_llm", is_flag=True, help="Use LLM to enhance accuracy")
|
|
169
|
+
@click.option("--page_range", help='Page range to process (e.g., "0-2" or "0,1,2")')
|
|
170
|
+
@click.option(
|
|
171
|
+
"--extensions", help="Comma-separated list of file extensions (for directories)"
|
|
172
|
+
)
|
|
173
|
+
@click.option(
|
|
174
|
+
"--max_concurrent", default=5, type=int, help="Maximum concurrent requests"
|
|
175
|
+
)
|
|
176
|
+
@click.option("--base_url", default=settings.DATALAB_HOST, help="API base URL")
|
|
177
|
+
def convert(
|
|
178
|
+
path: str,
|
|
179
|
+
api_key: str,
|
|
180
|
+
output_dir: str,
|
|
181
|
+
output_format: str,
|
|
182
|
+
max_pages: Optional[int],
|
|
183
|
+
force_ocr: bool,
|
|
184
|
+
format_lines: bool,
|
|
185
|
+
paginate: bool,
|
|
186
|
+
use_llm: bool,
|
|
187
|
+
page_range: Optional[str],
|
|
188
|
+
extensions: Optional[str],
|
|
189
|
+
max_concurrent: int,
|
|
190
|
+
base_url: str,
|
|
191
|
+
):
|
|
192
|
+
"""Convert documents to markdown, HTML, or JSON"""
|
|
193
|
+
|
|
194
|
+
if api_key is None:
|
|
195
|
+
api_key = settings.DATALAB_API_KEY
|
|
196
|
+
if api_key is None:
|
|
197
|
+
raise DatalabError(
|
|
198
|
+
"You must either pass in an api key via --api_key or set the DATALAB_API_KEY env variable."
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
path = Path(path)
|
|
202
|
+
|
|
203
|
+
if output_dir is None:
|
|
204
|
+
output_dir = os.getcwd()
|
|
205
|
+
|
|
206
|
+
output_dir = Path(output_dir)
|
|
207
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
208
|
+
|
|
209
|
+
# Parse extensions
|
|
210
|
+
file_extensions = None
|
|
211
|
+
if extensions:
|
|
212
|
+
file_extensions = [ext.strip() for ext in extensions.split(",")]
|
|
213
|
+
file_extensions = [
|
|
214
|
+
ext if ext.startswith(".") else f".{ext}" for ext in file_extensions
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
# Create processing options
|
|
218
|
+
options = ProcessingOptions(
|
|
219
|
+
output_format=output_format,
|
|
220
|
+
max_pages=max_pages,
|
|
221
|
+
force_ocr=force_ocr,
|
|
222
|
+
format_lines=format_lines,
|
|
223
|
+
paginate=paginate,
|
|
224
|
+
use_llm=use_llm,
|
|
225
|
+
page_range=page_range,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
# Set API key and base URL in client
|
|
230
|
+
settings.DATALAB_API_KEY = api_key
|
|
231
|
+
settings.DATALAB_HOST = base_url
|
|
232
|
+
|
|
233
|
+
if path.is_file():
|
|
234
|
+
# Single file processing
|
|
235
|
+
if file_extensions and path.suffix.lower() not in file_extensions:
|
|
236
|
+
click.echo(f"❌ Skipping {path}: unsupported file type", err=True)
|
|
237
|
+
sys.exit(1)
|
|
238
|
+
|
|
239
|
+
result = process_single_file_sync(path, output_dir, "convert", options)
|
|
240
|
+
|
|
241
|
+
if result["success"]:
|
|
242
|
+
click.echo(f"✅ Successfully converted {result['file_path']}")
|
|
243
|
+
if result["page_count"]:
|
|
244
|
+
click.echo(f" 📄 Processed {result['page_count']} pages")
|
|
245
|
+
if result["output_path"]:
|
|
246
|
+
click.echo(f" 📁 Output saved to: {result['output_path']}")
|
|
247
|
+
else:
|
|
248
|
+
click.echo(
|
|
249
|
+
f"❌ Failed to convert {result['file_path']}: {result['error']}",
|
|
250
|
+
err=True,
|
|
251
|
+
)
|
|
252
|
+
sys.exit(1)
|
|
253
|
+
else:
|
|
254
|
+
# Directory processing
|
|
255
|
+
files = find_files_in_directory(path, file_extensions)
|
|
256
|
+
|
|
257
|
+
if not files:
|
|
258
|
+
click.echo(f"❌ No supported files found in {path}", err=True)
|
|
259
|
+
sys.exit(1)
|
|
260
|
+
|
|
261
|
+
click.echo(f"📂 Found {len(files)} files to process")
|
|
262
|
+
|
|
263
|
+
# Process files asynchronously
|
|
264
|
+
results = asyncio.run(
|
|
265
|
+
process_files_async(
|
|
266
|
+
files, output_dir, "convert", options, max_pages, max_concurrent
|
|
267
|
+
)
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Show results
|
|
271
|
+
successful = sum(1 for r in results if r["success"])
|
|
272
|
+
failed = len(results) - successful
|
|
273
|
+
|
|
274
|
+
click.echo("\n📊 Conversion Summary:")
|
|
275
|
+
click.echo(f" ✅ Successfully converted: {successful} files")
|
|
276
|
+
if failed > 0:
|
|
277
|
+
click.echo(f" ❌ Failed: {failed} files")
|
|
278
|
+
|
|
279
|
+
# Show failed files
|
|
280
|
+
click.echo("\n Failed files:")
|
|
281
|
+
for result in results:
|
|
282
|
+
if not result["success"]:
|
|
283
|
+
click.echo(f" - {result['file_path']}: {result['error']}")
|
|
284
|
+
|
|
285
|
+
click.echo(f"\n📁 Output saved to: {output_dir}")
|
|
286
|
+
|
|
287
|
+
except DatalabError as e:
|
|
288
|
+
click.echo(f"❌ Error: {e}", err=True)
|
|
289
|
+
sys.exit(1)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
@click.command()
|
|
293
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
294
|
+
@click.option("--api_key", required=False, help="Datalab API key")
|
|
295
|
+
@click.option(
|
|
296
|
+
"--output_dir", "-o", required=False, type=click.Path(), help="Output directory"
|
|
297
|
+
)
|
|
298
|
+
@click.option("--max_pages", type=int, help="Maximum number of pages to process")
|
|
299
|
+
@click.option(
|
|
300
|
+
"--extensions", help="Comma-separated list of file extensions (for directories)"
|
|
301
|
+
)
|
|
302
|
+
@click.option(
|
|
303
|
+
"--max_concurrent", default=5, type=int, help="Maximum concurrent requests"
|
|
304
|
+
)
|
|
305
|
+
@click.option("--base_url", default=settings.DATALAB_HOST, help="API base URL")
|
|
306
|
+
def ocr(
|
|
307
|
+
path: str,
|
|
308
|
+
api_key: str,
|
|
309
|
+
output_dir: str,
|
|
310
|
+
max_pages: Optional[int],
|
|
311
|
+
extensions: Optional[str],
|
|
312
|
+
max_concurrent: int,
|
|
313
|
+
base_url: str,
|
|
314
|
+
):
|
|
315
|
+
"""Perform OCR on documents"""
|
|
316
|
+
|
|
317
|
+
if api_key is None:
|
|
318
|
+
api_key = settings.DATALAB_API_KEY
|
|
319
|
+
if api_key is None:
|
|
320
|
+
raise DatalabError(
|
|
321
|
+
"You must either pass in an api key via --api_key or set the DATALAB_API_KEY env variable."
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
path = Path(path)
|
|
325
|
+
|
|
326
|
+
if output_dir is None:
|
|
327
|
+
output_dir = os.getcwd()
|
|
328
|
+
|
|
329
|
+
output_dir = Path(output_dir)
|
|
330
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
331
|
+
|
|
332
|
+
# Parse extensions
|
|
333
|
+
file_extensions = None
|
|
334
|
+
if extensions:
|
|
335
|
+
file_extensions = [ext.strip() for ext in extensions.split(",")]
|
|
336
|
+
file_extensions = [
|
|
337
|
+
ext if ext.startswith(".") else f".{ext}" for ext in file_extensions
|
|
338
|
+
]
|
|
339
|
+
|
|
340
|
+
try:
|
|
341
|
+
# Set API key and base URL in client
|
|
342
|
+
settings.DATALAB_API_KEY = api_key
|
|
343
|
+
settings.DATALAB_HOST = base_url
|
|
344
|
+
|
|
345
|
+
if path.is_file():
|
|
346
|
+
# Single file processing
|
|
347
|
+
if file_extensions and path.suffix.lower() not in file_extensions:
|
|
348
|
+
click.echo(f"❌ Skipping {path}: unsupported file type", err=True)
|
|
349
|
+
sys.exit(1)
|
|
350
|
+
|
|
351
|
+
result = process_single_file_sync(
|
|
352
|
+
path, output_dir, "ocr", max_pages=max_pages
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
if result["success"]:
|
|
356
|
+
click.echo(f"✅ Successfully performed OCR on {result['file_path']}")
|
|
357
|
+
if result["page_count"]:
|
|
358
|
+
click.echo(f" 📄 Processed {result['page_count']} pages")
|
|
359
|
+
if result["output_path"]:
|
|
360
|
+
click.echo(f" 📁 Output saved to: {result['output_path']}")
|
|
361
|
+
else:
|
|
362
|
+
click.echo(
|
|
363
|
+
f"❌ Failed OCR on {result['file_path']}: {result['error']}",
|
|
364
|
+
err=True,
|
|
365
|
+
)
|
|
366
|
+
sys.exit(1)
|
|
367
|
+
else:
|
|
368
|
+
# Directory processing
|
|
369
|
+
files = find_files_in_directory(path, file_extensions)
|
|
370
|
+
|
|
371
|
+
if not files:
|
|
372
|
+
click.echo(f"❌ No supported files found in {path}", err=True)
|
|
373
|
+
sys.exit(1)
|
|
374
|
+
|
|
375
|
+
click.echo(f"📂 Found {len(files)} files to process")
|
|
376
|
+
|
|
377
|
+
# Process files asynchronously
|
|
378
|
+
results = asyncio.run(
|
|
379
|
+
process_files_async(
|
|
380
|
+
files,
|
|
381
|
+
output_dir,
|
|
382
|
+
"ocr",
|
|
383
|
+
max_pages=max_pages,
|
|
384
|
+
max_concurrent=max_concurrent,
|
|
385
|
+
)
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# Show results
|
|
389
|
+
successful = sum(1 for r in results if r["success"])
|
|
390
|
+
failed = len(results) - successful
|
|
391
|
+
|
|
392
|
+
click.echo("\n📊 OCR Summary:")
|
|
393
|
+
click.echo(f" ✅ Successfully processed: {successful} files")
|
|
394
|
+
if failed > 0:
|
|
395
|
+
click.echo(f" ❌ Failed: {failed} files")
|
|
396
|
+
|
|
397
|
+
# Show failed files
|
|
398
|
+
click.echo("\n Failed files:")
|
|
399
|
+
for result in results:
|
|
400
|
+
if not result["success"]:
|
|
401
|
+
click.echo(f" - {result['file_path']}: {result['error']}")
|
|
402
|
+
|
|
403
|
+
click.echo(f"\n📁 Output saved to: {output_dir}")
|
|
404
|
+
|
|
405
|
+
except DatalabError as e:
|
|
406
|
+
click.echo(f"❌ Error: {e}", err=True)
|
|
407
|
+
sys.exit(1)
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
# Add commands to CLI group
|
|
411
|
+
cli.add_command(convert)
|
|
412
|
+
cli.add_command(ocr)
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
if __name__ == "__main__":
|
|
416
|
+
cli()
|
datalab_sdk/client.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Datalab API client - async core with sync wrapper
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import mimetypes
|
|
7
|
+
import aiohttp
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Union, Optional, Dict, Any
|
|
10
|
+
|
|
11
|
+
from datalab_sdk.exceptions import (
|
|
12
|
+
DatalabAPIError,
|
|
13
|
+
DatalabTimeoutError,
|
|
14
|
+
DatalabFileError,
|
|
15
|
+
)
|
|
16
|
+
from datalab_sdk.mimetypes import MIMETYPE_MAP
|
|
17
|
+
from datalab_sdk.models import ConversionResult, OCRResult, ProcessingOptions
|
|
18
|
+
from datalab_sdk.settings import settings
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AsyncDatalabClient:
|
|
22
|
+
"""Asynchronous client for Datalab API"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
api_key: str | None = None,
|
|
27
|
+
base_url: str = settings.DATALAB_HOST,
|
|
28
|
+
timeout: int = 300,
|
|
29
|
+
):
|
|
30
|
+
"""
|
|
31
|
+
Initialize the async Datalab client
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
api_key: Your Datalab API key
|
|
35
|
+
base_url: Base URL for the API (default: https://www.datalab.to)
|
|
36
|
+
timeout: Default timeout for requests in seconds
|
|
37
|
+
"""
|
|
38
|
+
if api_key is None:
|
|
39
|
+
api_key = settings.DATALAB_API_KEY
|
|
40
|
+
if api_key is None:
|
|
41
|
+
raise DatalabAPIError("You must pass in an api_key or set DATALAB_API_KEY.")
|
|
42
|
+
|
|
43
|
+
self.api_key = api_key
|
|
44
|
+
self.base_url = base_url.rstrip("/")
|
|
45
|
+
self.timeout = timeout
|
|
46
|
+
self._session = None
|
|
47
|
+
|
|
48
|
+
async def __aenter__(self):
|
|
49
|
+
"""Async context manager entry"""
|
|
50
|
+
await self._ensure_session()
|
|
51
|
+
return self
|
|
52
|
+
|
|
53
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
54
|
+
"""Async context manager exit"""
|
|
55
|
+
await self.close()
|
|
56
|
+
|
|
57
|
+
async def _ensure_session(self):
|
|
58
|
+
"""Ensure aiohttp session is created"""
|
|
59
|
+
if self._session is None:
|
|
60
|
+
timeout = aiohttp.ClientTimeout(total=self.timeout)
|
|
61
|
+
self._session = aiohttp.ClientSession(
|
|
62
|
+
timeout=timeout,
|
|
63
|
+
headers={
|
|
64
|
+
"X-Api-Key": self.api_key,
|
|
65
|
+
"User-Agent": "datalab-python-sdk/0.1.0",
|
|
66
|
+
},
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
async def close(self):
|
|
70
|
+
"""Close the aiohttp session"""
|
|
71
|
+
if self._session:
|
|
72
|
+
await self._session.close()
|
|
73
|
+
self._session = None
|
|
74
|
+
|
|
75
|
+
async def _make_request(
|
|
76
|
+
self, method: str, endpoint: str, **kwargs
|
|
77
|
+
) -> Dict[str, Any]:
|
|
78
|
+
"""Make an async request to the API"""
|
|
79
|
+
await self._ensure_session()
|
|
80
|
+
|
|
81
|
+
url = endpoint
|
|
82
|
+
if not endpoint.startswith("http"):
|
|
83
|
+
url = f"{self.base_url}/{endpoint.lstrip('/')}"
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
async with self._session.request(method, url, **kwargs) as response:
|
|
87
|
+
response.raise_for_status()
|
|
88
|
+
return await response.json()
|
|
89
|
+
except asyncio.TimeoutError:
|
|
90
|
+
raise DatalabTimeoutError(f"Request timed out after {self.timeout} seconds")
|
|
91
|
+
except aiohttp.ClientResponseError as e:
|
|
92
|
+
try:
|
|
93
|
+
error_data = await response.json()
|
|
94
|
+
error_message = error_data.get("error", str(e))
|
|
95
|
+
except Exception:
|
|
96
|
+
error_message = str(e)
|
|
97
|
+
raise DatalabAPIError(
|
|
98
|
+
error_message,
|
|
99
|
+
e.status,
|
|
100
|
+
error_data if "error_data" in locals() else None,
|
|
101
|
+
)
|
|
102
|
+
except aiohttp.ClientError as e:
|
|
103
|
+
raise DatalabAPIError(f"Request failed: {str(e)}")
|
|
104
|
+
|
|
105
|
+
async def _poll_result(
|
|
106
|
+
self, check_url: str, max_polls: int = 300, poll_interval: int = 1
|
|
107
|
+
) -> Dict[str, Any]:
|
|
108
|
+
"""Poll for result completion"""
|
|
109
|
+
full_url = (
|
|
110
|
+
check_url
|
|
111
|
+
if check_url.startswith("http")
|
|
112
|
+
else f"{self.base_url}/{check_url.lstrip('/')}"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
for i in range(max_polls):
|
|
116
|
+
data = await self._make_request("GET", full_url)
|
|
117
|
+
|
|
118
|
+
if data.get("status") == "complete":
|
|
119
|
+
return data
|
|
120
|
+
|
|
121
|
+
if not data.get("success", True) and not data.get("status") == "processing":
|
|
122
|
+
raise DatalabAPIError(
|
|
123
|
+
f"Processing failed: {data.get('error', 'Unknown error')}"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
await asyncio.sleep(poll_interval)
|
|
127
|
+
|
|
128
|
+
raise DatalabTimeoutError(
|
|
129
|
+
f"Polling timed out after {max_polls * poll_interval} seconds"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def _prepare_file_data(self, file_path: Union[str, Path]) -> tuple:
|
|
133
|
+
"""Prepare file data for upload"""
|
|
134
|
+
file_path = Path(file_path)
|
|
135
|
+
|
|
136
|
+
if not file_path.exists():
|
|
137
|
+
raise DatalabFileError(f"File not found: {file_path}")
|
|
138
|
+
|
|
139
|
+
# Determine MIME type
|
|
140
|
+
mime_type, _ = mimetypes.guess_type(str(file_path))
|
|
141
|
+
if not mime_type:
|
|
142
|
+
# Try to detect from extension
|
|
143
|
+
extension = file_path.suffix.lower()
|
|
144
|
+
mime_type = MIMETYPE_MAP.get(extension, "application/octet-stream")
|
|
145
|
+
|
|
146
|
+
return file_path.name, file_path.read_bytes(), mime_type
|
|
147
|
+
|
|
148
|
+
# Convenient endpoint-specific methods
|
|
149
|
+
async def convert(
|
|
150
|
+
self,
|
|
151
|
+
file_path: Union[str, Path],
|
|
152
|
+
options: Optional[ProcessingOptions] = None,
|
|
153
|
+
save_output: Optional[Union[str, Path]] = None,
|
|
154
|
+
) -> ConversionResult:
|
|
155
|
+
"""Convert a document using the marker endpoint"""
|
|
156
|
+
if options is None:
|
|
157
|
+
options = ProcessingOptions()
|
|
158
|
+
|
|
159
|
+
filename, file_data, mime_type = self._prepare_file_data(file_path)
|
|
160
|
+
|
|
161
|
+
form_data = aiohttp.FormData()
|
|
162
|
+
form_data.add_field(
|
|
163
|
+
"file", file_data, filename=filename, content_type=mime_type
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
for key, value in options.to_form_data().items():
|
|
167
|
+
if isinstance(value, tuple):
|
|
168
|
+
form_data.add_field(key, str(value[1]))
|
|
169
|
+
else:
|
|
170
|
+
form_data.add_field(key, str(value))
|
|
171
|
+
|
|
172
|
+
initial_data = await self._make_request(
|
|
173
|
+
"POST", "/api/v1/marker", data=form_data
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
if not initial_data.get("success"):
|
|
177
|
+
raise DatalabAPIError(
|
|
178
|
+
f"Request failed: {initial_data.get('error', 'Unknown error')}"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
result_data = await self._poll_result(initial_data["request_check_url"])
|
|
182
|
+
|
|
183
|
+
result = ConversionResult(
|
|
184
|
+
success=result_data.get("success", False),
|
|
185
|
+
output_format=result_data.get("output_format", options.output_format),
|
|
186
|
+
markdown=result_data.get("markdown"),
|
|
187
|
+
html=result_data.get("html"),
|
|
188
|
+
json=result_data.get("json"),
|
|
189
|
+
images=result_data.get("images"),
|
|
190
|
+
metadata=result_data.get("metadata"),
|
|
191
|
+
error=result_data.get("error"),
|
|
192
|
+
page_count=result_data.get("page_count"),
|
|
193
|
+
status=result_data.get("status", "complete"),
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Save output if requested
|
|
197
|
+
if save_output and result.success:
|
|
198
|
+
output_path = Path(save_output)
|
|
199
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
200
|
+
result.save_output(output_path)
|
|
201
|
+
|
|
202
|
+
return result
|
|
203
|
+
|
|
204
|
+
async def ocr(
|
|
205
|
+
self,
|
|
206
|
+
file_path: Union[str, Path],
|
|
207
|
+
max_pages: Optional[int] = None,
|
|
208
|
+
save_output: Optional[Union[str, Path]] = None,
|
|
209
|
+
) -> OCRResult:
|
|
210
|
+
"""Perform OCR on a document"""
|
|
211
|
+
filename, file_data, mime_type = self._prepare_file_data(file_path)
|
|
212
|
+
|
|
213
|
+
form_data = aiohttp.FormData()
|
|
214
|
+
form_data.add_field(
|
|
215
|
+
"file", file_data, filename=filename, content_type=mime_type
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
if max_pages is not None:
|
|
219
|
+
form_data.add_field("max_pages", str(max_pages))
|
|
220
|
+
|
|
221
|
+
initial_data = await self._make_request("POST", "/api/v1/ocr", data=form_data)
|
|
222
|
+
|
|
223
|
+
if not initial_data.get("success"):
|
|
224
|
+
raise DatalabAPIError(
|
|
225
|
+
f"Request failed: {initial_data.get('error', 'Unknown error')}"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
result_data = await self._poll_result(initial_data["request_check_url"])
|
|
229
|
+
|
|
230
|
+
result = OCRResult(
|
|
231
|
+
success=result_data.get("success", False),
|
|
232
|
+
pages=result_data.get("pages", []),
|
|
233
|
+
error=result_data.get("error"),
|
|
234
|
+
page_count=result_data.get("page_count"),
|
|
235
|
+
status=result_data.get("status", "complete"),
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Save output if requested
|
|
239
|
+
if save_output and result.success:
|
|
240
|
+
output_path = Path(save_output)
|
|
241
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
242
|
+
result.save_output(output_path)
|
|
243
|
+
|
|
244
|
+
return result
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class DatalabClient:
|
|
248
|
+
"""Synchronous wrapper around AsyncDatalabClient"""
|
|
249
|
+
|
|
250
|
+
def __init__(
|
|
251
|
+
self,
|
|
252
|
+
api_key: str | None = None,
|
|
253
|
+
base_url: str = settings.DATALAB_HOST,
|
|
254
|
+
timeout: int = 300,
|
|
255
|
+
):
|
|
256
|
+
"""
|
|
257
|
+
Initialize the Datalab client
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
api_key: Your Datalab API key
|
|
261
|
+
base_url: Base URL for the API (default: https://www.datalab.to)
|
|
262
|
+
timeout: Default timeout for requests in seconds
|
|
263
|
+
"""
|
|
264
|
+
self._async_client = AsyncDatalabClient(api_key, base_url, timeout)
|
|
265
|
+
|
|
266
|
+
def _run_async(self, coro):
|
|
267
|
+
"""Run async coroutine in sync context"""
|
|
268
|
+
try:
|
|
269
|
+
loop = asyncio.get_event_loop()
|
|
270
|
+
except RuntimeError:
|
|
271
|
+
loop = asyncio.new_event_loop()
|
|
272
|
+
asyncio.set_event_loop(loop)
|
|
273
|
+
|
|
274
|
+
return loop.run_until_complete(self._async_wrapper(coro))
|
|
275
|
+
|
|
276
|
+
async def _async_wrapper(self, coro):
|
|
277
|
+
"""Wrapper to ensure session management"""
|
|
278
|
+
async with self._async_client:
|
|
279
|
+
return await coro
|
|
280
|
+
|
|
281
|
+
def convert(
|
|
282
|
+
self,
|
|
283
|
+
file_path: Union[str, Path],
|
|
284
|
+
options: Optional[ProcessingOptions] = None,
|
|
285
|
+
save_output: Optional[Union[str, Path]] = None,
|
|
286
|
+
) -> ConversionResult:
|
|
287
|
+
"""Convert a document using the marker endpoint (sync version)"""
|
|
288
|
+
return self._run_async(
|
|
289
|
+
self._async_client.convert(file_path, options, save_output)
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
def ocr(
|
|
293
|
+
self,
|
|
294
|
+
file_path: Union[str, Path],
|
|
295
|
+
max_pages: Optional[int] = None,
|
|
296
|
+
save_output: Optional[Union[str, Path]] = None,
|
|
297
|
+
) -> OCRResult:
|
|
298
|
+
"""Perform OCR on a document (sync version)"""
|
|
299
|
+
return self._run_async(
|
|
300
|
+
self._async_client.ocr(file_path, max_pages, save_output)
|
|
301
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Datalab SDK exceptions
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DatalabError(Exception):
|
|
7
|
+
"""Base exception for Datalab SDK errors"""
|
|
8
|
+
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DatalabAPIError(DatalabError):
|
|
13
|
+
"""Exception raised when the API returns an error response"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self, message: str, status_code: int = None, response_data: dict = None
|
|
17
|
+
):
|
|
18
|
+
super().__init__(message)
|
|
19
|
+
self.status_code = status_code
|
|
20
|
+
self.response_data = response_data
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DatalabTimeoutError(DatalabError):
|
|
24
|
+
"""Exception raised when a request times out"""
|
|
25
|
+
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DatalabFileError(DatalabError):
|
|
30
|
+
"""Exception raised when there's an issue with file operations"""
|
|
31
|
+
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DatalabValidationError(DatalabError):
|
|
36
|
+
"""Exception raised when input validation fails"""
|
|
37
|
+
|
|
38
|
+
pass
|
datalab_sdk/mimetypes.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
MIMETYPE_MAP = {
|
|
2
|
+
".pdf": "application/pdf",
|
|
3
|
+
".png": "image/png",
|
|
4
|
+
".jpg": "image/jpeg",
|
|
5
|
+
".jpeg": "image/jpeg",
|
|
6
|
+
".gif": "image/gif",
|
|
7
|
+
".tiff": "image/tiff",
|
|
8
|
+
".webp": "image/webp",
|
|
9
|
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
10
|
+
".doc": "application/msword",
|
|
11
|
+
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
12
|
+
".xls": "application/vnd.ms-excel",
|
|
13
|
+
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
14
|
+
".ppt": "application/vnd.ms-powerpoint",
|
|
15
|
+
".html": "text/html",
|
|
16
|
+
".epub": "application/epub+zip",
|
|
17
|
+
}
|
|
18
|
+
SUPPORTED_EXTENSIONS = list(MIMETYPE_MAP.keys())
|
datalab_sdk/models.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Datalab SDK data models
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Dict, List, Optional, Any, Union
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import json
|
|
9
|
+
import base64
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ProcessingOptions:
|
|
14
|
+
"""Options for document processing"""
|
|
15
|
+
|
|
16
|
+
# Common options
|
|
17
|
+
max_pages: Optional[int] = None
|
|
18
|
+
output_format: str = "markdown" # markdown, json, html
|
|
19
|
+
skip_cache: bool = True
|
|
20
|
+
|
|
21
|
+
# Marker specific options
|
|
22
|
+
force_ocr: bool = False
|
|
23
|
+
format_lines: bool = False
|
|
24
|
+
paginate: bool = False
|
|
25
|
+
use_llm: bool = False
|
|
26
|
+
strip_existing_ocr: bool = False
|
|
27
|
+
disable_image_extraction: bool = False
|
|
28
|
+
page_range: Optional[str] = None
|
|
29
|
+
block_correction_prompt: Optional[str] = None
|
|
30
|
+
additional_config: Optional[Dict[str, Any]] = None
|
|
31
|
+
page_schema: Optional[Dict[str, Any]] = None
|
|
32
|
+
|
|
33
|
+
# Table recognition options
|
|
34
|
+
skip_table_detection: bool = False
|
|
35
|
+
detect_cell_boxes: bool = False
|
|
36
|
+
|
|
37
|
+
def to_form_data(self) -> Dict[str, Any]:
|
|
38
|
+
"""Convert to form data format for API requests"""
|
|
39
|
+
form_data = {}
|
|
40
|
+
|
|
41
|
+
# Add non-None values
|
|
42
|
+
for key, value in self.__dict__.items():
|
|
43
|
+
if value is not None:
|
|
44
|
+
if isinstance(value, bool):
|
|
45
|
+
form_data[key] = (None, value)
|
|
46
|
+
elif isinstance(value, (dict, list)):
|
|
47
|
+
form_data[key] = (None, json.dumps(value, indent=2))
|
|
48
|
+
else:
|
|
49
|
+
form_data[key] = (None, value)
|
|
50
|
+
|
|
51
|
+
return form_data
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class ConversionResult:
|
|
56
|
+
"""Result from document conversion (marker endpoint)"""
|
|
57
|
+
|
|
58
|
+
success: bool
|
|
59
|
+
output_format: str
|
|
60
|
+
markdown: Optional[str] = None
|
|
61
|
+
html: Optional[str] = None
|
|
62
|
+
json: Optional[Dict[str, Any]] = None
|
|
63
|
+
images: Optional[Dict[str, str]] = None
|
|
64
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
65
|
+
error: Optional[str] = None
|
|
66
|
+
page_count: Optional[int] = None
|
|
67
|
+
status: str = "complete"
|
|
68
|
+
|
|
69
|
+
def save_output(
|
|
70
|
+
self, output_path: Union[str, Path], save_images: bool = True
|
|
71
|
+
) -> None:
|
|
72
|
+
"""Save the conversion output to files"""
|
|
73
|
+
output_path = Path(output_path)
|
|
74
|
+
|
|
75
|
+
# Save main content
|
|
76
|
+
if self.markdown:
|
|
77
|
+
with open(output_path.with_suffix(".md"), "w", encoding="utf-8") as f:
|
|
78
|
+
f.write(self.markdown)
|
|
79
|
+
|
|
80
|
+
if self.html:
|
|
81
|
+
with open(output_path.with_suffix(".html"), "w", encoding="utf-8") as f:
|
|
82
|
+
f.write(self.html)
|
|
83
|
+
|
|
84
|
+
if self.json:
|
|
85
|
+
with open(output_path.with_suffix(".json"), "w", encoding="utf-8") as f:
|
|
86
|
+
json.dump(self.json, f, indent=2)
|
|
87
|
+
|
|
88
|
+
# Save images if present
|
|
89
|
+
if save_images and self.images:
|
|
90
|
+
images_dir = output_path.parent
|
|
91
|
+
images_dir.mkdir(exist_ok=True)
|
|
92
|
+
|
|
93
|
+
for filename, base64_data in self.images.items():
|
|
94
|
+
image_path = images_dir / filename
|
|
95
|
+
with open(image_path, "wb") as f:
|
|
96
|
+
f.write(base64.b64decode(base64_data))
|
|
97
|
+
|
|
98
|
+
# Save metadata if present
|
|
99
|
+
if self.metadata:
|
|
100
|
+
with open(
|
|
101
|
+
output_path.with_suffix(".metadata.json"), "w", encoding="utf-8"
|
|
102
|
+
) as f:
|
|
103
|
+
json.dump(self.metadata, f, indent=2)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@dataclass
|
|
107
|
+
class OCRResult:
|
|
108
|
+
"""Result from OCR processing"""
|
|
109
|
+
|
|
110
|
+
success: bool
|
|
111
|
+
pages: List[Dict[str, Any]]
|
|
112
|
+
error: Optional[str] = None
|
|
113
|
+
page_count: Optional[int] = None
|
|
114
|
+
status: str = "complete"
|
|
115
|
+
|
|
116
|
+
def get_text(self, page_num: Optional[int] = None) -> str:
|
|
117
|
+
"""Extract text from OCR results"""
|
|
118
|
+
if page_num is not None:
|
|
119
|
+
# Get text from specific page
|
|
120
|
+
page = next((p for p in self.pages if p.get("page") == page_num), None)
|
|
121
|
+
if page:
|
|
122
|
+
return "\n".join([line["text"] for line in page.get("text_lines", [])])
|
|
123
|
+
return ""
|
|
124
|
+
else:
|
|
125
|
+
# Get all text
|
|
126
|
+
all_text = []
|
|
127
|
+
for page in self.pages:
|
|
128
|
+
page_text = "\n".join(
|
|
129
|
+
[line["text"] for line in page.get("text_lines", [])]
|
|
130
|
+
)
|
|
131
|
+
all_text.append(page_text)
|
|
132
|
+
return "\n\n".join(all_text)
|
|
133
|
+
|
|
134
|
+
def save_output(self, output_path: Union[str, Path]) -> None:
|
|
135
|
+
"""Save the OCR output to a text file"""
|
|
136
|
+
output_path = Path(output_path)
|
|
137
|
+
|
|
138
|
+
# Save as text file
|
|
139
|
+
text_content = self.get_text()
|
|
140
|
+
with open(output_path.with_suffix(".txt"), "w", encoding="utf-8") as f:
|
|
141
|
+
f.write(text_content)
|
|
142
|
+
|
|
143
|
+
# Save detailed OCR data as JSON
|
|
144
|
+
with open(output_path.with_suffix(".ocr.json"), "w", encoding="utf-8") as f:
|
|
145
|
+
json.dump(
|
|
146
|
+
{
|
|
147
|
+
"success": self.success,
|
|
148
|
+
"pages": self.pages,
|
|
149
|
+
"error": self.error,
|
|
150
|
+
"page_count": self.page_count,
|
|
151
|
+
"status": self.status,
|
|
152
|
+
},
|
|
153
|
+
f,
|
|
154
|
+
indent=2,
|
|
155
|
+
)
|
datalab_sdk/settings.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from pydantic_settings import BaseSettings
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Settings(BaseSettings):
|
|
6
|
+
# Paths
|
|
7
|
+
BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
8
|
+
LOGLEVEL: str = "DEBUG"
|
|
9
|
+
|
|
10
|
+
# Base settings
|
|
11
|
+
DATALAB_API_KEY: str | None = None
|
|
12
|
+
DATALAB_HOST: str = "https://www.datalab.to"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
settings = Settings()
|