datalab-python-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: datalab-python-sdk
3
+ Version: 0.1.0
4
+ Summary: Auto-generated SDK for Datalab API
5
+ License-File: LICENSE
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: aiohttp>=3.12.14
8
+ Requires-Dist: click>=8.2.1
9
+ Requires-Dist: pydantic-settings<3.0.0,>=2.10.1
10
+ Requires-Dist: pydantic<3.0.0,>=2.11.7
11
+ Requires-Dist: pytest-asyncio>=1.0.0
12
+ Provides-Extra: test
13
+ Requires-Dist: aiofiles>=23.2.0; extra == 'test'
14
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == 'test'
15
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'test'
16
+ Requires-Dist: pytest-mock>=3.11.0; extra == 'test'
17
+ Requires-Dist: pytest>=7.4.0; extra == 'test'
@@ -0,0 +1,12 @@
1
+ datalab_sdk/__init__.py,sha256=Ou4cGIih1julnRVMmXLipBiQibYGGK_f4DtQggM286E,647
2
+ datalab_sdk/cli.py,sha256=aoEunI4h7hOfS3lnGeC55sIEfMz8avKG6rHtjpH4KJw,13191
3
+ datalab_sdk/client.py,sha256=UoP8cZxZOxzuzaRFekHdwKofYVOqQXk2sKRCOTgj1qk,10065
4
+ datalab_sdk/exceptions.py,sha256=MDfNd-l152OUVhZSfEsE95McJq-IuOsrEsEQ4CRMbEI,785
5
+ datalab_sdk/mimetypes.py,sha256=kgAM5ZRIcbjwBgDeMs4LsjtlaYpbDyRLDkZGy8vpruc,698
6
+ datalab_sdk/models.py,sha256=e2_wHonABi_-TVrfe77gniK1Hh-EjepTrItmbRRQMlY,4954
7
+ datalab_sdk/settings.py,sha256=kVa9mlfkbOHX0VyX154UmeN_um1BPNaVBgRQ-DkaUNQ,338
8
+ datalab_python_sdk-0.1.0.dist-info/METADATA,sha256=Z0qLU5jMAdtZnm9lsA6XxZsEvHMo0YMcKaAOqaEFQIQ,609
9
+ datalab_python_sdk-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
+ datalab_python_sdk-0.1.0.dist-info/entry_points.txt,sha256=rqWxO21UV57JxxyeF20iBTxHpzBYOA69hCHIqwdn-Ok,48
11
+ datalab_python_sdk-0.1.0.dist-info/licenses/LICENSE,sha256=XGjT-Uz6t7UZzr3JCZfilsGcZEgMm_1aONcRunohxWg,1064
12
+ datalab_python_sdk-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ datalab = datalab_sdk.cli:cli
@@ -0,0 +1,18 @@
1
+ Copyright 2025 Endless Labs Inc
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
4
+ this software and associated documentation files (the “Software”), to deal in
5
+ the Software without restriction, including without limitation the rights to
6
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7
+ the Software, and to permit persons to whom the Software is furnished to do so,
8
+ subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,26 @@
1
+ """
2
+ Datalab SDK - Python client for Datalab API
3
+
4
+ This SDK provides both synchronous and asynchronous interfaces to the Datalab API,
5
+ supporting document conversion, OCR, layout analysis, and table recognition.
6
+ """
7
+
8
+ from .client import DatalabClient, AsyncDatalabClient
9
+ from .exceptions import DatalabError, DatalabAPIError, DatalabTimeoutError
10
+ from .models import (
11
+ ConversionResult,
12
+ OCRResult,
13
+ ProcessingOptions,
14
+ )
15
+
16
+ __version__ = "1.0.0"
17
+ __all__ = [
18
+ "DatalabClient",
19
+ "AsyncDatalabClient",
20
+ "DatalabError",
21
+ "DatalabAPIError",
22
+ "DatalabTimeoutError",
23
+ "ConversionResult",
24
+ "OCRResult",
25
+ "ProcessingOptions",
26
+ ]
datalab_sdk/cli.py ADDED
@@ -0,0 +1,416 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Datalab SDK Command Line Interface
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import asyncio
9
+ from pathlib import Path
10
+ from typing import Optional, List
11
+ import click
12
+
13
+ from datalab_sdk.client import DatalabClient, AsyncDatalabClient
14
+ from datalab_sdk.models import ProcessingOptions
15
+ from datalab_sdk.exceptions import DatalabError
16
+ from datalab_sdk.settings import settings
17
+
18
+
19
+ def get_supported_extensions() -> List[str]:
20
+ """Get list of supported file extensions"""
21
+ return [
22
+ ".pdf",
23
+ ".png",
24
+ ".jpg",
25
+ ".jpeg",
26
+ ".gif",
27
+ ".tiff",
28
+ ".webp",
29
+ ".docx",
30
+ ".doc",
31
+ ".xlsx",
32
+ ".xls",
33
+ ".pptx",
34
+ ".ppt",
35
+ ".html",
36
+ ".epub",
37
+ ]
38
+
39
+
40
+ def find_files_in_directory(
41
+ directory: Path, extensions: Optional[List[str]] = None
42
+ ) -> List[Path]:
43
+ """Find all supported files in a directory"""
44
+ if extensions is None:
45
+ extensions = get_supported_extensions()
46
+
47
+ files = []
48
+ for file_path in directory.rglob("*"):
49
+ if file_path.is_file() and file_path.suffix.lower() in extensions:
50
+ files.append(file_path)
51
+
52
+ return files
53
+
54
+
55
+ async def process_files_async(
56
+ files: List[Path],
57
+ output_dir: Path,
58
+ method: str,
59
+ options: Optional[ProcessingOptions] = None,
60
+ max_pages: Optional[int] = None,
61
+ max_concurrent: int = 5,
62
+ ) -> List[dict]:
63
+ """Process files asynchronously"""
64
+ semaphore = asyncio.Semaphore(max_concurrent)
65
+
66
+ async def process_single_file(file_path: Path) -> dict:
67
+ async with semaphore:
68
+ try:
69
+ # Create output path
70
+ relative_path = file_path.name
71
+ output_path = output_dir / Path(relative_path).stem
72
+
73
+ async with AsyncDatalabClient() as client:
74
+ if method == "convert":
75
+ result = await client.convert(
76
+ file_path, options=options, save_output=output_path
77
+ )
78
+ else: # method == 'ocr'
79
+ result = await client.ocr(
80
+ file_path, max_pages=max_pages, save_output=output_path
81
+ )
82
+
83
+ return {
84
+ "file_path": str(file_path),
85
+ "output_path": str(output_path),
86
+ "success": result.success,
87
+ "error": result.error,
88
+ "page_count": result.page_count,
89
+ }
90
+ except Exception as e:
91
+ return {
92
+ "file_path": str(file_path),
93
+ "output_path": None,
94
+ "success": False,
95
+ "error": str(e),
96
+ "page_count": None,
97
+ }
98
+
99
+ # Process all files concurrently
100
+ tasks = [process_single_file(file_path) for file_path in files]
101
+ results = await asyncio.gather(*tasks)
102
+
103
+ return results
104
+
105
+
106
+ def process_single_file_sync(
107
+ file_path: Path,
108
+ output_dir: Path,
109
+ method: str,
110
+ options: Optional[ProcessingOptions] = None,
111
+ max_pages: Optional[int] = None,
112
+ ) -> dict:
113
+ """Process a single file synchronously"""
114
+ try:
115
+ # Create output path
116
+ output_path = output_dir / file_path.stem
117
+ output_file = output_path / file_path.stem
118
+
119
+ client = DatalabClient()
120
+ if method == "convert":
121
+ result = client.convert(file_path, options=options, save_output=output_file)
122
+ else: # method == 'ocr'
123
+ result = client.ocr(file_path, max_pages=max_pages, save_output=output_file)
124
+
125
+ return {
126
+ "file_path": str(file_path),
127
+ "output_path": str(output_path),
128
+ "success": result.success,
129
+ "error": result.error,
130
+ "page_count": result.page_count,
131
+ }
132
+ except Exception as e:
133
+ return {
134
+ "file_path": str(file_path),
135
+ "output_path": None,
136
+ "success": False,
137
+ "error": str(e),
138
+ "page_count": None,
139
+ }
140
+
141
+
142
+ @click.group()
143
+ @click.version_option(version="1.0.0")
144
+ def cli():
145
+ """Datalab SDK - Command line interface for document processing"""
146
+ pass
147
+
148
+
149
+ @click.command()
150
+ @click.argument("path", type=click.Path(exists=True))
151
+ @click.option("--api_key", required=False, help="Datalab API key")
152
+ @click.option(
153
+ "--output_dir", "-o", required=False, type=click.Path(), help="Output directory"
154
+ )
155
+ @click.option(
156
+ "--format",
157
+ "output_format",
158
+ default="markdown",
159
+ type=click.Choice(["markdown", "html", "json"]),
160
+ help="Output format",
161
+ )
162
+ @click.option("--max_pages", type=int, help="Maximum number of pages to process")
163
+ @click.option("--force_ocr", is_flag=True, help="Force OCR on every page")
164
+ @click.option(
165
+ "--format_lines", is_flag=True, help="Partially OCR lines for better formatting"
166
+ )
167
+ @click.option("--paginate", is_flag=True, help="Add page delimiters to output")
168
+ @click.option("--use_llm", is_flag=True, help="Use LLM to enhance accuracy")
169
+ @click.option("--page_range", help='Page range to process (e.g., "0-2" or "0,1,2")')
170
+ @click.option(
171
+ "--extensions", help="Comma-separated list of file extensions (for directories)"
172
+ )
173
+ @click.option(
174
+ "--max_concurrent", default=5, type=int, help="Maximum concurrent requests"
175
+ )
176
+ @click.option("--base_url", default=settings.DATALAB_HOST, help="API base URL")
177
+ def convert(
178
+ path: str,
179
+ api_key: str,
180
+ output_dir: str,
181
+ output_format: str,
182
+ max_pages: Optional[int],
183
+ force_ocr: bool,
184
+ format_lines: bool,
185
+ paginate: bool,
186
+ use_llm: bool,
187
+ page_range: Optional[str],
188
+ extensions: Optional[str],
189
+ max_concurrent: int,
190
+ base_url: str,
191
+ ):
192
+ """Convert documents to markdown, HTML, or JSON"""
193
+
194
+ if api_key is None:
195
+ api_key = settings.DATALAB_API_KEY
196
+ if api_key is None:
197
+ raise DatalabError(
198
+ "You must either pass in an api key via --api_key or set the DATALAB_API_KEY env variable."
199
+ )
200
+
201
+ path = Path(path)
202
+
203
+ if output_dir is None:
204
+ output_dir = os.getcwd()
205
+
206
+ output_dir = Path(output_dir)
207
+ output_dir.mkdir(parents=True, exist_ok=True)
208
+
209
+ # Parse extensions
210
+ file_extensions = None
211
+ if extensions:
212
+ file_extensions = [ext.strip() for ext in extensions.split(",")]
213
+ file_extensions = [
214
+ ext if ext.startswith(".") else f".{ext}" for ext in file_extensions
215
+ ]
216
+
217
+ # Create processing options
218
+ options = ProcessingOptions(
219
+ output_format=output_format,
220
+ max_pages=max_pages,
221
+ force_ocr=force_ocr,
222
+ format_lines=format_lines,
223
+ paginate=paginate,
224
+ use_llm=use_llm,
225
+ page_range=page_range,
226
+ )
227
+
228
+ try:
229
+ # Set API key and base URL in client
230
+ settings.DATALAB_API_KEY = api_key
231
+ settings.DATALAB_HOST = base_url
232
+
233
+ if path.is_file():
234
+ # Single file processing
235
+ if file_extensions and path.suffix.lower() not in file_extensions:
236
+ click.echo(f"❌ Skipping {path}: unsupported file type", err=True)
237
+ sys.exit(1)
238
+
239
+ result = process_single_file_sync(path, output_dir, "convert", options)
240
+
241
+ if result["success"]:
242
+ click.echo(f"✅ Successfully converted {result['file_path']}")
243
+ if result["page_count"]:
244
+ click.echo(f" 📄 Processed {result['page_count']} pages")
245
+ if result["output_path"]:
246
+ click.echo(f" 📁 Output saved to: {result['output_path']}")
247
+ else:
248
+ click.echo(
249
+ f"❌ Failed to convert {result['file_path']}: {result['error']}",
250
+ err=True,
251
+ )
252
+ sys.exit(1)
253
+ else:
254
+ # Directory processing
255
+ files = find_files_in_directory(path, file_extensions)
256
+
257
+ if not files:
258
+ click.echo(f"❌ No supported files found in {path}", err=True)
259
+ sys.exit(1)
260
+
261
+ click.echo(f"📂 Found {len(files)} files to process")
262
+
263
+ # Process files asynchronously
264
+ results = asyncio.run(
265
+ process_files_async(
266
+ files, output_dir, "convert", options, max_pages, max_concurrent
267
+ )
268
+ )
269
+
270
+ # Show results
271
+ successful = sum(1 for r in results if r["success"])
272
+ failed = len(results) - successful
273
+
274
+ click.echo("\n📊 Conversion Summary:")
275
+ click.echo(f" ✅ Successfully converted: {successful} files")
276
+ if failed > 0:
277
+ click.echo(f" ❌ Failed: {failed} files")
278
+
279
+ # Show failed files
280
+ click.echo("\n Failed files:")
281
+ for result in results:
282
+ if not result["success"]:
283
+ click.echo(f" - {result['file_path']}: {result['error']}")
284
+
285
+ click.echo(f"\n📁 Output saved to: {output_dir}")
286
+
287
+ except DatalabError as e:
288
+ click.echo(f"❌ Error: {e}", err=True)
289
+ sys.exit(1)
290
+
291
+
292
+ @click.command()
293
+ @click.argument("path", type=click.Path(exists=True))
294
+ @click.option("--api_key", required=False, help="Datalab API key")
295
+ @click.option(
296
+ "--output_dir", "-o", required=False, type=click.Path(), help="Output directory"
297
+ )
298
+ @click.option("--max_pages", type=int, help="Maximum number of pages to process")
299
+ @click.option(
300
+ "--extensions", help="Comma-separated list of file extensions (for directories)"
301
+ )
302
+ @click.option(
303
+ "--max_concurrent", default=5, type=int, help="Maximum concurrent requests"
304
+ )
305
+ @click.option("--base_url", default=settings.DATALAB_HOST, help="API base URL")
306
+ def ocr(
307
+ path: str,
308
+ api_key: str,
309
+ output_dir: str,
310
+ max_pages: Optional[int],
311
+ extensions: Optional[str],
312
+ max_concurrent: int,
313
+ base_url: str,
314
+ ):
315
+ """Perform OCR on documents"""
316
+
317
+ if api_key is None:
318
+ api_key = settings.DATALAB_API_KEY
319
+ if api_key is None:
320
+ raise DatalabError(
321
+ "You must either pass in an api key via --api_key or set the DATALAB_API_KEY env variable."
322
+ )
323
+
324
+ path = Path(path)
325
+
326
+ if output_dir is None:
327
+ output_dir = os.getcwd()
328
+
329
+ output_dir = Path(output_dir)
330
+ output_dir.mkdir(parents=True, exist_ok=True)
331
+
332
+ # Parse extensions
333
+ file_extensions = None
334
+ if extensions:
335
+ file_extensions = [ext.strip() for ext in extensions.split(",")]
336
+ file_extensions = [
337
+ ext if ext.startswith(".") else f".{ext}" for ext in file_extensions
338
+ ]
339
+
340
+ try:
341
+ # Set API key and base URL in client
342
+ settings.DATALAB_API_KEY = api_key
343
+ settings.DATALAB_HOST = base_url
344
+
345
+ if path.is_file():
346
+ # Single file processing
347
+ if file_extensions and path.suffix.lower() not in file_extensions:
348
+ click.echo(f"❌ Skipping {path}: unsupported file type", err=True)
349
+ sys.exit(1)
350
+
351
+ result = process_single_file_sync(
352
+ path, output_dir, "ocr", max_pages=max_pages
353
+ )
354
+
355
+ if result["success"]:
356
+ click.echo(f"✅ Successfully performed OCR on {result['file_path']}")
357
+ if result["page_count"]:
358
+ click.echo(f" 📄 Processed {result['page_count']} pages")
359
+ if result["output_path"]:
360
+ click.echo(f" 📁 Output saved to: {result['output_path']}")
361
+ else:
362
+ click.echo(
363
+ f"❌ Failed OCR on {result['file_path']}: {result['error']}",
364
+ err=True,
365
+ )
366
+ sys.exit(1)
367
+ else:
368
+ # Directory processing
369
+ files = find_files_in_directory(path, file_extensions)
370
+
371
+ if not files:
372
+ click.echo(f"❌ No supported files found in {path}", err=True)
373
+ sys.exit(1)
374
+
375
+ click.echo(f"📂 Found {len(files)} files to process")
376
+
377
+ # Process files asynchronously
378
+ results = asyncio.run(
379
+ process_files_async(
380
+ files,
381
+ output_dir,
382
+ "ocr",
383
+ max_pages=max_pages,
384
+ max_concurrent=max_concurrent,
385
+ )
386
+ )
387
+
388
+ # Show results
389
+ successful = sum(1 for r in results if r["success"])
390
+ failed = len(results) - successful
391
+
392
+ click.echo("\n📊 OCR Summary:")
393
+ click.echo(f" ✅ Successfully processed: {successful} files")
394
+ if failed > 0:
395
+ click.echo(f" ❌ Failed: {failed} files")
396
+
397
+ # Show failed files
398
+ click.echo("\n Failed files:")
399
+ for result in results:
400
+ if not result["success"]:
401
+ click.echo(f" - {result['file_path']}: {result['error']}")
402
+
403
+ click.echo(f"\n📁 Output saved to: {output_dir}")
404
+
405
+ except DatalabError as e:
406
+ click.echo(f"❌ Error: {e}", err=True)
407
+ sys.exit(1)
408
+
409
+
410
+ # Add commands to CLI group
411
+ cli.add_command(convert)
412
+ cli.add_command(ocr)
413
+
414
+
415
+ if __name__ == "__main__":
416
+ cli()
datalab_sdk/client.py ADDED
@@ -0,0 +1,301 @@
1
+ """
2
+ Datalab API client - async core with sync wrapper
3
+ """
4
+
5
+ import asyncio
6
+ import mimetypes
7
+ import aiohttp
8
+ from pathlib import Path
9
+ from typing import Union, Optional, Dict, Any
10
+
11
+ from datalab_sdk.exceptions import (
12
+ DatalabAPIError,
13
+ DatalabTimeoutError,
14
+ DatalabFileError,
15
+ )
16
+ from datalab_sdk.mimetypes import MIMETYPE_MAP
17
+ from datalab_sdk.models import ConversionResult, OCRResult, ProcessingOptions
18
+ from datalab_sdk.settings import settings
19
+
20
+
21
+ class AsyncDatalabClient:
22
+ """Asynchronous client for Datalab API"""
23
+
24
+ def __init__(
25
+ self,
26
+ api_key: str | None = None,
27
+ base_url: str = settings.DATALAB_HOST,
28
+ timeout: int = 300,
29
+ ):
30
+ """
31
+ Initialize the async Datalab client
32
+
33
+ Args:
34
+ api_key: Your Datalab API key
35
+ base_url: Base URL for the API (default: https://www.datalab.to)
36
+ timeout: Default timeout for requests in seconds
37
+ """
38
+ if api_key is None:
39
+ api_key = settings.DATALAB_API_KEY
40
+ if api_key is None:
41
+ raise DatalabAPIError("You must pass in an api_key or set DATALAB_API_KEY.")
42
+
43
+ self.api_key = api_key
44
+ self.base_url = base_url.rstrip("/")
45
+ self.timeout = timeout
46
+ self._session = None
47
+
48
+ async def __aenter__(self):
49
+ """Async context manager entry"""
50
+ await self._ensure_session()
51
+ return self
52
+
53
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
54
+ """Async context manager exit"""
55
+ await self.close()
56
+
57
+ async def _ensure_session(self):
58
+ """Ensure aiohttp session is created"""
59
+ if self._session is None:
60
+ timeout = aiohttp.ClientTimeout(total=self.timeout)
61
+ self._session = aiohttp.ClientSession(
62
+ timeout=timeout,
63
+ headers={
64
+ "X-Api-Key": self.api_key,
65
+ "User-Agent": "datalab-python-sdk/0.1.0",
66
+ },
67
+ )
68
+
69
+ async def close(self):
70
+ """Close the aiohttp session"""
71
+ if self._session:
72
+ await self._session.close()
73
+ self._session = None
74
+
75
+ async def _make_request(
76
+ self, method: str, endpoint: str, **kwargs
77
+ ) -> Dict[str, Any]:
78
+ """Make an async request to the API"""
79
+ await self._ensure_session()
80
+
81
+ url = endpoint
82
+ if not endpoint.startswith("http"):
83
+ url = f"{self.base_url}/{endpoint.lstrip('/')}"
84
+
85
+ try:
86
+ async with self._session.request(method, url, **kwargs) as response:
87
+ response.raise_for_status()
88
+ return await response.json()
89
+ except asyncio.TimeoutError:
90
+ raise DatalabTimeoutError(f"Request timed out after {self.timeout} seconds")
91
+ except aiohttp.ClientResponseError as e:
92
+ try:
93
+ error_data = await response.json()
94
+ error_message = error_data.get("error", str(e))
95
+ except Exception:
96
+ error_message = str(e)
97
+ raise DatalabAPIError(
98
+ error_message,
99
+ e.status,
100
+ error_data if "error_data" in locals() else None,
101
+ )
102
+ except aiohttp.ClientError as e:
103
+ raise DatalabAPIError(f"Request failed: {str(e)}")
104
+
105
+ async def _poll_result(
106
+ self, check_url: str, max_polls: int = 300, poll_interval: int = 1
107
+ ) -> Dict[str, Any]:
108
+ """Poll for result completion"""
109
+ full_url = (
110
+ check_url
111
+ if check_url.startswith("http")
112
+ else f"{self.base_url}/{check_url.lstrip('/')}"
113
+ )
114
+
115
+ for i in range(max_polls):
116
+ data = await self._make_request("GET", full_url)
117
+
118
+ if data.get("status") == "complete":
119
+ return data
120
+
121
+ if not data.get("success", True) and not data.get("status") == "processing":
122
+ raise DatalabAPIError(
123
+ f"Processing failed: {data.get('error', 'Unknown error')}"
124
+ )
125
+
126
+ await asyncio.sleep(poll_interval)
127
+
128
+ raise DatalabTimeoutError(
129
+ f"Polling timed out after {max_polls * poll_interval} seconds"
130
+ )
131
+
132
+ def _prepare_file_data(self, file_path: Union[str, Path]) -> tuple:
133
+ """Prepare file data for upload"""
134
+ file_path = Path(file_path)
135
+
136
+ if not file_path.exists():
137
+ raise DatalabFileError(f"File not found: {file_path}")
138
+
139
+ # Determine MIME type
140
+ mime_type, _ = mimetypes.guess_type(str(file_path))
141
+ if not mime_type:
142
+ # Try to detect from extension
143
+ extension = file_path.suffix.lower()
144
+ mime_type = MIMETYPE_MAP.get(extension, "application/octet-stream")
145
+
146
+ return file_path.name, file_path.read_bytes(), mime_type
147
+
148
+ # Convenient endpoint-specific methods
149
+ async def convert(
150
+ self,
151
+ file_path: Union[str, Path],
152
+ options: Optional[ProcessingOptions] = None,
153
+ save_output: Optional[Union[str, Path]] = None,
154
+ ) -> ConversionResult:
155
+ """Convert a document using the marker endpoint"""
156
+ if options is None:
157
+ options = ProcessingOptions()
158
+
159
+ filename, file_data, mime_type = self._prepare_file_data(file_path)
160
+
161
+ form_data = aiohttp.FormData()
162
+ form_data.add_field(
163
+ "file", file_data, filename=filename, content_type=mime_type
164
+ )
165
+
166
+ for key, value in options.to_form_data().items():
167
+ if isinstance(value, tuple):
168
+ form_data.add_field(key, str(value[1]))
169
+ else:
170
+ form_data.add_field(key, str(value))
171
+
172
+ initial_data = await self._make_request(
173
+ "POST", "/api/v1/marker", data=form_data
174
+ )
175
+
176
+ if not initial_data.get("success"):
177
+ raise DatalabAPIError(
178
+ f"Request failed: {initial_data.get('error', 'Unknown error')}"
179
+ )
180
+
181
+ result_data = await self._poll_result(initial_data["request_check_url"])
182
+
183
+ result = ConversionResult(
184
+ success=result_data.get("success", False),
185
+ output_format=result_data.get("output_format", options.output_format),
186
+ markdown=result_data.get("markdown"),
187
+ html=result_data.get("html"),
188
+ json=result_data.get("json"),
189
+ images=result_data.get("images"),
190
+ metadata=result_data.get("metadata"),
191
+ error=result_data.get("error"),
192
+ page_count=result_data.get("page_count"),
193
+ status=result_data.get("status", "complete"),
194
+ )
195
+
196
+ # Save output if requested
197
+ if save_output and result.success:
198
+ output_path = Path(save_output)
199
+ output_path.parent.mkdir(parents=True, exist_ok=True)
200
+ result.save_output(output_path)
201
+
202
+ return result
203
+
204
+ async def ocr(
205
+ self,
206
+ file_path: Union[str, Path],
207
+ max_pages: Optional[int] = None,
208
+ save_output: Optional[Union[str, Path]] = None,
209
+ ) -> OCRResult:
210
+ """Perform OCR on a document"""
211
+ filename, file_data, mime_type = self._prepare_file_data(file_path)
212
+
213
+ form_data = aiohttp.FormData()
214
+ form_data.add_field(
215
+ "file", file_data, filename=filename, content_type=mime_type
216
+ )
217
+
218
+ if max_pages is not None:
219
+ form_data.add_field("max_pages", str(max_pages))
220
+
221
+ initial_data = await self._make_request("POST", "/api/v1/ocr", data=form_data)
222
+
223
+ if not initial_data.get("success"):
224
+ raise DatalabAPIError(
225
+ f"Request failed: {initial_data.get('error', 'Unknown error')}"
226
+ )
227
+
228
+ result_data = await self._poll_result(initial_data["request_check_url"])
229
+
230
+ result = OCRResult(
231
+ success=result_data.get("success", False),
232
+ pages=result_data.get("pages", []),
233
+ error=result_data.get("error"),
234
+ page_count=result_data.get("page_count"),
235
+ status=result_data.get("status", "complete"),
236
+ )
237
+
238
+ # Save output if requested
239
+ if save_output and result.success:
240
+ output_path = Path(save_output)
241
+ output_path.parent.mkdir(parents=True, exist_ok=True)
242
+ result.save_output(output_path)
243
+
244
+ return result
245
+
246
+
247
+ class DatalabClient:
248
+ """Synchronous wrapper around AsyncDatalabClient"""
249
+
250
+ def __init__(
251
+ self,
252
+ api_key: str | None = None,
253
+ base_url: str = settings.DATALAB_HOST,
254
+ timeout: int = 300,
255
+ ):
256
+ """
257
+ Initialize the Datalab client
258
+
259
+ Args:
260
+ api_key: Your Datalab API key
261
+ base_url: Base URL for the API (default: https://www.datalab.to)
262
+ timeout: Default timeout for requests in seconds
263
+ """
264
+ self._async_client = AsyncDatalabClient(api_key, base_url, timeout)
265
+
266
+ def _run_async(self, coro):
267
+ """Run async coroutine in sync context"""
268
+ try:
269
+ loop = asyncio.get_event_loop()
270
+ except RuntimeError:
271
+ loop = asyncio.new_event_loop()
272
+ asyncio.set_event_loop(loop)
273
+
274
+ return loop.run_until_complete(self._async_wrapper(coro))
275
+
276
+ async def _async_wrapper(self, coro):
277
+ """Wrapper to ensure session management"""
278
+ async with self._async_client:
279
+ return await coro
280
+
281
+ def convert(
282
+ self,
283
+ file_path: Union[str, Path],
284
+ options: Optional[ProcessingOptions] = None,
285
+ save_output: Optional[Union[str, Path]] = None,
286
+ ) -> ConversionResult:
287
+ """Convert a document using the marker endpoint (sync version)"""
288
+ return self._run_async(
289
+ self._async_client.convert(file_path, options, save_output)
290
+ )
291
+
292
+ def ocr(
293
+ self,
294
+ file_path: Union[str, Path],
295
+ max_pages: Optional[int] = None,
296
+ save_output: Optional[Union[str, Path]] = None,
297
+ ) -> OCRResult:
298
+ """Perform OCR on a document (sync version)"""
299
+ return self._run_async(
300
+ self._async_client.ocr(file_path, max_pages, save_output)
301
+ )
@@ -0,0 +1,38 @@
1
+ """
2
+ Datalab SDK exceptions
3
+ """
4
+
5
+
6
+ class DatalabError(Exception):
7
+ """Base exception for Datalab SDK errors"""
8
+
9
+ pass
10
+
11
+
12
+ class DatalabAPIError(DatalabError):
13
+ """Exception raised when the API returns an error response"""
14
+
15
+ def __init__(
16
+ self, message: str, status_code: int = None, response_data: dict = None
17
+ ):
18
+ super().__init__(message)
19
+ self.status_code = status_code
20
+ self.response_data = response_data
21
+
22
+
23
+ class DatalabTimeoutError(DatalabError):
24
+ """Exception raised when a request times out"""
25
+
26
+ pass
27
+
28
+
29
+ class DatalabFileError(DatalabError):
30
+ """Exception raised when there's an issue with file operations"""
31
+
32
+ pass
33
+
34
+
35
+ class DatalabValidationError(DatalabError):
36
+ """Exception raised when input validation fails"""
37
+
38
+ pass
@@ -0,0 +1,18 @@
1
+ MIMETYPE_MAP = {
2
+ ".pdf": "application/pdf",
3
+ ".png": "image/png",
4
+ ".jpg": "image/jpeg",
5
+ ".jpeg": "image/jpeg",
6
+ ".gif": "image/gif",
7
+ ".tiff": "image/tiff",
8
+ ".webp": "image/webp",
9
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
10
+ ".doc": "application/msword",
11
+ ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
12
+ ".xls": "application/vnd.ms-excel",
13
+ ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
14
+ ".ppt": "application/vnd.ms-powerpoint",
15
+ ".html": "text/html",
16
+ ".epub": "application/epub+zip",
17
+ }
18
+ SUPPORTED_EXTENSIONS = list(MIMETYPE_MAP.keys())
datalab_sdk/models.py ADDED
@@ -0,0 +1,155 @@
1
+ """
2
+ Datalab SDK data models
3
+ """
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Dict, List, Optional, Any, Union
7
+ from pathlib import Path
8
+ import json
9
+ import base64
10
+
11
+
12
+ @dataclass
13
+ class ProcessingOptions:
14
+ """Options for document processing"""
15
+
16
+ # Common options
17
+ max_pages: Optional[int] = None
18
+ output_format: str = "markdown" # markdown, json, html
19
+ skip_cache: bool = True
20
+
21
+ # Marker specific options
22
+ force_ocr: bool = False
23
+ format_lines: bool = False
24
+ paginate: bool = False
25
+ use_llm: bool = False
26
+ strip_existing_ocr: bool = False
27
+ disable_image_extraction: bool = False
28
+ page_range: Optional[str] = None
29
+ block_correction_prompt: Optional[str] = None
30
+ additional_config: Optional[Dict[str, Any]] = None
31
+ page_schema: Optional[Dict[str, Any]] = None
32
+
33
+ # Table recognition options
34
+ skip_table_detection: bool = False
35
+ detect_cell_boxes: bool = False
36
+
37
+ def to_form_data(self) -> Dict[str, Any]:
38
+ """Convert to form data format for API requests"""
39
+ form_data = {}
40
+
41
+ # Add non-None values
42
+ for key, value in self.__dict__.items():
43
+ if value is not None:
44
+ if isinstance(value, bool):
45
+ form_data[key] = (None, value)
46
+ elif isinstance(value, (dict, list)):
47
+ form_data[key] = (None, json.dumps(value, indent=2))
48
+ else:
49
+ form_data[key] = (None, value)
50
+
51
+ return form_data
52
+
53
+
54
+ @dataclass
55
+ class ConversionResult:
56
+ """Result from document conversion (marker endpoint)"""
57
+
58
+ success: bool
59
+ output_format: str
60
+ markdown: Optional[str] = None
61
+ html: Optional[str] = None
62
+ json: Optional[Dict[str, Any]] = None
63
+ images: Optional[Dict[str, str]] = None
64
+ metadata: Optional[Dict[str, Any]] = None
65
+ error: Optional[str] = None
66
+ page_count: Optional[int] = None
67
+ status: str = "complete"
68
+
69
+ def save_output(
70
+ self, output_path: Union[str, Path], save_images: bool = True
71
+ ) -> None:
72
+ """Save the conversion output to files"""
73
+ output_path = Path(output_path)
74
+
75
+ # Save main content
76
+ if self.markdown:
77
+ with open(output_path.with_suffix(".md"), "w", encoding="utf-8") as f:
78
+ f.write(self.markdown)
79
+
80
+ if self.html:
81
+ with open(output_path.with_suffix(".html"), "w", encoding="utf-8") as f:
82
+ f.write(self.html)
83
+
84
+ if self.json:
85
+ with open(output_path.with_suffix(".json"), "w", encoding="utf-8") as f:
86
+ json.dump(self.json, f, indent=2)
87
+
88
+ # Save images if present
89
+ if save_images and self.images:
90
+ images_dir = output_path.parent
91
+ images_dir.mkdir(exist_ok=True)
92
+
93
+ for filename, base64_data in self.images.items():
94
+ image_path = images_dir / filename
95
+ with open(image_path, "wb") as f:
96
+ f.write(base64.b64decode(base64_data))
97
+
98
+ # Save metadata if present
99
+ if self.metadata:
100
+ with open(
101
+ output_path.with_suffix(".metadata.json"), "w", encoding="utf-8"
102
+ ) as f:
103
+ json.dump(self.metadata, f, indent=2)
104
+
105
+
106
+ @dataclass
107
+ class OCRResult:
108
+ """Result from OCR processing"""
109
+
110
+ success: bool
111
+ pages: List[Dict[str, Any]]
112
+ error: Optional[str] = None
113
+ page_count: Optional[int] = None
114
+ status: str = "complete"
115
+
116
+ def get_text(self, page_num: Optional[int] = None) -> str:
117
+ """Extract text from OCR results"""
118
+ if page_num is not None:
119
+ # Get text from specific page
120
+ page = next((p for p in self.pages if p.get("page") == page_num), None)
121
+ if page:
122
+ return "\n".join([line["text"] for line in page.get("text_lines", [])])
123
+ return ""
124
+ else:
125
+ # Get all text
126
+ all_text = []
127
+ for page in self.pages:
128
+ page_text = "\n".join(
129
+ [line["text"] for line in page.get("text_lines", [])]
130
+ )
131
+ all_text.append(page_text)
132
+ return "\n\n".join(all_text)
133
+
134
+ def save_output(self, output_path: Union[str, Path]) -> None:
135
+ """Save the OCR output to a text file"""
136
+ output_path = Path(output_path)
137
+
138
+ # Save as text file
139
+ text_content = self.get_text()
140
+ with open(output_path.with_suffix(".txt"), "w", encoding="utf-8") as f:
141
+ f.write(text_content)
142
+
143
+ # Save detailed OCR data as JSON
144
+ with open(output_path.with_suffix(".ocr.json"), "w", encoding="utf-8") as f:
145
+ json.dump(
146
+ {
147
+ "success": self.success,
148
+ "pages": self.pages,
149
+ "error": self.error,
150
+ "page_count": self.page_count,
151
+ "status": self.status,
152
+ },
153
+ f,
154
+ indent=2,
155
+ )
@@ -0,0 +1,15 @@
1
+ from pydantic_settings import BaseSettings
2
+ import os
3
+
4
+
5
+ class Settings(BaseSettings):
6
+ # Paths
7
+ BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
8
+ LOGLEVEL: str = "DEBUG"
9
+
10
+ # Base settings
11
+ DATALAB_API_KEY: str | None = None
12
+ DATALAB_HOST: str = "https://www.datalab.to"
13
+
14
+
15
+ settings = Settings()