PyPI - datalab-python-sdk - Versions diffs - 0.2.0__tar.gz → 0.2.2__tar.gz - Mend

datalab-python-sdk 0.2.0tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{datalab_python_sdk-0.2.0 → datalab_python_sdk-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datalab-python-sdk
-Version: 0.2.0
+Version: 0.2.2
 Summary: SDK for the Datalab document intelligence API
 Author-email: Datalab Team <hi@datalab.to>
 License-Expression: MIT
@@ -12,6 +12,7 @@ Requires-Dist: click>=8.2.1
 Requires-Dist: pydantic-settings>=2.10.1
 Requires-Dist: pydantic>=2.11.7
 Requires-Dist: tenacity>=8.2.3
+Requires-Dist: tqdm>=4.66.0
 Description-Content-Type: text/markdown
 # Datalab SDK
@@ -54,7 +55,7 @@ Workflows allow you to chain multiple document processing steps together. Each w
 **Note:** All workflow operations require authentication. Make sure you have set your `DATALAB_API_KEY` environment variable or pass `api_key` when creating the client (see [Authentication](#authentication) section above).
-For more Workflow tips, see our [examples here](./examples/README.md) and [documentation](https://documentation.datalab.to/docs/recipes/workflows/workflow-concepts).
+For more Workflow tips, see our [documentation](https://documentation.datalab.to/docs/recipes/workflows/workflow-concepts).
 ## CLI Usage
@@ -64,9 +65,6 @@ The SDK includes a command-line interface:
 # Convert document to markdown
 datalab convert document.pdf
-# OCR with JSON output
-datalab ocr document.pdf --output-format json
 # Workflow commands
 datalab create-workflow --help
 datalab execute-workflow --help

{datalab_python_sdk-0.2.0 → datalab_python_sdk-0.2.2}/README.md RENAMED Viewed

@@ -38,7 +38,7 @@ Workflows allow you to chain multiple document processing steps together. Each w
 **Note:** All workflow operations require authentication. Make sure you have set your `DATALAB_API_KEY` environment variable or pass `api_key` when creating the client (see [Authentication](#authentication) section above).
-For more Workflow tips, see our [examples here](./examples/README.md) and [documentation](https://documentation.datalab.to/docs/recipes/workflows/workflow-concepts).
+For more Workflow tips, see our [documentation](https://documentation.datalab.to/docs/recipes/workflows/workflow-concepts).
 ## CLI Usage
@@ -48,9 +48,6 @@ The SDK includes a command-line interface:
 # Convert document to markdown
 datalab convert document.pdf
-# OCR with JSON output
-datalab ocr document.pdf --output-format json
 # Workflow commands
 datalab create-workflow --help
 datalab execute-workflow --help

{datalab_python_sdk-0.2.0 → datalab_python_sdk-0.2.2}/datalab_sdk/cli.py RENAMED Viewed

@@ -9,6 +9,7 @@ import asyncio
 from pathlib import Path
 from typing import Optional, List
 import click
+from tqdm import tqdm
 from datalab_sdk.client import AsyncDatalabClient, DatalabClient
 from datalab_sdk.mimetypes import SUPPORTED_EXTENSIONS
@@ -122,6 +123,25 @@ async def process_files_async(
     """Process files asynchronously"""
     semaphore = asyncio.Semaphore(max_concurrent)
+    async def call_api(client, file_path, output_path):
+        """Make API call - client handles retries for rate limits"""
+        if method == "convert":
+            return await client.convert(
+                file_path,
+                options=options,
+                save_output=output_path,
+                max_polls=max_polls,
+                poll_interval=poll_interval,
+            )
+        else:  # method == 'ocr'
+            return await client.ocr(
+                file_path,
+                options=options,
+                save_output=output_path,
+                max_polls=max_polls,
+                poll_interval=poll_interval,
+            )
     async def process_single_file(file_path: Path) -> dict:
         async with semaphore:
             try:
@@ -134,22 +154,7 @@ async def process_files_async(
                 async with AsyncDatalabClient(
                     api_key=api_key, base_url=base_url
                 ) as client:
-                    if method == "convert":
-                        result = await client.convert(
-                            file_path,
-                            options=options,
-                            save_output=output_path,
-                            max_polls=max_polls,
-                            poll_interval=poll_interval,
-                        )
-                    else:  # method == 'ocr'
-                        result = await client.ocr(
-                            file_path,
-                            options=options,
-                            save_output=output_path,
-                            max_polls=max_polls,
-                            poll_interval=poll_interval,
-                        )
+                    result = await call_api(client, file_path, output_path)
                 return {
                     "file_path": str(file_path),
@@ -167,9 +172,19 @@ async def process_files_async(
                     "page_count": None,
                 }
-    # Process all files concurrently
-    tasks = [process_single_file(file_path) for file_path in files]
-    results = await asyncio.gather(*tasks)
+    # Process all files concurrently with progress bar
+    tasks = [asyncio.create_task(process_single_file(file_path)) for file_path in files]
+    results = []
+    with tqdm(total=len(tasks), desc="Processing", unit="file") as pbar:
+        for coro in asyncio.as_completed(tasks):
+            result = await coro
+            results.append(result)
+            # Update progress bar description with current file
+            filename = Path(result["file_path"]).name
+            status = "✓" if result["success"] else "✗"
+            pbar.set_postfix_str(f"{status} {filename[:30]}")
+            pbar.update(1)
     return results

{datalab_python_sdk-0.2.0 → datalab_python_sdk-0.2.2}/datalab_sdk/client.py RENAMED Viewed

@@ -112,7 +112,9 @@ class AsyncDatalabClient:
             try:
                 error_data = await response.json()
                 # FastAPI returns errors in "detail" field, but some APIs use "error"
-                error_message = error_data.get("detail") or error_data.get("error") or str(e)
+                error_message = (
+                    error_data.get("detail") or error_data.get("error") or str(e)
+                )
             except Exception:
                 error_message = str(e)
             raise DatalabAPIError(
@@ -123,6 +125,19 @@ class AsyncDatalabClient:
         except aiohttp.ClientError as e:
             raise DatalabAPIError(f"Request failed: {str(e)}")
+    @retry(
+        retry=retry_if_exception(
+            lambda e: isinstance(e, DatalabAPIError)
+            and getattr(e, "status_code", None) == 429
+        ),
+        stop=stop_after_attempt(10),
+        wait=wait_exponential_jitter(initial=5, max=120),
+        reraise=True,
+    )
+    async def _submit_with_retry(self, endpoint: str, data) -> Dict[str, Any]:
+        """POST submission with retry for rate limits (429)"""
+        return await self._make_request("POST", endpoint, data=data)
     async def _poll_result(
         self, check_url: str, max_polls: int = 300, poll_interval: int = 1
     ) -> Dict[str, Any]:
@@ -168,8 +183,8 @@ class AsyncDatalabClient:
                 )
             )
         ),
-        stop=stop_after_attempt(2),
-        wait=wait_exponential_jitter(max=0.5),
+        stop=stop_after_attempt(10),
+        wait=wait_exponential_jitter(initial=5, max=120),
         reraise=True,
     )
     async def _poll_get_with_retry(self, url: str) -> Dict[str, Any]:
@@ -185,7 +200,7 @@ class AsyncDatalabClient:
         # Read file content
         file_data = file_path.read_bytes()
         # Check if file is empty
         if not file_data:
             raise DatalabFileError(
@@ -252,8 +267,7 @@ class AsyncDatalabClient:
         if options is None:
             options = ConvertOptions()
-        initial_data = await self._make_request(
-            "POST",
+        initial_data = await self._submit_with_retry(
             "/api/v1/marker",
             data=self.get_form_params(
                 file_path=file_path, file_url=file_url, options=options
@@ -283,6 +297,7 @@ class AsyncDatalabClient:
             images=result_data.get("images"),
             metadata=result_data.get("metadata"),
             error=result_data.get("error"),
+            error_in=result_data.get("error_in"),
             page_count=result_data.get("page_count"),
             status=result_data.get("status", "complete"),
             checkpoint_id=result_data.get("checkpoint_id"),
@@ -312,8 +327,7 @@ class AsyncDatalabClient:
         if options is None:
             options = OCROptions()
-        initial_data = await self._make_request(
-            "POST",
+        initial_data = await self._submit_with_retry(
             "/api/v1/ocr",
             data=self.get_form_params(file_path=file_path, options=options),
         )
@@ -370,8 +384,7 @@ class AsyncDatalabClient:
         if options is None:
             raise ValueError("options must be provided with field_data")
-        initial_data = await self._make_request(
-            "POST",
+        initial_data = await self._submit_with_retry(
             "/api/v1/fill",
             data=self.get_form_params(
                 file_path=file_path, file_url=file_url, options=options
@@ -393,6 +406,7 @@ class AsyncDatalabClient:
             status=result_data.get("status", "complete"),
             success=result_data.get("success"),
             error=result_data.get("error"),
+            error_in=result_data.get("error_in"),
             output_format=result_data.get("output_format"),
             output_base64=result_data.get("output_base64"),
             fields_filled=result_data.get("fields_filled"),
@@ -570,7 +584,9 @@ class AsyncDatalabClient:
         return {
             "success": response.get("success", True),
-            "message": response.get("message", f"Workflow {workflow_id} deleted successfully"),
+            "message": response.get(
+                "message", f"Workflow {workflow_id} deleted successfully"
+            ),
         }
     async def execute_workflow(
@@ -1236,9 +1252,7 @@ class DatalabClient:
         Returns:
             UploadedFileMetadata object with file information
         """
-        return self._run_async(
-            self._async_client.get_file_metadata(file_id=file_id)
-        )
+        return self._run_async(self._async_client.get_file_metadata(file_id=file_id))
     def get_file_download_url(
         self,
@@ -1260,7 +1274,9 @@ class DatalabClient:
                 - original_filename: Original filename
         """
         return self._run_async(
-            self._async_client.get_file_download_url(file_id=file_id, expires_in=expires_in)
+            self._async_client.get_file_download_url(
+                file_id=file_id, expires_in=expires_in
+            )
         )
     def delete_file(
@@ -1280,9 +1296,7 @@ class DatalabClient:
                 - success: Whether the deletion was successful
                 - message: Confirmation message
         """
-        return self._run_async(
-            self._async_client.delete_file(file_id=file_id)
-        )
+        return self._run_async(self._async_client.delete_file(file_id=file_id))
     def delete_workflow(self, workflow_id: int) -> Dict[str, Any]:
         """

{datalab_python_sdk-0.2.0 → datalab_python_sdk-0.2.2}/datalab_sdk/models.py RENAMED Viewed

@@ -3,7 +3,7 @@ Datalab SDK data models
 """
 from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Any, Union
+from typing import Dict, List, Optional, Any, Union, Literal
 from pathlib import Path
 import json
 import base64
@@ -41,19 +41,20 @@ class ConvertOptions(ProcessingOptions):
     paginate: bool = False
     disable_image_extraction: bool = False
     disable_image_captions: bool = False
+    fence_synthetic_captions: bool = False
     additional_config: Optional[Dict[str, Any]] = None
     page_schema: Optional[Dict[str, Any]] = None
     segmentation_schema: Optional[str] = None  # JSON string for document segmentation
     save_checkpoint: bool = False
-    extras: Optional[str] = (
-        None  # Comma-separated list: 'track_changes', 'chart_understanding'
-    )
     output_format: str = "markdown"  # markdown, json, html, chunks
     mode: str = "balanced"  # fast, balanced, accurate
     keep_spreadsheet_formatting: bool = False
     webhook_url: Optional[str] = None
-    extras: Optional[str] = None  # comma-separated extras
+    # Comma-separated list of extra features: 'track_changes', 'chart_understanding',
+    # 'table_row_bboxes', 'extract_links', 'infographic', 'new_block_types'
+    extras: Optional[str] = None
     add_block_ids: bool = False  # add block IDs to HTML output
+    include_markdown_in_chunks: bool = False  # include markdown field in chunks/JSON output
     def to_form_data(self) -> Dict[str, Any]:
         """Convert to form data format for API requests"""
@@ -121,6 +122,7 @@ class ConversionResult:
     images: Optional[Dict[str, str]] = None
     metadata: Optional[Dict[str, Any]] = None
     error: Optional[str] = None
+    error_in: Optional[Literal["VALIDATION", "INFERENCE", "OTHER"]] = None
     page_count: Optional[int] = None
     status: str = "complete"
     checkpoint_id: Optional[str] = None
@@ -375,6 +377,7 @@ class FormFillingResult:
     status: str
     success: Optional[bool] = None
     error: Optional[str] = None
+    error_in: Optional[Literal["VALIDATION", "INFERENCE", "OTHER"]] = None
     output_format: Optional[str] = None  # "pdf" or "png"
     output_base64: Optional[str] = None  # Base64-encoded filled form
     fields_filled: Optional[List[str]] = (

{datalab_python_sdk-0.2.0 → datalab_python_sdk-0.2.2}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ readme = "README.md"
 license = "MIT"
 repository = "https://github.com/datalab-to/sdk"
 keywords = ["datalab", "sdk", "document-intelligence", "api"]
-version = "0.2.0"
+version = "0.2.2"
 description = "SDK for the Datalab document intelligence API"
 requires-python = ">=3.10"
 dependencies = [
@@ -16,6 +16,7 @@ dependencies = [
     "pydantic>=2.11.7",
     "pydantic-settings>=2.10.1",
     "tenacity>=8.2.3",
+    "tqdm>=4.66.0",
 ]
 [project.scripts]

{datalab_python_sdk-0.2.0 → datalab_python_sdk-0.2.2}/tests/test_client_methods.py RENAMED Viewed

@@ -504,13 +504,13 @@ class TestClientErrorHandling:
             with patch.object(
                 client, "_make_request", new_callable=AsyncMock
             ) as mock_request:
-                # Setup mock to raise API error
+                # Setup mock to raise API error (use 400 since 429 is retried)
                 mock_request.side_effect = DatalabAPIError(
-                    "API rate limit exceeded", status_code=429
+                    "Bad request", status_code=400
                 )
                 # Test that error is propagated
-                with pytest.raises(DatalabAPIError, match="API rate limit exceeded"):
+                with pytest.raises(DatalabAPIError, match="Bad request"):
                     await client.ocr(pdf_file)
     def test_convert_unsuccessful_response(self, temp_dir):

{datalab_python_sdk-0.2.0 → datalab_python_sdk-0.2.2}/uv.lock RENAMED Viewed

@@ -212,7 +212,7 @@ wheels = [
 [[package]]
 name = "datalab-python-sdk"
-version = "0.2.0"
+version = "0.2.2"
 source = { editable = "." }
 dependencies = [
     { name = "aiohttp" },
@@ -220,6 +220,7 @@ dependencies = [
     { name = "pydantic" },
     { name = "pydantic-settings" },
     { name = "tenacity" },
+    { name = "tqdm" },
 ]
 [package.dev-dependencies]
@@ -240,6 +241,7 @@ requires-dist = [
     { name = "pydantic", specifier = ">=2.11.7" },
     { name = "pydantic-settings", specifier = ">=2.10.1" },
     { name = "tenacity", specifier = ">=8.2.3" },
+    { name = "tqdm", specifier = ">=4.66.0" },
 ]
 [package.metadata.requires-dev]
@@ -1102,6 +1104,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/77/b8/0135fadc89e73be292b473cb820b4f5a08197779206b33191e801feeae40/tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b", size = 14408, upload-time = "2025-10-08T22:01:46.04Z" },
 ]
+[[package]]
+name = "tqdm"
+version = "4.67.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" },
+]
 [[package]]
 name = "typing-extensions"
 version = "4.15.0"