PyPI - datalab-python-sdk - Versions diffs - 0.1.6__tar.gz → 0.1.8__tar.gz - Mend

datalab-python-sdk 0.1.6tar.gz → 0.1.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datalab-python-sdk
-Version: 0.1.6
+Version: 0.1.8
 Summary: SDK for the Datalab document intelligence API
 Author-email: Datalab Team <hi@datalab.to>
 License-Expression: MIT

{datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/cli.py RENAMED Viewed

@@ -42,6 +42,12 @@ def common_options(func):
     func = click.option("--skip_cache", help="Skip the cache when running inference")(
         func
     )
+    func = click.option(
+        "--max_polls", default=300, type=int, help="Maximum number of polling attempts"
+    )(func)
+    func = click.option(
+        "--poll_interval", default=1, type=int, help="Polling interval in seconds"
+    )(func)
     return func
@@ -51,7 +57,7 @@ def marker_options(func):
         "--format",
         "output_format",
         default="markdown",
-        type=click.Choice(["markdown", "html", "json"]),
+        type=click.Choice(["markdown", "html", "json", "chunks"]),
         help="Output format",
     )(func)
     func = click.option("--force_ocr", is_flag=True, help="Force OCR on every page")(
@@ -106,6 +112,8 @@ async def process_files_async(
     max_concurrent: int = 5,
     api_key: str | None = None,
     base_url: str | None = None,
+    max_polls: int = 300,
+    poll_interval: int = 1,
 ) -> List[dict]:
     """Process files asynchronously"""
     semaphore = asyncio.Semaphore(max_concurrent)
@@ -124,11 +132,13 @@ async def process_files_async(
                 ) as client:
                     if method == "convert":
                         result = await client.convert(
-                            file_path, options=options, save_output=output_path
+                            file_path, options=options, save_output=output_path,
+                            max_polls=max_polls, poll_interval=poll_interval
                         )
                     else:  # method == 'ocr'
                         result = await client.ocr(
-                            file_path, options=options, save_output=output_path
+                            file_path, options=options, save_output=output_path,
+                            max_polls=max_polls, poll_interval=poll_interval
                         )
                 return {
@@ -218,6 +228,8 @@ def process_documents(
     base_url: str,
     page_range: Optional[str],
     skip_cache: bool,
+    max_polls: int,
+    poll_interval: int,
     # Convert-specific options
     output_format: Optional[str] = None,
     force_ocr: bool = False,
@@ -288,6 +300,8 @@ def process_documents(
                 max_concurrent=max_concurrent,
                 api_key=api_key,
                 base_url=base_url,
+                max_polls=max_polls,
+                poll_interval=poll_interval,
             )
         )
@@ -320,6 +334,8 @@ def convert(
     base_url: str,
     page_range: Optional[str],
     skip_cache: bool,
+    max_polls: int,
+    poll_interval: int,
     output_format: str,
     force_ocr: bool,
     format_lines: bool,
@@ -342,6 +358,8 @@ def convert(
         base_url=base_url,
         page_range=page_range,
         skip_cache=skip_cache,
+        max_polls=max_polls,
+        poll_interval=poll_interval,
         output_format=output_format,
         force_ocr=force_ocr,
         format_lines=format_lines,
@@ -367,6 +385,8 @@ def ocr(
     base_url: str,
     page_range: Optional[str],
     skip_cache: bool,
+    max_polls: int,
+    poll_interval: int,
 ):
     """Perform OCR on documents"""
     process_documents(
@@ -380,6 +400,8 @@ def ocr(
         base_url=base_url,
         page_range=page_range,
         skip_cache=skip_cache,
+        max_polls=max_polls,
+        poll_interval=poll_interval,
     )

{datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/client.py RENAMED Viewed

@@ -249,6 +249,7 @@ class AsyncDatalabClient:
             markdown=result_data.get("markdown"),
             html=result_data.get("html"),
             json=result_data.get("json"),
+            chunks=result_data.get("chunks"),
             extraction_schema_json=result_data.get("extraction_schema_json"),
             images=result_data.get("images"),
             metadata=result_data.get("metadata"),

{datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/models.py RENAMED Viewed

@@ -13,7 +13,7 @@ import base64
 class ProcessingOptions:
     # Common options
     max_pages: Optional[int] = None
-    skip_cache: bool = True
+    skip_cache: bool = False
     page_range: Optional[str] = None
     def to_form_data(self) -> Dict[str, Any]:
@@ -65,6 +65,7 @@ class ConversionResult:
     markdown: Optional[str] = None
     html: Optional[str] = None
     json: Optional[Dict[str, Any]] = None
+    chunks: Optional[Dict[str, Any]] = None
     extraction_schema_json: Optional[str] = None
     images: Optional[Dict[str, str]] = None
     metadata: Optional[Dict[str, Any]] = None
@@ -91,6 +92,10 @@ class ConversionResult:
             with open(output_path.with_suffix(".json"), "w", encoding="utf-8") as f:
                 json.dump(self.json, f, indent=2)
+        if self.chunks:
+            with open(output_path.with_suffix(".chunks.json"), "w", encoding="utf-8") as f:
+                json.dump(self.chunks, f, indent=2)
         if self.extraction_schema_json:
             with open(
                 output_path.with_suffix("_extraction_results.json"),

{datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ readme = "README.md"
 license = "MIT"
 repository = "https://github.com/datalab-to/sdk"
 keywords = ["datalab", "sdk", "document-intelligence", "api"]
-version = "0.1.6"
+version = "0.1.8"
 description = "SDK for the Datalab document intelligence API"
 requires-python = ">=3.10"
 dependencies = [

{datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/tests/test_client_methods.py RENAMED Viewed

@@ -89,6 +89,7 @@ class TestConvertMethod:
             "markdown": "# Test Document\n\nThis is a test document.",
             "html": None,
             "json": None,
+            "chunks": {"some_content": True},
             "images": {},
             "metadata": {"pages": 1},
             "error": "",
@@ -114,13 +115,17 @@ class TestConvertMethod:
                     # Verify result
                     assert result.success is True
-                    # Verify file was saved
+                    # Verify Markdown file was saved
                     assert (output_path.with_suffix(".md")).exists()
                     saved_content = (output_path.with_suffix(".md")).read_text()
                     assert (
                         saved_content == "# Test Document\n\nThis is a test document."
                     )
+                    assert (output_path.with_suffix(".chunks.json")).exists()
+                    saved_chunks = json.loads((output_path.with_suffix(".chunks.json")).read_text())
+                    assert saved_chunks == {"some_content": True}
     def test_convert_sync_with_processing_options(self, temp_dir):
         """Test synchronous conversion with processing options"""
         # Create test file

{datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/uv.lock RENAMED Viewed

@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.10"
 [[package]]
@@ -169,7 +169,7 @@ wheels = [
 [[package]]
 name = "datalab-python-sdk"
-version = "0.1.6"
+version = "0.1.8"
 source = { editable = "." }
 dependencies = [
     { name = "aiohttp" },