PyPI - datalab-python-sdk - Versions diffs - 0.1.10__tar.gz → 0.1.11__tar.gz - Mend

datalab-python-sdk 0.1.10tar.gz → 0.1.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datalab-python-sdk
-Version: 0.1.10
+Version: 0.1.11
 Summary: SDK for the Datalab document intelligence API
 Author-email: Datalab Team <hi@datalab.to>
 License-Expression: MIT

{datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/cli.py RENAMED Viewed

@@ -197,7 +197,7 @@ def get_files_to_process(
     if path.is_file():
         # Single file processing
         if file_extensions and path.suffix.lower() not in file_extensions:
-            click.echo(f"❌ Skipping {path}: unsupported file type", err=True)
+            click.echo(f"Skipping {path}: unsupported file type", err=True)
             sys.exit(1)
         return [path]
     else:
@@ -210,10 +210,10 @@ def show_results(results: List[dict], operation: str, output_dir: Path):
     successful = sum(1 for r in results if r["success"])
     failed = len(results) - successful
-    click.echo(f"\n📊 {operation} Summary:")
-    click.echo(f"   ✅ Successfully processed: {successful} files")
+    click.echo(f"\n{operation} Summary:")
+    click.echo(f"   Successfully processed: {successful} files")
     if failed > 0:
-        click.echo(f"   ❌ Failed: {failed} files")
+        click.echo(f"   Failed: {failed} files")
         # Show failed files
         click.echo("\n   Failed files:")
@@ -221,7 +221,7 @@ def show_results(results: List[dict], operation: str, output_dir: Path):
             if not result["success"]:
                 click.echo(f"      - {result['file_path']}: {result['error']}")
-    click.echo(f"\n📁 Output saved to: {output_dir}")
+    click.echo(f"\nOutput saved to: {output_dir}")
 def process_documents(
@@ -270,10 +270,10 @@ def process_documents(
         to_process = get_files_to_process(path, file_extensions)
         if not to_process:
-            click.echo(f"❌ No supported files found in {path}", err=True)
+            click.echo(f"No supported files found in {path}", err=True)
             sys.exit(1)
-        click.echo(f"📂 Found {len(to_process)} files to process")
+        click.echo(f"Found {len(to_process)} files to process")
         # Create processing options based on method
         if method == "convert":
@@ -317,7 +317,7 @@ def process_documents(
         show_results(results, operation, output_dir)
     except DatalabError as e:
-        click.echo(f"❌ Error: {e}", err=True)
+        click.echo(f"Error: {e}", err=True)
         sys.exit(1)
@@ -465,17 +465,17 @@ def create_workflow(
             name=name, team_id=team_id, steps=workflow_steps
         )
-        click.echo(f"✅ Workflow created successfully!")
+        click.echo(f"Workflow created successfully!")
         click.echo(f"   ID: {workflow.id}")
         click.echo(f"   Name: {workflow.name}")
         click.echo(f"   Team ID: {workflow.team_id}")
         click.echo(f"   Steps: {len(workflow.steps)}")
     except DatalabError as e:
-        click.echo(f"❌ Error: {e}", err=True)
+        click.echo(f"Error: {e}", err=True)
         sys.exit(1)
     except Exception as e:
-        click.echo(f"❌ Error: {e}", err=True)
+        click.echo(f"Error: {e}", err=True)
         sys.exit(1)
@@ -497,7 +497,7 @@ def get_workflow(workflow_id: int, api_key: Optional[str], base_url: str):
         client = DatalabClient(api_key=api_key, base_url=base_url)
         workflow = client.get_workflow(workflow_id)
-        click.echo(f"📋 Workflow Details:")
+        click.echo(f"Workflow Details:")
         click.echo(f"   ID: {workflow.id}")
         click.echo(f"   Name: {workflow.name}")
         click.echo(f"   Team ID: {workflow.team_id}")
@@ -512,7 +512,48 @@ def get_workflow(workflow_id: int, api_key: Optional[str], base_url: str):
                 click.echo(f"      Depends on: {', '.join(step.depends_on)}")
     except DatalabError as e:
-        click.echo(f"❌ Error: {e}", err=True)
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+@click.command()
+@click.option("--api_key", required=False, help="Datalab API key")
+@click.option("--base_url", default=settings.DATALAB_HOST, help="API base URL")
+def get_step_types(api_key: Optional[str], base_url: str):
+    """Get all available workflow step types"""
+    try:
+        if api_key is None:
+            api_key = settings.DATALAB_API_KEY
+        if api_key is None:
+            raise DatalabError(
+                "You must either pass in an api key via --api_key or set the DATALAB_API_KEY env variable."
+            )
+        client = DatalabClient(api_key=api_key, base_url=base_url)
+        response = client.get_step_types()
+        step_types = response.get("step_types", [])
+        if not step_types:
+            click.echo("No step types found.")
+            return
+        click.echo(f"Found {len(step_types)} step type(s):\n")
+        for step_type in step_types:
+            click.echo("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+            click.echo(f"Key:         {step_type.get('type')}")
+            click.echo(f"Version:     {step_type.get('version')}")
+            click.echo(f"Name:        {step_type.get('name')}")
+            if step_type.get("description"):
+                click.echo(f"Description: {step_type['description']}")
+            if step_type.get("settings_schema"):
+                click.echo("\nSettings Schema:")
+                click.echo(json.dumps(step_type["settings_schema"], indent=2))
+            click.echo("")
+    except DatalabError as e:
+        click.echo(f"Error: {e}", err=True)
         sys.exit(1)
@@ -537,7 +578,7 @@ def list_workflows(api_key: Optional[str], base_url: str):
             click.echo("No workflows found.")
             return
-        click.echo(f"📋 Found {len(workflows)} workflow(s):\n")
+        click.echo(f"Found {len(workflows)} workflow(s):\n")
         for workflow in workflows:
             click.echo(f"   ID: {workflow.id}")
             click.echo(f"   Name: {workflow.name}")
@@ -547,7 +588,7 @@ def list_workflows(api_key: Optional[str], base_url: str):
             click.echo("")
     except DatalabError as e:
-        click.echo(f"❌ Error: {e}", err=True)
+        click.echo(f"Error: {e}", err=True)
         sys.exit(1)
@@ -595,25 +636,25 @@ def execute_workflow(
         client = DatalabClient(api_key=api_key, base_url=base_url)
-        click.echo(f"🚀 Triggering workflow execution for workflow {workflow_id}...")
+        click.echo(f"Triggering workflow execution for workflow {workflow_id}...")
         execution = client.execute_workflow(
             workflow_id=workflow_id,
             input_config=input_cfg,
         )
-        click.echo(f"\n✅ Successfully triggered workflow execution!")
+        click.echo(f"\nSuccessfully triggered workflow execution!")
         click.echo(f"   Execution ID: {execution.id}")
         click.echo(f"   Status: {execution.status}")
-        click.echo(f"\n💡 To check the status, run:")
+        click.echo(f"\nTo check the status, run:")
         click.echo(f"   datalab get-execution-status --execution_id {execution.id}")
         click.echo(f"\n   Or poll until complete:")
         click.echo(f"   datalab get-execution-status --execution_id {execution.id} --max_polls 300 --poll_interval 2")
     except DatalabError as e:
-        click.echo(f"❌ Error: {e}", err=True)
+        click.echo(f"Error: {e}", err=True)
         sys.exit(1)
     except Exception as e:
-        click.echo(f"❌ Error: {e}", err=True)
+        click.echo(f"Error: {e}", err=True)
         sys.exit(1)
@@ -663,7 +704,7 @@ def get_execution_status(
             download_results=download,
         )
-        click.echo(f"📊 Execution Status:")
+        click.echo(f"Execution Status:")
         click.echo(f"   Execution ID: {execution.id}")
         click.echo(f"   Workflow ID: {execution.workflow_id}")
         click.echo(f"   Status: {execution.status}")
@@ -683,7 +724,7 @@ def get_execution_status(
                     if "output_url" in step_data and not download:
                         click.echo(f"      Status: {step_data.get('status', 'N/A')}")
                         click.echo(f"      Output URL: {step_data.get('output_url')}")
-                        click.echo(f"      💡 Use --download to fetch actual results")
+                        click.echo(f"      Use --download to fetch actual results")
                     else:
                         click.echo(f"      {json.dumps(step_data, indent=8)}")
                 else:
@@ -694,10 +735,10 @@ def get_execution_status(
             output_path = Path(output)
             output_path.parent.mkdir(parents=True, exist_ok=True)
             execution.save_output(output_path)
-            click.echo(f"\n📁 Results saved to: {output_path}")
+            click.echo(f"\nResults saved to: {output_path}")
     except DatalabError as e:
-        click.echo(f"❌ Error: {e}", err=True)
+        click.echo(f"Error: {e}", err=True)
         sys.exit(1)
@@ -722,7 +763,7 @@ def visualize_workflow(definition: str):
         steps = workflow_def.get("steps", [])
         if not steps:
-            click.echo("⚠️  No steps found in workflow definition")
+            click.echo("No steps found in workflow definition")
             return
         # Build dependency graph
@@ -761,13 +802,13 @@ def visualize_workflow(definition: str):
         click.echo(f"\nTotal steps: {len(steps)}")
     except json.JSONDecodeError as e:
-        click.echo(f"❌ Invalid JSON: {e}", err=True)
+        click.echo(f"Invalid JSON: {e}", err=True)
         sys.exit(1)
     except KeyError as e:
-        click.echo(f"❌ Missing required field in workflow definition: {e}", err=True)
+        click.echo(f"Missing required field in workflow definition: {e}", err=True)
         sys.exit(1)
     except Exception as e:
-        click.echo(f"❌ Error: {e}", err=True)
+        click.echo(f"Error: {e}", err=True)
         sys.exit(1)
@@ -814,6 +855,7 @@ cli.add_command(convert)
 cli.add_command(ocr)
 cli.add_command(create_workflow)
 cli.add_command(get_workflow)
+cli.add_command(get_step_types)
 cli.add_command(list_workflows)
 cli.add_command(execute_workflow)
 cli.add_command(get_execution_status)

{datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/client.py RENAMED Viewed

@@ -345,13 +345,10 @@ class AsyncDatalabClient:
         # Parse response into Workflow object
         workflow_steps = [
             WorkflowStep(
-                step_key=step["step_key"],
                 unique_name=step["unique_name"],
                 settings=step["settings"],
                 depends_on=step.get("depends_on", []),
-                id=step.get("id"),
-                version=step.get("version"),
-                name=step.get("name"),
+                id=step.get("id")
             )
             for step in response.get("steps", [])
         ]
@@ -402,6 +399,19 @@ class AsyncDatalabClient:
             updated=response.get("updated"),
         )
+    async def get_step_types(self) -> dict:
+        """
+        Get all available workflow step types
+        Returns:
+            Dictionary containing step_types list with their schemas
+        """
+        response = await self._make_request(
+            "GET",
+            "/api/v1/workflows/step-types",
+        )
+        return response
     async def list_workflows(self) -> list[Workflow]:
         """
         List all workflows for the authenticated user's team
@@ -694,6 +704,10 @@ class DatalabClient:
         """Get a workflow by ID (sync version)"""
         return self._run_async(self._async_client.get_workflow(workflow_id))
+    def get_step_types(self) -> dict:
+        """Get all available workflow step types (sync version)"""
+        return self._run_async(self._async_client.get_step_types())
     def list_workflows(self) -> list[Workflow]:
         """List all workflows (sync version)"""
         return self._run_async(self._async_client.list_workflows())

{datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ readme = "README.md"
 license = "MIT"
 repository = "https://github.com/datalab-to/sdk"
 keywords = ["datalab", "sdk", "document-intelligence", "api"]
-version = "0.1.10"
+version = "0.1.11"
 description = "SDK for the Datalab document intelligence API"
 requires-python = ">=3.10"
 dependencies = [

{datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/end_to_end_workflow.py RENAMED Viewed

@@ -42,33 +42,6 @@ def load_workflow_definition(definition_path: str, replacements: dict = None) ->
     return workflow_def
-def create_simple_workflow_definition() -> dict:
-    """Create a simple default workflow definition"""
-    return {
-        "name": "Simple Parse and Extract",
-        "steps": [
-            {
-                "step_key": "marker_parse",
-                "unique_name": "parse_document",
-                "settings": {"max_pages": 10, "output_format": "json"},
-                "depends_on": []
-            },
-            {
-                "step_key": "marker_extract",
-                "unique_name": "extract_metadata",
-                "settings": {
-                    "page_schema": {
-                        "title": {"type": "string"},
-                        "author": {"type": "string"},
-                        "summary": {"type": "string"}
-                    }
-                },
-                "depends_on": ["parse_document"]
-            }
-        ]
-    }
 def main():
     parser = argparse.ArgumentParser(
         description="Run any workflow end-to-end from definition to results"
@@ -126,14 +99,10 @@ def main():
     # Step 1: Load or create workflow definition
     print("📄 Loading workflow definition...")
-    if args.definition:
-        # Build replacements dict
-        replacements = dict(args.replace) if args.replace else None
-        workflow_def = load_workflow_definition(args.definition, replacements)
-        print(f"   Source: {args.definition}")
-    else:
-        workflow_def = create_simple_workflow_definition()
-        print(f"   Source: Built-in simple workflow")
+    # Build replacements dict
+    replacements = dict(args.replace) if args.replace else None
+    workflow_def = load_workflow_definition(args.definition, replacements)
+    print(f"   Source: {args.definition}")
     print(f"   Name: {workflow_def['name']}")
     print(f"   Steps: {len(workflow_def['steps'])}\n")

{datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_definitions/README.md RENAMED Viewed

@@ -4,6 +4,7 @@ This directory contains JSON workflow definitions that can be loaded and execute
 ## Available Workflows
+- [Parse and Segment (Simple)](#parse-and-segment-simple) - Do a straightforward parse -> segment to run one or more files through
 - [Eval Segmentation Across Providers](#eval-segmentation-across-providers) - Compare Marker vs Reducto segmentation in parallel
 - [Parallel Extract Large SEC Filings](#parallel-extract-large-sec-filings) - Parse → Segment → Extract from multiple sections in parallel
 - [Slack Alert Workflow](#slack-alert-workflow) - Full pipeline with parallel extraction, aggregation, and Slack notification
@@ -33,6 +34,35 @@ For a full list of `settings` to use for `marker` related steps, visit our [API
 ## Available Workflow Definitions
+### Parse and Segment (Simple)
+**What it does:**
+Simple workflow that does `marker_parse` -> `marker_segment`. You can pass in one or more `file_urls` when triggering your workflow.
+Once you get results, you can process them to run your own custom evaluations.
+**Structure:**
+- **Marker branch**: Parse → Segment
+**Visualize:**
+```bash
+datalab visualize-workflow --definition recipes/workflows/workflow_definitions/parse_segment.json
+```
+**Execute:**
+```bash
+# Using end-to-end runner
+python recipes/workflows/end_to_end_workflow.py \
+    --definition recipes/workflows/workflow_definitions/parse_segment.json \
+    --file_url https://www.novonordisk.com/content/dam/nncorp/global/en/investors/irmaterial/annual_report/2024/novo-nordisk-form-20-f-2023.pdf
+# Or step-by-step
+python recipes/workflows/workflow_api_tutorial/create_workflow.py \
+    --definition recipes/workflows/workflow_definitions/parse_segment.json
+```
+---
 ### Eval Segmentation Across Providers
 **What it does:**
@@ -46,21 +76,21 @@ Once you get results, you can process them to run your own custom evaluations.
 **Visualize:**
 ```bash
-datalab visualize-workflow --definition workflow_definitions/eval_segmentation.json
+datalab visualize-workflow --definition recipes/workflows/workflow_definitions/eval_segmentation.json
 ```
 **Execute:**
 ```bash
 # Using end-to-end runner
 python recipes/workflows/end_to_end_workflow.py \
-    --definition workflow_definitions/eval_segmentation.json \
+    --definition recipes/workflows/workflow_definitions/eval_segmentation.json \
     --file_url https://example.com/doc.pdf \
     --replace YOUR_REDUCTO_API_KEY your_key_here \
     --save results.json
 # Or step-by-step
 python recipes/workflows/workflow_api_tutorial/create_workflow.py \
-    --definition workflow_definitions/eval_segmentation.json \
+    --definition recipes/workflows/workflow_definitions/eval_segmentation.json \
     --replace YOUR_REDUCTO_API_KEY your_key_here
 ```
@@ -86,19 +116,19 @@ Without this, you might have a long, dense schema that applies on the entire doc
 **Visualize:**
 ```bash
-datalab visualize-workflow --definition workflow_definitions/segment_parallel_extract.json
+datalab visualize-workflow --definition recipes/workflows/workflow_definitions/segment_parallel_extract.json
 ```
 **Execute:**
 ```bash
 # Using end-to-end runner
 python recipes/workflows/end_to_end_workflow.py \
-    --definition workflow_definitions/segment_parallel_extraction.json \
+    --definition recipes/workflows/workflow_definitions/segment_parallel_extract.json \
     --file_url https://www.novonordisk.com/content/dam/nncorp/global/en/investors/irmaterial/annual_report/2024/novo-nordisk-form-20-f-2023.pdf
 # Or step-by-step
 python recipes/workflows/workflow_api_tutorial/create_workflow.py \
-    --definition workflow_definitions/segment_parallel_extraction.json
+    --definition recipes/workflows/workflow_definitions/segment_parallel_extract.json
 ```
 ---
@@ -119,14 +149,14 @@ Complete pipeline that parses documents, segments into sections, extracts struct
 **Visualize:**
 ```bash
-datalab visualize-workflow --definition workflow_definitions/slack_alert.json
+datalab visualize-workflow --definition recipes/workflows/workflow_definitions/slack_alert.json
 ```
 **Execute:**
 ```bash
 # Using end-to-end runner with multiple files
 python recipes/workflows/end_to_end_workflow.py \
-    --definition workflow_definitions/slack_alert.json \
+    --definition recipes/workflows/workflow_definitions/slack_alert.json \
     --file_url https://www.novonordisk.com/content/dam/nncorp/global/en/investors/irmaterial/annual_report/2024/novo-nordisk-form-20-f-2023.pdf \
     --replace YOUR_SLACK_BOT_TOKEN xoxb-your-token \
     --replace YOUR_SLACK_CHANNEL_ID <YOUR_CHANNEL_ID> \
@@ -134,7 +164,7 @@ python recipes/workflows/end_to_end_workflow.py \
 # Or step-by-step
 python recipes/workflows/workflow_api_tutorial/create_workflow.py \
-    --definition workflow_definitions/slack_alert.json \
+    --definition recipes/workflows/workflow_definitions/slack_alert.json \
     --replace YOUR_SLACK_BOT_TOKEN xoxb-your-token \
     --replace YOUR_SLACK_CHANNEL_ID <YOUR_CHANNEL_ID>
 ```

datalab_python_sdk-0.1.11/recipes/workflows/workflow_definitions/parse_segment.json ADDED Viewed

@@ -0,0 +1,22 @@
+{
+  "name": "Parse and Segment",
+  "description": "Simple Parse and Segment Workflow",
+  "steps": [
+    {
+      "step_key": "marker_parse",
+      "unique_name": "marker_parse",
+      "settings": {},
+      "depends_on": []
+    },
+    {
+      "step_key": "marker_segment",
+      "unique_name": "marker_segment",
+      "settings": {
+        "segmentation_schema": {
+          "segmentation_strategy": "document_boundary"
+        }
+      },
+      "depends_on": ["marker_parse"]
+    }
+  ]
+}

{datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_definitions/slack_alert.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
-  "name": "Parse, Segment, Extract, Aggregate, then Post to Slack",
-  "description": "Full pipeline: Parse documents, segment into sections, extract data from multiple segments in parallel, aggregate results, and post to Slack",
+  "name": "Parse then Post to Slack",
+  "description": "Parse documents and post to Slack",
   "steps": [
     {
       "step_key": "marker_parse",

{datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/tests/test_cli_simple.py RENAMED Viewed

@@ -54,7 +54,7 @@ class TestConvertCommand:
                 )
                 assert result.exit_code == 0
-                assert "✅ Successfully processed" in result.output
+                assert "Successfully processed" in result.output
                 # Verify client was called correctly
                 mock_client_class.assert_called_once()
@@ -79,7 +79,7 @@ class TestConvertCommand:
                 settings.DATALAB_API_KEY = None
                 assert result.exit_code == 0
-                assert "✅ Successfully processed" in result.output
+                assert "Successfully processed" in result.output
             finally:
                 os.unlink(tmp_file.name)
@@ -131,7 +131,7 @@ class TestOCRCommand:
                 )
                 assert result.exit_code == 0
-                assert "✅ Successfully processed: 2 files" in result.output
+                assert "Successfully processed: 2 files" in result.output
                 # Verify client was called correctly
                 mock_client_class.assert_called_once()
@@ -163,7 +163,7 @@ class TestOCRCommand:
                 )
                 assert result.exit_code == 0
-                assert "✅ Successfully processed: 2 files" in result.output
+                assert "Successfully processed: 2 files" in result.output
             finally:
                 os.unlink(tmp_file.name)
@@ -193,5 +193,5 @@ class TestOCRCommand:
             )
             assert result.exit_code == 0
-            assert "📊 OCR Summary:" in result.output
-            assert "✅ Successfully processed: 2 files" in result.output
+            assert "OCR Summary:" in result.output
+            assert "Successfully processed: 2 files" in result.output

{datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/tests/test_workflows.py RENAMED Viewed

@@ -76,7 +76,7 @@ class TestWorkflowMethods:
                 assert workflow.name == "Test Workflow"
                 assert workflow.team_id == 12
                 assert len(workflow.steps) == 2
-                assert workflow.steps[0].step_key == "marker_parse"
+                assert workflow.steps[0].unique_name == "marker_parse"
                 assert workflow.steps[1].depends_on == ["marker_parse"]
     @pytest.mark.asyncio

{datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/uv.lock RENAMED Viewed

@@ -169,7 +169,7 @@ wheels = [
 [[package]]
 name = "datalab-python-sdk"
-version = "0.1.10"
+version = "0.1.11"
 source = { editable = "." }
 dependencies = [
     { name = "aiohttp" },