datalab-python-sdk 0.1.10__tar.gz → 0.1.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/PKG-INFO +1 -1
  2. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/cli.py +70 -28
  3. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/client.py +18 -4
  4. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/pyproject.toml +1 -1
  5. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/end_to_end_workflow.py +4 -35
  6. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_definitions/README.md +39 -9
  7. datalab_python_sdk-0.1.11/recipes/workflows/workflow_definitions/parse_segment.json +22 -0
  8. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_definitions/slack_alert.json +2 -2
  9. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/tests/test_cli_simple.py +6 -6
  10. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/tests/test_workflows.py +1 -1
  11. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/uv.lock +1 -1
  12. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/.github/workflows/ci.yml +0 -0
  13. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/.github/workflows/publish.yml +0 -0
  14. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/.gitignore +0 -0
  15. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/.pre-commit-config.yaml +0 -0
  16. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/.python-version +0 -0
  17. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/LICENSE +0 -0
  18. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/README.md +0 -0
  19. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/08-Lambda-Calculus.pptx +0 -0
  20. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/adversarial.pdf +0 -0
  21. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/bid_evaluation.docx +0 -0
  22. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/book_review.ppt +0 -0
  23. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/book_store.xls +0 -0
  24. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/chi_hind.png +0 -0
  25. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/how_to_read.doc +0 -0
  26. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/normandy.epub +0 -0
  27. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/sample-1-sheet.xlsx +0 -0
  28. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/thinkpython.pdf +0 -0
  29. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/vibe.html +0 -0
  30. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/__init__.py +0 -0
  31. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/exceptions.py +0 -0
  32. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/mimetypes.py +0 -0
  33. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/models.py +0 -0
  34. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/settings.py +0 -0
  35. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/integration/README.md +0 -0
  36. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/integration/__init__.py +0 -0
  37. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/integration/test_live_api.py +0 -0
  38. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/integration/test_readme_examples.py +0 -0
  39. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/poetry.lock +0 -0
  40. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/pytest.ini +0 -0
  41. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/README.md +0 -0
  42. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_api_tutorial/1_get_step_types.py +0 -0
  43. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_api_tutorial/2_get_workflows.py +0 -0
  44. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_api_tutorial/3_create_workflow.py +0 -0
  45. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_api_tutorial/4_execute_workflow.py +0 -0
  46. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_api_tutorial/5_poll_workflow.py +0 -0
  47. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_api_tutorial/README.md +0 -0
  48. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_definitions/eval_segmentation.json +0 -0
  49. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_definitions/segment_parallel_extract.json +0 -0
  50. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/tests/__init__.py +0 -0
  51. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/tests/conftest.py +0 -0
  52. {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/tests/test_client_methods.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datalab-python-sdk
3
- Version: 0.1.10
3
+ Version: 0.1.11
4
4
  Summary: SDK for the Datalab document intelligence API
5
5
  Author-email: Datalab Team <hi@datalab.to>
6
6
  License-Expression: MIT
@@ -197,7 +197,7 @@ def get_files_to_process(
197
197
  if path.is_file():
198
198
  # Single file processing
199
199
  if file_extensions and path.suffix.lower() not in file_extensions:
200
- click.echo(f"Skipping {path}: unsupported file type", err=True)
200
+ click.echo(f"Skipping {path}: unsupported file type", err=True)
201
201
  sys.exit(1)
202
202
  return [path]
203
203
  else:
@@ -210,10 +210,10 @@ def show_results(results: List[dict], operation: str, output_dir: Path):
210
210
  successful = sum(1 for r in results if r["success"])
211
211
  failed = len(results) - successful
212
212
 
213
- click.echo(f"\n📊 {operation} Summary:")
214
- click.echo(f" Successfully processed: {successful} files")
213
+ click.echo(f"\n{operation} Summary:")
214
+ click.echo(f" Successfully processed: {successful} files")
215
215
  if failed > 0:
216
- click.echo(f" Failed: {failed} files")
216
+ click.echo(f" Failed: {failed} files")
217
217
 
218
218
  # Show failed files
219
219
  click.echo("\n Failed files:")
@@ -221,7 +221,7 @@ def show_results(results: List[dict], operation: str, output_dir: Path):
221
221
  if not result["success"]:
222
222
  click.echo(f" - {result['file_path']}: {result['error']}")
223
223
 
224
- click.echo(f"\n📁 Output saved to: {output_dir}")
224
+ click.echo(f"\nOutput saved to: {output_dir}")
225
225
 
226
226
 
227
227
  def process_documents(
@@ -270,10 +270,10 @@ def process_documents(
270
270
  to_process = get_files_to_process(path, file_extensions)
271
271
 
272
272
  if not to_process:
273
- click.echo(f"No supported files found in {path}", err=True)
273
+ click.echo(f"No supported files found in {path}", err=True)
274
274
  sys.exit(1)
275
275
 
276
- click.echo(f"📂 Found {len(to_process)} files to process")
276
+ click.echo(f"Found {len(to_process)} files to process")
277
277
 
278
278
  # Create processing options based on method
279
279
  if method == "convert":
@@ -317,7 +317,7 @@ def process_documents(
317
317
  show_results(results, operation, output_dir)
318
318
 
319
319
  except DatalabError as e:
320
- click.echo(f"Error: {e}", err=True)
320
+ click.echo(f"Error: {e}", err=True)
321
321
  sys.exit(1)
322
322
 
323
323
 
@@ -465,17 +465,17 @@ def create_workflow(
465
465
  name=name, team_id=team_id, steps=workflow_steps
466
466
  )
467
467
 
468
- click.echo(f"Workflow created successfully!")
468
+ click.echo(f"Workflow created successfully!")
469
469
  click.echo(f" ID: {workflow.id}")
470
470
  click.echo(f" Name: {workflow.name}")
471
471
  click.echo(f" Team ID: {workflow.team_id}")
472
472
  click.echo(f" Steps: {len(workflow.steps)}")
473
473
 
474
474
  except DatalabError as e:
475
- click.echo(f"Error: {e}", err=True)
475
+ click.echo(f"Error: {e}", err=True)
476
476
  sys.exit(1)
477
477
  except Exception as e:
478
- click.echo(f"Error: {e}", err=True)
478
+ click.echo(f"Error: {e}", err=True)
479
479
  sys.exit(1)
480
480
 
481
481
 
@@ -497,7 +497,7 @@ def get_workflow(workflow_id: int, api_key: Optional[str], base_url: str):
497
497
  client = DatalabClient(api_key=api_key, base_url=base_url)
498
498
  workflow = client.get_workflow(workflow_id)
499
499
 
500
- click.echo(f"📋 Workflow Details:")
500
+ click.echo(f"Workflow Details:")
501
501
  click.echo(f" ID: {workflow.id}")
502
502
  click.echo(f" Name: {workflow.name}")
503
503
  click.echo(f" Team ID: {workflow.team_id}")
@@ -512,7 +512,48 @@ def get_workflow(workflow_id: int, api_key: Optional[str], base_url: str):
512
512
  click.echo(f" Depends on: {', '.join(step.depends_on)}")
513
513
 
514
514
  except DatalabError as e:
515
- click.echo(f"Error: {e}", err=True)
515
+ click.echo(f"Error: {e}", err=True)
516
+ sys.exit(1)
517
+
518
+
519
+ @click.command()
520
+ @click.option("--api_key", required=False, help="Datalab API key")
521
+ @click.option("--base_url", default=settings.DATALAB_HOST, help="API base URL")
522
+ def get_step_types(api_key: Optional[str], base_url: str):
523
+ """Get all available workflow step types"""
524
+ try:
525
+ if api_key is None:
526
+ api_key = settings.DATALAB_API_KEY
527
+
528
+ if api_key is None:
529
+ raise DatalabError(
530
+ "You must either pass in an api key via --api_key or set the DATALAB_API_KEY env variable."
531
+ )
532
+
533
+ client = DatalabClient(api_key=api_key, base_url=base_url)
534
+ response = client.get_step_types()
535
+
536
+ step_types = response.get("step_types", [])
537
+ if not step_types:
538
+ click.echo("No step types found.")
539
+ return
540
+
541
+ click.echo(f"Found {len(step_types)} step type(s):\n")
542
+ for step_type in step_types:
543
+ click.echo("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
544
+ click.echo(f"Key: {step_type.get('type')}")
545
+ click.echo(f"Version: {step_type.get('version')}")
546
+ click.echo(f"Name: {step_type.get('name')}")
547
+ if step_type.get("description"):
548
+ click.echo(f"Description: {step_type['description']}")
549
+
550
+ if step_type.get("settings_schema"):
551
+ click.echo("\nSettings Schema:")
552
+ click.echo(json.dumps(step_type["settings_schema"], indent=2))
553
+ click.echo("")
554
+
555
+ except DatalabError as e:
556
+ click.echo(f"Error: {e}", err=True)
516
557
  sys.exit(1)
517
558
 
518
559
 
@@ -537,7 +578,7 @@ def list_workflows(api_key: Optional[str], base_url: str):
537
578
  click.echo("No workflows found.")
538
579
  return
539
580
 
540
- click.echo(f"📋 Found {len(workflows)} workflow(s):\n")
581
+ click.echo(f"Found {len(workflows)} workflow(s):\n")
541
582
  for workflow in workflows:
542
583
  click.echo(f" ID: {workflow.id}")
543
584
  click.echo(f" Name: {workflow.name}")
@@ -547,7 +588,7 @@ def list_workflows(api_key: Optional[str], base_url: str):
547
588
  click.echo("")
548
589
 
549
590
  except DatalabError as e:
550
- click.echo(f"Error: {e}", err=True)
591
+ click.echo(f"Error: {e}", err=True)
551
592
  sys.exit(1)
552
593
 
553
594
 
@@ -595,25 +636,25 @@ def execute_workflow(
595
636
 
596
637
  client = DatalabClient(api_key=api_key, base_url=base_url)
597
638
 
598
- click.echo(f"🚀 Triggering workflow execution for workflow {workflow_id}...")
639
+ click.echo(f"Triggering workflow execution for workflow {workflow_id}...")
599
640
  execution = client.execute_workflow(
600
641
  workflow_id=workflow_id,
601
642
  input_config=input_cfg,
602
643
  )
603
644
 
604
- click.echo(f"\n✅ Successfully triggered workflow execution!")
645
+ click.echo(f"\nSuccessfully triggered workflow execution!")
605
646
  click.echo(f" Execution ID: {execution.id}")
606
647
  click.echo(f" Status: {execution.status}")
607
- click.echo(f"\n💡 To check the status, run:")
648
+ click.echo(f"\nTo check the status, run:")
608
649
  click.echo(f" datalab get-execution-status --execution_id {execution.id}")
609
650
  click.echo(f"\n Or poll until complete:")
610
651
  click.echo(f" datalab get-execution-status --execution_id {execution.id} --max_polls 300 --poll_interval 2")
611
652
 
612
653
  except DatalabError as e:
613
- click.echo(f"Error: {e}", err=True)
654
+ click.echo(f"Error: {e}", err=True)
614
655
  sys.exit(1)
615
656
  except Exception as e:
616
- click.echo(f"Error: {e}", err=True)
657
+ click.echo(f"Error: {e}", err=True)
617
658
  sys.exit(1)
618
659
 
619
660
 
@@ -663,7 +704,7 @@ def get_execution_status(
663
704
  download_results=download,
664
705
  )
665
706
 
666
- click.echo(f"📊 Execution Status:")
707
+ click.echo(f"Execution Status:")
667
708
  click.echo(f" Execution ID: {execution.id}")
668
709
  click.echo(f" Workflow ID: {execution.workflow_id}")
669
710
  click.echo(f" Status: {execution.status}")
@@ -683,7 +724,7 @@ def get_execution_status(
683
724
  if "output_url" in step_data and not download:
684
725
  click.echo(f" Status: {step_data.get('status', 'N/A')}")
685
726
  click.echo(f" Output URL: {step_data.get('output_url')}")
686
- click.echo(f" 💡 Use --download to fetch actual results")
727
+ click.echo(f" Use --download to fetch actual results")
687
728
  else:
688
729
  click.echo(f" {json.dumps(step_data, indent=8)}")
689
730
  else:
@@ -694,10 +735,10 @@ def get_execution_status(
694
735
  output_path = Path(output)
695
736
  output_path.parent.mkdir(parents=True, exist_ok=True)
696
737
  execution.save_output(output_path)
697
- click.echo(f"\n📁 Results saved to: {output_path}")
738
+ click.echo(f"\nResults saved to: {output_path}")
698
739
 
699
740
  except DatalabError as e:
700
- click.echo(f"Error: {e}", err=True)
741
+ click.echo(f"Error: {e}", err=True)
701
742
  sys.exit(1)
702
743
 
703
744
 
@@ -722,7 +763,7 @@ def visualize_workflow(definition: str):
722
763
  steps = workflow_def.get("steps", [])
723
764
 
724
765
  if not steps:
725
- click.echo("⚠️ No steps found in workflow definition")
766
+ click.echo("No steps found in workflow definition")
726
767
  return
727
768
 
728
769
  # Build dependency graph
@@ -761,13 +802,13 @@ def visualize_workflow(definition: str):
761
802
  click.echo(f"\nTotal steps: {len(steps)}")
762
803
 
763
804
  except json.JSONDecodeError as e:
764
- click.echo(f"Invalid JSON: {e}", err=True)
805
+ click.echo(f"Invalid JSON: {e}", err=True)
765
806
  sys.exit(1)
766
807
  except KeyError as e:
767
- click.echo(f"Missing required field in workflow definition: {e}", err=True)
808
+ click.echo(f"Missing required field in workflow definition: {e}", err=True)
768
809
  sys.exit(1)
769
810
  except Exception as e:
770
- click.echo(f"Error: {e}", err=True)
811
+ click.echo(f"Error: {e}", err=True)
771
812
  sys.exit(1)
772
813
 
773
814
 
@@ -814,6 +855,7 @@ cli.add_command(convert)
814
855
  cli.add_command(ocr)
815
856
  cli.add_command(create_workflow)
816
857
  cli.add_command(get_workflow)
858
+ cli.add_command(get_step_types)
817
859
  cli.add_command(list_workflows)
818
860
  cli.add_command(execute_workflow)
819
861
  cli.add_command(get_execution_status)
@@ -345,13 +345,10 @@ class AsyncDatalabClient:
345
345
  # Parse response into Workflow object
346
346
  workflow_steps = [
347
347
  WorkflowStep(
348
- step_key=step["step_key"],
349
348
  unique_name=step["unique_name"],
350
349
  settings=step["settings"],
351
350
  depends_on=step.get("depends_on", []),
352
- id=step.get("id"),
353
- version=step.get("version"),
354
- name=step.get("name"),
351
+ id=step.get("id")
355
352
  )
356
353
  for step in response.get("steps", [])
357
354
  ]
@@ -402,6 +399,19 @@ class AsyncDatalabClient:
402
399
  updated=response.get("updated"),
403
400
  )
404
401
 
402
+ async def get_step_types(self) -> dict:
403
+ """
404
+ Get all available workflow step types
405
+
406
+ Returns:
407
+ Dictionary containing step_types list with their schemas
408
+ """
409
+ response = await self._make_request(
410
+ "GET",
411
+ "/api/v1/workflows/step-types",
412
+ )
413
+ return response
414
+
405
415
  async def list_workflows(self) -> list[Workflow]:
406
416
  """
407
417
  List all workflows for the authenticated user's team
@@ -694,6 +704,10 @@ class DatalabClient:
694
704
  """Get a workflow by ID (sync version)"""
695
705
  return self._run_async(self._async_client.get_workflow(workflow_id))
696
706
 
707
+ def get_step_types(self) -> dict:
708
+ """Get all available workflow step types (sync version)"""
709
+ return self._run_async(self._async_client.get_step_types())
710
+
697
711
  def list_workflows(self) -> list[Workflow]:
698
712
  """List all workflows (sync version)"""
699
713
  return self._run_async(self._async_client.list_workflows())
@@ -7,7 +7,7 @@ readme = "README.md"
7
7
  license = "MIT"
8
8
  repository = "https://github.com/datalab-to/sdk"
9
9
  keywords = ["datalab", "sdk", "document-intelligence", "api"]
10
- version = "0.1.10"
10
+ version = "0.1.11"
11
11
  description = "SDK for the Datalab document intelligence API"
12
12
  requires-python = ">=3.10"
13
13
  dependencies = [
@@ -42,33 +42,6 @@ def load_workflow_definition(definition_path: str, replacements: dict = None) ->
42
42
  return workflow_def
43
43
 
44
44
 
45
- def create_simple_workflow_definition() -> dict:
46
- """Create a simple default workflow definition"""
47
- return {
48
- "name": "Simple Parse and Extract",
49
- "steps": [
50
- {
51
- "step_key": "marker_parse",
52
- "unique_name": "parse_document",
53
- "settings": {"max_pages": 10, "output_format": "json"},
54
- "depends_on": []
55
- },
56
- {
57
- "step_key": "marker_extract",
58
- "unique_name": "extract_metadata",
59
- "settings": {
60
- "page_schema": {
61
- "title": {"type": "string"},
62
- "author": {"type": "string"},
63
- "summary": {"type": "string"}
64
- }
65
- },
66
- "depends_on": ["parse_document"]
67
- }
68
- ]
69
- }
70
-
71
-
72
45
  def main():
73
46
  parser = argparse.ArgumentParser(
74
47
  description="Run any workflow end-to-end from definition to results"
@@ -126,14 +99,10 @@ def main():
126
99
  # Step 1: Load or create workflow definition
127
100
  print("📄 Loading workflow definition...")
128
101
 
129
- if args.definition:
130
- # Build replacements dict
131
- replacements = dict(args.replace) if args.replace else None
132
- workflow_def = load_workflow_definition(args.definition, replacements)
133
- print(f" Source: {args.definition}")
134
- else:
135
- workflow_def = create_simple_workflow_definition()
136
- print(f" Source: Built-in simple workflow")
102
+ # Build replacements dict
103
+ replacements = dict(args.replace) if args.replace else None
104
+ workflow_def = load_workflow_definition(args.definition, replacements)
105
+ print(f" Source: {args.definition}")
137
106
 
138
107
  print(f" Name: {workflow_def['name']}")
139
108
  print(f" Steps: {len(workflow_def['steps'])}\n")
@@ -4,6 +4,7 @@ This directory contains JSON workflow definitions that can be loaded and execute
4
4
 
5
5
  ## Available Workflows
6
6
 
7
+ - [Parse and Segment (Simple)](#parse-and-segment-simple) - Do a straightforward parse -> segment to run one or more files through
7
8
  - [Eval Segmentation Across Providers](#eval-segmentation-across-providers) - Compare Marker vs Reducto segmentation in parallel
8
9
  - [Parallel Extract Large SEC Filings](#parallel-extract-large-sec-filings) - Parse → Segment → Extract from multiple sections in parallel
9
10
  - [Slack Alert Workflow](#slack-alert-workflow) - Full pipeline with parallel extraction, aggregation, and Slack notification
@@ -33,6 +34,35 @@ For a full list of `settings` to use for `marker` related steps, visit our [API
33
34
 
34
35
  ## Available Workflow Definitions
35
36
 
37
+ ### Parse and Segment (Simple)
38
+
39
+ **What it does:**
40
+ Simple workflow that does `marker_parse` -> `marker_segment`. You can pass in one or more `file_urls` when triggering your workflow.
41
+
42
+ Once you get results, you can process them to run your own custom evaluations.
43
+
44
+ **Structure:**
45
+ - **Marker branch**: Parse → Segment
46
+
47
+ **Visualize:**
48
+ ```bash
49
+ datalab visualize-workflow --definition recipes/workflows/workflow_definitions/parse_segment.json
50
+ ```
51
+
52
+ **Execute:**
53
+ ```bash
54
+ # Using end-to-end runner
55
+ python recipes/workflows/end_to_end_workflow.py \
56
+ --definition recipes/workflows/workflow_definitions/parse_segment.json \
57
+ --file_url https://www.novonordisk.com/content/dam/nncorp/global/en/investors/irmaterial/annual_report/2024/novo-nordisk-form-20-f-2023.pdf
58
+
59
+ # Or step-by-step
60
+ python recipes/workflows/workflow_api_tutorial/create_workflow.py \
61
+ --definition recipes/workflows/workflow_definitions/parse_segment.json
62
+ ```
63
+
64
+ ---
65
+
36
66
  ### Eval Segmentation Across Providers
37
67
 
38
68
  **What it does:**
@@ -46,21 +76,21 @@ Once you get results, you can process them to run your own custom evaluations.
46
76
 
47
77
  **Visualize:**
48
78
  ```bash
49
- datalab visualize-workflow --definition workflow_definitions/eval_segmentation.json
79
+ datalab visualize-workflow --definition recipes/workflows/workflow_definitions/eval_segmentation.json
50
80
  ```
51
81
 
52
82
  **Execute:**
53
83
  ```bash
54
84
  # Using end-to-end runner
55
85
  python recipes/workflows/end_to_end_workflow.py \
56
- --definition workflow_definitions/eval_segmentation.json \
86
+ --definition recipes/workflows/workflow_definitions/eval_segmentation.json \
57
87
  --file_url https://example.com/doc.pdf \
58
88
  --replace YOUR_REDUCTO_API_KEY your_key_here \
59
89
  --save results.json
60
90
 
61
91
  # Or step-by-step
62
92
  python recipes/workflows/workflow_api_tutorial/create_workflow.py \
63
- --definition workflow_definitions/eval_segmentation.json \
93
+ --definition recipes/workflows/workflow_definitions/eval_segmentation.json \
64
94
  --replace YOUR_REDUCTO_API_KEY your_key_here
65
95
  ```
66
96
 
@@ -86,19 +116,19 @@ Without this, you might have a long, dense schema that applies on the entire doc
86
116
 
87
117
  **Visualize:**
88
118
  ```bash
89
- datalab visualize-workflow --definition workflow_definitions/segment_parallel_extract.json
119
+ datalab visualize-workflow --definition recipes/workflows/workflow_definitions/segment_parallel_extract.json
90
120
  ```
91
121
 
92
122
  **Execute:**
93
123
  ```bash
94
124
  # Using end-to-end runner
95
125
  python recipes/workflows/end_to_end_workflow.py \
96
- --definition workflow_definitions/segment_parallel_extraction.json \
126
+ --definition recipes/workflows/workflow_definitions/segment_parallel_extract.json \
97
127
  --file_url https://www.novonordisk.com/content/dam/nncorp/global/en/investors/irmaterial/annual_report/2024/novo-nordisk-form-20-f-2023.pdf
98
128
 
99
129
  # Or step-by-step
100
130
  python recipes/workflows/workflow_api_tutorial/create_workflow.py \
101
- --definition workflow_definitions/segment_parallel_extraction.json
131
+ --definition recipes/workflows/workflow_definitions/segment_parallel_extract.json
102
132
  ```
103
133
 
104
134
  ---
@@ -119,14 +149,14 @@ Complete pipeline that parses documents, segments into sections, extracts struct
119
149
 
120
150
  **Visualize:**
121
151
  ```bash
122
- datalab visualize-workflow --definition workflow_definitions/slack_alert.json
152
+ datalab visualize-workflow --definition recipes/workflows/workflow_definitions/slack_alert.json
123
153
  ```
124
154
 
125
155
  **Execute:**
126
156
  ```bash
127
157
  # Using end-to-end runner with multiple files
128
158
  python recipes/workflows/end_to_end_workflow.py \
129
- --definition workflow_definitions/slack_alert.json \
159
+ --definition recipes/workflows/workflow_definitions/slack_alert.json \
130
160
  --file_url https://www.novonordisk.com/content/dam/nncorp/global/en/investors/irmaterial/annual_report/2024/novo-nordisk-form-20-f-2023.pdf \
131
161
  --replace YOUR_SLACK_BOT_TOKEN xoxb-your-token \
132
162
  --replace YOUR_SLACK_CHANNEL_ID <YOUR_CHANNEL_ID> \
@@ -134,7 +164,7 @@ python recipes/workflows/end_to_end_workflow.py \
134
164
 
135
165
  # Or step-by-step
136
166
  python recipes/workflows/workflow_api_tutorial/create_workflow.py \
137
- --definition workflow_definitions/slack_alert.json \
167
+ --definition recipes/workflows/workflow_definitions/slack_alert.json \
138
168
  --replace YOUR_SLACK_BOT_TOKEN xoxb-your-token \
139
169
  --replace YOUR_SLACK_CHANNEL_ID <YOUR_CHANNEL_ID>
140
170
  ```
@@ -0,0 +1,22 @@
1
+ {
2
+ "name": "Parse and Segment",
3
+ "description": "Simple Parse and Segment Workflow",
4
+ "steps": [
5
+ {
6
+ "step_key": "marker_parse",
7
+ "unique_name": "marker_parse",
8
+ "settings": {},
9
+ "depends_on": []
10
+ },
11
+ {
12
+ "step_key": "marker_segment",
13
+ "unique_name": "marker_segment",
14
+ "settings": {
15
+ "segmentation_schema": {
16
+ "segmentation_strategy": "document_boundary"
17
+ }
18
+ },
19
+ "depends_on": ["marker_parse"]
20
+ }
21
+ ]
22
+ }
@@ -1,6 +1,6 @@
1
1
  {
2
- "name": "Parse, Segment, Extract, Aggregate, then Post to Slack",
3
- "description": "Full pipeline: Parse documents, segment into sections, extract data from multiple segments in parallel, aggregate results, and post to Slack",
2
+ "name": "Parse then Post to Slack",
3
+ "description": "Parse documents and post to Slack",
4
4
  "steps": [
5
5
  {
6
6
  "step_key": "marker_parse",
@@ -54,7 +54,7 @@ class TestConvertCommand:
54
54
  )
55
55
 
56
56
  assert result.exit_code == 0
57
- assert "Successfully processed" in result.output
57
+ assert "Successfully processed" in result.output
58
58
 
59
59
  # Verify client was called correctly
60
60
  mock_client_class.assert_called_once()
@@ -79,7 +79,7 @@ class TestConvertCommand:
79
79
  settings.DATALAB_API_KEY = None
80
80
 
81
81
  assert result.exit_code == 0
82
- assert "Successfully processed" in result.output
82
+ assert "Successfully processed" in result.output
83
83
 
84
84
  finally:
85
85
  os.unlink(tmp_file.name)
@@ -131,7 +131,7 @@ class TestOCRCommand:
131
131
  )
132
132
 
133
133
  assert result.exit_code == 0
134
- assert "Successfully processed: 2 files" in result.output
134
+ assert "Successfully processed: 2 files" in result.output
135
135
 
136
136
  # Verify client was called correctly
137
137
  mock_client_class.assert_called_once()
@@ -163,7 +163,7 @@ class TestOCRCommand:
163
163
  )
164
164
 
165
165
  assert result.exit_code == 0
166
- assert "Successfully processed: 2 files" in result.output
166
+ assert "Successfully processed: 2 files" in result.output
167
167
 
168
168
  finally:
169
169
  os.unlink(tmp_file.name)
@@ -193,5 +193,5 @@ class TestOCRCommand:
193
193
  )
194
194
 
195
195
  assert result.exit_code == 0
196
- assert "📊 OCR Summary:" in result.output
197
- assert "Successfully processed: 2 files" in result.output
196
+ assert "OCR Summary:" in result.output
197
+ assert "Successfully processed: 2 files" in result.output
@@ -76,7 +76,7 @@ class TestWorkflowMethods:
76
76
  assert workflow.name == "Test Workflow"
77
77
  assert workflow.team_id == 12
78
78
  assert len(workflow.steps) == 2
79
- assert workflow.steps[0].step_key == "marker_parse"
79
+ assert workflow.steps[0].unique_name == "marker_parse"
80
80
  assert workflow.steps[1].depends_on == ["marker_parse"]
81
81
 
82
82
  @pytest.mark.asyncio
@@ -169,7 +169,7 @@ wheels = [
169
169
 
170
170
  [[package]]
171
171
  name = "datalab-python-sdk"
172
- version = "0.1.10"
172
+ version = "0.1.11"
173
173
  source = { editable = "." }
174
174
  dependencies = [
175
175
  { name = "aiohttp" },