datalab-python-sdk 0.1.10__tar.gz → 0.1.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/PKG-INFO +1 -1
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/cli.py +70 -28
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/client.py +18 -4
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/pyproject.toml +1 -1
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/end_to_end_workflow.py +4 -35
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_definitions/README.md +39 -9
- datalab_python_sdk-0.1.11/recipes/workflows/workflow_definitions/parse_segment.json +22 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_definitions/slack_alert.json +2 -2
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/tests/test_cli_simple.py +6 -6
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/tests/test_workflows.py +1 -1
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/uv.lock +1 -1
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/.github/workflows/ci.yml +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/.github/workflows/publish.yml +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/.gitignore +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/.pre-commit-config.yaml +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/.python-version +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/LICENSE +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/README.md +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/08-Lambda-Calculus.pptx +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/adversarial.pdf +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/bid_evaluation.docx +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/book_review.ppt +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/book_store.xls +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/chi_hind.png +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/how_to_read.doc +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/normandy.epub +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/sample-1-sheet.xlsx +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/thinkpython.pdf +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/data/vibe.html +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/__init__.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/exceptions.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/mimetypes.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/models.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/datalab_sdk/settings.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/integration/README.md +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/integration/__init__.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/integration/test_live_api.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/integration/test_readme_examples.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/poetry.lock +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/pytest.ini +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/README.md +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_api_tutorial/1_get_step_types.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_api_tutorial/2_get_workflows.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_api_tutorial/3_create_workflow.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_api_tutorial/4_execute_workflow.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_api_tutorial/5_poll_workflow.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_api_tutorial/README.md +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_definitions/eval_segmentation.json +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/workflow_definitions/segment_parallel_extract.json +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/tests/__init__.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/tests/conftest.py +0 -0
- {datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/tests/test_client_methods.py +0 -0
|
@@ -197,7 +197,7 @@ def get_files_to_process(
|
|
|
197
197
|
if path.is_file():
|
|
198
198
|
# Single file processing
|
|
199
199
|
if file_extensions and path.suffix.lower() not in file_extensions:
|
|
200
|
-
click.echo(f"
|
|
200
|
+
click.echo(f"Skipping {path}: unsupported file type", err=True)
|
|
201
201
|
sys.exit(1)
|
|
202
202
|
return [path]
|
|
203
203
|
else:
|
|
@@ -210,10 +210,10 @@ def show_results(results: List[dict], operation: str, output_dir: Path):
|
|
|
210
210
|
successful = sum(1 for r in results if r["success"])
|
|
211
211
|
failed = len(results) - successful
|
|
212
212
|
|
|
213
|
-
click.echo(f"\n
|
|
214
|
-
click.echo(f"
|
|
213
|
+
click.echo(f"\n{operation} Summary:")
|
|
214
|
+
click.echo(f" Successfully processed: {successful} files")
|
|
215
215
|
if failed > 0:
|
|
216
|
-
click.echo(f"
|
|
216
|
+
click.echo(f" Failed: {failed} files")
|
|
217
217
|
|
|
218
218
|
# Show failed files
|
|
219
219
|
click.echo("\n Failed files:")
|
|
@@ -221,7 +221,7 @@ def show_results(results: List[dict], operation: str, output_dir: Path):
|
|
|
221
221
|
if not result["success"]:
|
|
222
222
|
click.echo(f" - {result['file_path']}: {result['error']}")
|
|
223
223
|
|
|
224
|
-
click.echo(f"\
|
|
224
|
+
click.echo(f"\nOutput saved to: {output_dir}")
|
|
225
225
|
|
|
226
226
|
|
|
227
227
|
def process_documents(
|
|
@@ -270,10 +270,10 @@ def process_documents(
|
|
|
270
270
|
to_process = get_files_to_process(path, file_extensions)
|
|
271
271
|
|
|
272
272
|
if not to_process:
|
|
273
|
-
click.echo(f"
|
|
273
|
+
click.echo(f"No supported files found in {path}", err=True)
|
|
274
274
|
sys.exit(1)
|
|
275
275
|
|
|
276
|
-
click.echo(f"
|
|
276
|
+
click.echo(f"Found {len(to_process)} files to process")
|
|
277
277
|
|
|
278
278
|
# Create processing options based on method
|
|
279
279
|
if method == "convert":
|
|
@@ -317,7 +317,7 @@ def process_documents(
|
|
|
317
317
|
show_results(results, operation, output_dir)
|
|
318
318
|
|
|
319
319
|
except DatalabError as e:
|
|
320
|
-
click.echo(f"
|
|
320
|
+
click.echo(f"Error: {e}", err=True)
|
|
321
321
|
sys.exit(1)
|
|
322
322
|
|
|
323
323
|
|
|
@@ -465,17 +465,17 @@ def create_workflow(
|
|
|
465
465
|
name=name, team_id=team_id, steps=workflow_steps
|
|
466
466
|
)
|
|
467
467
|
|
|
468
|
-
click.echo(f"
|
|
468
|
+
click.echo(f"Workflow created successfully!")
|
|
469
469
|
click.echo(f" ID: {workflow.id}")
|
|
470
470
|
click.echo(f" Name: {workflow.name}")
|
|
471
471
|
click.echo(f" Team ID: {workflow.team_id}")
|
|
472
472
|
click.echo(f" Steps: {len(workflow.steps)}")
|
|
473
473
|
|
|
474
474
|
except DatalabError as e:
|
|
475
|
-
click.echo(f"
|
|
475
|
+
click.echo(f"Error: {e}", err=True)
|
|
476
476
|
sys.exit(1)
|
|
477
477
|
except Exception as e:
|
|
478
|
-
click.echo(f"
|
|
478
|
+
click.echo(f"Error: {e}", err=True)
|
|
479
479
|
sys.exit(1)
|
|
480
480
|
|
|
481
481
|
|
|
@@ -497,7 +497,7 @@ def get_workflow(workflow_id: int, api_key: Optional[str], base_url: str):
|
|
|
497
497
|
client = DatalabClient(api_key=api_key, base_url=base_url)
|
|
498
498
|
workflow = client.get_workflow(workflow_id)
|
|
499
499
|
|
|
500
|
-
click.echo(f"
|
|
500
|
+
click.echo(f"Workflow Details:")
|
|
501
501
|
click.echo(f" ID: {workflow.id}")
|
|
502
502
|
click.echo(f" Name: {workflow.name}")
|
|
503
503
|
click.echo(f" Team ID: {workflow.team_id}")
|
|
@@ -512,7 +512,48 @@ def get_workflow(workflow_id: int, api_key: Optional[str], base_url: str):
|
|
|
512
512
|
click.echo(f" Depends on: {', '.join(step.depends_on)}")
|
|
513
513
|
|
|
514
514
|
except DatalabError as e:
|
|
515
|
-
click.echo(f"
|
|
515
|
+
click.echo(f"Error: {e}", err=True)
|
|
516
|
+
sys.exit(1)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
@click.command()
|
|
520
|
+
@click.option("--api_key", required=False, help="Datalab API key")
|
|
521
|
+
@click.option("--base_url", default=settings.DATALAB_HOST, help="API base URL")
|
|
522
|
+
def get_step_types(api_key: Optional[str], base_url: str):
|
|
523
|
+
"""Get all available workflow step types"""
|
|
524
|
+
try:
|
|
525
|
+
if api_key is None:
|
|
526
|
+
api_key = settings.DATALAB_API_KEY
|
|
527
|
+
|
|
528
|
+
if api_key is None:
|
|
529
|
+
raise DatalabError(
|
|
530
|
+
"You must either pass in an api key via --api_key or set the DATALAB_API_KEY env variable."
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
client = DatalabClient(api_key=api_key, base_url=base_url)
|
|
534
|
+
response = client.get_step_types()
|
|
535
|
+
|
|
536
|
+
step_types = response.get("step_types", [])
|
|
537
|
+
if not step_types:
|
|
538
|
+
click.echo("No step types found.")
|
|
539
|
+
return
|
|
540
|
+
|
|
541
|
+
click.echo(f"Found {len(step_types)} step type(s):\n")
|
|
542
|
+
for step_type in step_types:
|
|
543
|
+
click.echo("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
544
|
+
click.echo(f"Key: {step_type.get('type')}")
|
|
545
|
+
click.echo(f"Version: {step_type.get('version')}")
|
|
546
|
+
click.echo(f"Name: {step_type.get('name')}")
|
|
547
|
+
if step_type.get("description"):
|
|
548
|
+
click.echo(f"Description: {step_type['description']}")
|
|
549
|
+
|
|
550
|
+
if step_type.get("settings_schema"):
|
|
551
|
+
click.echo("\nSettings Schema:")
|
|
552
|
+
click.echo(json.dumps(step_type["settings_schema"], indent=2))
|
|
553
|
+
click.echo("")
|
|
554
|
+
|
|
555
|
+
except DatalabError as e:
|
|
556
|
+
click.echo(f"Error: {e}", err=True)
|
|
516
557
|
sys.exit(1)
|
|
517
558
|
|
|
518
559
|
|
|
@@ -537,7 +578,7 @@ def list_workflows(api_key: Optional[str], base_url: str):
|
|
|
537
578
|
click.echo("No workflows found.")
|
|
538
579
|
return
|
|
539
580
|
|
|
540
|
-
click.echo(f"
|
|
581
|
+
click.echo(f"Found {len(workflows)} workflow(s):\n")
|
|
541
582
|
for workflow in workflows:
|
|
542
583
|
click.echo(f" ID: {workflow.id}")
|
|
543
584
|
click.echo(f" Name: {workflow.name}")
|
|
@@ -547,7 +588,7 @@ def list_workflows(api_key: Optional[str], base_url: str):
|
|
|
547
588
|
click.echo("")
|
|
548
589
|
|
|
549
590
|
except DatalabError as e:
|
|
550
|
-
click.echo(f"
|
|
591
|
+
click.echo(f"Error: {e}", err=True)
|
|
551
592
|
sys.exit(1)
|
|
552
593
|
|
|
553
594
|
|
|
@@ -595,25 +636,25 @@ def execute_workflow(
|
|
|
595
636
|
|
|
596
637
|
client = DatalabClient(api_key=api_key, base_url=base_url)
|
|
597
638
|
|
|
598
|
-
click.echo(f"
|
|
639
|
+
click.echo(f"Triggering workflow execution for workflow {workflow_id}...")
|
|
599
640
|
execution = client.execute_workflow(
|
|
600
641
|
workflow_id=workflow_id,
|
|
601
642
|
input_config=input_cfg,
|
|
602
643
|
)
|
|
603
644
|
|
|
604
|
-
click.echo(f"\
|
|
645
|
+
click.echo(f"\nSuccessfully triggered workflow execution!")
|
|
605
646
|
click.echo(f" Execution ID: {execution.id}")
|
|
606
647
|
click.echo(f" Status: {execution.status}")
|
|
607
|
-
click.echo(f"\
|
|
648
|
+
click.echo(f"\nTo check the status, run:")
|
|
608
649
|
click.echo(f" datalab get-execution-status --execution_id {execution.id}")
|
|
609
650
|
click.echo(f"\n Or poll until complete:")
|
|
610
651
|
click.echo(f" datalab get-execution-status --execution_id {execution.id} --max_polls 300 --poll_interval 2")
|
|
611
652
|
|
|
612
653
|
except DatalabError as e:
|
|
613
|
-
click.echo(f"
|
|
654
|
+
click.echo(f"Error: {e}", err=True)
|
|
614
655
|
sys.exit(1)
|
|
615
656
|
except Exception as e:
|
|
616
|
-
click.echo(f"
|
|
657
|
+
click.echo(f"Error: {e}", err=True)
|
|
617
658
|
sys.exit(1)
|
|
618
659
|
|
|
619
660
|
|
|
@@ -663,7 +704,7 @@ def get_execution_status(
|
|
|
663
704
|
download_results=download,
|
|
664
705
|
)
|
|
665
706
|
|
|
666
|
-
click.echo(f"
|
|
707
|
+
click.echo(f"Execution Status:")
|
|
667
708
|
click.echo(f" Execution ID: {execution.id}")
|
|
668
709
|
click.echo(f" Workflow ID: {execution.workflow_id}")
|
|
669
710
|
click.echo(f" Status: {execution.status}")
|
|
@@ -683,7 +724,7 @@ def get_execution_status(
|
|
|
683
724
|
if "output_url" in step_data and not download:
|
|
684
725
|
click.echo(f" Status: {step_data.get('status', 'N/A')}")
|
|
685
726
|
click.echo(f" Output URL: {step_data.get('output_url')}")
|
|
686
|
-
click.echo(f"
|
|
727
|
+
click.echo(f" Use --download to fetch actual results")
|
|
687
728
|
else:
|
|
688
729
|
click.echo(f" {json.dumps(step_data, indent=8)}")
|
|
689
730
|
else:
|
|
@@ -694,10 +735,10 @@ def get_execution_status(
|
|
|
694
735
|
output_path = Path(output)
|
|
695
736
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
696
737
|
execution.save_output(output_path)
|
|
697
|
-
click.echo(f"\
|
|
738
|
+
click.echo(f"\nResults saved to: {output_path}")
|
|
698
739
|
|
|
699
740
|
except DatalabError as e:
|
|
700
|
-
click.echo(f"
|
|
741
|
+
click.echo(f"Error: {e}", err=True)
|
|
701
742
|
sys.exit(1)
|
|
702
743
|
|
|
703
744
|
|
|
@@ -722,7 +763,7 @@ def visualize_workflow(definition: str):
|
|
|
722
763
|
steps = workflow_def.get("steps", [])
|
|
723
764
|
|
|
724
765
|
if not steps:
|
|
725
|
-
click.echo("
|
|
766
|
+
click.echo("No steps found in workflow definition")
|
|
726
767
|
return
|
|
727
768
|
|
|
728
769
|
# Build dependency graph
|
|
@@ -761,13 +802,13 @@ def visualize_workflow(definition: str):
|
|
|
761
802
|
click.echo(f"\nTotal steps: {len(steps)}")
|
|
762
803
|
|
|
763
804
|
except json.JSONDecodeError as e:
|
|
764
|
-
click.echo(f"
|
|
805
|
+
click.echo(f"Invalid JSON: {e}", err=True)
|
|
765
806
|
sys.exit(1)
|
|
766
807
|
except KeyError as e:
|
|
767
|
-
click.echo(f"
|
|
808
|
+
click.echo(f"Missing required field in workflow definition: {e}", err=True)
|
|
768
809
|
sys.exit(1)
|
|
769
810
|
except Exception as e:
|
|
770
|
-
click.echo(f"
|
|
811
|
+
click.echo(f"Error: {e}", err=True)
|
|
771
812
|
sys.exit(1)
|
|
772
813
|
|
|
773
814
|
|
|
@@ -814,6 +855,7 @@ cli.add_command(convert)
|
|
|
814
855
|
cli.add_command(ocr)
|
|
815
856
|
cli.add_command(create_workflow)
|
|
816
857
|
cli.add_command(get_workflow)
|
|
858
|
+
cli.add_command(get_step_types)
|
|
817
859
|
cli.add_command(list_workflows)
|
|
818
860
|
cli.add_command(execute_workflow)
|
|
819
861
|
cli.add_command(get_execution_status)
|
|
@@ -345,13 +345,10 @@ class AsyncDatalabClient:
|
|
|
345
345
|
# Parse response into Workflow object
|
|
346
346
|
workflow_steps = [
|
|
347
347
|
WorkflowStep(
|
|
348
|
-
step_key=step["step_key"],
|
|
349
348
|
unique_name=step["unique_name"],
|
|
350
349
|
settings=step["settings"],
|
|
351
350
|
depends_on=step.get("depends_on", []),
|
|
352
|
-
id=step.get("id")
|
|
353
|
-
version=step.get("version"),
|
|
354
|
-
name=step.get("name"),
|
|
351
|
+
id=step.get("id")
|
|
355
352
|
)
|
|
356
353
|
for step in response.get("steps", [])
|
|
357
354
|
]
|
|
@@ -402,6 +399,19 @@ class AsyncDatalabClient:
|
|
|
402
399
|
updated=response.get("updated"),
|
|
403
400
|
)
|
|
404
401
|
|
|
402
|
+
async def get_step_types(self) -> dict:
|
|
403
|
+
"""
|
|
404
|
+
Get all available workflow step types
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
Dictionary containing step_types list with their schemas
|
|
408
|
+
"""
|
|
409
|
+
response = await self._make_request(
|
|
410
|
+
"GET",
|
|
411
|
+
"/api/v1/workflows/step-types",
|
|
412
|
+
)
|
|
413
|
+
return response
|
|
414
|
+
|
|
405
415
|
async def list_workflows(self) -> list[Workflow]:
|
|
406
416
|
"""
|
|
407
417
|
List all workflows for the authenticated user's team
|
|
@@ -694,6 +704,10 @@ class DatalabClient:
|
|
|
694
704
|
"""Get a workflow by ID (sync version)"""
|
|
695
705
|
return self._run_async(self._async_client.get_workflow(workflow_id))
|
|
696
706
|
|
|
707
|
+
def get_step_types(self) -> dict:
|
|
708
|
+
"""Get all available workflow step types (sync version)"""
|
|
709
|
+
return self._run_async(self._async_client.get_step_types())
|
|
710
|
+
|
|
697
711
|
def list_workflows(self) -> list[Workflow]:
|
|
698
712
|
"""List all workflows (sync version)"""
|
|
699
713
|
return self._run_async(self._async_client.list_workflows())
|
|
@@ -7,7 +7,7 @@ readme = "README.md"
|
|
|
7
7
|
license = "MIT"
|
|
8
8
|
repository = "https://github.com/datalab-to/sdk"
|
|
9
9
|
keywords = ["datalab", "sdk", "document-intelligence", "api"]
|
|
10
|
-
version = "0.1.
|
|
10
|
+
version = "0.1.11"
|
|
11
11
|
description = "SDK for the Datalab document intelligence API"
|
|
12
12
|
requires-python = ">=3.10"
|
|
13
13
|
dependencies = [
|
{datalab_python_sdk-0.1.10 → datalab_python_sdk-0.1.11}/recipes/workflows/end_to_end_workflow.py
RENAMED
|
@@ -42,33 +42,6 @@ def load_workflow_definition(definition_path: str, replacements: dict = None) ->
|
|
|
42
42
|
return workflow_def
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
def create_simple_workflow_definition() -> dict:
|
|
46
|
-
"""Create a simple default workflow definition"""
|
|
47
|
-
return {
|
|
48
|
-
"name": "Simple Parse and Extract",
|
|
49
|
-
"steps": [
|
|
50
|
-
{
|
|
51
|
-
"step_key": "marker_parse",
|
|
52
|
-
"unique_name": "parse_document",
|
|
53
|
-
"settings": {"max_pages": 10, "output_format": "json"},
|
|
54
|
-
"depends_on": []
|
|
55
|
-
},
|
|
56
|
-
{
|
|
57
|
-
"step_key": "marker_extract",
|
|
58
|
-
"unique_name": "extract_metadata",
|
|
59
|
-
"settings": {
|
|
60
|
-
"page_schema": {
|
|
61
|
-
"title": {"type": "string"},
|
|
62
|
-
"author": {"type": "string"},
|
|
63
|
-
"summary": {"type": "string"}
|
|
64
|
-
}
|
|
65
|
-
},
|
|
66
|
-
"depends_on": ["parse_document"]
|
|
67
|
-
}
|
|
68
|
-
]
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
|
|
72
45
|
def main():
|
|
73
46
|
parser = argparse.ArgumentParser(
|
|
74
47
|
description="Run any workflow end-to-end from definition to results"
|
|
@@ -126,14 +99,10 @@ def main():
|
|
|
126
99
|
# Step 1: Load or create workflow definition
|
|
127
100
|
print("📄 Loading workflow definition...")
|
|
128
101
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
print(f" Source: {args.definition}")
|
|
134
|
-
else:
|
|
135
|
-
workflow_def = create_simple_workflow_definition()
|
|
136
|
-
print(f" Source: Built-in simple workflow")
|
|
102
|
+
# Build replacements dict
|
|
103
|
+
replacements = dict(args.replace) if args.replace else None
|
|
104
|
+
workflow_def = load_workflow_definition(args.definition, replacements)
|
|
105
|
+
print(f" Source: {args.definition}")
|
|
137
106
|
|
|
138
107
|
print(f" Name: {workflow_def['name']}")
|
|
139
108
|
print(f" Steps: {len(workflow_def['steps'])}\n")
|
|
@@ -4,6 +4,7 @@ This directory contains JSON workflow definitions that can be loaded and execute
|
|
|
4
4
|
|
|
5
5
|
## Available Workflows
|
|
6
6
|
|
|
7
|
+
- [Parse and Segment (Simple)](#parse-and-segment-simple) - Do a straightforward parse -> segment to run one or more files through
|
|
7
8
|
- [Eval Segmentation Across Providers](#eval-segmentation-across-providers) - Compare Marker vs Reducto segmentation in parallel
|
|
8
9
|
- [Parallel Extract Large SEC Filings](#parallel-extract-large-sec-filings) - Parse → Segment → Extract from multiple sections in parallel
|
|
9
10
|
- [Slack Alert Workflow](#slack-alert-workflow) - Full pipeline with parallel extraction, aggregation, and Slack notification
|
|
@@ -33,6 +34,35 @@ For a full list of `settings` to use for `marker` related steps, visit our [API
|
|
|
33
34
|
|
|
34
35
|
## Available Workflow Definitions
|
|
35
36
|
|
|
37
|
+
### Parse and Segment (Simple)
|
|
38
|
+
|
|
39
|
+
**What it does:**
|
|
40
|
+
Simple workflow that does `marker_parse` -> `marker_segment`. You can pass in one or more `file_urls` when triggering your workflow.
|
|
41
|
+
|
|
42
|
+
Once you get results, you can process them to run your own custom evaluations.
|
|
43
|
+
|
|
44
|
+
**Structure:**
|
|
45
|
+
- **Marker branch**: Parse → Segment
|
|
46
|
+
|
|
47
|
+
**Visualize:**
|
|
48
|
+
```bash
|
|
49
|
+
datalab visualize-workflow --definition recipes/workflows/workflow_definitions/parse_segment.json
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**Execute:**
|
|
53
|
+
```bash
|
|
54
|
+
# Using end-to-end runner
|
|
55
|
+
python recipes/workflows/end_to_end_workflow.py \
|
|
56
|
+
--definition recipes/workflows/workflow_definitions/parse_segment.json \
|
|
57
|
+
--file_url https://www.novonordisk.com/content/dam/nncorp/global/en/investors/irmaterial/annual_report/2024/novo-nordisk-form-20-f-2023.pdf
|
|
58
|
+
|
|
59
|
+
# Or step-by-step
|
|
60
|
+
python recipes/workflows/workflow_api_tutorial/create_workflow.py \
|
|
61
|
+
--definition recipes/workflows/workflow_definitions/parse_segment.json
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
36
66
|
### Eval Segmentation Across Providers
|
|
37
67
|
|
|
38
68
|
**What it does:**
|
|
@@ -46,21 +76,21 @@ Once you get results, you can process them to run your own custom evaluations.
|
|
|
46
76
|
|
|
47
77
|
**Visualize:**
|
|
48
78
|
```bash
|
|
49
|
-
datalab visualize-workflow --definition workflow_definitions/eval_segmentation.json
|
|
79
|
+
datalab visualize-workflow --definition recipes/workflows/workflow_definitions/eval_segmentation.json
|
|
50
80
|
```
|
|
51
81
|
|
|
52
82
|
**Execute:**
|
|
53
83
|
```bash
|
|
54
84
|
# Using end-to-end runner
|
|
55
85
|
python recipes/workflows/end_to_end_workflow.py \
|
|
56
|
-
--definition workflow_definitions/eval_segmentation.json \
|
|
86
|
+
--definition recipes/workflows/workflow_definitions/eval_segmentation.json \
|
|
57
87
|
--file_url https://example.com/doc.pdf \
|
|
58
88
|
--replace YOUR_REDUCTO_API_KEY your_key_here \
|
|
59
89
|
--save results.json
|
|
60
90
|
|
|
61
91
|
# Or step-by-step
|
|
62
92
|
python recipes/workflows/workflow_api_tutorial/create_workflow.py \
|
|
63
|
-
--definition workflow_definitions/eval_segmentation.json \
|
|
93
|
+
--definition recipes/workflows/workflow_definitions/eval_segmentation.json \
|
|
64
94
|
--replace YOUR_REDUCTO_API_KEY your_key_here
|
|
65
95
|
```
|
|
66
96
|
|
|
@@ -86,19 +116,19 @@ Without this, you might have a long, dense schema that applies on the entire doc
|
|
|
86
116
|
|
|
87
117
|
**Visualize:**
|
|
88
118
|
```bash
|
|
89
|
-
datalab visualize-workflow --definition workflow_definitions/segment_parallel_extract.json
|
|
119
|
+
datalab visualize-workflow --definition recipes/workflows/workflow_definitions/segment_parallel_extract.json
|
|
90
120
|
```
|
|
91
121
|
|
|
92
122
|
**Execute:**
|
|
93
123
|
```bash
|
|
94
124
|
# Using end-to-end runner
|
|
95
125
|
python recipes/workflows/end_to_end_workflow.py \
|
|
96
|
-
--definition workflow_definitions/
|
|
126
|
+
--definition recipes/workflows/workflow_definitions/segment_parallel_extract.json \
|
|
97
127
|
--file_url https://www.novonordisk.com/content/dam/nncorp/global/en/investors/irmaterial/annual_report/2024/novo-nordisk-form-20-f-2023.pdf
|
|
98
128
|
|
|
99
129
|
# Or step-by-step
|
|
100
130
|
python recipes/workflows/workflow_api_tutorial/create_workflow.py \
|
|
101
|
-
--definition workflow_definitions/
|
|
131
|
+
--definition recipes/workflows/workflow_definitions/segment_parallel_extract.json
|
|
102
132
|
```
|
|
103
133
|
|
|
104
134
|
---
|
|
@@ -119,14 +149,14 @@ Complete pipeline that parses documents, segments into sections, extracts struct
|
|
|
119
149
|
|
|
120
150
|
**Visualize:**
|
|
121
151
|
```bash
|
|
122
|
-
datalab visualize-workflow --definition workflow_definitions/slack_alert.json
|
|
152
|
+
datalab visualize-workflow --definition recipes/workflows/workflow_definitions/slack_alert.json
|
|
123
153
|
```
|
|
124
154
|
|
|
125
155
|
**Execute:**
|
|
126
156
|
```bash
|
|
127
157
|
# Using end-to-end runner with multiple files
|
|
128
158
|
python recipes/workflows/end_to_end_workflow.py \
|
|
129
|
-
--definition workflow_definitions/slack_alert.json \
|
|
159
|
+
--definition recipes/workflows/workflow_definitions/slack_alert.json \
|
|
130
160
|
--file_url https://www.novonordisk.com/content/dam/nncorp/global/en/investors/irmaterial/annual_report/2024/novo-nordisk-form-20-f-2023.pdf \
|
|
131
161
|
--replace YOUR_SLACK_BOT_TOKEN xoxb-your-token \
|
|
132
162
|
--replace YOUR_SLACK_CHANNEL_ID <YOUR_CHANNEL_ID> \
|
|
@@ -134,7 +164,7 @@ python recipes/workflows/end_to_end_workflow.py \
|
|
|
134
164
|
|
|
135
165
|
# Or step-by-step
|
|
136
166
|
python recipes/workflows/workflow_api_tutorial/create_workflow.py \
|
|
137
|
-
--definition workflow_definitions/slack_alert.json \
|
|
167
|
+
--definition recipes/workflows/workflow_definitions/slack_alert.json \
|
|
138
168
|
--replace YOUR_SLACK_BOT_TOKEN xoxb-your-token \
|
|
139
169
|
--replace YOUR_SLACK_CHANNEL_ID <YOUR_CHANNEL_ID>
|
|
140
170
|
```
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "Parse and Segment",
|
|
3
|
+
"description": "Simple Parse and Segment Workflow",
|
|
4
|
+
"steps": [
|
|
5
|
+
{
|
|
6
|
+
"step_key": "marker_parse",
|
|
7
|
+
"unique_name": "marker_parse",
|
|
8
|
+
"settings": {},
|
|
9
|
+
"depends_on": []
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"step_key": "marker_segment",
|
|
13
|
+
"unique_name": "marker_segment",
|
|
14
|
+
"settings": {
|
|
15
|
+
"segmentation_schema": {
|
|
16
|
+
"segmentation_strategy": "document_boundary"
|
|
17
|
+
}
|
|
18
|
+
},
|
|
19
|
+
"depends_on": ["marker_parse"]
|
|
20
|
+
}
|
|
21
|
+
]
|
|
22
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
|
-
"name": "Parse
|
|
3
|
-
"description": "
|
|
2
|
+
"name": "Parse then Post to Slack",
|
|
3
|
+
"description": "Parse documents and post to Slack",
|
|
4
4
|
"steps": [
|
|
5
5
|
{
|
|
6
6
|
"step_key": "marker_parse",
|
|
@@ -54,7 +54,7 @@ class TestConvertCommand:
|
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
assert result.exit_code == 0
|
|
57
|
-
assert "
|
|
57
|
+
assert "Successfully processed" in result.output
|
|
58
58
|
|
|
59
59
|
# Verify client was called correctly
|
|
60
60
|
mock_client_class.assert_called_once()
|
|
@@ -79,7 +79,7 @@ class TestConvertCommand:
|
|
|
79
79
|
settings.DATALAB_API_KEY = None
|
|
80
80
|
|
|
81
81
|
assert result.exit_code == 0
|
|
82
|
-
assert "
|
|
82
|
+
assert "Successfully processed" in result.output
|
|
83
83
|
|
|
84
84
|
finally:
|
|
85
85
|
os.unlink(tmp_file.name)
|
|
@@ -131,7 +131,7 @@ class TestOCRCommand:
|
|
|
131
131
|
)
|
|
132
132
|
|
|
133
133
|
assert result.exit_code == 0
|
|
134
|
-
assert "
|
|
134
|
+
assert "Successfully processed: 2 files" in result.output
|
|
135
135
|
|
|
136
136
|
# Verify client was called correctly
|
|
137
137
|
mock_client_class.assert_called_once()
|
|
@@ -163,7 +163,7 @@ class TestOCRCommand:
|
|
|
163
163
|
)
|
|
164
164
|
|
|
165
165
|
assert result.exit_code == 0
|
|
166
|
-
assert "
|
|
166
|
+
assert "Successfully processed: 2 files" in result.output
|
|
167
167
|
|
|
168
168
|
finally:
|
|
169
169
|
os.unlink(tmp_file.name)
|
|
@@ -193,5 +193,5 @@ class TestOCRCommand:
|
|
|
193
193
|
)
|
|
194
194
|
|
|
195
195
|
assert result.exit_code == 0
|
|
196
|
-
assert "
|
|
197
|
-
assert "
|
|
196
|
+
assert "OCR Summary:" in result.output
|
|
197
|
+
assert "Successfully processed: 2 files" in result.output
|
|
@@ -76,7 +76,7 @@ class TestWorkflowMethods:
|
|
|
76
76
|
assert workflow.name == "Test Workflow"
|
|
77
77
|
assert workflow.team_id == 12
|
|
78
78
|
assert len(workflow.steps) == 2
|
|
79
|
-
assert workflow.steps[0].
|
|
79
|
+
assert workflow.steps[0].unique_name == "marker_parse"
|
|
80
80
|
assert workflow.steps[1].depends_on == ["marker_parse"]
|
|
81
81
|
|
|
82
82
|
@pytest.mark.asyncio
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|