remdb 0.3.118__py3-none-any.whl → 0.3.141__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (40) hide show
  1. rem/agentic/agents/sse_simulator.py +2 -0
  2. rem/agentic/context.py +23 -3
  3. rem/agentic/mcp/tool_wrapper.py +126 -15
  4. rem/agentic/otel/setup.py +1 -0
  5. rem/agentic/providers/phoenix.py +371 -108
  6. rem/agentic/providers/pydantic_ai.py +122 -43
  7. rem/agentic/schema.py +4 -1
  8. rem/api/mcp_router/tools.py +13 -2
  9. rem/api/routers/chat/completions.py +250 -4
  10. rem/api/routers/chat/models.py +81 -7
  11. rem/api/routers/chat/otel_utils.py +33 -0
  12. rem/api/routers/chat/sse_events.py +17 -1
  13. rem/api/routers/chat/streaming.py +35 -1
  14. rem/api/routers/feedback.py +134 -14
  15. rem/cli/commands/cluster.py +590 -82
  16. rem/cli/commands/configure.py +3 -4
  17. rem/cli/commands/experiments.py +436 -30
  18. rem/cli/commands/session.py +336 -0
  19. rem/cli/dreaming.py +2 -2
  20. rem/cli/main.py +2 -0
  21. rem/config.py +8 -1
  22. rem/models/core/experiment.py +54 -0
  23. rem/models/entities/ontology.py +1 -1
  24. rem/models/entities/ontology_config.py +1 -1
  25. rem/schemas/agents/examples/contract-analyzer.yaml +1 -1
  26. rem/schemas/agents/examples/contract-extractor.yaml +1 -1
  27. rem/schemas/agents/examples/cv-parser.yaml +1 -1
  28. rem/services/phoenix/client.py +59 -18
  29. rem/services/session/compression.py +7 -0
  30. rem/settings.py +236 -13
  31. rem/sql/migrations/002_install_models.sql +91 -91
  32. rem/sql/migrations/004_cache_system.sql +1 -1
  33. rem/utils/schema_loader.py +94 -3
  34. rem/utils/vision.py +1 -1
  35. rem/workers/__init__.py +2 -1
  36. rem/workers/db_listener.py +579 -0
  37. {remdb-0.3.118.dist-info → remdb-0.3.141.dist-info}/METADATA +156 -144
  38. {remdb-0.3.118.dist-info → remdb-0.3.141.dist-info}/RECORD +40 -37
  39. {remdb-0.3.118.dist-info → remdb-0.3.141.dist-info}/WHEEL +0 -0
  40. {remdb-0.3.118.dist-info → remdb-0.3.141.dist-info}/entry_points.txt +0 -0
@@ -110,7 +110,7 @@ def prompt_llm_config(use_defaults: bool = False) -> dict:
110
110
  config = {}
111
111
 
112
112
  # Default values
113
- default_model = "anthropic:claude-sonnet-4-5-20250929"
113
+ default_model = "openai:gpt-4.1"
114
114
  default_temperature = 0.5
115
115
 
116
116
  if use_defaults:
@@ -124,9 +124,9 @@ def prompt_llm_config(use_defaults: bool = False) -> dict:
124
124
  # Default model
125
125
  click.echo("\nDefault LLM model (format: provider:model-id)")
126
126
  click.echo("Examples:")
127
+ click.echo(" - openai:gpt-4.1")
127
128
  click.echo(" - anthropic:claude-sonnet-4-5-20250929")
128
- click.echo(" - openai:gpt-4o")
129
- click.echo(" - openai:gpt-4o-mini")
129
+ click.echo(" - openai:gpt-4.1-mini")
130
130
 
131
131
  config["default_model"] = click.prompt(
132
132
  "Default model", default=default_model
@@ -422,7 +422,6 @@ def configure_command(install: bool, claude_desktop: bool, show: bool, edit: boo
422
422
 
423
423
  try:
424
424
  import shutil
425
- from pathlib import Path
426
425
  from fastmcp.mcp_config import update_config_file, StdioMCPServer
427
426
 
428
427
  # Find Claude Desktop config path
@@ -63,6 +63,7 @@ def experiments():
63
63
  @experiments.command("create")
64
64
  @click.argument("name")
65
65
  @click.option("--agent", "-a", required=True, help="Agent schema name (e.g., 'cv-parser')")
66
+ @click.option("--task", "-t", default="general", help="Task name for organizing experiments (e.g., 'risk-assessment')")
66
67
  @click.option("--evaluator", "-e", default="default", help="Evaluator schema name (default: 'default')")
67
68
  @click.option("--description", "-d", help="Experiment description")
68
69
  @click.option("--dataset-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
@@ -74,6 +75,7 @@ def experiments():
74
75
  def create(
75
76
  name: str,
76
77
  agent: str,
78
+ task: str,
77
79
  evaluator: str,
78
80
  description: Optional[str],
79
81
  dataset_location: str,
@@ -170,7 +172,8 @@ def create(
170
172
  # Create experiment config
171
173
  config = ExperimentConfig(
172
174
  name=name,
173
- description=description or f"Evaluation experiment for {agent} agent",
175
+ task=task,
176
+ description=description or f"Evaluation experiment for {agent} agent ({task} task)",
174
177
  agent_schema_ref=SchemaReference(
175
178
  name=agent,
176
179
  version=None, # Use latest by default
@@ -514,6 +517,159 @@ def show(name: str, base_path: Optional[str]):
514
517
  raise click.Abort()
515
518
 
516
519
 
520
+ # =============================================================================
521
+ # VIBES MODE HELPER
522
+ # =============================================================================
523
+
524
+
525
+ def _run_vibes_mode(
526
+ config: Any,
527
+ dataset_df: Any,
528
+ task_fn: Any,
529
+ base_path: str,
530
+ limit: Optional[int],
531
+ evaluator_schema_path: Path,
532
+ ) -> None:
533
+ """Run experiment in vibes mode - execute agent and export for AI evaluation.
534
+
535
+ Vibes mode runs the agent on each example and saves results to a JSONL file.
536
+ The AI assistant (e.g., Claude Code) then acts as the judge using the
537
+ evaluator schema to evaluate results.
538
+
539
+ Args:
540
+ config: ExperimentConfig object
541
+ dataset_df: Polars DataFrame with ground truth examples
542
+ task_fn: Function to run agent on each example
543
+ base_path: Base directory for experiments
544
+ limit: Optional limit on number of examples to process
545
+ evaluator_schema_path: Path to the evaluator schema YAML file
546
+ """
547
+ from rem.utils.date_utils import format_timestamp_for_experiment, utc_now, to_iso
548
+ import json
549
+
550
+ # Apply limit if specified
551
+ if limit:
552
+ dataset_df = dataset_df.head(limit)
553
+ click.echo(f" (Limited to {limit} examples)")
554
+
555
+ # Create results directory
556
+ timestamp = format_timestamp_for_experiment()
557
+ results_dir = Path(base_path) / config.name / "results" / timestamp
558
+ results_dir.mkdir(parents=True, exist_ok=True)
559
+
560
+ click.echo(f"\n⏳ Running agent on {len(dataset_df)} examples...")
561
+ click.echo(f" Results will be saved to: {results_dir}")
562
+ click.echo()
563
+
564
+ # Run agent on each example and collect results
565
+ results = []
566
+ records = dataset_df.to_dicts()
567
+
568
+ for i, record in enumerate(records, 1):
569
+ example_id = record.get("id", i)
570
+ click.echo(f" [{i}/{len(records)}] Processing example {example_id}...", nl=False)
571
+
572
+ try:
573
+ # Prepare input for agent
574
+ input_text = record.get("text", record.get("input", record.get("query", "")))
575
+ example_input = {"query": input_text} if isinstance(input_text, str) else input_text
576
+
577
+ # Run agent
578
+ output = task_fn({"input": example_input})
579
+
580
+ result = {
581
+ "id": example_id,
582
+ "input": input_text,
583
+ "ground_truth": record.get("ground_truth", record.get("expected_output", "")),
584
+ "category": record.get("category", ""),
585
+ "agent_output": output,
586
+ "status": "success",
587
+ }
588
+ click.echo(" ✓")
589
+
590
+ except Exception as e:
591
+ result = {
592
+ "id": example_id,
593
+ "input": record.get("text", record.get("input", "")),
594
+ "ground_truth": record.get("ground_truth", record.get("expected_output", "")),
595
+ "category": record.get("category", ""),
596
+ "agent_output": None,
597
+ "status": "error",
598
+ "error": str(e),
599
+ }
600
+ click.echo(f" ✗ ({e})")
601
+
602
+ results.append(result)
603
+
604
+ # Save results to JSONL
605
+ results_file = results_dir / "vibes-results.jsonl"
606
+ with open(results_file, "w") as f:
607
+ for result in results:
608
+ f.write(json.dumps(result) + "\n")
609
+
610
+ # Copy evaluator schema to results dir for easy reference
611
+ import shutil
612
+ evaluator_copy = results_dir / "evaluator-schema.yaml"
613
+ shutil.copy(evaluator_schema_path, evaluator_copy)
614
+
615
+ # Save run metadata
616
+ run_info = {
617
+ "experiment": config.name,
618
+ "agent": config.agent_schema_ref.name,
619
+ "evaluator": config.evaluator_schema_ref.name,
620
+ "mode": "vibes",
621
+ "timestamp": timestamp,
622
+ "total_examples": len(records),
623
+ "successful": len([r for r in results if r["status"] == "success"]),
624
+ "failed": len([r for r in results if r["status"] == "error"]),
625
+ "completed_at": to_iso(utc_now()),
626
+ }
627
+
628
+ run_info_file = results_dir / "run-info.json"
629
+ with open(run_info_file, "w") as f:
630
+ json.dump(run_info, f, indent=2)
631
+
632
+ # Print summary and instructions
633
+ success_count = run_info["successful"]
634
+ fail_count = run_info["failed"]
635
+
636
+ click.echo(f"\n{'=' * 60}")
637
+ click.echo(f"VIBES MODE COMPLETE")
638
+ click.echo(f"{'=' * 60}")
639
+ click.echo(f"\nResults: {success_count} successful, {fail_count} failed")
640
+ click.echo(f"\nFiles saved to: {results_dir}/")
641
+ click.echo(f" - vibes-results.jsonl (agent outputs)")
642
+ click.echo(f" - evaluator-schema.yaml (evaluation criteria)")
643
+ click.echo(f" - run-info.json (run metadata)")
644
+
645
+ click.echo(f"\n{'=' * 60}")
646
+ click.echo(f"NEXT STEP: Ask your AI assistant to evaluate")
647
+ click.echo(f"{'=' * 60}")
648
+ click.echo(f"""
649
+ Copy this prompt to Claude Code or your AI assistant:
650
+
651
+ Please evaluate the experiment results in:
652
+ {results_dir}/
653
+
654
+ Read the vibes-results.jsonl file and evaluate each example
655
+ using the evaluator schema in evaluator-schema.yaml.
656
+
657
+ For each example, provide:
658
+ 1. extracted_classification
659
+ 2. exact_match (vs ground_truth)
660
+ 3. semantic_match
661
+ 4. reasoning_quality_score
662
+ 5. overall_score
663
+ 6. pass/fail
664
+
665
+ Then provide summary metrics:
666
+ - Exact match accuracy
667
+ - Semantic match accuracy
668
+ - Average overall score
669
+ - Pass rate
670
+ """)
671
+
672
+
517
673
  # =============================================================================
518
674
  # RUN COMMAND
519
675
  # =============================================================================
@@ -524,6 +680,8 @@ def show(name: str, base_path: Optional[str]):
524
680
  @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
525
681
  @click.option("--version", help="Git tag version to load (e.g., 'experiments/my-exp/v1.0.0')")
526
682
  @click.option("--dry-run", is_flag=True, help="Test on small subset without saving")
683
+ @click.option("--only-vibes", is_flag=True, help="Run agent locally, export results for AI evaluation (no Phoenix)")
684
+ @click.option("--limit", "-n", type=int, help="Limit number of examples to evaluate (useful with --only-vibes)")
527
685
  @click.option("--update-prompts", is_flag=True, help="Update prompts in Phoenix before running")
528
686
  @click.option("--phoenix-url", help="Phoenix server URL (overrides PHOENIX_BASE_URL env var)")
529
687
  @click.option("--phoenix-api-key", help="Phoenix API key (overrides PHOENIX_API_KEY env var)")
@@ -532,14 +690,45 @@ def run(
532
690
  base_path: Optional[str],
533
691
  version: Optional[str],
534
692
  dry_run: bool,
693
+ only_vibes: bool,
694
+ limit: Optional[int],
535
695
  update_prompts: bool,
536
696
  phoenix_url: Optional[str],
537
697
  phoenix_api_key: Optional[str],
538
698
  ):
539
- """Run an experiment using Phoenix provider.
699
+ """Run an experiment using Phoenix provider or local vibes mode.
540
700
 
541
701
  Loads configuration, executes agent and evaluator, saves results.
542
702
 
703
+ Vibes Mode (--only-vibes):
704
+ Run agent locally without Phoenix infrastructure. Agent outputs are saved
705
+ to a JSONL file along with the evaluator schema. Your AI assistant (e.g.,
706
+ Claude Code) then acts as the judge to evaluate results.
707
+
708
+ This enables seamless switching between:
709
+ - Local evaluation: Quick iteration with AI-as-judge
710
+ - Phoenix evaluation: Production metrics and dashboards
711
+
712
+ Usage:
713
+ rem experiments run my-experiment --only-vibes
714
+ rem experiments run my-experiment --only-vibes --limit 5
715
+
716
+ The command will:
717
+ 1. Run the agent on each ground-truth example
718
+ 2. Save results to results/{timestamp}/vibes-results.jsonl
719
+ 3. Print the evaluator prompt and schema
720
+ 4. Instruct you to ask your AI assistant to evaluate
721
+
722
+ Example workflow with Claude Code:
723
+ $ rem experiments run mental-health-classifier --only-vibes --limit 3
724
+ # ... agent runs ...
725
+ # Results saved to: .experiments/mental-health-classifier/results/20241203-143022/
726
+
727
+ # Then ask Claude Code:
728
+ "Please evaluate the experiment results in
729
+ .experiments/mental-health-classifier/results/20241203-143022/
730
+ using the evaluator schema provided"
731
+
543
732
  Phoenix Connection:
544
733
  Commands respect PHOENIX_BASE_URL and PHOENIX_API_KEY environment variables.
545
734
  Defaults to localhost:6006 for local development.
@@ -562,6 +751,12 @@ def run(
562
751
  # Run experiment with latest schemas
563
752
  rem experiments run hello-world-validation
564
753
 
754
+ # Quick local evaluation (vibes mode)
755
+ rem experiments run hello-world-validation --only-vibes
756
+
757
+ # Vibes mode with limited examples
758
+ rem experiments run hello-world-validation --only-vibes --limit 5
759
+
565
760
  # Run specific version
566
761
  rem experiments run hello-world-validation \\
567
762
  --version experiments/hello-world-validation/v1.0.0
@@ -674,40 +869,67 @@ def run(
674
869
 
675
870
  click.echo(f"Loading evaluator: {evaluator_name} for agent {agent_name}")
676
871
 
677
- # Try multiple evaluator path patterns (agent-specific, then generic)
678
- evaluator_paths_to_try = [
679
- f"{agent_name}/{evaluator_name}", # e.g., hello-world/default
680
- f"{agent_name}-{evaluator_name}", # e.g., hello-world-default
681
- evaluator_name, # e.g., default (generic)
682
- ]
872
+ # Find evaluator schema file path
873
+ from rem.utils.schema_loader import get_evaluator_schema_path
874
+
875
+ evaluator_schema_path = get_evaluator_schema_path(evaluator_name)
876
+ if not evaluator_schema_path or not evaluator_schema_path.exists():
877
+ click.echo(f"Error: Could not find evaluator schema '{evaluator_name}'")
878
+ raise click.Abort()
683
879
 
880
+ click.echo(f"✓ Found evaluator schema: {evaluator_schema_path}")
881
+
882
+ # For Phoenix mode, also load evaluator function
684
883
  evaluator_fn = None
685
- evaluator_load_error = None
884
+ if not only_vibes:
885
+ # Try multiple evaluator path patterns (agent-specific, then generic)
886
+ evaluator_paths_to_try = [
887
+ f"{agent_name}/{evaluator_name}", # e.g., hello-world/default
888
+ f"{agent_name}-{evaluator_name}", # e.g., hello-world-default
889
+ evaluator_name, # e.g., default (generic)
890
+ ]
686
891
 
687
- for evaluator_path in evaluator_paths_to_try:
688
- try:
689
- evaluator_fn = create_evaluator_from_schema(
690
- evaluator_schema_path=evaluator_path,
691
- model_name=None, # Use default from schema
692
- )
693
- click.echo(f"✓ Loaded evaluator schema: {evaluator_path}")
694
- break
695
- except FileNotFoundError as e:
696
- evaluator_load_error = e
697
- logger.debug(f"Evaluator not found at {evaluator_path}: {e}")
698
- continue
699
- except Exception as e:
700
- evaluator_load_error = e
701
- logger.warning(f"Failed to load evaluator from {evaluator_path}: {e}")
702
- continue
892
+ evaluator_load_error = None
703
893
 
704
- if evaluator_fn is None:
705
- click.echo(f"Error: Could not load evaluator schema '{evaluator_name}'")
894
+ for evaluator_path in evaluator_paths_to_try:
895
+ try:
896
+ evaluator_fn = create_evaluator_from_schema(
897
+ evaluator_schema_path=evaluator_path,
898
+ model_name=None, # Use default from schema
899
+ )
900
+ click.echo(f"✓ Loaded evaluator function: {evaluator_path}")
901
+ break
902
+ except FileNotFoundError as e:
903
+ evaluator_load_error = e
904
+ logger.debug(f"Evaluator not found at {evaluator_path}: {e}")
905
+ continue
906
+ except Exception as e:
907
+ evaluator_load_error = e
908
+ logger.warning(f"Failed to load evaluator from {evaluator_path}: {e}")
909
+ continue
910
+
911
+ if evaluator_fn is None and not only_vibes:
912
+ click.echo(f"Error: Could not load evaluator function '{evaluator_name}'")
706
913
  click.echo(f" Tried paths: {evaluator_paths_to_try}")
707
914
  if evaluator_load_error:
708
915
  click.echo(f" Last error: {evaluator_load_error}")
709
916
  raise click.Abort()
710
917
 
918
+ # Validate evaluator credentials before running expensive agent tasks
919
+ if evaluator_fn is not None and not only_vibes:
920
+ from rem.agentic.providers.phoenix import validate_evaluator_credentials
921
+
922
+ click.echo("Validating evaluator credentials...")
923
+ is_valid, error_msg = validate_evaluator_credentials()
924
+ if not is_valid:
925
+ click.echo(click.style(f"\n⚠️ Evaluator validation failed: {error_msg}", fg="yellow"))
926
+ click.echo("\nOptions:")
927
+ click.echo(" 1. Fix the credentials issue and re-run")
928
+ click.echo(" 2. Run with --only-vibes to skip LLM evaluation")
929
+ click.echo(" 3. Use --evaluator-model to specify a different model")
930
+ raise click.Abort()
931
+ click.echo("✓ Evaluator credentials validated")
932
+
711
933
  # Load dataset using Polars
712
934
  import polars as pl
713
935
 
@@ -769,6 +991,18 @@ def run(
769
991
  # TODO: Implement prompt updating
770
992
  click.echo("⚠ --update-prompts not yet implemented")
771
993
 
994
+ # Vibes mode: run agent and export for AI evaluation
995
+ if only_vibes:
996
+ _run_vibes_mode(
997
+ config=config,
998
+ dataset_df=dataset_df,
999
+ task_fn=task_fn,
1000
+ base_path=base_path,
1001
+ limit=limit,
1002
+ evaluator_schema_path=evaluator_schema_path,
1003
+ )
1004
+ return
1005
+
772
1006
  # Run experiment via Phoenix
773
1007
  if not dry_run:
774
1008
  # Create Phoenix client with optional overrides
@@ -1067,7 +1301,7 @@ def prompt():
1067
1301
  @click.option("--system-prompt", "-s", required=True, help="System prompt text")
1068
1302
  @click.option("--description", "-d", help="Prompt description")
1069
1303
  @click.option("--model-provider", default="OPENAI", help="Model provider (OPENAI, ANTHROPIC)")
1070
- @click.option("--model-name", "-m", help="Model name (e.g., gpt-4o, claude-sonnet-4-5)")
1304
+ @click.option("--model-name", "-m", help="Model name (e.g., gpt-4.1, claude-sonnet-4-5)")
1071
1305
  @click.option("--type", "-t", "prompt_type", default="Agent", help="Prompt type (Agent or Evaluator)")
1072
1306
  def prompt_create(
1073
1307
  name: str,
@@ -1083,7 +1317,7 @@ def prompt_create(
1083
1317
  # Create agent prompt
1084
1318
  rem experiments prompt create hello-world \\
1085
1319
  --system-prompt "You are a helpful assistant." \\
1086
- --model-name gpt-4o
1320
+ --model-name gpt-4.1
1087
1321
 
1088
1322
  # Create evaluator prompt
1089
1323
  rem experiments prompt create correctness-evaluator \\
@@ -1101,7 +1335,7 @@ def prompt_create(
1101
1335
  try:
1102
1336
  # Set default model if not specified
1103
1337
  if not model_name:
1104
- model_name = "gpt-4o" if model_provider == "OPENAI" else "claude-sonnet-4-5-20250929"
1338
+ model_name = "gpt-4.1" if model_provider == "OPENAI" else "claude-sonnet-4-5-20250929"
1105
1339
 
1106
1340
  # Get config
1107
1341
  phoenix_client = PhoenixClient()
@@ -1304,3 +1538,175 @@ def trace_list(
1304
1538
  logger.error(f"Failed to list traces: {e}")
1305
1539
  click.echo(f"Error: {e}", err=True)
1306
1540
  raise click.Abort()
1541
+
1542
+
1543
+ # =============================================================================
1544
+ # EXPORT COMMAND
1545
+ # =============================================================================
1546
+
1547
+
1548
+ @experiments.command("export")
1549
+ @click.argument("name")
1550
+ @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
1551
+ @click.option("--bucket", "-b", help="S3 bucket name (default: DATA_LAKE__BUCKET_NAME)")
1552
+ @click.option("--version", "-v", default="v0", help="Data lake version prefix (default: v0)")
1553
+ @click.option("--plan", is_flag=True, help="Show what would be exported without uploading")
1554
+ @click.option("--include-results", is_flag=True, help="Include results directory in export")
1555
+ def export(
1556
+ name: str,
1557
+ base_path: Optional[str],
1558
+ bucket: Optional[str],
1559
+ version: str,
1560
+ plan: bool,
1561
+ include_results: bool,
1562
+ ):
1563
+ """Export experiment to S3 data lake.
1564
+
1565
+ Exports experiment configuration, ground truth, and optionally results
1566
+ to the S3 data lake following the convention:
1567
+
1568
+ s3://{bucket}/{version}/datasets/calibration/experiments/{agent}/{task}/
1569
+
1570
+ The export includes:
1571
+ - experiment.yaml (configuration)
1572
+ - README.md (documentation)
1573
+ - ground-truth/ (evaluation datasets)
1574
+ - seed-data/ (optional seed data)
1575
+ - results/ (optional, with --include-results)
1576
+
1577
+ Examples:
1578
+ # Preview what would be exported
1579
+ rem experiments export my-experiment --plan
1580
+
1581
+ # Export to configured data lake bucket
1582
+ rem experiments export my-experiment
1583
+
1584
+ # Export to specific bucket
1585
+ rem experiments export my-experiment --bucket siggy-data
1586
+
1587
+ # Include results in export
1588
+ rem experiments export my-experiment --include-results
1589
+
1590
+ # Export with custom version prefix
1591
+ rem experiments export my-experiment --version v1
1592
+ """
1593
+ from rem.models.core.experiment import ExperimentConfig
1594
+ from rem.settings import settings
1595
+ from rem.services.fs.s3_provider import S3Provider
1596
+ import os
1597
+ import json
1598
+
1599
+ try:
1600
+ # Resolve base path
1601
+ if base_path is None:
1602
+ base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
1603
+
1604
+ # Load experiment configuration
1605
+ config_path = Path(base_path) / name / "experiment.yaml"
1606
+ if not config_path.exists():
1607
+ click.echo(f"Experiment not found: {name}")
1608
+ click.echo(f" Looked in: {config_path}")
1609
+ raise click.Abort()
1610
+
1611
+ config = ExperimentConfig.from_yaml(config_path)
1612
+ click.echo(f"✓ Loaded experiment: {name}")
1613
+
1614
+ # Resolve bucket
1615
+ if bucket is None:
1616
+ bucket = settings.data_lake.bucket_name
1617
+ if bucket is None:
1618
+ click.echo("Error: No S3 bucket configured.")
1619
+ click.echo(" Set DATA_LAKE__BUCKET_NAME environment variable or use --bucket option")
1620
+ raise click.Abort()
1621
+
1622
+ # Build S3 paths
1623
+ s3_base = config.get_s3_export_path(bucket, version)
1624
+ exp_dir = config.get_experiment_dir(base_path)
1625
+
1626
+ # Collect files to export
1627
+ files_to_export = []
1628
+
1629
+ # Always include these files
1630
+ required_files = [
1631
+ ("experiment.yaml", exp_dir / "experiment.yaml"),
1632
+ ("README.md", exp_dir / "README.md"),
1633
+ ]
1634
+
1635
+ for s3_name, local_path in required_files:
1636
+ if local_path.exists():
1637
+ files_to_export.append((s3_name, local_path))
1638
+
1639
+ # Include ground-truth directory
1640
+ ground_truth_dir = exp_dir / "ground-truth"
1641
+ if ground_truth_dir.exists():
1642
+ for f in ground_truth_dir.rglob("*"):
1643
+ if f.is_file():
1644
+ relative = f.relative_to(exp_dir)
1645
+ files_to_export.append((str(relative), f))
1646
+
1647
+ # Include seed-data directory
1648
+ seed_data_dir = exp_dir / "seed-data"
1649
+ if seed_data_dir.exists():
1650
+ for f in seed_data_dir.rglob("*"):
1651
+ if f.is_file():
1652
+ relative = f.relative_to(exp_dir)
1653
+ files_to_export.append((str(relative), f))
1654
+
1655
+ # Optionally include results
1656
+ if include_results:
1657
+ results_dir = exp_dir / "results"
1658
+ if results_dir.exists():
1659
+ for f in results_dir.rglob("*"):
1660
+ if f.is_file():
1661
+ relative = f.relative_to(exp_dir)
1662
+ files_to_export.append((str(relative), f))
1663
+
1664
+ # Display export plan
1665
+ click.echo(f"\n{'=' * 60}")
1666
+ click.echo(f"EXPORT {'PLAN' if plan else 'TO S3'}")
1667
+ click.echo(f"{'=' * 60}")
1668
+ click.echo(f"\nExperiment: {config.name}")
1669
+ click.echo(f"Agent: {config.agent_schema_ref.name}")
1670
+ click.echo(f"Task: {config.task}")
1671
+ click.echo(f"Evaluator file: {config.get_evaluator_filename()}")
1672
+ click.echo(f"\nDestination: {s3_base}/")
1673
+ click.echo(f"\nFiles to export ({len(files_to_export)}):")
1674
+
1675
+ for s3_name, local_path in files_to_export:
1676
+ s3_uri = f"{s3_base}/{s3_name}"
1677
+ if plan:
1678
+ click.echo(f" {local_path}")
1679
+ click.echo(f" → {s3_uri}")
1680
+ else:
1681
+ click.echo(f" {s3_name}")
1682
+
1683
+ if plan:
1684
+ click.echo(f"\n[PLAN MODE] No files were uploaded.")
1685
+ click.echo(f"Run without --plan to execute the export.")
1686
+ return
1687
+
1688
+ # Execute export
1689
+ click.echo(f"\n⏳ Uploading to S3...")
1690
+ s3 = S3Provider()
1691
+
1692
+ uploaded = 0
1693
+ for s3_name, local_path in files_to_export:
1694
+ s3_uri = f"{s3_base}/{s3_name}"
1695
+ try:
1696
+ s3.copy(str(local_path), s3_uri)
1697
+ uploaded += 1
1698
+ click.echo(f" ✓ {s3_name}")
1699
+ except Exception as e:
1700
+ click.echo(f" ✗ {s3_name}: {e}")
1701
+
1702
+ click.echo(f"\n✓ Exported {uploaded}/{len(files_to_export)} files to {s3_base}/")
1703
+
1704
+ # Show next steps
1705
+ click.echo(f"\nNext steps:")
1706
+ click.echo(f" - View in S3: aws s3 ls {s3_base}/ --recursive")
1707
+ click.echo(f" - Download: aws s3 sync {s3_base}/ ./{config.agent_schema_ref.name}/{config.task}/")
1708
+
1709
+ except Exception as e:
1710
+ logger.error(f"Failed to export experiment: {e}")
1711
+ click.echo(f"Error: {e}", err=True)
1712
+ raise click.Abort()