remdb 0.3.114__py3-none-any.whl → 0.3.172__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (83) hide show
  1. rem/agentic/agents/__init__.py +16 -0
  2. rem/agentic/agents/agent_manager.py +311 -0
  3. rem/agentic/agents/sse_simulator.py +2 -0
  4. rem/agentic/context.py +103 -5
  5. rem/agentic/context_builder.py +36 -9
  6. rem/agentic/mcp/tool_wrapper.py +161 -18
  7. rem/agentic/otel/setup.py +1 -0
  8. rem/agentic/providers/phoenix.py +371 -108
  9. rem/agentic/providers/pydantic_ai.py +172 -30
  10. rem/agentic/schema.py +8 -4
  11. rem/api/deps.py +3 -5
  12. rem/api/main.py +26 -4
  13. rem/api/mcp_router/resources.py +15 -10
  14. rem/api/mcp_router/server.py +11 -3
  15. rem/api/mcp_router/tools.py +418 -4
  16. rem/api/middleware/tracking.py +5 -5
  17. rem/api/routers/admin.py +218 -1
  18. rem/api/routers/auth.py +349 -6
  19. rem/api/routers/chat/completions.py +255 -7
  20. rem/api/routers/chat/models.py +81 -7
  21. rem/api/routers/chat/otel_utils.py +33 -0
  22. rem/api/routers/chat/sse_events.py +17 -1
  23. rem/api/routers/chat/streaming.py +126 -19
  24. rem/api/routers/feedback.py +134 -14
  25. rem/api/routers/messages.py +24 -15
  26. rem/api/routers/query.py +6 -3
  27. rem/auth/__init__.py +13 -3
  28. rem/auth/jwt.py +352 -0
  29. rem/auth/middleware.py +115 -10
  30. rem/auth/providers/__init__.py +4 -1
  31. rem/auth/providers/email.py +215 -0
  32. rem/cli/commands/README.md +42 -0
  33. rem/cli/commands/cluster.py +617 -168
  34. rem/cli/commands/configure.py +4 -7
  35. rem/cli/commands/db.py +66 -22
  36. rem/cli/commands/experiments.py +468 -76
  37. rem/cli/commands/schema.py +6 -5
  38. rem/cli/commands/session.py +336 -0
  39. rem/cli/dreaming.py +2 -2
  40. rem/cli/main.py +2 -0
  41. rem/config.py +8 -1
  42. rem/models/core/experiment.py +58 -14
  43. rem/models/entities/__init__.py +4 -0
  44. rem/models/entities/ontology.py +1 -1
  45. rem/models/entities/ontology_config.py +1 -1
  46. rem/models/entities/subscriber.py +175 -0
  47. rem/models/entities/user.py +1 -0
  48. rem/schemas/agents/core/agent-builder.yaml +235 -0
  49. rem/schemas/agents/examples/contract-analyzer.yaml +1 -1
  50. rem/schemas/agents/examples/contract-extractor.yaml +1 -1
  51. rem/schemas/agents/examples/cv-parser.yaml +1 -1
  52. rem/services/__init__.py +3 -1
  53. rem/services/content/service.py +4 -3
  54. rem/services/email/__init__.py +10 -0
  55. rem/services/email/service.py +513 -0
  56. rem/services/email/templates.py +360 -0
  57. rem/services/phoenix/client.py +59 -18
  58. rem/services/postgres/README.md +38 -0
  59. rem/services/postgres/diff_service.py +127 -6
  60. rem/services/postgres/pydantic_to_sqlalchemy.py +45 -13
  61. rem/services/postgres/repository.py +5 -4
  62. rem/services/postgres/schema_generator.py +205 -4
  63. rem/services/session/compression.py +120 -50
  64. rem/services/session/reload.py +14 -7
  65. rem/services/user_service.py +41 -9
  66. rem/settings.py +442 -23
  67. rem/sql/migrations/001_install.sql +156 -0
  68. rem/sql/migrations/002_install_models.sql +1951 -88
  69. rem/sql/migrations/004_cache_system.sql +548 -0
  70. rem/sql/migrations/005_schema_update.sql +145 -0
  71. rem/utils/README.md +45 -0
  72. rem/utils/__init__.py +18 -0
  73. rem/utils/files.py +157 -1
  74. rem/utils/schema_loader.py +139 -10
  75. rem/utils/sql_paths.py +146 -0
  76. rem/utils/vision.py +1 -1
  77. rem/workers/__init__.py +3 -1
  78. rem/workers/db_listener.py +579 -0
  79. rem/workers/unlogged_maintainer.py +463 -0
  80. {remdb-0.3.114.dist-info → remdb-0.3.172.dist-info}/METADATA +218 -180
  81. {remdb-0.3.114.dist-info → remdb-0.3.172.dist-info}/RECORD +83 -68
  82. {remdb-0.3.114.dist-info → remdb-0.3.172.dist-info}/WHEEL +0 -0
  83. {remdb-0.3.114.dist-info → remdb-0.3.172.dist-info}/entry_points.txt +0 -0
@@ -63,6 +63,7 @@ def experiments():
63
63
  @experiments.command("create")
64
64
  @click.argument("name")
65
65
  @click.option("--agent", "-a", required=True, help="Agent schema name (e.g., 'cv-parser')")
66
+ @click.option("--task", "-t", default="general", help="Task name for organizing experiments (e.g., 'risk-assessment')")
66
67
  @click.option("--evaluator", "-e", default="default", help="Evaluator schema name (default: 'default')")
67
68
  @click.option("--description", "-d", help="Experiment description")
68
69
  @click.option("--dataset-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
@@ -74,6 +75,7 @@ def experiments():
74
75
  def create(
75
76
  name: str,
76
77
  agent: str,
78
+ task: str,
77
79
  evaluator: str,
78
80
  description: Optional[str],
79
81
  dataset_location: str,
@@ -123,19 +125,17 @@ def create(
123
125
  # Resolve base path: CLI arg > EXPERIMENTS_HOME env var > default "experiments"
124
126
  if base_path is None:
125
127
  base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
126
- # Build dataset reference
128
+ # Build dataset reference (format auto-detected from file extension)
127
129
  if dataset_location == "git":
128
130
  dataset_ref = DatasetReference(
129
131
  location=DatasetLocation.GIT,
130
132
  path="ground-truth/dataset.csv",
131
- format="csv",
132
133
  description="Ground truth Q&A dataset for evaluation"
133
134
  )
134
135
  else: # s3 or hybrid
135
136
  dataset_ref = DatasetReference(
136
137
  location=DatasetLocation(dataset_location),
137
138
  path=f"s3://rem-experiments/{name}/datasets/ground_truth.parquet",
138
- format="parquet",
139
139
  schema_path="datasets/schema.yaml" if dataset_location == "hybrid" else None,
140
140
  description="Ground truth dataset for evaluation"
141
141
  )
@@ -170,7 +170,8 @@ def create(
170
170
  # Create experiment config
171
171
  config = ExperimentConfig(
172
172
  name=name,
173
- description=description or f"Evaluation experiment for {agent} agent",
173
+ task=task,
174
+ description=description or f"Evaluation experiment for {agent} agent ({task} task)",
174
175
  agent_schema_ref=SchemaReference(
175
176
  name=agent,
176
177
  version=None, # Use latest by default
@@ -514,6 +515,159 @@ def show(name: str, base_path: Optional[str]):
514
515
  raise click.Abort()
515
516
 
516
517
 
518
+ # =============================================================================
519
+ # VIBES MODE HELPER
520
+ # =============================================================================
521
+
522
+
523
+ def _run_vibes_mode(
524
+ config: Any,
525
+ dataset_df: Any,
526
+ task_fn: Any,
527
+ base_path: str,
528
+ limit: Optional[int],
529
+ evaluator_schema_path: Path,
530
+ ) -> None:
531
+ """Run experiment in vibes mode - execute agent and export for AI evaluation.
532
+
533
+ Vibes mode runs the agent on each example and saves results to a JSONL file.
534
+ The AI assistant (e.g., Claude Code) then acts as the judge using the
535
+ evaluator schema to evaluate results.
536
+
537
+ Args:
538
+ config: ExperimentConfig object
539
+ dataset_df: Polars DataFrame with ground truth examples
540
+ task_fn: Function to run agent on each example
541
+ base_path: Base directory for experiments
542
+ limit: Optional limit on number of examples to process
543
+ evaluator_schema_path: Path to the evaluator schema YAML file
544
+ """
545
+ from rem.utils.date_utils import format_timestamp_for_experiment, utc_now, to_iso
546
+ import json
547
+
548
+ # Apply limit if specified
549
+ if limit:
550
+ dataset_df = dataset_df.head(limit)
551
+ click.echo(f" (Limited to {limit} examples)")
552
+
553
+ # Create results directory
554
+ timestamp = format_timestamp_for_experiment()
555
+ results_dir = Path(base_path) / config.name / "results" / timestamp
556
+ results_dir.mkdir(parents=True, exist_ok=True)
557
+
558
+ click.echo(f"\n⏳ Running agent on {len(dataset_df)} examples...")
559
+ click.echo(f" Results will be saved to: {results_dir}")
560
+ click.echo()
561
+
562
+ # Run agent on each example and collect results
563
+ results = []
564
+ records = dataset_df.to_dicts()
565
+
566
+ for i, record in enumerate(records, 1):
567
+ example_id = record.get("id", i)
568
+ click.echo(f" [{i}/{len(records)}] Processing example {example_id}...", nl=False)
569
+
570
+ try:
571
+ # Prepare input for agent
572
+ input_text = record.get("text", record.get("input", record.get("query", "")))
573
+ example_input = {"query": input_text} if isinstance(input_text, str) else input_text
574
+
575
+ # Run agent
576
+ output = task_fn({"input": example_input})
577
+
578
+ result = {
579
+ "id": example_id,
580
+ "input": input_text,
581
+ "ground_truth": record.get("ground_truth", record.get("expected_output", "")),
582
+ "category": record.get("category", ""),
583
+ "agent_output": output,
584
+ "status": "success",
585
+ }
586
+ click.echo(" ✓")
587
+
588
+ except Exception as e:
589
+ result = {
590
+ "id": example_id,
591
+ "input": record.get("text", record.get("input", "")),
592
+ "ground_truth": record.get("ground_truth", record.get("expected_output", "")),
593
+ "category": record.get("category", ""),
594
+ "agent_output": None,
595
+ "status": "error",
596
+ "error": str(e),
597
+ }
598
+ click.echo(f" ✗ ({e})")
599
+
600
+ results.append(result)
601
+
602
+ # Save results to JSONL
603
+ results_file = results_dir / "vibes-results.jsonl"
604
+ with open(results_file, "w") as f:
605
+ for result in results:
606
+ f.write(json.dumps(result) + "\n")
607
+
608
+ # Copy evaluator schema to results dir for easy reference
609
+ import shutil
610
+ evaluator_copy = results_dir / "evaluator-schema.yaml"
611
+ shutil.copy(evaluator_schema_path, evaluator_copy)
612
+
613
+ # Save run metadata
614
+ run_info = {
615
+ "experiment": config.name,
616
+ "agent": config.agent_schema_ref.name,
617
+ "evaluator": config.evaluator_schema_ref.name,
618
+ "mode": "vibes",
619
+ "timestamp": timestamp,
620
+ "total_examples": len(records),
621
+ "successful": len([r for r in results if r["status"] == "success"]),
622
+ "failed": len([r for r in results if r["status"] == "error"]),
623
+ "completed_at": to_iso(utc_now()),
624
+ }
625
+
626
+ run_info_file = results_dir / "run-info.json"
627
+ with open(run_info_file, "w") as f:
628
+ json.dump(run_info, f, indent=2)
629
+
630
+ # Print summary and instructions
631
+ success_count = run_info["successful"]
632
+ fail_count = run_info["failed"]
633
+
634
+ click.echo(f"\n{'=' * 60}")
635
+ click.echo(f"VIBES MODE COMPLETE")
636
+ click.echo(f"{'=' * 60}")
637
+ click.echo(f"\nResults: {success_count} successful, {fail_count} failed")
638
+ click.echo(f"\nFiles saved to: {results_dir}/")
639
+ click.echo(f" - vibes-results.jsonl (agent outputs)")
640
+ click.echo(f" - evaluator-schema.yaml (evaluation criteria)")
641
+ click.echo(f" - run-info.json (run metadata)")
642
+
643
+ click.echo(f"\n{'=' * 60}")
644
+ click.echo(f"NEXT STEP: Ask your AI assistant to evaluate")
645
+ click.echo(f"{'=' * 60}")
646
+ click.echo(f"""
647
+ Copy this prompt to Claude Code or your AI assistant:
648
+
649
+ Please evaluate the experiment results in:
650
+ {results_dir}/
651
+
652
+ Read the vibes-results.jsonl file and evaluate each example
653
+ using the evaluator schema in evaluator-schema.yaml.
654
+
655
+ For each example, provide:
656
+ 1. extracted_classification
657
+ 2. exact_match (vs ground_truth)
658
+ 3. semantic_match
659
+ 4. reasoning_quality_score
660
+ 5. overall_score
661
+ 6. pass/fail
662
+
663
+ Then provide summary metrics:
664
+ - Exact match accuracy
665
+ - Semantic match accuracy
666
+ - Average overall score
667
+ - Pass rate
668
+ """)
669
+
670
+
517
671
  # =============================================================================
518
672
  # RUN COMMAND
519
673
  # =============================================================================
@@ -524,6 +678,8 @@ def show(name: str, base_path: Optional[str]):
524
678
  @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
525
679
  @click.option("--version", help="Git tag version to load (e.g., 'experiments/my-exp/v1.0.0')")
526
680
  @click.option("--dry-run", is_flag=True, help="Test on small subset without saving")
681
+ @click.option("--only-vibes", is_flag=True, help="Run agent locally, export results for AI evaluation (no Phoenix)")
682
+ @click.option("--limit", "-n", type=int, help="Limit number of examples to evaluate (useful with --only-vibes)")
527
683
  @click.option("--update-prompts", is_flag=True, help="Update prompts in Phoenix before running")
528
684
  @click.option("--phoenix-url", help="Phoenix server URL (overrides PHOENIX_BASE_URL env var)")
529
685
  @click.option("--phoenix-api-key", help="Phoenix API key (overrides PHOENIX_API_KEY env var)")
@@ -532,14 +688,45 @@ def run(
532
688
  base_path: Optional[str],
533
689
  version: Optional[str],
534
690
  dry_run: bool,
691
+ only_vibes: bool,
692
+ limit: Optional[int],
535
693
  update_prompts: bool,
536
694
  phoenix_url: Optional[str],
537
695
  phoenix_api_key: Optional[str],
538
696
  ):
539
- """Run an experiment using Phoenix provider.
697
+ """Run an experiment using Phoenix provider or local vibes mode.
540
698
 
541
699
  Loads configuration, executes agent and evaluator, saves results.
542
700
 
701
+ Vibes Mode (--only-vibes):
702
+ Run agent locally without Phoenix infrastructure. Agent outputs are saved
703
+ to a JSONL file along with the evaluator schema. Your AI assistant (e.g.,
704
+ Claude Code) then acts as the judge to evaluate results.
705
+
706
+ This enables seamless switching between:
707
+ - Local evaluation: Quick iteration with AI-as-judge
708
+ - Phoenix evaluation: Production metrics and dashboards
709
+
710
+ Usage:
711
+ rem experiments run my-experiment --only-vibes
712
+ rem experiments run my-experiment --only-vibes --limit 5
713
+
714
+ The command will:
715
+ 1. Run the agent on each ground-truth example
716
+ 2. Save results to results/{timestamp}/vibes-results.jsonl
717
+ 3. Print the evaluator prompt and schema
718
+ 4. Instruct you to ask your AI assistant to evaluate
719
+
720
+ Example workflow with Claude Code:
721
+ $ rem experiments run mental-health-classifier --only-vibes --limit 3
722
+ # ... agent runs ...
723
+ # Results saved to: .experiments/mental-health-classifier/results/20241203-143022/
724
+
725
+ # Then ask Claude Code:
726
+ "Please evaluate the experiment results in
727
+ .experiments/mental-health-classifier/results/20241203-143022/
728
+ using the evaluator schema provided"
729
+
543
730
  Phoenix Connection:
544
731
  Commands respect PHOENIX_BASE_URL and PHOENIX_API_KEY environment variables.
545
732
  Defaults to localhost:6006 for local development.
@@ -562,6 +749,12 @@ def run(
562
749
  # Run experiment with latest schemas
563
750
  rem experiments run hello-world-validation
564
751
 
752
+ # Quick local evaluation (vibes mode)
753
+ rem experiments run hello-world-validation --only-vibes
754
+
755
+ # Vibes mode with limited examples
756
+ rem experiments run hello-world-validation --only-vibes --limit 5
757
+
565
758
  # Run specific version
566
759
  rem experiments run hello-world-validation \\
567
760
  --version experiments/hello-world-validation/v1.0.0
@@ -674,92 +867,107 @@ def run(
674
867
 
675
868
  click.echo(f"Loading evaluator: {evaluator_name} for agent {agent_name}")
676
869
 
677
- # Try multiple evaluator path patterns (agent-specific, then generic)
678
- evaluator_paths_to_try = [
679
- f"{agent_name}/{evaluator_name}", # e.g., hello-world/default
680
- f"{agent_name}-{evaluator_name}", # e.g., hello-world-default
681
- evaluator_name, # e.g., default (generic)
682
- ]
870
+ # Find evaluator schema file path
871
+ from rem.utils.schema_loader import get_evaluator_schema_path
872
+
873
+ evaluator_schema_path = get_evaluator_schema_path(evaluator_name)
874
+ if not evaluator_schema_path or not evaluator_schema_path.exists():
875
+ click.echo(f"Error: Could not find evaluator schema '{evaluator_name}'")
876
+ raise click.Abort()
683
877
 
878
+ click.echo(f"✓ Found evaluator schema: {evaluator_schema_path}")
879
+
880
+ # For Phoenix mode, also load evaluator function
684
881
  evaluator_fn = None
685
- evaluator_load_error = None
882
+ if not only_vibes:
883
+ # Try multiple evaluator path patterns (agent-specific, then generic)
884
+ evaluator_paths_to_try = [
885
+ f"{agent_name}/{evaluator_name}", # e.g., hello-world/default
886
+ f"{agent_name}-{evaluator_name}", # e.g., hello-world-default
887
+ evaluator_name, # e.g., default (generic)
888
+ ]
686
889
 
687
- for evaluator_path in evaluator_paths_to_try:
688
- try:
689
- evaluator_fn = create_evaluator_from_schema(
690
- evaluator_schema_path=evaluator_path,
691
- model_name=None, # Use default from schema
692
- )
693
- click.echo(f"✓ Loaded evaluator schema: {evaluator_path}")
694
- break
695
- except FileNotFoundError as e:
696
- evaluator_load_error = e
697
- logger.debug(f"Evaluator not found at {evaluator_path}: {e}")
698
- continue
699
- except Exception as e:
700
- evaluator_load_error = e
701
- logger.warning(f"Failed to load evaluator from {evaluator_path}: {e}")
702
- continue
890
+ evaluator_load_error = None
891
+
892
+ for evaluator_path in evaluator_paths_to_try:
893
+ try:
894
+ evaluator_fn = create_evaluator_from_schema(
895
+ evaluator_schema_path=evaluator_path,
896
+ model_name=None, # Use default from schema
897
+ )
898
+ click.echo(f"✓ Loaded evaluator function: {evaluator_path}")
899
+ break
900
+ except FileNotFoundError as e:
901
+ evaluator_load_error = e
902
+ logger.debug(f"Evaluator not found at {evaluator_path}: {e}")
903
+ continue
904
+ except Exception as e:
905
+ evaluator_load_error = e
906
+ logger.warning(f"Failed to load evaluator from {evaluator_path}: {e}")
907
+ continue
703
908
 
704
- if evaluator_fn is None:
705
- click.echo(f"Error: Could not load evaluator schema '{evaluator_name}'")
909
+ if evaluator_fn is None and not only_vibes:
910
+ click.echo(f"Error: Could not load evaluator function '{evaluator_name}'")
706
911
  click.echo(f" Tried paths: {evaluator_paths_to_try}")
707
912
  if evaluator_load_error:
708
913
  click.echo(f" Last error: {evaluator_load_error}")
709
914
  raise click.Abort()
710
915
 
711
- # Load dataset using Polars
712
- import polars as pl
916
+ # Validate evaluator credentials before running expensive agent tasks
917
+ if evaluator_fn is not None and not only_vibes:
918
+ from rem.agentic.providers.phoenix import validate_evaluator_credentials
919
+
920
+ click.echo("Validating evaluator credentials...")
921
+ is_valid, error_msg = validate_evaluator_credentials()
922
+ if not is_valid:
923
+ click.echo(click.style(f"\n⚠️ Evaluator validation failed: {error_msg}", fg="yellow"))
924
+ click.echo("\nOptions:")
925
+ click.echo(" 1. Fix the credentials issue and re-run")
926
+ click.echo(" 2. Run with --only-vibes to skip LLM evaluation")
927
+ click.echo(" 3. Use --evaluator-model to specify a different model")
928
+ raise click.Abort()
929
+ click.echo("✓ Evaluator credentials validated")
930
+
931
+ # Load dataset using read_dataframe utility (auto-detects format from extension)
932
+ from rem.utils.files import read_dataframe
713
933
 
714
934
  click.echo(f"Loading dataset: {list(config.datasets.keys())[0]}")
715
935
  dataset_ref = list(config.datasets.values())[0]
716
936
 
717
- if dataset_ref.location.value == "git":
718
- # Load from Git (local filesystem)
719
- dataset_path = Path(base_path) / name / dataset_ref.path
720
- if not dataset_path.exists():
721
- click.echo(f"Error: Dataset not found: {dataset_path}")
722
- raise click.Abort()
723
-
724
- if dataset_ref.format == "csv":
725
- dataset_df = pl.read_csv(dataset_path)
726
- elif dataset_ref.format == "parquet":
727
- dataset_df = pl.read_parquet(dataset_path)
728
- elif dataset_ref.format == "jsonl":
729
- dataset_df = pl.read_ndjson(dataset_path)
730
- else:
731
- click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
732
- raise click.Abort()
733
- elif dataset_ref.location.value in ["s3", "hybrid"]:
734
- # Load from S3 using FS provider
735
- from rem.services.fs import FS
736
- from io import BytesIO
937
+ try:
938
+ if dataset_ref.location.value == "git":
939
+ # Load from Git (local filesystem)
940
+ dataset_path = Path(base_path) / name / dataset_ref.path
941
+ if not dataset_path.exists():
942
+ click.echo(f"Error: Dataset not found: {dataset_path}")
943
+ raise click.Abort()
737
944
 
738
- fs = FS()
945
+ dataset_df = read_dataframe(dataset_path)
739
946
 
740
- try:
741
- if dataset_ref.format == "csv":
742
- content = fs.read(dataset_ref.path)
743
- dataset_df = pl.read_csv(BytesIO(content.encode() if isinstance(content, str) else content))
744
- elif dataset_ref.format == "parquet":
745
- content_bytes = fs.read(dataset_ref.path)
746
- dataset_df = pl.read_parquet(BytesIO(content_bytes if isinstance(content_bytes, bytes) else content_bytes.encode()))
747
- elif dataset_ref.format == "jsonl":
748
- content = fs.read(dataset_ref.path)
749
- dataset_df = pl.read_ndjson(BytesIO(content.encode() if isinstance(content, str) else content))
750
- else:
751
- click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
752
- raise click.Abort()
947
+ elif dataset_ref.location.value in ["s3", "hybrid"]:
948
+ # Load from S3 using FS provider
949
+ from rem.services.fs import FS
753
950
 
951
+ fs = FS()
952
+ content = fs.read(dataset_ref.path)
953
+ # Ensure we have bytes
954
+ if isinstance(content, str):
955
+ content = content.encode()
956
+ dataset_df = read_dataframe(content, filename=dataset_ref.path)
754
957
  click.echo(f"✓ Loaded dataset from S3")
755
- except Exception as e:
756
- logger.error(f"Failed to load dataset from S3: {e}")
757
- click.echo(f"Error: Could not load dataset from S3")
758
- click.echo(f" Path: {dataset_ref.path}")
759
- click.echo(f" Format: {dataset_ref.format}")
958
+
959
+ else:
960
+ click.echo(f"Error: Unknown dataset location: {dataset_ref.location.value}")
760
961
  raise click.Abort()
761
- else:
762
- click.echo(f"Error: Unknown dataset location: {dataset_ref.location.value}")
962
+
963
+ except ValueError as e:
964
+ # Unsupported format error from read_dataframe
965
+ click.echo(f"Error: {e}")
966
+ raise click.Abort()
967
+ except Exception as e:
968
+ logger.error(f"Failed to load dataset: {e}")
969
+ click.echo(f"Error: Could not load dataset")
970
+ click.echo(f" Path: {dataset_ref.path}")
763
971
  raise click.Abort()
764
972
 
765
973
  click.echo(f"✓ Loaded dataset: {len(dataset_df)} examples")
@@ -769,6 +977,18 @@ def run(
769
977
  # TODO: Implement prompt updating
770
978
  click.echo("⚠ --update-prompts not yet implemented")
771
979
 
980
+ # Vibes mode: run agent and export for AI evaluation
981
+ if only_vibes:
982
+ _run_vibes_mode(
983
+ config=config,
984
+ dataset_df=dataset_df,
985
+ task_fn=task_fn,
986
+ base_path=base_path,
987
+ limit=limit,
988
+ evaluator_schema_path=evaluator_schema_path,
989
+ )
990
+ return
991
+
772
992
  # Run experiment via Phoenix
773
993
  if not dry_run:
774
994
  # Create Phoenix client with optional overrides
@@ -1067,7 +1287,7 @@ def prompt():
1067
1287
  @click.option("--system-prompt", "-s", required=True, help="System prompt text")
1068
1288
  @click.option("--description", "-d", help="Prompt description")
1069
1289
  @click.option("--model-provider", default="OPENAI", help="Model provider (OPENAI, ANTHROPIC)")
1070
- @click.option("--model-name", "-m", help="Model name (e.g., gpt-4o, claude-sonnet-4-5)")
1290
+ @click.option("--model-name", "-m", help="Model name (e.g., gpt-4.1, claude-sonnet-4-5)")
1071
1291
  @click.option("--type", "-t", "prompt_type", default="Agent", help="Prompt type (Agent or Evaluator)")
1072
1292
  def prompt_create(
1073
1293
  name: str,
@@ -1083,7 +1303,7 @@ def prompt_create(
1083
1303
  # Create agent prompt
1084
1304
  rem experiments prompt create hello-world \\
1085
1305
  --system-prompt "You are a helpful assistant." \\
1086
- --model-name gpt-4o
1306
+ --model-name gpt-4.1
1087
1307
 
1088
1308
  # Create evaluator prompt
1089
1309
  rem experiments prompt create correctness-evaluator \\
@@ -1101,7 +1321,7 @@ def prompt_create(
1101
1321
  try:
1102
1322
  # Set default model if not specified
1103
1323
  if not model_name:
1104
- model_name = "gpt-4o" if model_provider == "OPENAI" else "claude-sonnet-4-5-20250929"
1324
+ model_name = "gpt-4.1" if model_provider == "OPENAI" else "claude-sonnet-4-5-20250929"
1105
1325
 
1106
1326
  # Get config
1107
1327
  phoenix_client = PhoenixClient()
@@ -1304,3 +1524,175 @@ def trace_list(
1304
1524
  logger.error(f"Failed to list traces: {e}")
1305
1525
  click.echo(f"Error: {e}", err=True)
1306
1526
  raise click.Abort()
1527
+
1528
+
1529
+ # =============================================================================
1530
+ # EXPORT COMMAND
1531
+ # =============================================================================
1532
+
1533
+
1534
+ @experiments.command("export")
1535
+ @click.argument("name")
1536
+ @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
1537
+ @click.option("--bucket", "-b", help="S3 bucket name (default: DATA_LAKE__BUCKET_NAME)")
1538
+ @click.option("--version", "-v", default="v0", help="Data lake version prefix (default: v0)")
1539
+ @click.option("--plan", is_flag=True, help="Show what would be exported without uploading")
1540
+ @click.option("--include-results", is_flag=True, help="Include results directory in export")
1541
+ def export(
1542
+ name: str,
1543
+ base_path: Optional[str],
1544
+ bucket: Optional[str],
1545
+ version: str,
1546
+ plan: bool,
1547
+ include_results: bool,
1548
+ ):
1549
+ """Export experiment to S3 data lake.
1550
+
1551
+ Exports experiment configuration, ground truth, and optionally results
1552
+ to the S3 data lake following the convention:
1553
+
1554
+ s3://{bucket}/{version}/datasets/calibration/experiments/{agent}/{task}/
1555
+
1556
+ The export includes:
1557
+ - experiment.yaml (configuration)
1558
+ - README.md (documentation)
1559
+ - ground-truth/ (evaluation datasets)
1560
+ - seed-data/ (optional seed data)
1561
+ - results/ (optional, with --include-results)
1562
+
1563
+ Examples:
1564
+ # Preview what would be exported
1565
+ rem experiments export my-experiment --plan
1566
+
1567
+ # Export to configured data lake bucket
1568
+ rem experiments export my-experiment
1569
+
1570
+ # Export to specific bucket
1571
+ rem experiments export my-experiment --bucket siggy-data
1572
+
1573
+ # Include results in export
1574
+ rem experiments export my-experiment --include-results
1575
+
1576
+ # Export with custom version prefix
1577
+ rem experiments export my-experiment --version v1
1578
+ """
1579
+ from rem.models.core.experiment import ExperimentConfig
1580
+ from rem.settings import settings
1581
+ from rem.services.fs.s3_provider import S3Provider
1582
+ import os
1583
+ import json
1584
+
1585
+ try:
1586
+ # Resolve base path
1587
+ if base_path is None:
1588
+ base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
1589
+
1590
+ # Load experiment configuration
1591
+ config_path = Path(base_path) / name / "experiment.yaml"
1592
+ if not config_path.exists():
1593
+ click.echo(f"Experiment not found: {name}")
1594
+ click.echo(f" Looked in: {config_path}")
1595
+ raise click.Abort()
1596
+
1597
+ config = ExperimentConfig.from_yaml(config_path)
1598
+ click.echo(f"✓ Loaded experiment: {name}")
1599
+
1600
+ # Resolve bucket
1601
+ if bucket is None:
1602
+ bucket = settings.data_lake.bucket_name
1603
+ if bucket is None:
1604
+ click.echo("Error: No S3 bucket configured.")
1605
+ click.echo(" Set DATA_LAKE__BUCKET_NAME environment variable or use --bucket option")
1606
+ raise click.Abort()
1607
+
1608
+ # Build S3 paths
1609
+ s3_base = config.get_s3_export_path(bucket, version)
1610
+ exp_dir = config.get_experiment_dir(base_path)
1611
+
1612
+ # Collect files to export
1613
+ files_to_export = []
1614
+
1615
+ # Always include these files
1616
+ required_files = [
1617
+ ("experiment.yaml", exp_dir / "experiment.yaml"),
1618
+ ("README.md", exp_dir / "README.md"),
1619
+ ]
1620
+
1621
+ for s3_name, local_path in required_files:
1622
+ if local_path.exists():
1623
+ files_to_export.append((s3_name, local_path))
1624
+
1625
+ # Include ground-truth directory
1626
+ ground_truth_dir = exp_dir / "ground-truth"
1627
+ if ground_truth_dir.exists():
1628
+ for f in ground_truth_dir.rglob("*"):
1629
+ if f.is_file():
1630
+ relative = f.relative_to(exp_dir)
1631
+ files_to_export.append((str(relative), f))
1632
+
1633
+ # Include seed-data directory
1634
+ seed_data_dir = exp_dir / "seed-data"
1635
+ if seed_data_dir.exists():
1636
+ for f in seed_data_dir.rglob("*"):
1637
+ if f.is_file():
1638
+ relative = f.relative_to(exp_dir)
1639
+ files_to_export.append((str(relative), f))
1640
+
1641
+ # Optionally include results
1642
+ if include_results:
1643
+ results_dir = exp_dir / "results"
1644
+ if results_dir.exists():
1645
+ for f in results_dir.rglob("*"):
1646
+ if f.is_file():
1647
+ relative = f.relative_to(exp_dir)
1648
+ files_to_export.append((str(relative), f))
1649
+
1650
+ # Display export plan
1651
+ click.echo(f"\n{'=' * 60}")
1652
+ click.echo(f"EXPORT {'PLAN' if plan else 'TO S3'}")
1653
+ click.echo(f"{'=' * 60}")
1654
+ click.echo(f"\nExperiment: {config.name}")
1655
+ click.echo(f"Agent: {config.agent_schema_ref.name}")
1656
+ click.echo(f"Task: {config.task}")
1657
+ click.echo(f"Evaluator file: {config.get_evaluator_filename()}")
1658
+ click.echo(f"\nDestination: {s3_base}/")
1659
+ click.echo(f"\nFiles to export ({len(files_to_export)}):")
1660
+
1661
+ for s3_name, local_path in files_to_export:
1662
+ s3_uri = f"{s3_base}/{s3_name}"
1663
+ if plan:
1664
+ click.echo(f" {local_path}")
1665
+ click.echo(f" → {s3_uri}")
1666
+ else:
1667
+ click.echo(f" {s3_name}")
1668
+
1669
+ if plan:
1670
+ click.echo(f"\n[PLAN MODE] No files were uploaded.")
1671
+ click.echo(f"Run without --plan to execute the export.")
1672
+ return
1673
+
1674
+ # Execute export
1675
+ click.echo(f"\n⏳ Uploading to S3...")
1676
+ s3 = S3Provider()
1677
+
1678
+ uploaded = 0
1679
+ for s3_name, local_path in files_to_export:
1680
+ s3_uri = f"{s3_base}/{s3_name}"
1681
+ try:
1682
+ s3.copy(str(local_path), s3_uri)
1683
+ uploaded += 1
1684
+ click.echo(f" ✓ {s3_name}")
1685
+ except Exception as e:
1686
+ click.echo(f" ✗ {s3_name}: {e}")
1687
+
1688
+ click.echo(f"\n✓ Exported {uploaded}/{len(files_to_export)} files to {s3_base}/")
1689
+
1690
+ # Show next steps
1691
+ click.echo(f"\nNext steps:")
1692
+ click.echo(f" - View in S3: aws s3 ls {s3_base}/ --recursive")
1693
+ click.echo(f" - Download: aws s3 sync {s3_base}/ ./{config.agent_schema_ref.name}/{config.task}/")
1694
+
1695
+ except Exception as e:
1696
+ logger.error(f"Failed to export experiment: {e}")
1697
+ click.echo(f"Error: {e}", err=True)
1698
+ raise click.Abort()