remdb 0.3.7__py3-none-any.whl → 0.3.133__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. rem/__init__.py +129 -2
  2. rem/agentic/README.md +76 -0
  3. rem/agentic/__init__.py +15 -0
  4. rem/agentic/agents/__init__.py +16 -2
  5. rem/agentic/agents/sse_simulator.py +502 -0
  6. rem/agentic/context.py +51 -25
  7. rem/agentic/llm_provider_models.py +301 -0
  8. rem/agentic/mcp/tool_wrapper.py +112 -17
  9. rem/agentic/otel/setup.py +93 -4
  10. rem/agentic/providers/phoenix.py +314 -132
  11. rem/agentic/providers/pydantic_ai.py +215 -26
  12. rem/agentic/schema.py +361 -21
  13. rem/agentic/tools/rem_tools.py +3 -3
  14. rem/api/README.md +238 -1
  15. rem/api/deps.py +255 -0
  16. rem/api/main.py +154 -37
  17. rem/api/mcp_router/resources.py +1 -1
  18. rem/api/mcp_router/server.py +26 -5
  19. rem/api/mcp_router/tools.py +465 -7
  20. rem/api/middleware/tracking.py +172 -0
  21. rem/api/routers/admin.py +494 -0
  22. rem/api/routers/auth.py +124 -0
  23. rem/api/routers/chat/completions.py +402 -20
  24. rem/api/routers/chat/models.py +88 -10
  25. rem/api/routers/chat/otel_utils.py +33 -0
  26. rem/api/routers/chat/sse_events.py +542 -0
  27. rem/api/routers/chat/streaming.py +642 -45
  28. rem/api/routers/dev.py +81 -0
  29. rem/api/routers/feedback.py +268 -0
  30. rem/api/routers/messages.py +473 -0
  31. rem/api/routers/models.py +78 -0
  32. rem/api/routers/query.py +360 -0
  33. rem/api/routers/shared_sessions.py +406 -0
  34. rem/auth/middleware.py +126 -27
  35. rem/cli/commands/README.md +237 -64
  36. rem/cli/commands/ask.py +13 -10
  37. rem/cli/commands/cluster.py +1808 -0
  38. rem/cli/commands/configure.py +5 -6
  39. rem/cli/commands/db.py +396 -139
  40. rem/cli/commands/experiments.py +469 -74
  41. rem/cli/commands/process.py +22 -15
  42. rem/cli/commands/scaffold.py +47 -0
  43. rem/cli/commands/schema.py +97 -50
  44. rem/cli/main.py +29 -6
  45. rem/config.py +10 -3
  46. rem/models/core/core_model.py +7 -1
  47. rem/models/core/experiment.py +54 -0
  48. rem/models/core/rem_query.py +5 -2
  49. rem/models/entities/__init__.py +21 -0
  50. rem/models/entities/domain_resource.py +38 -0
  51. rem/models/entities/feedback.py +123 -0
  52. rem/models/entities/message.py +30 -1
  53. rem/models/entities/session.py +83 -0
  54. rem/models/entities/shared_session.py +180 -0
  55. rem/models/entities/user.py +10 -3
  56. rem/registry.py +373 -0
  57. rem/schemas/agents/rem.yaml +7 -3
  58. rem/services/content/providers.py +92 -133
  59. rem/services/content/service.py +92 -20
  60. rem/services/dreaming/affinity_service.py +2 -16
  61. rem/services/dreaming/moment_service.py +2 -15
  62. rem/services/embeddings/api.py +24 -17
  63. rem/services/embeddings/worker.py +16 -16
  64. rem/services/phoenix/EXPERIMENT_DESIGN.md +3 -3
  65. rem/services/phoenix/client.py +302 -28
  66. rem/services/postgres/README.md +159 -15
  67. rem/services/postgres/__init__.py +2 -1
  68. rem/services/postgres/diff_service.py +531 -0
  69. rem/services/postgres/pydantic_to_sqlalchemy.py +427 -129
  70. rem/services/postgres/repository.py +132 -0
  71. rem/services/postgres/schema_generator.py +291 -9
  72. rem/services/postgres/service.py +6 -6
  73. rem/services/rate_limit.py +113 -0
  74. rem/services/rem/README.md +14 -0
  75. rem/services/rem/parser.py +44 -9
  76. rem/services/rem/service.py +36 -2
  77. rem/services/session/compression.py +24 -1
  78. rem/services/session/reload.py +1 -1
  79. rem/services/user_service.py +98 -0
  80. rem/settings.py +399 -29
  81. rem/sql/background_indexes.sql +21 -16
  82. rem/sql/migrations/001_install.sql +387 -54
  83. rem/sql/migrations/002_install_models.sql +2320 -393
  84. rem/sql/migrations/003_optional_extensions.sql +326 -0
  85. rem/sql/migrations/004_cache_system.sql +548 -0
  86. rem/utils/__init__.py +18 -0
  87. rem/utils/constants.py +97 -0
  88. rem/utils/date_utils.py +228 -0
  89. rem/utils/embeddings.py +17 -4
  90. rem/utils/files.py +167 -0
  91. rem/utils/mime_types.py +158 -0
  92. rem/utils/model_helpers.py +156 -1
  93. rem/utils/schema_loader.py +282 -35
  94. rem/utils/sql_paths.py +146 -0
  95. rem/utils/sql_types.py +3 -1
  96. rem/utils/vision.py +9 -14
  97. rem/workers/README.md +14 -14
  98. rem/workers/__init__.py +3 -1
  99. rem/workers/db_listener.py +579 -0
  100. rem/workers/db_maintainer.py +74 -0
  101. rem/workers/unlogged_maintainer.py +463 -0
  102. {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/METADATA +460 -303
  103. {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/RECORD +105 -74
  104. {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/WHEEL +1 -1
  105. rem/sql/002_install_models.sql +0 -1068
  106. rem/sql/install_models.sql +0 -1038
  107. {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/entry_points.txt +0 -0
@@ -63,6 +63,7 @@ def experiments():
63
63
  @experiments.command("create")
64
64
  @click.argument("name")
65
65
  @click.option("--agent", "-a", required=True, help="Agent schema name (e.g., 'cv-parser')")
66
+ @click.option("--task", "-t", default="general", help="Task name for organizing experiments (e.g., 'risk-assessment')")
66
67
  @click.option("--evaluator", "-e", default="default", help="Evaluator schema name (default: 'default')")
67
68
  @click.option("--description", "-d", help="Experiment description")
68
69
  @click.option("--dataset-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
@@ -74,6 +75,7 @@ def experiments():
74
75
  def create(
75
76
  name: str,
76
77
  agent: str,
78
+ task: str,
77
79
  evaluator: str,
78
80
  description: Optional[str],
79
81
  dataset_location: str,
@@ -170,7 +172,8 @@ def create(
170
172
  # Create experiment config
171
173
  config = ExperimentConfig(
172
174
  name=name,
173
- description=description or f"Evaluation experiment for {agent} agent",
175
+ task=task,
176
+ description=description or f"Evaluation experiment for {agent} agent ({task} task)",
174
177
  agent_schema_ref=SchemaReference(
175
178
  name=agent,
176
179
  version=None, # Use latest by default
@@ -514,6 +517,159 @@ def show(name: str, base_path: Optional[str]):
514
517
  raise click.Abort()
515
518
 
516
519
 
520
+ # =============================================================================
521
+ # VIBES MODE HELPER
522
+ # =============================================================================
523
+
524
+
525
+ def _run_vibes_mode(
526
+ config: Any,
527
+ dataset_df: Any,
528
+ task_fn: Any,
529
+ base_path: str,
530
+ limit: Optional[int],
531
+ evaluator_schema_path: Path,
532
+ ) -> None:
533
+ """Run experiment in vibes mode - execute agent and export for AI evaluation.
534
+
535
+ Vibes mode runs the agent on each example and saves results to a JSONL file.
536
+ The AI assistant (e.g., Claude Code) then acts as the judge using the
537
+ evaluator schema to evaluate results.
538
+
539
+ Args:
540
+ config: ExperimentConfig object
541
+ dataset_df: Polars DataFrame with ground truth examples
542
+ task_fn: Function to run agent on each example
543
+ base_path: Base directory for experiments
544
+ limit: Optional limit on number of examples to process
545
+ evaluator_schema_path: Path to the evaluator schema YAML file
546
+ """
547
+ from rem.utils.date_utils import format_timestamp_for_experiment, utc_now, to_iso
548
+ import json
549
+
550
+ # Apply limit if specified
551
+ if limit:
552
+ dataset_df = dataset_df.head(limit)
553
+ click.echo(f" (Limited to {limit} examples)")
554
+
555
+ # Create results directory
556
+ timestamp = format_timestamp_for_experiment()
557
+ results_dir = Path(base_path) / config.name / "results" / timestamp
558
+ results_dir.mkdir(parents=True, exist_ok=True)
559
+
560
+ click.echo(f"\n⏳ Running agent on {len(dataset_df)} examples...")
561
+ click.echo(f" Results will be saved to: {results_dir}")
562
+ click.echo()
563
+
564
+ # Run agent on each example and collect results
565
+ results = []
566
+ records = dataset_df.to_dicts()
567
+
568
+ for i, record in enumerate(records, 1):
569
+ example_id = record.get("id", i)
570
+ click.echo(f" [{i}/{len(records)}] Processing example {example_id}...", nl=False)
571
+
572
+ try:
573
+ # Prepare input for agent
574
+ input_text = record.get("text", record.get("input", record.get("query", "")))
575
+ example_input = {"query": input_text} if isinstance(input_text, str) else input_text
576
+
577
+ # Run agent
578
+ output = task_fn({"input": example_input})
579
+
580
+ result = {
581
+ "id": example_id,
582
+ "input": input_text,
583
+ "ground_truth": record.get("ground_truth", record.get("expected_output", "")),
584
+ "category": record.get("category", ""),
585
+ "agent_output": output,
586
+ "status": "success",
587
+ }
588
+ click.echo(" ✓")
589
+
590
+ except Exception as e:
591
+ result = {
592
+ "id": example_id,
593
+ "input": record.get("text", record.get("input", "")),
594
+ "ground_truth": record.get("ground_truth", record.get("expected_output", "")),
595
+ "category": record.get("category", ""),
596
+ "agent_output": None,
597
+ "status": "error",
598
+ "error": str(e),
599
+ }
600
+ click.echo(f" ✗ ({e})")
601
+
602
+ results.append(result)
603
+
604
+ # Save results to JSONL
605
+ results_file = results_dir / "vibes-results.jsonl"
606
+ with open(results_file, "w") as f:
607
+ for result in results:
608
+ f.write(json.dumps(result) + "\n")
609
+
610
+ # Copy evaluator schema to results dir for easy reference
611
+ import shutil
612
+ evaluator_copy = results_dir / "evaluator-schema.yaml"
613
+ shutil.copy(evaluator_schema_path, evaluator_copy)
614
+
615
+ # Save run metadata
616
+ run_info = {
617
+ "experiment": config.name,
618
+ "agent": config.agent_schema_ref.name,
619
+ "evaluator": config.evaluator_schema_ref.name,
620
+ "mode": "vibes",
621
+ "timestamp": timestamp,
622
+ "total_examples": len(records),
623
+ "successful": len([r for r in results if r["status"] == "success"]),
624
+ "failed": len([r for r in results if r["status"] == "error"]),
625
+ "completed_at": to_iso(utc_now()),
626
+ }
627
+
628
+ run_info_file = results_dir / "run-info.json"
629
+ with open(run_info_file, "w") as f:
630
+ json.dump(run_info, f, indent=2)
631
+
632
+ # Print summary and instructions
633
+ success_count = run_info["successful"]
634
+ fail_count = run_info["failed"]
635
+
636
+ click.echo(f"\n{'=' * 60}")
637
+ click.echo(f"VIBES MODE COMPLETE")
638
+ click.echo(f"{'=' * 60}")
639
+ click.echo(f"\nResults: {success_count} successful, {fail_count} failed")
640
+ click.echo(f"\nFiles saved to: {results_dir}/")
641
+ click.echo(f" - vibes-results.jsonl (agent outputs)")
642
+ click.echo(f" - evaluator-schema.yaml (evaluation criteria)")
643
+ click.echo(f" - run-info.json (run metadata)")
644
+
645
+ click.echo(f"\n{'=' * 60}")
646
+ click.echo(f"NEXT STEP: Ask your AI assistant to evaluate")
647
+ click.echo(f"{'=' * 60}")
648
+ click.echo(f"""
649
+ Copy this prompt to Claude Code or your AI assistant:
650
+
651
+ Please evaluate the experiment results in:
652
+ {results_dir}/
653
+
654
+ Read the vibes-results.jsonl file and evaluate each example
655
+ using the evaluator schema in evaluator-schema.yaml.
656
+
657
+ For each example, provide:
658
+ 1. extracted_classification
659
+ 2. exact_match (vs ground_truth)
660
+ 3. semantic_match
661
+ 4. reasoning_quality_score
662
+ 5. overall_score
663
+ 6. pass/fail
664
+
665
+ Then provide summary metrics:
666
+ - Exact match accuracy
667
+ - Semantic match accuracy
668
+ - Average overall score
669
+ - Pass rate
670
+ """)
671
+
672
+
517
673
  # =============================================================================
518
674
  # RUN COMMAND
519
675
  # =============================================================================
@@ -524,6 +680,8 @@ def show(name: str, base_path: Optional[str]):
524
680
  @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
525
681
  @click.option("--version", help="Git tag version to load (e.g., 'experiments/my-exp/v1.0.0')")
526
682
  @click.option("--dry-run", is_flag=True, help="Test on small subset without saving")
683
+ @click.option("--only-vibes", is_flag=True, help="Run agent locally, export results for AI evaluation (no Phoenix)")
684
+ @click.option("--limit", "-n", type=int, help="Limit number of examples to evaluate (useful with --only-vibes)")
527
685
  @click.option("--update-prompts", is_flag=True, help="Update prompts in Phoenix before running")
528
686
  @click.option("--phoenix-url", help="Phoenix server URL (overrides PHOENIX_BASE_URL env var)")
529
687
  @click.option("--phoenix-api-key", help="Phoenix API key (overrides PHOENIX_API_KEY env var)")
@@ -532,14 +690,45 @@ def run(
532
690
  base_path: Optional[str],
533
691
  version: Optional[str],
534
692
  dry_run: bool,
693
+ only_vibes: bool,
694
+ limit: Optional[int],
535
695
  update_prompts: bool,
536
696
  phoenix_url: Optional[str],
537
697
  phoenix_api_key: Optional[str],
538
698
  ):
539
- """Run an experiment using Phoenix provider.
699
+ """Run an experiment using Phoenix provider or local vibes mode.
540
700
 
541
701
  Loads configuration, executes agent and evaluator, saves results.
542
702
 
703
+ Vibes Mode (--only-vibes):
704
+ Run agent locally without Phoenix infrastructure. Agent outputs are saved
705
+ to a JSONL file along with the evaluator schema. Your AI assistant (e.g.,
706
+ Claude Code) then acts as the judge to evaluate results.
707
+
708
+ This enables seamless switching between:
709
+ - Local evaluation: Quick iteration with AI-as-judge
710
+ - Phoenix evaluation: Production metrics and dashboards
711
+
712
+ Usage:
713
+ rem experiments run my-experiment --only-vibes
714
+ rem experiments run my-experiment --only-vibes --limit 5
715
+
716
+ The command will:
717
+ 1. Run the agent on each ground-truth example
718
+ 2. Save results to results/{timestamp}/vibes-results.jsonl
719
+ 3. Print the evaluator prompt and schema
720
+ 4. Instruct you to ask your AI assistant to evaluate
721
+
722
+ Example workflow with Claude Code:
723
+ $ rem experiments run mental-health-classifier --only-vibes --limit 3
724
+ # ... agent runs ...
725
+ # Results saved to: .experiments/mental-health-classifier/results/20241203-143022/
726
+
727
+ # Then ask Claude Code:
728
+ "Please evaluate the experiment results in
729
+ .experiments/mental-health-classifier/results/20241203-143022/
730
+ using the evaluator schema provided"
731
+
543
732
  Phoenix Connection:
544
733
  Commands respect PHOENIX_BASE_URL and PHOENIX_API_KEY environment variables.
545
734
  Defaults to localhost:6006 for local development.
@@ -562,6 +751,12 @@ def run(
562
751
  # Run experiment with latest schemas
563
752
  rem experiments run hello-world-validation
564
753
 
754
+ # Quick local evaluation (vibes mode)
755
+ rem experiments run hello-world-validation --only-vibes
756
+
757
+ # Vibes mode with limited examples
758
+ rem experiments run hello-world-validation --only-vibes --limit 5
759
+
565
760
  # Run specific version
566
761
  rem experiments run hello-world-validation \\
567
762
  --version experiments/hello-world-validation/v1.0.0
@@ -578,8 +773,7 @@ def run(
578
773
  from rem.services.git import GitService
579
774
  from rem.services.phoenix import PhoenixClient
580
775
  from rem.agentic.providers.phoenix import create_evaluator_from_schema
581
- from datetime import datetime
582
- import pandas as pd
776
+ from rem.utils.date_utils import utc_now, to_iso, format_timestamp_for_experiment
583
777
  import os
584
778
 
585
779
  try:
@@ -615,36 +809,22 @@ def run(
615
809
  click.echo(f" Mode: DRY RUN (no data will be saved)")
616
810
  click.echo()
617
811
 
618
- # Load agent schema from Git or filesystem
812
+ # Load agent schema using centralized schema loader
619
813
  agent_name = config.agent_schema_ref.name
620
814
  agent_version = config.agent_schema_ref.version
621
815
 
622
816
  click.echo(f"Loading agent schema: {agent_name} (version: {agent_version or 'latest'})")
623
817
 
624
- # Try Git first, fallback to filesystem
625
- agent_schema = None
626
- try:
627
- git_svc = GitService()
628
- agent_schema = git_svc.load_schema(agent_name, version=agent_version)
629
- click.echo(f"✓ Loaded agent schema from Git")
630
- except Exception as e:
631
- logger.debug(f"Git not available, trying filesystem: {e}")
632
-
633
- # Fallback to local filesystem
634
- from rem.services.fs import FS
635
- fs = FS()
818
+ from rem.utils.schema_loader import load_agent_schema
636
819
 
637
- schema_path = f"schemas/agents/{agent_name}.yaml"
638
- try:
639
- agent_schema = fs.read(schema_path)
640
- click.echo(f"✓ Loaded agent schema from filesystem")
641
- except Exception as fs_error:
642
- logger.error(f"Failed to load agent schema: Git: {e}, FS: {fs_error}")
643
- click.echo(f"Error: Could not load agent schema '{agent_name}'")
644
- click.echo(f" Tried Git: {e}")
645
- click.echo(f" Tried filesystem: {schema_path}")
646
- click.echo(f" Make sure the schema exists")
647
- raise click.Abort()
820
+ try:
821
+ agent_schema = load_agent_schema(agent_name)
822
+ click.echo(f"✓ Loaded agent schema: {agent_name}")
823
+ except FileNotFoundError as e:
824
+ logger.error(f"Failed to load agent schema: {e}")
825
+ click.echo(f"Error: Could not load agent schema '{agent_name}'")
826
+ click.echo(f" {e}")
827
+ raise click.Abort()
648
828
 
649
829
  # Create agent function from schema
650
830
  from rem.agentic.providers.pydantic_ai import create_agent
@@ -683,73 +863,97 @@ def run(
683
863
  return {"output": serialized}
684
864
  return serialized if isinstance(serialized, dict) else {"output": str(serialized)}
685
865
 
686
- # Load evaluator schema
866
+ # Load evaluator schema using centralized schema loader
687
867
  evaluator_name = config.evaluator_schema_ref.name
688
868
  evaluator_version = config.evaluator_schema_ref.version
689
869
 
690
- # Resolve evaluator path (evaluators are organized by agent name)
691
- evaluator_schema_path = f"rem/schemas/evaluators/{agent_name}/{evaluator_name}.yaml"
692
-
693
870
  click.echo(f"Loading evaluator: {evaluator_name} for agent {agent_name}")
694
871
 
695
- try:
696
- evaluator_fn = create_evaluator_from_schema(
697
- evaluator_schema_path=evaluator_schema_path,
698
- model_name=None, # Use default from schema
699
- )
700
- click.echo(f" Loaded evaluator schema")
701
- except Exception as e:
702
- logger.warning(f"Failed to load evaluator: {e}")
703
- click.echo(f"Error: Could not load evaluator schema")
704
- click.echo(f" Path: {evaluator_schema_path}")
705
- click.echo(f" Make sure the schema exists")
872
+ # Find evaluator schema file path
873
+ from rem.utils.schema_loader import get_evaluator_schema_path
874
+
875
+ evaluator_schema_path = get_evaluator_schema_path(evaluator_name)
876
+ if not evaluator_schema_path or not evaluator_schema_path.exists():
877
+ click.echo(f"Error: Could not find evaluator schema '{evaluator_name}'")
878
+ raise click.Abort()
879
+
880
+ click.echo(f" Found evaluator schema: {evaluator_schema_path}")
881
+
882
+ # For Phoenix mode, also load evaluator function
883
+ evaluator_fn = None
884
+ if not only_vibes:
885
+ # Try multiple evaluator path patterns (agent-specific, then generic)
886
+ evaluator_paths_to_try = [
887
+ f"{agent_name}/{evaluator_name}", # e.g., hello-world/default
888
+ f"{agent_name}-{evaluator_name}", # e.g., hello-world-default
889
+ evaluator_name, # e.g., default (generic)
890
+ ]
891
+
892
+ evaluator_load_error = None
893
+
894
+ for evaluator_path in evaluator_paths_to_try:
895
+ try:
896
+ evaluator_fn = create_evaluator_from_schema(
897
+ evaluator_schema_path=evaluator_path,
898
+ model_name=None, # Use default from schema
899
+ )
900
+ click.echo(f"✓ Loaded evaluator function: {evaluator_path}")
901
+ break
902
+ except FileNotFoundError as e:
903
+ evaluator_load_error = e
904
+ logger.debug(f"Evaluator not found at {evaluator_path}: {e}")
905
+ continue
906
+ except Exception as e:
907
+ evaluator_load_error = e
908
+ logger.warning(f"Failed to load evaluator from {evaluator_path}: {e}")
909
+ continue
910
+
911
+ if evaluator_fn is None and not only_vibes:
912
+ click.echo(f"Error: Could not load evaluator function '{evaluator_name}'")
913
+ click.echo(f" Tried paths: {evaluator_paths_to_try}")
914
+ if evaluator_load_error:
915
+ click.echo(f" Last error: {evaluator_load_error}")
706
916
  raise click.Abort()
707
917
 
708
- # Load dataset
918
+ # Load dataset using Polars
919
+ import polars as pl
920
+
709
921
  click.echo(f"Loading dataset: {list(config.datasets.keys())[0]}")
710
922
  dataset_ref = list(config.datasets.values())[0]
711
923
 
712
924
  if dataset_ref.location.value == "git":
713
- # Load from Git
925
+ # Load from Git (local filesystem)
714
926
  dataset_path = Path(base_path) / name / dataset_ref.path
715
927
  if not dataset_path.exists():
716
928
  click.echo(f"Error: Dataset not found: {dataset_path}")
717
929
  raise click.Abort()
718
930
 
719
931
  if dataset_ref.format == "csv":
720
- dataset_df = pd.read_csv(dataset_path)
932
+ dataset_df = pl.read_csv(dataset_path)
721
933
  elif dataset_ref.format == "parquet":
722
- dataset_df = pd.read_parquet(dataset_path)
934
+ dataset_df = pl.read_parquet(dataset_path)
723
935
  elif dataset_ref.format == "jsonl":
724
- dataset_df = pd.read_json(dataset_path, lines=True)
936
+ dataset_df = pl.read_ndjson(dataset_path)
725
937
  else:
726
938
  click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
727
939
  raise click.Abort()
728
940
  elif dataset_ref.location.value in ["s3", "hybrid"]:
729
941
  # Load from S3 using FS provider
730
942
  from rem.services.fs import FS
943
+ from io import BytesIO
731
944
 
732
945
  fs = FS()
733
946
 
734
947
  try:
735
948
  if dataset_ref.format == "csv":
736
949
  content = fs.read(dataset_ref.path)
737
- from io import StringIO
738
- dataset_df = pd.read_csv(StringIO(content))
950
+ dataset_df = pl.read_csv(BytesIO(content.encode() if isinstance(content, str) else content))
739
951
  elif dataset_ref.format == "parquet":
740
- # For parquet, we need binary read
741
- import tempfile
742
- with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
743
- tmp_path = tmp.name
744
- # Download via FS
745
- content_bytes = fs.read(dataset_ref.path)
746
- tmp.write(content_bytes)
747
- dataset_df = pd.read_parquet(tmp_path)
748
- Path(tmp_path).unlink() # Clean up temp file
952
+ content_bytes = fs.read(dataset_ref.path)
953
+ dataset_df = pl.read_parquet(BytesIO(content_bytes if isinstance(content_bytes, bytes) else content_bytes.encode()))
749
954
  elif dataset_ref.format == "jsonl":
750
955
  content = fs.read(dataset_ref.path)
751
- from io import StringIO
752
- dataset_df = pd.read_json(StringIO(content), lines=True)
956
+ dataset_df = pl.read_ndjson(BytesIO(content.encode() if isinstance(content, str) else content))
753
957
  else:
754
958
  click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
755
959
  raise click.Abort()
@@ -772,6 +976,18 @@ def run(
772
976
  # TODO: Implement prompt updating
773
977
  click.echo("⚠ --update-prompts not yet implemented")
774
978
 
979
+ # Vibes mode: run agent and export for AI evaluation
980
+ if only_vibes:
981
+ _run_vibes_mode(
982
+ config=config,
983
+ dataset_df=dataset_df,
984
+ task_fn=task_fn,
985
+ base_path=base_path,
986
+ limit=limit,
987
+ evaluator_schema_path=evaluator_schema_path,
988
+ )
989
+ return
990
+
775
991
  # Run experiment via Phoenix
776
992
  if not dry_run:
777
993
  # Create Phoenix client with optional overrides
@@ -793,13 +1009,13 @@ def run(
793
1009
 
794
1010
  client = PhoenixClient(config=phoenix_config)
795
1011
 
796
- experiment_name = f"{config.name}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
1012
+ experiment_name = f"{config.name}-{format_timestamp_for_experiment()}"
797
1013
 
798
1014
  click.echo(f"\n⏳ Running experiment: {experiment_name}")
799
1015
  click.echo(f" This may take several minutes...")
800
1016
 
801
1017
  experiment = client.run_experiment(
802
- dataset=dataset_df, # type: ignore[arg-type]
1018
+ dataset=dataset_df,
803
1019
  task=task_fn,
804
1020
  evaluators=[evaluator_fn],
805
1021
  experiment_name=experiment_name,
@@ -809,12 +1025,15 @@ def run(
809
1025
  "evaluator": config.evaluator_schema_ref.name,
810
1026
  "experiment_config": config.name,
811
1027
  **config.metadata
812
- }
1028
+ },
1029
+ # Smart column detection for DataFrame -> Phoenix Dataset conversion
1030
+ input_keys=["input"] if "input" in dataset_df.columns else None,
1031
+ output_keys=["expected_output"] if "expected_output" in dataset_df.columns else None,
813
1032
  )
814
1033
 
815
1034
  # Update experiment status
816
1035
  config.status = ExperimentStatus.COMPLETED
817
- config.last_run_at = datetime.now()
1036
+ config.last_run_at = utc_now()
818
1037
  if not version: # Only save if not loading from Git
819
1038
  config.save(base_path)
820
1039
 
@@ -835,7 +1054,7 @@ def run(
835
1054
  "agent": config.agent_schema_ref.name,
836
1055
  "evaluator": config.evaluator_schema_ref.name,
837
1056
  "dataset_size": len(dataset_df),
838
- "completed_at": datetime.now().isoformat(),
1057
+ "completed_at": to_iso(utc_now()),
839
1058
  "phoenix_url": getattr(experiment, "url", None),
840
1059
  "task_runs": len(exp_data.get("task_runs", [])),
841
1060
  }
@@ -1015,20 +1234,24 @@ def dataset_add(
1015
1234
  --output-keys expected_label,expected_type
1016
1235
  """
1017
1236
  from rem.services.phoenix import PhoenixClient
1018
- import pandas as pd
1237
+ import polars as pl
1019
1238
 
1020
1239
  try:
1021
1240
  client = PhoenixClient()
1022
1241
 
1023
- # Load CSV
1024
- df = pd.read_csv(from_csv)
1242
+ # Load CSV with Polars
1243
+ df = pl.read_csv(from_csv)
1244
+ records = df.to_dicts()
1025
1245
 
1026
1246
  # Extract data
1027
- inputs = cast(list[dict[str, Any]], df[input_keys.split(",")].to_dict("records"))
1028
- outputs = cast(list[dict[str, Any]], df[output_keys.split(",")].to_dict("records"))
1247
+ input_cols = input_keys.split(",")
1248
+ output_cols = output_keys.split(",")
1249
+ inputs = [{k: row.get(k) for k in input_cols} for row in records]
1250
+ outputs = [{k: row.get(k) for k in output_cols} for row in records]
1029
1251
  metadata = None
1030
1252
  if metadata_keys:
1031
- metadata = cast(list[dict[str, Any]], df[metadata_keys.split(",")].to_dict("records"))
1253
+ meta_cols = metadata_keys.split(",")
1254
+ metadata = [{k: row.get(k) for k in meta_cols} for row in records]
1032
1255
 
1033
1256
  # Add to dataset
1034
1257
  dataset = client.add_examples_to_dataset(
@@ -1269,12 +1492,12 @@ def trace_list(
1269
1492
  rem experiments trace list --project rem-agents --days 7 --limit 50
1270
1493
  """
1271
1494
  from rem.services.phoenix import PhoenixClient
1272
- from datetime import datetime, timedelta
1495
+ from rem.utils.date_utils import days_ago
1273
1496
 
1274
1497
  try:
1275
1498
  client = PhoenixClient()
1276
1499
 
1277
- start_time = datetime.now() - timedelta(days=days)
1500
+ start_time = days_ago(days)
1278
1501
 
1279
1502
  traces_df = client.get_traces(
1280
1503
  project_name=project,
@@ -1300,3 +1523,175 @@ def trace_list(
1300
1523
  logger.error(f"Failed to list traces: {e}")
1301
1524
  click.echo(f"Error: {e}", err=True)
1302
1525
  raise click.Abort()
1526
+
1527
+
1528
+ # =============================================================================
1529
+ # EXPORT COMMAND
1530
+ # =============================================================================
1531
+
1532
+
1533
+ @experiments.command("export")
1534
+ @click.argument("name")
1535
+ @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
1536
+ @click.option("--bucket", "-b", help="S3 bucket name (default: DATA_LAKE__BUCKET_NAME)")
1537
+ @click.option("--version", "-v", default="v0", help="Data lake version prefix (default: v0)")
1538
+ @click.option("--plan", is_flag=True, help="Show what would be exported without uploading")
1539
+ @click.option("--include-results", is_flag=True, help="Include results directory in export")
1540
+ def export(
1541
+ name: str,
1542
+ base_path: Optional[str],
1543
+ bucket: Optional[str],
1544
+ version: str,
1545
+ plan: bool,
1546
+ include_results: bool,
1547
+ ):
1548
+ """Export experiment to S3 data lake.
1549
+
1550
+ Exports experiment configuration, ground truth, and optionally results
1551
+ to the S3 data lake following the convention:
1552
+
1553
+ s3://{bucket}/{version}/datasets/calibration/experiments/{agent}/{task}/
1554
+
1555
+ The export includes:
1556
+ - experiment.yaml (configuration)
1557
+ - README.md (documentation)
1558
+ - ground-truth/ (evaluation datasets)
1559
+ - seed-data/ (optional seed data)
1560
+ - results/ (optional, with --include-results)
1561
+
1562
+ Examples:
1563
+ # Preview what would be exported
1564
+ rem experiments export my-experiment --plan
1565
+
1566
+ # Export to configured data lake bucket
1567
+ rem experiments export my-experiment
1568
+
1569
+ # Export to specific bucket
1570
+ rem experiments export my-experiment --bucket siggy-data
1571
+
1572
+ # Include results in export
1573
+ rem experiments export my-experiment --include-results
1574
+
1575
+ # Export with custom version prefix
1576
+ rem experiments export my-experiment --version v1
1577
+ """
1578
+ from rem.models.core.experiment import ExperimentConfig
1579
+ from rem.settings import settings
1580
+ from rem.services.fs.s3_provider import S3Provider
1581
+ import os
1582
+ import json
1583
+
1584
+ try:
1585
+ # Resolve base path
1586
+ if base_path is None:
1587
+ base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
1588
+
1589
+ # Load experiment configuration
1590
+ config_path = Path(base_path) / name / "experiment.yaml"
1591
+ if not config_path.exists():
1592
+ click.echo(f"Experiment not found: {name}")
1593
+ click.echo(f" Looked in: {config_path}")
1594
+ raise click.Abort()
1595
+
1596
+ config = ExperimentConfig.from_yaml(config_path)
1597
+ click.echo(f"✓ Loaded experiment: {name}")
1598
+
1599
+ # Resolve bucket
1600
+ if bucket is None:
1601
+ bucket = settings.data_lake.bucket_name
1602
+ if bucket is None:
1603
+ click.echo("Error: No S3 bucket configured.")
1604
+ click.echo(" Set DATA_LAKE__BUCKET_NAME environment variable or use --bucket option")
1605
+ raise click.Abort()
1606
+
1607
+ # Build S3 paths
1608
+ s3_base = config.get_s3_export_path(bucket, version)
1609
+ exp_dir = config.get_experiment_dir(base_path)
1610
+
1611
+ # Collect files to export
1612
+ files_to_export = []
1613
+
1614
+ # Always include these files
1615
+ required_files = [
1616
+ ("experiment.yaml", exp_dir / "experiment.yaml"),
1617
+ ("README.md", exp_dir / "README.md"),
1618
+ ]
1619
+
1620
+ for s3_name, local_path in required_files:
1621
+ if local_path.exists():
1622
+ files_to_export.append((s3_name, local_path))
1623
+
1624
+ # Include ground-truth directory
1625
+ ground_truth_dir = exp_dir / "ground-truth"
1626
+ if ground_truth_dir.exists():
1627
+ for f in ground_truth_dir.rglob("*"):
1628
+ if f.is_file():
1629
+ relative = f.relative_to(exp_dir)
1630
+ files_to_export.append((str(relative), f))
1631
+
1632
+ # Include seed-data directory
1633
+ seed_data_dir = exp_dir / "seed-data"
1634
+ if seed_data_dir.exists():
1635
+ for f in seed_data_dir.rglob("*"):
1636
+ if f.is_file():
1637
+ relative = f.relative_to(exp_dir)
1638
+ files_to_export.append((str(relative), f))
1639
+
1640
+ # Optionally include results
1641
+ if include_results:
1642
+ results_dir = exp_dir / "results"
1643
+ if results_dir.exists():
1644
+ for f in results_dir.rglob("*"):
1645
+ if f.is_file():
1646
+ relative = f.relative_to(exp_dir)
1647
+ files_to_export.append((str(relative), f))
1648
+
1649
+ # Display export plan
1650
+ click.echo(f"\n{'=' * 60}")
1651
+ click.echo(f"EXPORT {'PLAN' if plan else 'TO S3'}")
1652
+ click.echo(f"{'=' * 60}")
1653
+ click.echo(f"\nExperiment: {config.name}")
1654
+ click.echo(f"Agent: {config.agent_schema_ref.name}")
1655
+ click.echo(f"Task: {config.task}")
1656
+ click.echo(f"Evaluator file: {config.get_evaluator_filename()}")
1657
+ click.echo(f"\nDestination: {s3_base}/")
1658
+ click.echo(f"\nFiles to export ({len(files_to_export)}):")
1659
+
1660
+ for s3_name, local_path in files_to_export:
1661
+ s3_uri = f"{s3_base}/{s3_name}"
1662
+ if plan:
1663
+ click.echo(f" {local_path}")
1664
+ click.echo(f" → {s3_uri}")
1665
+ else:
1666
+ click.echo(f" {s3_name}")
1667
+
1668
+ if plan:
1669
+ click.echo(f"\n[PLAN MODE] No files were uploaded.")
1670
+ click.echo(f"Run without --plan to execute the export.")
1671
+ return
1672
+
1673
+ # Execute export
1674
+ click.echo(f"\n⏳ Uploading to S3...")
1675
+ s3 = S3Provider()
1676
+
1677
+ uploaded = 0
1678
+ for s3_name, local_path in files_to_export:
1679
+ s3_uri = f"{s3_base}/{s3_name}"
1680
+ try:
1681
+ s3.copy(str(local_path), s3_uri)
1682
+ uploaded += 1
1683
+ click.echo(f" ✓ {s3_name}")
1684
+ except Exception as e:
1685
+ click.echo(f" ✗ {s3_name}: {e}")
1686
+
1687
+ click.echo(f"\n✓ Exported {uploaded}/{len(files_to_export)} files to {s3_base}/")
1688
+
1689
+ # Show next steps
1690
+ click.echo(f"\nNext steps:")
1691
+ click.echo(f" - View in S3: aws s3 ls {s3_base}/ --recursive")
1692
+ click.echo(f" - Download: aws s3 sync {s3_base}/ ./{config.agent_schema_ref.name}/{config.task}/")
1693
+
1694
+ except Exception as e:
1695
+ logger.error(f"Failed to export experiment: {e}")
1696
+ click.echo(f"Error: {e}", err=True)
1697
+ raise click.Abort()