remdb 0.3.114__py3-none-any.whl → 0.3.172__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/agentic/agents/__init__.py +16 -0
- rem/agentic/agents/agent_manager.py +311 -0
- rem/agentic/agents/sse_simulator.py +2 -0
- rem/agentic/context.py +103 -5
- rem/agentic/context_builder.py +36 -9
- rem/agentic/mcp/tool_wrapper.py +161 -18
- rem/agentic/otel/setup.py +1 -0
- rem/agentic/providers/phoenix.py +371 -108
- rem/agentic/providers/pydantic_ai.py +172 -30
- rem/agentic/schema.py +8 -4
- rem/api/deps.py +3 -5
- rem/api/main.py +26 -4
- rem/api/mcp_router/resources.py +15 -10
- rem/api/mcp_router/server.py +11 -3
- rem/api/mcp_router/tools.py +418 -4
- rem/api/middleware/tracking.py +5 -5
- rem/api/routers/admin.py +218 -1
- rem/api/routers/auth.py +349 -6
- rem/api/routers/chat/completions.py +255 -7
- rem/api/routers/chat/models.py +81 -7
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +17 -1
- rem/api/routers/chat/streaming.py +126 -19
- rem/api/routers/feedback.py +134 -14
- rem/api/routers/messages.py +24 -15
- rem/api/routers/query.py +6 -3
- rem/auth/__init__.py +13 -3
- rem/auth/jwt.py +352 -0
- rem/auth/middleware.py +115 -10
- rem/auth/providers/__init__.py +4 -1
- rem/auth/providers/email.py +215 -0
- rem/cli/commands/README.md +42 -0
- rem/cli/commands/cluster.py +617 -168
- rem/cli/commands/configure.py +4 -7
- rem/cli/commands/db.py +66 -22
- rem/cli/commands/experiments.py +468 -76
- rem/cli/commands/schema.py +6 -5
- rem/cli/commands/session.py +336 -0
- rem/cli/dreaming.py +2 -2
- rem/cli/main.py +2 -0
- rem/config.py +8 -1
- rem/models/core/experiment.py +58 -14
- rem/models/entities/__init__.py +4 -0
- rem/models/entities/ontology.py +1 -1
- rem/models/entities/ontology_config.py +1 -1
- rem/models/entities/subscriber.py +175 -0
- rem/models/entities/user.py +1 -0
- rem/schemas/agents/core/agent-builder.yaml +235 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +1 -1
- rem/schemas/agents/examples/contract-extractor.yaml +1 -1
- rem/schemas/agents/examples/cv-parser.yaml +1 -1
- rem/services/__init__.py +3 -1
- rem/services/content/service.py +4 -3
- rem/services/email/__init__.py +10 -0
- rem/services/email/service.py +513 -0
- rem/services/email/templates.py +360 -0
- rem/services/phoenix/client.py +59 -18
- rem/services/postgres/README.md +38 -0
- rem/services/postgres/diff_service.py +127 -6
- rem/services/postgres/pydantic_to_sqlalchemy.py +45 -13
- rem/services/postgres/repository.py +5 -4
- rem/services/postgres/schema_generator.py +205 -4
- rem/services/session/compression.py +120 -50
- rem/services/session/reload.py +14 -7
- rem/services/user_service.py +41 -9
- rem/settings.py +442 -23
- rem/sql/migrations/001_install.sql +156 -0
- rem/sql/migrations/002_install_models.sql +1951 -88
- rem/sql/migrations/004_cache_system.sql +548 -0
- rem/sql/migrations/005_schema_update.sql +145 -0
- rem/utils/README.md +45 -0
- rem/utils/__init__.py +18 -0
- rem/utils/files.py +157 -1
- rem/utils/schema_loader.py +139 -10
- rem/utils/sql_paths.py +146 -0
- rem/utils/vision.py +1 -1
- rem/workers/__init__.py +3 -1
- rem/workers/db_listener.py +579 -0
- rem/workers/unlogged_maintainer.py +463 -0
- {remdb-0.3.114.dist-info → remdb-0.3.172.dist-info}/METADATA +218 -180
- {remdb-0.3.114.dist-info → remdb-0.3.172.dist-info}/RECORD +83 -68
- {remdb-0.3.114.dist-info → remdb-0.3.172.dist-info}/WHEEL +0 -0
- {remdb-0.3.114.dist-info → remdb-0.3.172.dist-info}/entry_points.txt +0 -0
rem/cli/commands/experiments.py
CHANGED
|
@@ -63,6 +63,7 @@ def experiments():
|
|
|
63
63
|
@experiments.command("create")
|
|
64
64
|
@click.argument("name")
|
|
65
65
|
@click.option("--agent", "-a", required=True, help="Agent schema name (e.g., 'cv-parser')")
|
|
66
|
+
@click.option("--task", "-t", default="general", help="Task name for organizing experiments (e.g., 'risk-assessment')")
|
|
66
67
|
@click.option("--evaluator", "-e", default="default", help="Evaluator schema name (default: 'default')")
|
|
67
68
|
@click.option("--description", "-d", help="Experiment description")
|
|
68
69
|
@click.option("--dataset-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
|
|
@@ -74,6 +75,7 @@ def experiments():
|
|
|
74
75
|
def create(
|
|
75
76
|
name: str,
|
|
76
77
|
agent: str,
|
|
78
|
+
task: str,
|
|
77
79
|
evaluator: str,
|
|
78
80
|
description: Optional[str],
|
|
79
81
|
dataset_location: str,
|
|
@@ -123,19 +125,17 @@ def create(
|
|
|
123
125
|
# Resolve base path: CLI arg > EXPERIMENTS_HOME env var > default "experiments"
|
|
124
126
|
if base_path is None:
|
|
125
127
|
base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
|
|
126
|
-
# Build dataset reference
|
|
128
|
+
# Build dataset reference (format auto-detected from file extension)
|
|
127
129
|
if dataset_location == "git":
|
|
128
130
|
dataset_ref = DatasetReference(
|
|
129
131
|
location=DatasetLocation.GIT,
|
|
130
132
|
path="ground-truth/dataset.csv",
|
|
131
|
-
format="csv",
|
|
132
133
|
description="Ground truth Q&A dataset for evaluation"
|
|
133
134
|
)
|
|
134
135
|
else: # s3 or hybrid
|
|
135
136
|
dataset_ref = DatasetReference(
|
|
136
137
|
location=DatasetLocation(dataset_location),
|
|
137
138
|
path=f"s3://rem-experiments/{name}/datasets/ground_truth.parquet",
|
|
138
|
-
format="parquet",
|
|
139
139
|
schema_path="datasets/schema.yaml" if dataset_location == "hybrid" else None,
|
|
140
140
|
description="Ground truth dataset for evaluation"
|
|
141
141
|
)
|
|
@@ -170,7 +170,8 @@ def create(
|
|
|
170
170
|
# Create experiment config
|
|
171
171
|
config = ExperimentConfig(
|
|
172
172
|
name=name,
|
|
173
|
-
|
|
173
|
+
task=task,
|
|
174
|
+
description=description or f"Evaluation experiment for {agent} agent ({task} task)",
|
|
174
175
|
agent_schema_ref=SchemaReference(
|
|
175
176
|
name=agent,
|
|
176
177
|
version=None, # Use latest by default
|
|
@@ -514,6 +515,159 @@ def show(name: str, base_path: Optional[str]):
|
|
|
514
515
|
raise click.Abort()
|
|
515
516
|
|
|
516
517
|
|
|
518
|
+
# =============================================================================
|
|
519
|
+
# VIBES MODE HELPER
|
|
520
|
+
# =============================================================================
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def _run_vibes_mode(
|
|
524
|
+
config: Any,
|
|
525
|
+
dataset_df: Any,
|
|
526
|
+
task_fn: Any,
|
|
527
|
+
base_path: str,
|
|
528
|
+
limit: Optional[int],
|
|
529
|
+
evaluator_schema_path: Path,
|
|
530
|
+
) -> None:
|
|
531
|
+
"""Run experiment in vibes mode - execute agent and export for AI evaluation.
|
|
532
|
+
|
|
533
|
+
Vibes mode runs the agent on each example and saves results to a JSONL file.
|
|
534
|
+
The AI assistant (e.g., Claude Code) then acts as the judge using the
|
|
535
|
+
evaluator schema to evaluate results.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
config: ExperimentConfig object
|
|
539
|
+
dataset_df: Polars DataFrame with ground truth examples
|
|
540
|
+
task_fn: Function to run agent on each example
|
|
541
|
+
base_path: Base directory for experiments
|
|
542
|
+
limit: Optional limit on number of examples to process
|
|
543
|
+
evaluator_schema_path: Path to the evaluator schema YAML file
|
|
544
|
+
"""
|
|
545
|
+
from rem.utils.date_utils import format_timestamp_for_experiment, utc_now, to_iso
|
|
546
|
+
import json
|
|
547
|
+
|
|
548
|
+
# Apply limit if specified
|
|
549
|
+
if limit:
|
|
550
|
+
dataset_df = dataset_df.head(limit)
|
|
551
|
+
click.echo(f" (Limited to {limit} examples)")
|
|
552
|
+
|
|
553
|
+
# Create results directory
|
|
554
|
+
timestamp = format_timestamp_for_experiment()
|
|
555
|
+
results_dir = Path(base_path) / config.name / "results" / timestamp
|
|
556
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
557
|
+
|
|
558
|
+
click.echo(f"\n⏳ Running agent on {len(dataset_df)} examples...")
|
|
559
|
+
click.echo(f" Results will be saved to: {results_dir}")
|
|
560
|
+
click.echo()
|
|
561
|
+
|
|
562
|
+
# Run agent on each example and collect results
|
|
563
|
+
results = []
|
|
564
|
+
records = dataset_df.to_dicts()
|
|
565
|
+
|
|
566
|
+
for i, record in enumerate(records, 1):
|
|
567
|
+
example_id = record.get("id", i)
|
|
568
|
+
click.echo(f" [{i}/{len(records)}] Processing example {example_id}...", nl=False)
|
|
569
|
+
|
|
570
|
+
try:
|
|
571
|
+
# Prepare input for agent
|
|
572
|
+
input_text = record.get("text", record.get("input", record.get("query", "")))
|
|
573
|
+
example_input = {"query": input_text} if isinstance(input_text, str) else input_text
|
|
574
|
+
|
|
575
|
+
# Run agent
|
|
576
|
+
output = task_fn({"input": example_input})
|
|
577
|
+
|
|
578
|
+
result = {
|
|
579
|
+
"id": example_id,
|
|
580
|
+
"input": input_text,
|
|
581
|
+
"ground_truth": record.get("ground_truth", record.get("expected_output", "")),
|
|
582
|
+
"category": record.get("category", ""),
|
|
583
|
+
"agent_output": output,
|
|
584
|
+
"status": "success",
|
|
585
|
+
}
|
|
586
|
+
click.echo(" ✓")
|
|
587
|
+
|
|
588
|
+
except Exception as e:
|
|
589
|
+
result = {
|
|
590
|
+
"id": example_id,
|
|
591
|
+
"input": record.get("text", record.get("input", "")),
|
|
592
|
+
"ground_truth": record.get("ground_truth", record.get("expected_output", "")),
|
|
593
|
+
"category": record.get("category", ""),
|
|
594
|
+
"agent_output": None,
|
|
595
|
+
"status": "error",
|
|
596
|
+
"error": str(e),
|
|
597
|
+
}
|
|
598
|
+
click.echo(f" ✗ ({e})")
|
|
599
|
+
|
|
600
|
+
results.append(result)
|
|
601
|
+
|
|
602
|
+
# Save results to JSONL
|
|
603
|
+
results_file = results_dir / "vibes-results.jsonl"
|
|
604
|
+
with open(results_file, "w") as f:
|
|
605
|
+
for result in results:
|
|
606
|
+
f.write(json.dumps(result) + "\n")
|
|
607
|
+
|
|
608
|
+
# Copy evaluator schema to results dir for easy reference
|
|
609
|
+
import shutil
|
|
610
|
+
evaluator_copy = results_dir / "evaluator-schema.yaml"
|
|
611
|
+
shutil.copy(evaluator_schema_path, evaluator_copy)
|
|
612
|
+
|
|
613
|
+
# Save run metadata
|
|
614
|
+
run_info = {
|
|
615
|
+
"experiment": config.name,
|
|
616
|
+
"agent": config.agent_schema_ref.name,
|
|
617
|
+
"evaluator": config.evaluator_schema_ref.name,
|
|
618
|
+
"mode": "vibes",
|
|
619
|
+
"timestamp": timestamp,
|
|
620
|
+
"total_examples": len(records),
|
|
621
|
+
"successful": len([r for r in results if r["status"] == "success"]),
|
|
622
|
+
"failed": len([r for r in results if r["status"] == "error"]),
|
|
623
|
+
"completed_at": to_iso(utc_now()),
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
run_info_file = results_dir / "run-info.json"
|
|
627
|
+
with open(run_info_file, "w") as f:
|
|
628
|
+
json.dump(run_info, f, indent=2)
|
|
629
|
+
|
|
630
|
+
# Print summary and instructions
|
|
631
|
+
success_count = run_info["successful"]
|
|
632
|
+
fail_count = run_info["failed"]
|
|
633
|
+
|
|
634
|
+
click.echo(f"\n{'=' * 60}")
|
|
635
|
+
click.echo(f"VIBES MODE COMPLETE")
|
|
636
|
+
click.echo(f"{'=' * 60}")
|
|
637
|
+
click.echo(f"\nResults: {success_count} successful, {fail_count} failed")
|
|
638
|
+
click.echo(f"\nFiles saved to: {results_dir}/")
|
|
639
|
+
click.echo(f" - vibes-results.jsonl (agent outputs)")
|
|
640
|
+
click.echo(f" - evaluator-schema.yaml (evaluation criteria)")
|
|
641
|
+
click.echo(f" - run-info.json (run metadata)")
|
|
642
|
+
|
|
643
|
+
click.echo(f"\n{'=' * 60}")
|
|
644
|
+
click.echo(f"NEXT STEP: Ask your AI assistant to evaluate")
|
|
645
|
+
click.echo(f"{'=' * 60}")
|
|
646
|
+
click.echo(f"""
|
|
647
|
+
Copy this prompt to Claude Code or your AI assistant:
|
|
648
|
+
|
|
649
|
+
Please evaluate the experiment results in:
|
|
650
|
+
{results_dir}/
|
|
651
|
+
|
|
652
|
+
Read the vibes-results.jsonl file and evaluate each example
|
|
653
|
+
using the evaluator schema in evaluator-schema.yaml.
|
|
654
|
+
|
|
655
|
+
For each example, provide:
|
|
656
|
+
1. extracted_classification
|
|
657
|
+
2. exact_match (vs ground_truth)
|
|
658
|
+
3. semantic_match
|
|
659
|
+
4. reasoning_quality_score
|
|
660
|
+
5. overall_score
|
|
661
|
+
6. pass/fail
|
|
662
|
+
|
|
663
|
+
Then provide summary metrics:
|
|
664
|
+
- Exact match accuracy
|
|
665
|
+
- Semantic match accuracy
|
|
666
|
+
- Average overall score
|
|
667
|
+
- Pass rate
|
|
668
|
+
""")
|
|
669
|
+
|
|
670
|
+
|
|
517
671
|
# =============================================================================
|
|
518
672
|
# RUN COMMAND
|
|
519
673
|
# =============================================================================
|
|
@@ -524,6 +678,8 @@ def show(name: str, base_path: Optional[str]):
|
|
|
524
678
|
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
525
679
|
@click.option("--version", help="Git tag version to load (e.g., 'experiments/my-exp/v1.0.0')")
|
|
526
680
|
@click.option("--dry-run", is_flag=True, help="Test on small subset without saving")
|
|
681
|
+
@click.option("--only-vibes", is_flag=True, help="Run agent locally, export results for AI evaluation (no Phoenix)")
|
|
682
|
+
@click.option("--limit", "-n", type=int, help="Limit number of examples to evaluate (useful with --only-vibes)")
|
|
527
683
|
@click.option("--update-prompts", is_flag=True, help="Update prompts in Phoenix before running")
|
|
528
684
|
@click.option("--phoenix-url", help="Phoenix server URL (overrides PHOENIX_BASE_URL env var)")
|
|
529
685
|
@click.option("--phoenix-api-key", help="Phoenix API key (overrides PHOENIX_API_KEY env var)")
|
|
@@ -532,14 +688,45 @@ def run(
|
|
|
532
688
|
base_path: Optional[str],
|
|
533
689
|
version: Optional[str],
|
|
534
690
|
dry_run: bool,
|
|
691
|
+
only_vibes: bool,
|
|
692
|
+
limit: Optional[int],
|
|
535
693
|
update_prompts: bool,
|
|
536
694
|
phoenix_url: Optional[str],
|
|
537
695
|
phoenix_api_key: Optional[str],
|
|
538
696
|
):
|
|
539
|
-
"""Run an experiment using Phoenix provider.
|
|
697
|
+
"""Run an experiment using Phoenix provider or local vibes mode.
|
|
540
698
|
|
|
541
699
|
Loads configuration, executes agent and evaluator, saves results.
|
|
542
700
|
|
|
701
|
+
Vibes Mode (--only-vibes):
|
|
702
|
+
Run agent locally without Phoenix infrastructure. Agent outputs are saved
|
|
703
|
+
to a JSONL file along with the evaluator schema. Your AI assistant (e.g.,
|
|
704
|
+
Claude Code) then acts as the judge to evaluate results.
|
|
705
|
+
|
|
706
|
+
This enables seamless switching between:
|
|
707
|
+
- Local evaluation: Quick iteration with AI-as-judge
|
|
708
|
+
- Phoenix evaluation: Production metrics and dashboards
|
|
709
|
+
|
|
710
|
+
Usage:
|
|
711
|
+
rem experiments run my-experiment --only-vibes
|
|
712
|
+
rem experiments run my-experiment --only-vibes --limit 5
|
|
713
|
+
|
|
714
|
+
The command will:
|
|
715
|
+
1. Run the agent on each ground-truth example
|
|
716
|
+
2. Save results to results/{timestamp}/vibes-results.jsonl
|
|
717
|
+
3. Print the evaluator prompt and schema
|
|
718
|
+
4. Instruct you to ask your AI assistant to evaluate
|
|
719
|
+
|
|
720
|
+
Example workflow with Claude Code:
|
|
721
|
+
$ rem experiments run mental-health-classifier --only-vibes --limit 3
|
|
722
|
+
# ... agent runs ...
|
|
723
|
+
# Results saved to: .experiments/mental-health-classifier/results/20241203-143022/
|
|
724
|
+
|
|
725
|
+
# Then ask Claude Code:
|
|
726
|
+
"Please evaluate the experiment results in
|
|
727
|
+
.experiments/mental-health-classifier/results/20241203-143022/
|
|
728
|
+
using the evaluator schema provided"
|
|
729
|
+
|
|
543
730
|
Phoenix Connection:
|
|
544
731
|
Commands respect PHOENIX_BASE_URL and PHOENIX_API_KEY environment variables.
|
|
545
732
|
Defaults to localhost:6006 for local development.
|
|
@@ -562,6 +749,12 @@ def run(
|
|
|
562
749
|
# Run experiment with latest schemas
|
|
563
750
|
rem experiments run hello-world-validation
|
|
564
751
|
|
|
752
|
+
# Quick local evaluation (vibes mode)
|
|
753
|
+
rem experiments run hello-world-validation --only-vibes
|
|
754
|
+
|
|
755
|
+
# Vibes mode with limited examples
|
|
756
|
+
rem experiments run hello-world-validation --only-vibes --limit 5
|
|
757
|
+
|
|
565
758
|
# Run specific version
|
|
566
759
|
rem experiments run hello-world-validation \\
|
|
567
760
|
--version experiments/hello-world-validation/v1.0.0
|
|
@@ -674,92 +867,107 @@ def run(
|
|
|
674
867
|
|
|
675
868
|
click.echo(f"Loading evaluator: {evaluator_name} for agent {agent_name}")
|
|
676
869
|
|
|
677
|
-
#
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
870
|
+
# Find evaluator schema file path
|
|
871
|
+
from rem.utils.schema_loader import get_evaluator_schema_path
|
|
872
|
+
|
|
873
|
+
evaluator_schema_path = get_evaluator_schema_path(evaluator_name)
|
|
874
|
+
if not evaluator_schema_path or not evaluator_schema_path.exists():
|
|
875
|
+
click.echo(f"Error: Could not find evaluator schema '{evaluator_name}'")
|
|
876
|
+
raise click.Abort()
|
|
683
877
|
|
|
878
|
+
click.echo(f"✓ Found evaluator schema: {evaluator_schema_path}")
|
|
879
|
+
|
|
880
|
+
# For Phoenix mode, also load evaluator function
|
|
684
881
|
evaluator_fn = None
|
|
685
|
-
|
|
882
|
+
if not only_vibes:
|
|
883
|
+
# Try multiple evaluator path patterns (agent-specific, then generic)
|
|
884
|
+
evaluator_paths_to_try = [
|
|
885
|
+
f"{agent_name}/{evaluator_name}", # e.g., hello-world/default
|
|
886
|
+
f"{agent_name}-{evaluator_name}", # e.g., hello-world-default
|
|
887
|
+
evaluator_name, # e.g., default (generic)
|
|
888
|
+
]
|
|
686
889
|
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
890
|
+
evaluator_load_error = None
|
|
891
|
+
|
|
892
|
+
for evaluator_path in evaluator_paths_to_try:
|
|
893
|
+
try:
|
|
894
|
+
evaluator_fn = create_evaluator_from_schema(
|
|
895
|
+
evaluator_schema_path=evaluator_path,
|
|
896
|
+
model_name=None, # Use default from schema
|
|
897
|
+
)
|
|
898
|
+
click.echo(f"✓ Loaded evaluator function: {evaluator_path}")
|
|
899
|
+
break
|
|
900
|
+
except FileNotFoundError as e:
|
|
901
|
+
evaluator_load_error = e
|
|
902
|
+
logger.debug(f"Evaluator not found at {evaluator_path}: {e}")
|
|
903
|
+
continue
|
|
904
|
+
except Exception as e:
|
|
905
|
+
evaluator_load_error = e
|
|
906
|
+
logger.warning(f"Failed to load evaluator from {evaluator_path}: {e}")
|
|
907
|
+
continue
|
|
703
908
|
|
|
704
|
-
if evaluator_fn is None:
|
|
705
|
-
click.echo(f"Error: Could not load evaluator
|
|
909
|
+
if evaluator_fn is None and not only_vibes:
|
|
910
|
+
click.echo(f"Error: Could not load evaluator function '{evaluator_name}'")
|
|
706
911
|
click.echo(f" Tried paths: {evaluator_paths_to_try}")
|
|
707
912
|
if evaluator_load_error:
|
|
708
913
|
click.echo(f" Last error: {evaluator_load_error}")
|
|
709
914
|
raise click.Abort()
|
|
710
915
|
|
|
711
|
-
#
|
|
712
|
-
|
|
916
|
+
# Validate evaluator credentials before running expensive agent tasks
|
|
917
|
+
if evaluator_fn is not None and not only_vibes:
|
|
918
|
+
from rem.agentic.providers.phoenix import validate_evaluator_credentials
|
|
919
|
+
|
|
920
|
+
click.echo("Validating evaluator credentials...")
|
|
921
|
+
is_valid, error_msg = validate_evaluator_credentials()
|
|
922
|
+
if not is_valid:
|
|
923
|
+
click.echo(click.style(f"\n⚠️ Evaluator validation failed: {error_msg}", fg="yellow"))
|
|
924
|
+
click.echo("\nOptions:")
|
|
925
|
+
click.echo(" 1. Fix the credentials issue and re-run")
|
|
926
|
+
click.echo(" 2. Run with --only-vibes to skip LLM evaluation")
|
|
927
|
+
click.echo(" 3. Use --evaluator-model to specify a different model")
|
|
928
|
+
raise click.Abort()
|
|
929
|
+
click.echo("✓ Evaluator credentials validated")
|
|
930
|
+
|
|
931
|
+
# Load dataset using read_dataframe utility (auto-detects format from extension)
|
|
932
|
+
from rem.utils.files import read_dataframe
|
|
713
933
|
|
|
714
934
|
click.echo(f"Loading dataset: {list(config.datasets.keys())[0]}")
|
|
715
935
|
dataset_ref = list(config.datasets.values())[0]
|
|
716
936
|
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
if dataset_ref.format == "csv":
|
|
725
|
-
dataset_df = pl.read_csv(dataset_path)
|
|
726
|
-
elif dataset_ref.format == "parquet":
|
|
727
|
-
dataset_df = pl.read_parquet(dataset_path)
|
|
728
|
-
elif dataset_ref.format == "jsonl":
|
|
729
|
-
dataset_df = pl.read_ndjson(dataset_path)
|
|
730
|
-
else:
|
|
731
|
-
click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
|
|
732
|
-
raise click.Abort()
|
|
733
|
-
elif dataset_ref.location.value in ["s3", "hybrid"]:
|
|
734
|
-
# Load from S3 using FS provider
|
|
735
|
-
from rem.services.fs import FS
|
|
736
|
-
from io import BytesIO
|
|
937
|
+
try:
|
|
938
|
+
if dataset_ref.location.value == "git":
|
|
939
|
+
# Load from Git (local filesystem)
|
|
940
|
+
dataset_path = Path(base_path) / name / dataset_ref.path
|
|
941
|
+
if not dataset_path.exists():
|
|
942
|
+
click.echo(f"Error: Dataset not found: {dataset_path}")
|
|
943
|
+
raise click.Abort()
|
|
737
944
|
|
|
738
|
-
|
|
945
|
+
dataset_df = read_dataframe(dataset_path)
|
|
739
946
|
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
dataset_df = pl.read_csv(BytesIO(content.encode() if isinstance(content, str) else content))
|
|
744
|
-
elif dataset_ref.format == "parquet":
|
|
745
|
-
content_bytes = fs.read(dataset_ref.path)
|
|
746
|
-
dataset_df = pl.read_parquet(BytesIO(content_bytes if isinstance(content_bytes, bytes) else content_bytes.encode()))
|
|
747
|
-
elif dataset_ref.format == "jsonl":
|
|
748
|
-
content = fs.read(dataset_ref.path)
|
|
749
|
-
dataset_df = pl.read_ndjson(BytesIO(content.encode() if isinstance(content, str) else content))
|
|
750
|
-
else:
|
|
751
|
-
click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
|
|
752
|
-
raise click.Abort()
|
|
947
|
+
elif dataset_ref.location.value in ["s3", "hybrid"]:
|
|
948
|
+
# Load from S3 using FS provider
|
|
949
|
+
from rem.services.fs import FS
|
|
753
950
|
|
|
951
|
+
fs = FS()
|
|
952
|
+
content = fs.read(dataset_ref.path)
|
|
953
|
+
# Ensure we have bytes
|
|
954
|
+
if isinstance(content, str):
|
|
955
|
+
content = content.encode()
|
|
956
|
+
dataset_df = read_dataframe(content, filename=dataset_ref.path)
|
|
754
957
|
click.echo(f"✓ Loaded dataset from S3")
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
click.echo(f"Error:
|
|
758
|
-
click.echo(f" Path: {dataset_ref.path}")
|
|
759
|
-
click.echo(f" Format: {dataset_ref.format}")
|
|
958
|
+
|
|
959
|
+
else:
|
|
960
|
+
click.echo(f"Error: Unknown dataset location: {dataset_ref.location.value}")
|
|
760
961
|
raise click.Abort()
|
|
761
|
-
|
|
762
|
-
|
|
962
|
+
|
|
963
|
+
except ValueError as e:
|
|
964
|
+
# Unsupported format error from read_dataframe
|
|
965
|
+
click.echo(f"Error: {e}")
|
|
966
|
+
raise click.Abort()
|
|
967
|
+
except Exception as e:
|
|
968
|
+
logger.error(f"Failed to load dataset: {e}")
|
|
969
|
+
click.echo(f"Error: Could not load dataset")
|
|
970
|
+
click.echo(f" Path: {dataset_ref.path}")
|
|
763
971
|
raise click.Abort()
|
|
764
972
|
|
|
765
973
|
click.echo(f"✓ Loaded dataset: {len(dataset_df)} examples")
|
|
@@ -769,6 +977,18 @@ def run(
|
|
|
769
977
|
# TODO: Implement prompt updating
|
|
770
978
|
click.echo("⚠ --update-prompts not yet implemented")
|
|
771
979
|
|
|
980
|
+
# Vibes mode: run agent and export for AI evaluation
|
|
981
|
+
if only_vibes:
|
|
982
|
+
_run_vibes_mode(
|
|
983
|
+
config=config,
|
|
984
|
+
dataset_df=dataset_df,
|
|
985
|
+
task_fn=task_fn,
|
|
986
|
+
base_path=base_path,
|
|
987
|
+
limit=limit,
|
|
988
|
+
evaluator_schema_path=evaluator_schema_path,
|
|
989
|
+
)
|
|
990
|
+
return
|
|
991
|
+
|
|
772
992
|
# Run experiment via Phoenix
|
|
773
993
|
if not dry_run:
|
|
774
994
|
# Create Phoenix client with optional overrides
|
|
@@ -1067,7 +1287,7 @@ def prompt():
|
|
|
1067
1287
|
@click.option("--system-prompt", "-s", required=True, help="System prompt text")
|
|
1068
1288
|
@click.option("--description", "-d", help="Prompt description")
|
|
1069
1289
|
@click.option("--model-provider", default="OPENAI", help="Model provider (OPENAI, ANTHROPIC)")
|
|
1070
|
-
@click.option("--model-name", "-m", help="Model name (e.g., gpt-
|
|
1290
|
+
@click.option("--model-name", "-m", help="Model name (e.g., gpt-4.1, claude-sonnet-4-5)")
|
|
1071
1291
|
@click.option("--type", "-t", "prompt_type", default="Agent", help="Prompt type (Agent or Evaluator)")
|
|
1072
1292
|
def prompt_create(
|
|
1073
1293
|
name: str,
|
|
@@ -1083,7 +1303,7 @@ def prompt_create(
|
|
|
1083
1303
|
# Create agent prompt
|
|
1084
1304
|
rem experiments prompt create hello-world \\
|
|
1085
1305
|
--system-prompt "You are a helpful assistant." \\
|
|
1086
|
-
--model-name gpt-
|
|
1306
|
+
--model-name gpt-4.1
|
|
1087
1307
|
|
|
1088
1308
|
# Create evaluator prompt
|
|
1089
1309
|
rem experiments prompt create correctness-evaluator \\
|
|
@@ -1101,7 +1321,7 @@ def prompt_create(
|
|
|
1101
1321
|
try:
|
|
1102
1322
|
# Set default model if not specified
|
|
1103
1323
|
if not model_name:
|
|
1104
|
-
model_name = "gpt-
|
|
1324
|
+
model_name = "gpt-4.1" if model_provider == "OPENAI" else "claude-sonnet-4-5-20250929"
|
|
1105
1325
|
|
|
1106
1326
|
# Get config
|
|
1107
1327
|
phoenix_client = PhoenixClient()
|
|
@@ -1304,3 +1524,175 @@ def trace_list(
|
|
|
1304
1524
|
logger.error(f"Failed to list traces: {e}")
|
|
1305
1525
|
click.echo(f"Error: {e}", err=True)
|
|
1306
1526
|
raise click.Abort()
|
|
1527
|
+
|
|
1528
|
+
|
|
1529
|
+
# =============================================================================
|
|
1530
|
+
# EXPORT COMMAND
|
|
1531
|
+
# =============================================================================
|
|
1532
|
+
|
|
1533
|
+
|
|
1534
|
+
@experiments.command("export")
|
|
1535
|
+
@click.argument("name")
|
|
1536
|
+
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
1537
|
+
@click.option("--bucket", "-b", help="S3 bucket name (default: DATA_LAKE__BUCKET_NAME)")
|
|
1538
|
+
@click.option("--version", "-v", default="v0", help="Data lake version prefix (default: v0)")
|
|
1539
|
+
@click.option("--plan", is_flag=True, help="Show what would be exported without uploading")
|
|
1540
|
+
@click.option("--include-results", is_flag=True, help="Include results directory in export")
|
|
1541
|
+
def export(
|
|
1542
|
+
name: str,
|
|
1543
|
+
base_path: Optional[str],
|
|
1544
|
+
bucket: Optional[str],
|
|
1545
|
+
version: str,
|
|
1546
|
+
plan: bool,
|
|
1547
|
+
include_results: bool,
|
|
1548
|
+
):
|
|
1549
|
+
"""Export experiment to S3 data lake.
|
|
1550
|
+
|
|
1551
|
+
Exports experiment configuration, ground truth, and optionally results
|
|
1552
|
+
to the S3 data lake following the convention:
|
|
1553
|
+
|
|
1554
|
+
s3://{bucket}/{version}/datasets/calibration/experiments/{agent}/{task}/
|
|
1555
|
+
|
|
1556
|
+
The export includes:
|
|
1557
|
+
- experiment.yaml (configuration)
|
|
1558
|
+
- README.md (documentation)
|
|
1559
|
+
- ground-truth/ (evaluation datasets)
|
|
1560
|
+
- seed-data/ (optional seed data)
|
|
1561
|
+
- results/ (optional, with --include-results)
|
|
1562
|
+
|
|
1563
|
+
Examples:
|
|
1564
|
+
# Preview what would be exported
|
|
1565
|
+
rem experiments export my-experiment --plan
|
|
1566
|
+
|
|
1567
|
+
# Export to configured data lake bucket
|
|
1568
|
+
rem experiments export my-experiment
|
|
1569
|
+
|
|
1570
|
+
# Export to specific bucket
|
|
1571
|
+
rem experiments export my-experiment --bucket siggy-data
|
|
1572
|
+
|
|
1573
|
+
# Include results in export
|
|
1574
|
+
rem experiments export my-experiment --include-results
|
|
1575
|
+
|
|
1576
|
+
# Export with custom version prefix
|
|
1577
|
+
rem experiments export my-experiment --version v1
|
|
1578
|
+
"""
|
|
1579
|
+
from rem.models.core.experiment import ExperimentConfig
|
|
1580
|
+
from rem.settings import settings
|
|
1581
|
+
from rem.services.fs.s3_provider import S3Provider
|
|
1582
|
+
import os
|
|
1583
|
+
import json
|
|
1584
|
+
|
|
1585
|
+
try:
|
|
1586
|
+
# Resolve base path
|
|
1587
|
+
if base_path is None:
|
|
1588
|
+
base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
|
|
1589
|
+
|
|
1590
|
+
# Load experiment configuration
|
|
1591
|
+
config_path = Path(base_path) / name / "experiment.yaml"
|
|
1592
|
+
if not config_path.exists():
|
|
1593
|
+
click.echo(f"Experiment not found: {name}")
|
|
1594
|
+
click.echo(f" Looked in: {config_path}")
|
|
1595
|
+
raise click.Abort()
|
|
1596
|
+
|
|
1597
|
+
config = ExperimentConfig.from_yaml(config_path)
|
|
1598
|
+
click.echo(f"✓ Loaded experiment: {name}")
|
|
1599
|
+
|
|
1600
|
+
# Resolve bucket
|
|
1601
|
+
if bucket is None:
|
|
1602
|
+
bucket = settings.data_lake.bucket_name
|
|
1603
|
+
if bucket is None:
|
|
1604
|
+
click.echo("Error: No S3 bucket configured.")
|
|
1605
|
+
click.echo(" Set DATA_LAKE__BUCKET_NAME environment variable or use --bucket option")
|
|
1606
|
+
raise click.Abort()
|
|
1607
|
+
|
|
1608
|
+
# Build S3 paths
|
|
1609
|
+
s3_base = config.get_s3_export_path(bucket, version)
|
|
1610
|
+
exp_dir = config.get_experiment_dir(base_path)
|
|
1611
|
+
|
|
1612
|
+
# Collect files to export
|
|
1613
|
+
files_to_export = []
|
|
1614
|
+
|
|
1615
|
+
# Always include these files
|
|
1616
|
+
required_files = [
|
|
1617
|
+
("experiment.yaml", exp_dir / "experiment.yaml"),
|
|
1618
|
+
("README.md", exp_dir / "README.md"),
|
|
1619
|
+
]
|
|
1620
|
+
|
|
1621
|
+
for s3_name, local_path in required_files:
|
|
1622
|
+
if local_path.exists():
|
|
1623
|
+
files_to_export.append((s3_name, local_path))
|
|
1624
|
+
|
|
1625
|
+
# Include ground-truth directory
|
|
1626
|
+
ground_truth_dir = exp_dir / "ground-truth"
|
|
1627
|
+
if ground_truth_dir.exists():
|
|
1628
|
+
for f in ground_truth_dir.rglob("*"):
|
|
1629
|
+
if f.is_file():
|
|
1630
|
+
relative = f.relative_to(exp_dir)
|
|
1631
|
+
files_to_export.append((str(relative), f))
|
|
1632
|
+
|
|
1633
|
+
# Include seed-data directory
|
|
1634
|
+
seed_data_dir = exp_dir / "seed-data"
|
|
1635
|
+
if seed_data_dir.exists():
|
|
1636
|
+
for f in seed_data_dir.rglob("*"):
|
|
1637
|
+
if f.is_file():
|
|
1638
|
+
relative = f.relative_to(exp_dir)
|
|
1639
|
+
files_to_export.append((str(relative), f))
|
|
1640
|
+
|
|
1641
|
+
# Optionally include results
|
|
1642
|
+
if include_results:
|
|
1643
|
+
results_dir = exp_dir / "results"
|
|
1644
|
+
if results_dir.exists():
|
|
1645
|
+
for f in results_dir.rglob("*"):
|
|
1646
|
+
if f.is_file():
|
|
1647
|
+
relative = f.relative_to(exp_dir)
|
|
1648
|
+
files_to_export.append((str(relative), f))
|
|
1649
|
+
|
|
1650
|
+
# Display export plan
|
|
1651
|
+
click.echo(f"\n{'=' * 60}")
|
|
1652
|
+
click.echo(f"EXPORT {'PLAN' if plan else 'TO S3'}")
|
|
1653
|
+
click.echo(f"{'=' * 60}")
|
|
1654
|
+
click.echo(f"\nExperiment: {config.name}")
|
|
1655
|
+
click.echo(f"Agent: {config.agent_schema_ref.name}")
|
|
1656
|
+
click.echo(f"Task: {config.task}")
|
|
1657
|
+
click.echo(f"Evaluator file: {config.get_evaluator_filename()}")
|
|
1658
|
+
click.echo(f"\nDestination: {s3_base}/")
|
|
1659
|
+
click.echo(f"\nFiles to export ({len(files_to_export)}):")
|
|
1660
|
+
|
|
1661
|
+
for s3_name, local_path in files_to_export:
|
|
1662
|
+
s3_uri = f"{s3_base}/{s3_name}"
|
|
1663
|
+
if plan:
|
|
1664
|
+
click.echo(f" {local_path}")
|
|
1665
|
+
click.echo(f" → {s3_uri}")
|
|
1666
|
+
else:
|
|
1667
|
+
click.echo(f" {s3_name}")
|
|
1668
|
+
|
|
1669
|
+
if plan:
|
|
1670
|
+
click.echo(f"\n[PLAN MODE] No files were uploaded.")
|
|
1671
|
+
click.echo(f"Run without --plan to execute the export.")
|
|
1672
|
+
return
|
|
1673
|
+
|
|
1674
|
+
# Execute export
|
|
1675
|
+
click.echo(f"\n⏳ Uploading to S3...")
|
|
1676
|
+
s3 = S3Provider()
|
|
1677
|
+
|
|
1678
|
+
uploaded = 0
|
|
1679
|
+
for s3_name, local_path in files_to_export:
|
|
1680
|
+
s3_uri = f"{s3_base}/{s3_name}"
|
|
1681
|
+
try:
|
|
1682
|
+
s3.copy(str(local_path), s3_uri)
|
|
1683
|
+
uploaded += 1
|
|
1684
|
+
click.echo(f" ✓ {s3_name}")
|
|
1685
|
+
except Exception as e:
|
|
1686
|
+
click.echo(f" ✗ {s3_name}: {e}")
|
|
1687
|
+
|
|
1688
|
+
click.echo(f"\n✓ Exported {uploaded}/{len(files_to_export)} files to {s3_base}/")
|
|
1689
|
+
|
|
1690
|
+
# Show next steps
|
|
1691
|
+
click.echo(f"\nNext steps:")
|
|
1692
|
+
click.echo(f" - View in S3: aws s3 ls {s3_base}/ --recursive")
|
|
1693
|
+
click.echo(f" - Download: aws s3 sync {s3_base}/ ./{config.agent_schema_ref.name}/{config.task}/")
|
|
1694
|
+
|
|
1695
|
+
except Exception as e:
|
|
1696
|
+
logger.error(f"Failed to export experiment: {e}")
|
|
1697
|
+
click.echo(f"Error: {e}", err=True)
|
|
1698
|
+
raise click.Abort()
|