remdb 0.3.7__py3-none-any.whl → 0.3.133__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/__init__.py +129 -2
- rem/agentic/README.md +76 -0
- rem/agentic/__init__.py +15 -0
- rem/agentic/agents/__init__.py +16 -2
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +51 -25
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/tool_wrapper.py +112 -17
- rem/agentic/otel/setup.py +93 -4
- rem/agentic/providers/phoenix.py +314 -132
- rem/agentic/providers/pydantic_ai.py +215 -26
- rem/agentic/schema.py +361 -21
- rem/agentic/tools/rem_tools.py +3 -3
- rem/api/README.md +238 -1
- rem/api/deps.py +255 -0
- rem/api/main.py +154 -37
- rem/api/mcp_router/resources.py +1 -1
- rem/api/mcp_router/server.py +26 -5
- rem/api/mcp_router/tools.py +465 -7
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +494 -0
- rem/api/routers/auth.py +124 -0
- rem/api/routers/chat/completions.py +402 -20
- rem/api/routers/chat/models.py +88 -10
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +542 -0
- rem/api/routers/chat/streaming.py +642 -45
- rem/api/routers/dev.py +81 -0
- rem/api/routers/feedback.py +268 -0
- rem/api/routers/messages.py +473 -0
- rem/api/routers/models.py +78 -0
- rem/api/routers/query.py +360 -0
- rem/api/routers/shared_sessions.py +406 -0
- rem/auth/middleware.py +126 -27
- rem/cli/commands/README.md +237 -64
- rem/cli/commands/ask.py +13 -10
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +5 -6
- rem/cli/commands/db.py +396 -139
- rem/cli/commands/experiments.py +469 -74
- rem/cli/commands/process.py +22 -15
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +97 -50
- rem/cli/main.py +29 -6
- rem/config.py +10 -3
- rem/models/core/core_model.py +7 -1
- rem/models/core/experiment.py +54 -0
- rem/models/core/rem_query.py +5 -2
- rem/models/entities/__init__.py +21 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/message.py +30 -1
- rem/models/entities/session.py +83 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/user.py +10 -3
- rem/registry.py +373 -0
- rem/schemas/agents/rem.yaml +7 -3
- rem/services/content/providers.py +92 -133
- rem/services/content/service.py +92 -20
- rem/services/dreaming/affinity_service.py +2 -16
- rem/services/dreaming/moment_service.py +2 -15
- rem/services/embeddings/api.py +24 -17
- rem/services/embeddings/worker.py +16 -16
- rem/services/phoenix/EXPERIMENT_DESIGN.md +3 -3
- rem/services/phoenix/client.py +302 -28
- rem/services/postgres/README.md +159 -15
- rem/services/postgres/__init__.py +2 -1
- rem/services/postgres/diff_service.py +531 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +427 -129
- rem/services/postgres/repository.py +132 -0
- rem/services/postgres/schema_generator.py +291 -9
- rem/services/postgres/service.py +6 -6
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +14 -0
- rem/services/rem/parser.py +44 -9
- rem/services/rem/service.py +36 -2
- rem/services/session/compression.py +24 -1
- rem/services/session/reload.py +1 -1
- rem/services/user_service.py +98 -0
- rem/settings.py +399 -29
- rem/sql/background_indexes.sql +21 -16
- rem/sql/migrations/001_install.sql +387 -54
- rem/sql/migrations/002_install_models.sql +2320 -393
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +548 -0
- rem/utils/__init__.py +18 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/embeddings.py +17 -4
- rem/utils/files.py +167 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +156 -1
- rem/utils/schema_loader.py +282 -35
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +3 -1
- rem/utils/vision.py +9 -14
- rem/workers/README.md +14 -14
- rem/workers/__init__.py +3 -1
- rem/workers/db_listener.py +579 -0
- rem/workers/db_maintainer.py +74 -0
- rem/workers/unlogged_maintainer.py +463 -0
- {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/METADATA +460 -303
- {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/RECORD +105 -74
- {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/WHEEL +1 -1
- rem/sql/002_install_models.sql +0 -1068
- rem/sql/install_models.sql +0 -1038
- {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/entry_points.txt +0 -0
rem/cli/commands/experiments.py
CHANGED
|
@@ -63,6 +63,7 @@ def experiments():
|
|
|
63
63
|
@experiments.command("create")
|
|
64
64
|
@click.argument("name")
|
|
65
65
|
@click.option("--agent", "-a", required=True, help="Agent schema name (e.g., 'cv-parser')")
|
|
66
|
+
@click.option("--task", "-t", default="general", help="Task name for organizing experiments (e.g., 'risk-assessment')")
|
|
66
67
|
@click.option("--evaluator", "-e", default="default", help="Evaluator schema name (default: 'default')")
|
|
67
68
|
@click.option("--description", "-d", help="Experiment description")
|
|
68
69
|
@click.option("--dataset-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
|
|
@@ -74,6 +75,7 @@ def experiments():
|
|
|
74
75
|
def create(
|
|
75
76
|
name: str,
|
|
76
77
|
agent: str,
|
|
78
|
+
task: str,
|
|
77
79
|
evaluator: str,
|
|
78
80
|
description: Optional[str],
|
|
79
81
|
dataset_location: str,
|
|
@@ -170,7 +172,8 @@ def create(
|
|
|
170
172
|
# Create experiment config
|
|
171
173
|
config = ExperimentConfig(
|
|
172
174
|
name=name,
|
|
173
|
-
|
|
175
|
+
task=task,
|
|
176
|
+
description=description or f"Evaluation experiment for {agent} agent ({task} task)",
|
|
174
177
|
agent_schema_ref=SchemaReference(
|
|
175
178
|
name=agent,
|
|
176
179
|
version=None, # Use latest by default
|
|
@@ -514,6 +517,159 @@ def show(name: str, base_path: Optional[str]):
|
|
|
514
517
|
raise click.Abort()
|
|
515
518
|
|
|
516
519
|
|
|
520
|
+
# =============================================================================
|
|
521
|
+
# VIBES MODE HELPER
|
|
522
|
+
# =============================================================================
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def _run_vibes_mode(
|
|
526
|
+
config: Any,
|
|
527
|
+
dataset_df: Any,
|
|
528
|
+
task_fn: Any,
|
|
529
|
+
base_path: str,
|
|
530
|
+
limit: Optional[int],
|
|
531
|
+
evaluator_schema_path: Path,
|
|
532
|
+
) -> None:
|
|
533
|
+
"""Run experiment in vibes mode - execute agent and export for AI evaluation.
|
|
534
|
+
|
|
535
|
+
Vibes mode runs the agent on each example and saves results to a JSONL file.
|
|
536
|
+
The AI assistant (e.g., Claude Code) then acts as the judge using the
|
|
537
|
+
evaluator schema to evaluate results.
|
|
538
|
+
|
|
539
|
+
Args:
|
|
540
|
+
config: ExperimentConfig object
|
|
541
|
+
dataset_df: Polars DataFrame with ground truth examples
|
|
542
|
+
task_fn: Function to run agent on each example
|
|
543
|
+
base_path: Base directory for experiments
|
|
544
|
+
limit: Optional limit on number of examples to process
|
|
545
|
+
evaluator_schema_path: Path to the evaluator schema YAML file
|
|
546
|
+
"""
|
|
547
|
+
from rem.utils.date_utils import format_timestamp_for_experiment, utc_now, to_iso
|
|
548
|
+
import json
|
|
549
|
+
|
|
550
|
+
# Apply limit if specified
|
|
551
|
+
if limit:
|
|
552
|
+
dataset_df = dataset_df.head(limit)
|
|
553
|
+
click.echo(f" (Limited to {limit} examples)")
|
|
554
|
+
|
|
555
|
+
# Create results directory
|
|
556
|
+
timestamp = format_timestamp_for_experiment()
|
|
557
|
+
results_dir = Path(base_path) / config.name / "results" / timestamp
|
|
558
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
559
|
+
|
|
560
|
+
click.echo(f"\n⏳ Running agent on {len(dataset_df)} examples...")
|
|
561
|
+
click.echo(f" Results will be saved to: {results_dir}")
|
|
562
|
+
click.echo()
|
|
563
|
+
|
|
564
|
+
# Run agent on each example and collect results
|
|
565
|
+
results = []
|
|
566
|
+
records = dataset_df.to_dicts()
|
|
567
|
+
|
|
568
|
+
for i, record in enumerate(records, 1):
|
|
569
|
+
example_id = record.get("id", i)
|
|
570
|
+
click.echo(f" [{i}/{len(records)}] Processing example {example_id}...", nl=False)
|
|
571
|
+
|
|
572
|
+
try:
|
|
573
|
+
# Prepare input for agent
|
|
574
|
+
input_text = record.get("text", record.get("input", record.get("query", "")))
|
|
575
|
+
example_input = {"query": input_text} if isinstance(input_text, str) else input_text
|
|
576
|
+
|
|
577
|
+
# Run agent
|
|
578
|
+
output = task_fn({"input": example_input})
|
|
579
|
+
|
|
580
|
+
result = {
|
|
581
|
+
"id": example_id,
|
|
582
|
+
"input": input_text,
|
|
583
|
+
"ground_truth": record.get("ground_truth", record.get("expected_output", "")),
|
|
584
|
+
"category": record.get("category", ""),
|
|
585
|
+
"agent_output": output,
|
|
586
|
+
"status": "success",
|
|
587
|
+
}
|
|
588
|
+
click.echo(" ✓")
|
|
589
|
+
|
|
590
|
+
except Exception as e:
|
|
591
|
+
result = {
|
|
592
|
+
"id": example_id,
|
|
593
|
+
"input": record.get("text", record.get("input", "")),
|
|
594
|
+
"ground_truth": record.get("ground_truth", record.get("expected_output", "")),
|
|
595
|
+
"category": record.get("category", ""),
|
|
596
|
+
"agent_output": None,
|
|
597
|
+
"status": "error",
|
|
598
|
+
"error": str(e),
|
|
599
|
+
}
|
|
600
|
+
click.echo(f" ✗ ({e})")
|
|
601
|
+
|
|
602
|
+
results.append(result)
|
|
603
|
+
|
|
604
|
+
# Save results to JSONL
|
|
605
|
+
results_file = results_dir / "vibes-results.jsonl"
|
|
606
|
+
with open(results_file, "w") as f:
|
|
607
|
+
for result in results:
|
|
608
|
+
f.write(json.dumps(result) + "\n")
|
|
609
|
+
|
|
610
|
+
# Copy evaluator schema to results dir for easy reference
|
|
611
|
+
import shutil
|
|
612
|
+
evaluator_copy = results_dir / "evaluator-schema.yaml"
|
|
613
|
+
shutil.copy(evaluator_schema_path, evaluator_copy)
|
|
614
|
+
|
|
615
|
+
# Save run metadata
|
|
616
|
+
run_info = {
|
|
617
|
+
"experiment": config.name,
|
|
618
|
+
"agent": config.agent_schema_ref.name,
|
|
619
|
+
"evaluator": config.evaluator_schema_ref.name,
|
|
620
|
+
"mode": "vibes",
|
|
621
|
+
"timestamp": timestamp,
|
|
622
|
+
"total_examples": len(records),
|
|
623
|
+
"successful": len([r for r in results if r["status"] == "success"]),
|
|
624
|
+
"failed": len([r for r in results if r["status"] == "error"]),
|
|
625
|
+
"completed_at": to_iso(utc_now()),
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
run_info_file = results_dir / "run-info.json"
|
|
629
|
+
with open(run_info_file, "w") as f:
|
|
630
|
+
json.dump(run_info, f, indent=2)
|
|
631
|
+
|
|
632
|
+
# Print summary and instructions
|
|
633
|
+
success_count = run_info["successful"]
|
|
634
|
+
fail_count = run_info["failed"]
|
|
635
|
+
|
|
636
|
+
click.echo(f"\n{'=' * 60}")
|
|
637
|
+
click.echo(f"VIBES MODE COMPLETE")
|
|
638
|
+
click.echo(f"{'=' * 60}")
|
|
639
|
+
click.echo(f"\nResults: {success_count} successful, {fail_count} failed")
|
|
640
|
+
click.echo(f"\nFiles saved to: {results_dir}/")
|
|
641
|
+
click.echo(f" - vibes-results.jsonl (agent outputs)")
|
|
642
|
+
click.echo(f" - evaluator-schema.yaml (evaluation criteria)")
|
|
643
|
+
click.echo(f" - run-info.json (run metadata)")
|
|
644
|
+
|
|
645
|
+
click.echo(f"\n{'=' * 60}")
|
|
646
|
+
click.echo(f"NEXT STEP: Ask your AI assistant to evaluate")
|
|
647
|
+
click.echo(f"{'=' * 60}")
|
|
648
|
+
click.echo(f"""
|
|
649
|
+
Copy this prompt to Claude Code or your AI assistant:
|
|
650
|
+
|
|
651
|
+
Please evaluate the experiment results in:
|
|
652
|
+
{results_dir}/
|
|
653
|
+
|
|
654
|
+
Read the vibes-results.jsonl file and evaluate each example
|
|
655
|
+
using the evaluator schema in evaluator-schema.yaml.
|
|
656
|
+
|
|
657
|
+
For each example, provide:
|
|
658
|
+
1. extracted_classification
|
|
659
|
+
2. exact_match (vs ground_truth)
|
|
660
|
+
3. semantic_match
|
|
661
|
+
4. reasoning_quality_score
|
|
662
|
+
5. overall_score
|
|
663
|
+
6. pass/fail
|
|
664
|
+
|
|
665
|
+
Then provide summary metrics:
|
|
666
|
+
- Exact match accuracy
|
|
667
|
+
- Semantic match accuracy
|
|
668
|
+
- Average overall score
|
|
669
|
+
- Pass rate
|
|
670
|
+
""")
|
|
671
|
+
|
|
672
|
+
|
|
517
673
|
# =============================================================================
|
|
518
674
|
# RUN COMMAND
|
|
519
675
|
# =============================================================================
|
|
@@ -524,6 +680,8 @@ def show(name: str, base_path: Optional[str]):
|
|
|
524
680
|
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
525
681
|
@click.option("--version", help="Git tag version to load (e.g., 'experiments/my-exp/v1.0.0')")
|
|
526
682
|
@click.option("--dry-run", is_flag=True, help="Test on small subset without saving")
|
|
683
|
+
@click.option("--only-vibes", is_flag=True, help="Run agent locally, export results for AI evaluation (no Phoenix)")
|
|
684
|
+
@click.option("--limit", "-n", type=int, help="Limit number of examples to evaluate (useful with --only-vibes)")
|
|
527
685
|
@click.option("--update-prompts", is_flag=True, help="Update prompts in Phoenix before running")
|
|
528
686
|
@click.option("--phoenix-url", help="Phoenix server URL (overrides PHOENIX_BASE_URL env var)")
|
|
529
687
|
@click.option("--phoenix-api-key", help="Phoenix API key (overrides PHOENIX_API_KEY env var)")
|
|
@@ -532,14 +690,45 @@ def run(
|
|
|
532
690
|
base_path: Optional[str],
|
|
533
691
|
version: Optional[str],
|
|
534
692
|
dry_run: bool,
|
|
693
|
+
only_vibes: bool,
|
|
694
|
+
limit: Optional[int],
|
|
535
695
|
update_prompts: bool,
|
|
536
696
|
phoenix_url: Optional[str],
|
|
537
697
|
phoenix_api_key: Optional[str],
|
|
538
698
|
):
|
|
539
|
-
"""Run an experiment using Phoenix provider.
|
|
699
|
+
"""Run an experiment using Phoenix provider or local vibes mode.
|
|
540
700
|
|
|
541
701
|
Loads configuration, executes agent and evaluator, saves results.
|
|
542
702
|
|
|
703
|
+
Vibes Mode (--only-vibes):
|
|
704
|
+
Run agent locally without Phoenix infrastructure. Agent outputs are saved
|
|
705
|
+
to a JSONL file along with the evaluator schema. Your AI assistant (e.g.,
|
|
706
|
+
Claude Code) then acts as the judge to evaluate results.
|
|
707
|
+
|
|
708
|
+
This enables seamless switching between:
|
|
709
|
+
- Local evaluation: Quick iteration with AI-as-judge
|
|
710
|
+
- Phoenix evaluation: Production metrics and dashboards
|
|
711
|
+
|
|
712
|
+
Usage:
|
|
713
|
+
rem experiments run my-experiment --only-vibes
|
|
714
|
+
rem experiments run my-experiment --only-vibes --limit 5
|
|
715
|
+
|
|
716
|
+
The command will:
|
|
717
|
+
1. Run the agent on each ground-truth example
|
|
718
|
+
2. Save results to results/{timestamp}/vibes-results.jsonl
|
|
719
|
+
3. Print the evaluator prompt and schema
|
|
720
|
+
4. Instruct you to ask your AI assistant to evaluate
|
|
721
|
+
|
|
722
|
+
Example workflow with Claude Code:
|
|
723
|
+
$ rem experiments run mental-health-classifier --only-vibes --limit 3
|
|
724
|
+
# ... agent runs ...
|
|
725
|
+
# Results saved to: .experiments/mental-health-classifier/results/20241203-143022/
|
|
726
|
+
|
|
727
|
+
# Then ask Claude Code:
|
|
728
|
+
"Please evaluate the experiment results in
|
|
729
|
+
.experiments/mental-health-classifier/results/20241203-143022/
|
|
730
|
+
using the evaluator schema provided"
|
|
731
|
+
|
|
543
732
|
Phoenix Connection:
|
|
544
733
|
Commands respect PHOENIX_BASE_URL and PHOENIX_API_KEY environment variables.
|
|
545
734
|
Defaults to localhost:6006 for local development.
|
|
@@ -562,6 +751,12 @@ def run(
|
|
|
562
751
|
# Run experiment with latest schemas
|
|
563
752
|
rem experiments run hello-world-validation
|
|
564
753
|
|
|
754
|
+
# Quick local evaluation (vibes mode)
|
|
755
|
+
rem experiments run hello-world-validation --only-vibes
|
|
756
|
+
|
|
757
|
+
# Vibes mode with limited examples
|
|
758
|
+
rem experiments run hello-world-validation --only-vibes --limit 5
|
|
759
|
+
|
|
565
760
|
# Run specific version
|
|
566
761
|
rem experiments run hello-world-validation \\
|
|
567
762
|
--version experiments/hello-world-validation/v1.0.0
|
|
@@ -578,8 +773,7 @@ def run(
|
|
|
578
773
|
from rem.services.git import GitService
|
|
579
774
|
from rem.services.phoenix import PhoenixClient
|
|
580
775
|
from rem.agentic.providers.phoenix import create_evaluator_from_schema
|
|
581
|
-
from
|
|
582
|
-
import pandas as pd
|
|
776
|
+
from rem.utils.date_utils import utc_now, to_iso, format_timestamp_for_experiment
|
|
583
777
|
import os
|
|
584
778
|
|
|
585
779
|
try:
|
|
@@ -615,36 +809,22 @@ def run(
|
|
|
615
809
|
click.echo(f" Mode: DRY RUN (no data will be saved)")
|
|
616
810
|
click.echo()
|
|
617
811
|
|
|
618
|
-
# Load agent schema
|
|
812
|
+
# Load agent schema using centralized schema loader
|
|
619
813
|
agent_name = config.agent_schema_ref.name
|
|
620
814
|
agent_version = config.agent_schema_ref.version
|
|
621
815
|
|
|
622
816
|
click.echo(f"Loading agent schema: {agent_name} (version: {agent_version or 'latest'})")
|
|
623
817
|
|
|
624
|
-
|
|
625
|
-
agent_schema = None
|
|
626
|
-
try:
|
|
627
|
-
git_svc = GitService()
|
|
628
|
-
agent_schema = git_svc.load_schema(agent_name, version=agent_version)
|
|
629
|
-
click.echo(f"✓ Loaded agent schema from Git")
|
|
630
|
-
except Exception as e:
|
|
631
|
-
logger.debug(f"Git not available, trying filesystem: {e}")
|
|
632
|
-
|
|
633
|
-
# Fallback to local filesystem
|
|
634
|
-
from rem.services.fs import FS
|
|
635
|
-
fs = FS()
|
|
818
|
+
from rem.utils.schema_loader import load_agent_schema
|
|
636
819
|
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
click.echo(f" Tried filesystem: {schema_path}")
|
|
646
|
-
click.echo(f" Make sure the schema exists")
|
|
647
|
-
raise click.Abort()
|
|
820
|
+
try:
|
|
821
|
+
agent_schema = load_agent_schema(agent_name)
|
|
822
|
+
click.echo(f"✓ Loaded agent schema: {agent_name}")
|
|
823
|
+
except FileNotFoundError as e:
|
|
824
|
+
logger.error(f"Failed to load agent schema: {e}")
|
|
825
|
+
click.echo(f"Error: Could not load agent schema '{agent_name}'")
|
|
826
|
+
click.echo(f" {e}")
|
|
827
|
+
raise click.Abort()
|
|
648
828
|
|
|
649
829
|
# Create agent function from schema
|
|
650
830
|
from rem.agentic.providers.pydantic_ai import create_agent
|
|
@@ -683,73 +863,97 @@ def run(
|
|
|
683
863
|
return {"output": serialized}
|
|
684
864
|
return serialized if isinstance(serialized, dict) else {"output": str(serialized)}
|
|
685
865
|
|
|
686
|
-
# Load evaluator schema
|
|
866
|
+
# Load evaluator schema using centralized schema loader
|
|
687
867
|
evaluator_name = config.evaluator_schema_ref.name
|
|
688
868
|
evaluator_version = config.evaluator_schema_ref.version
|
|
689
869
|
|
|
690
|
-
# Resolve evaluator path (evaluators are organized by agent name)
|
|
691
|
-
evaluator_schema_path = f"rem/schemas/evaluators/{agent_name}/{evaluator_name}.yaml"
|
|
692
|
-
|
|
693
870
|
click.echo(f"Loading evaluator: {evaluator_name} for agent {agent_name}")
|
|
694
871
|
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
click.echo(f"
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
872
|
+
# Find evaluator schema file path
|
|
873
|
+
from rem.utils.schema_loader import get_evaluator_schema_path
|
|
874
|
+
|
|
875
|
+
evaluator_schema_path = get_evaluator_schema_path(evaluator_name)
|
|
876
|
+
if not evaluator_schema_path or not evaluator_schema_path.exists():
|
|
877
|
+
click.echo(f"Error: Could not find evaluator schema '{evaluator_name}'")
|
|
878
|
+
raise click.Abort()
|
|
879
|
+
|
|
880
|
+
click.echo(f"✓ Found evaluator schema: {evaluator_schema_path}")
|
|
881
|
+
|
|
882
|
+
# For Phoenix mode, also load evaluator function
|
|
883
|
+
evaluator_fn = None
|
|
884
|
+
if not only_vibes:
|
|
885
|
+
# Try multiple evaluator path patterns (agent-specific, then generic)
|
|
886
|
+
evaluator_paths_to_try = [
|
|
887
|
+
f"{agent_name}/{evaluator_name}", # e.g., hello-world/default
|
|
888
|
+
f"{agent_name}-{evaluator_name}", # e.g., hello-world-default
|
|
889
|
+
evaluator_name, # e.g., default (generic)
|
|
890
|
+
]
|
|
891
|
+
|
|
892
|
+
evaluator_load_error = None
|
|
893
|
+
|
|
894
|
+
for evaluator_path in evaluator_paths_to_try:
|
|
895
|
+
try:
|
|
896
|
+
evaluator_fn = create_evaluator_from_schema(
|
|
897
|
+
evaluator_schema_path=evaluator_path,
|
|
898
|
+
model_name=None, # Use default from schema
|
|
899
|
+
)
|
|
900
|
+
click.echo(f"✓ Loaded evaluator function: {evaluator_path}")
|
|
901
|
+
break
|
|
902
|
+
except FileNotFoundError as e:
|
|
903
|
+
evaluator_load_error = e
|
|
904
|
+
logger.debug(f"Evaluator not found at {evaluator_path}: {e}")
|
|
905
|
+
continue
|
|
906
|
+
except Exception as e:
|
|
907
|
+
evaluator_load_error = e
|
|
908
|
+
logger.warning(f"Failed to load evaluator from {evaluator_path}: {e}")
|
|
909
|
+
continue
|
|
910
|
+
|
|
911
|
+
if evaluator_fn is None and not only_vibes:
|
|
912
|
+
click.echo(f"Error: Could not load evaluator function '{evaluator_name}'")
|
|
913
|
+
click.echo(f" Tried paths: {evaluator_paths_to_try}")
|
|
914
|
+
if evaluator_load_error:
|
|
915
|
+
click.echo(f" Last error: {evaluator_load_error}")
|
|
706
916
|
raise click.Abort()
|
|
707
917
|
|
|
708
|
-
# Load dataset
|
|
918
|
+
# Load dataset using Polars
|
|
919
|
+
import polars as pl
|
|
920
|
+
|
|
709
921
|
click.echo(f"Loading dataset: {list(config.datasets.keys())[0]}")
|
|
710
922
|
dataset_ref = list(config.datasets.values())[0]
|
|
711
923
|
|
|
712
924
|
if dataset_ref.location.value == "git":
|
|
713
|
-
# Load from Git
|
|
925
|
+
# Load from Git (local filesystem)
|
|
714
926
|
dataset_path = Path(base_path) / name / dataset_ref.path
|
|
715
927
|
if not dataset_path.exists():
|
|
716
928
|
click.echo(f"Error: Dataset not found: {dataset_path}")
|
|
717
929
|
raise click.Abort()
|
|
718
930
|
|
|
719
931
|
if dataset_ref.format == "csv":
|
|
720
|
-
dataset_df =
|
|
932
|
+
dataset_df = pl.read_csv(dataset_path)
|
|
721
933
|
elif dataset_ref.format == "parquet":
|
|
722
|
-
dataset_df =
|
|
934
|
+
dataset_df = pl.read_parquet(dataset_path)
|
|
723
935
|
elif dataset_ref.format == "jsonl":
|
|
724
|
-
dataset_df =
|
|
936
|
+
dataset_df = pl.read_ndjson(dataset_path)
|
|
725
937
|
else:
|
|
726
938
|
click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
|
|
727
939
|
raise click.Abort()
|
|
728
940
|
elif dataset_ref.location.value in ["s3", "hybrid"]:
|
|
729
941
|
# Load from S3 using FS provider
|
|
730
942
|
from rem.services.fs import FS
|
|
943
|
+
from io import BytesIO
|
|
731
944
|
|
|
732
945
|
fs = FS()
|
|
733
946
|
|
|
734
947
|
try:
|
|
735
948
|
if dataset_ref.format == "csv":
|
|
736
949
|
content = fs.read(dataset_ref.path)
|
|
737
|
-
|
|
738
|
-
dataset_df = pd.read_csv(StringIO(content))
|
|
950
|
+
dataset_df = pl.read_csv(BytesIO(content.encode() if isinstance(content, str) else content))
|
|
739
951
|
elif dataset_ref.format == "parquet":
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
|
|
743
|
-
tmp_path = tmp.name
|
|
744
|
-
# Download via FS
|
|
745
|
-
content_bytes = fs.read(dataset_ref.path)
|
|
746
|
-
tmp.write(content_bytes)
|
|
747
|
-
dataset_df = pd.read_parquet(tmp_path)
|
|
748
|
-
Path(tmp_path).unlink() # Clean up temp file
|
|
952
|
+
content_bytes = fs.read(dataset_ref.path)
|
|
953
|
+
dataset_df = pl.read_parquet(BytesIO(content_bytes if isinstance(content_bytes, bytes) else content_bytes.encode()))
|
|
749
954
|
elif dataset_ref.format == "jsonl":
|
|
750
955
|
content = fs.read(dataset_ref.path)
|
|
751
|
-
|
|
752
|
-
dataset_df = pd.read_json(StringIO(content), lines=True)
|
|
956
|
+
dataset_df = pl.read_ndjson(BytesIO(content.encode() if isinstance(content, str) else content))
|
|
753
957
|
else:
|
|
754
958
|
click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
|
|
755
959
|
raise click.Abort()
|
|
@@ -772,6 +976,18 @@ def run(
|
|
|
772
976
|
# TODO: Implement prompt updating
|
|
773
977
|
click.echo("⚠ --update-prompts not yet implemented")
|
|
774
978
|
|
|
979
|
+
# Vibes mode: run agent and export for AI evaluation
|
|
980
|
+
if only_vibes:
|
|
981
|
+
_run_vibes_mode(
|
|
982
|
+
config=config,
|
|
983
|
+
dataset_df=dataset_df,
|
|
984
|
+
task_fn=task_fn,
|
|
985
|
+
base_path=base_path,
|
|
986
|
+
limit=limit,
|
|
987
|
+
evaluator_schema_path=evaluator_schema_path,
|
|
988
|
+
)
|
|
989
|
+
return
|
|
990
|
+
|
|
775
991
|
# Run experiment via Phoenix
|
|
776
992
|
if not dry_run:
|
|
777
993
|
# Create Phoenix client with optional overrides
|
|
@@ -793,13 +1009,13 @@ def run(
|
|
|
793
1009
|
|
|
794
1010
|
client = PhoenixClient(config=phoenix_config)
|
|
795
1011
|
|
|
796
|
-
experiment_name = f"{config.name}-{
|
|
1012
|
+
experiment_name = f"{config.name}-{format_timestamp_for_experiment()}"
|
|
797
1013
|
|
|
798
1014
|
click.echo(f"\n⏳ Running experiment: {experiment_name}")
|
|
799
1015
|
click.echo(f" This may take several minutes...")
|
|
800
1016
|
|
|
801
1017
|
experiment = client.run_experiment(
|
|
802
|
-
dataset=dataset_df,
|
|
1018
|
+
dataset=dataset_df,
|
|
803
1019
|
task=task_fn,
|
|
804
1020
|
evaluators=[evaluator_fn],
|
|
805
1021
|
experiment_name=experiment_name,
|
|
@@ -809,12 +1025,15 @@ def run(
|
|
|
809
1025
|
"evaluator": config.evaluator_schema_ref.name,
|
|
810
1026
|
"experiment_config": config.name,
|
|
811
1027
|
**config.metadata
|
|
812
|
-
}
|
|
1028
|
+
},
|
|
1029
|
+
# Smart column detection for DataFrame -> Phoenix Dataset conversion
|
|
1030
|
+
input_keys=["input"] if "input" in dataset_df.columns else None,
|
|
1031
|
+
output_keys=["expected_output"] if "expected_output" in dataset_df.columns else None,
|
|
813
1032
|
)
|
|
814
1033
|
|
|
815
1034
|
# Update experiment status
|
|
816
1035
|
config.status = ExperimentStatus.COMPLETED
|
|
817
|
-
config.last_run_at =
|
|
1036
|
+
config.last_run_at = utc_now()
|
|
818
1037
|
if not version: # Only save if not loading from Git
|
|
819
1038
|
config.save(base_path)
|
|
820
1039
|
|
|
@@ -835,7 +1054,7 @@ def run(
|
|
|
835
1054
|
"agent": config.agent_schema_ref.name,
|
|
836
1055
|
"evaluator": config.evaluator_schema_ref.name,
|
|
837
1056
|
"dataset_size": len(dataset_df),
|
|
838
|
-
"completed_at":
|
|
1057
|
+
"completed_at": to_iso(utc_now()),
|
|
839
1058
|
"phoenix_url": getattr(experiment, "url", None),
|
|
840
1059
|
"task_runs": len(exp_data.get("task_runs", [])),
|
|
841
1060
|
}
|
|
@@ -1015,20 +1234,24 @@ def dataset_add(
|
|
|
1015
1234
|
--output-keys expected_label,expected_type
|
|
1016
1235
|
"""
|
|
1017
1236
|
from rem.services.phoenix import PhoenixClient
|
|
1018
|
-
import
|
|
1237
|
+
import polars as pl
|
|
1019
1238
|
|
|
1020
1239
|
try:
|
|
1021
1240
|
client = PhoenixClient()
|
|
1022
1241
|
|
|
1023
|
-
# Load CSV
|
|
1024
|
-
df =
|
|
1242
|
+
# Load CSV with Polars
|
|
1243
|
+
df = pl.read_csv(from_csv)
|
|
1244
|
+
records = df.to_dicts()
|
|
1025
1245
|
|
|
1026
1246
|
# Extract data
|
|
1027
|
-
|
|
1028
|
-
|
|
1247
|
+
input_cols = input_keys.split(",")
|
|
1248
|
+
output_cols = output_keys.split(",")
|
|
1249
|
+
inputs = [{k: row.get(k) for k in input_cols} for row in records]
|
|
1250
|
+
outputs = [{k: row.get(k) for k in output_cols} for row in records]
|
|
1029
1251
|
metadata = None
|
|
1030
1252
|
if metadata_keys:
|
|
1031
|
-
|
|
1253
|
+
meta_cols = metadata_keys.split(",")
|
|
1254
|
+
metadata = [{k: row.get(k) for k in meta_cols} for row in records]
|
|
1032
1255
|
|
|
1033
1256
|
# Add to dataset
|
|
1034
1257
|
dataset = client.add_examples_to_dataset(
|
|
@@ -1269,12 +1492,12 @@ def trace_list(
|
|
|
1269
1492
|
rem experiments trace list --project rem-agents --days 7 --limit 50
|
|
1270
1493
|
"""
|
|
1271
1494
|
from rem.services.phoenix import PhoenixClient
|
|
1272
|
-
from
|
|
1495
|
+
from rem.utils.date_utils import days_ago
|
|
1273
1496
|
|
|
1274
1497
|
try:
|
|
1275
1498
|
client = PhoenixClient()
|
|
1276
1499
|
|
|
1277
|
-
start_time =
|
|
1500
|
+
start_time = days_ago(days)
|
|
1278
1501
|
|
|
1279
1502
|
traces_df = client.get_traces(
|
|
1280
1503
|
project_name=project,
|
|
@@ -1300,3 +1523,175 @@ def trace_list(
|
|
|
1300
1523
|
logger.error(f"Failed to list traces: {e}")
|
|
1301
1524
|
click.echo(f"Error: {e}", err=True)
|
|
1302
1525
|
raise click.Abort()
|
|
1526
|
+
|
|
1527
|
+
|
|
1528
|
+
# =============================================================================
|
|
1529
|
+
# EXPORT COMMAND
|
|
1530
|
+
# =============================================================================
|
|
1531
|
+
|
|
1532
|
+
|
|
1533
|
+
@experiments.command("export")
|
|
1534
|
+
@click.argument("name")
|
|
1535
|
+
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
1536
|
+
@click.option("--bucket", "-b", help="S3 bucket name (default: DATA_LAKE__BUCKET_NAME)")
|
|
1537
|
+
@click.option("--version", "-v", default="v0", help="Data lake version prefix (default: v0)")
|
|
1538
|
+
@click.option("--plan", is_flag=True, help="Show what would be exported without uploading")
|
|
1539
|
+
@click.option("--include-results", is_flag=True, help="Include results directory in export")
|
|
1540
|
+
def export(
|
|
1541
|
+
name: str,
|
|
1542
|
+
base_path: Optional[str],
|
|
1543
|
+
bucket: Optional[str],
|
|
1544
|
+
version: str,
|
|
1545
|
+
plan: bool,
|
|
1546
|
+
include_results: bool,
|
|
1547
|
+
):
|
|
1548
|
+
"""Export experiment to S3 data lake.
|
|
1549
|
+
|
|
1550
|
+
Exports experiment configuration, ground truth, and optionally results
|
|
1551
|
+
to the S3 data lake following the convention:
|
|
1552
|
+
|
|
1553
|
+
s3://{bucket}/{version}/datasets/calibration/experiments/{agent}/{task}/
|
|
1554
|
+
|
|
1555
|
+
The export includes:
|
|
1556
|
+
- experiment.yaml (configuration)
|
|
1557
|
+
- README.md (documentation)
|
|
1558
|
+
- ground-truth/ (evaluation datasets)
|
|
1559
|
+
- seed-data/ (optional seed data)
|
|
1560
|
+
- results/ (optional, with --include-results)
|
|
1561
|
+
|
|
1562
|
+
Examples:
|
|
1563
|
+
# Preview what would be exported
|
|
1564
|
+
rem experiments export my-experiment --plan
|
|
1565
|
+
|
|
1566
|
+
# Export to configured data lake bucket
|
|
1567
|
+
rem experiments export my-experiment
|
|
1568
|
+
|
|
1569
|
+
# Export to specific bucket
|
|
1570
|
+
rem experiments export my-experiment --bucket siggy-data
|
|
1571
|
+
|
|
1572
|
+
# Include results in export
|
|
1573
|
+
rem experiments export my-experiment --include-results
|
|
1574
|
+
|
|
1575
|
+
# Export with custom version prefix
|
|
1576
|
+
rem experiments export my-experiment --version v1
|
|
1577
|
+
"""
|
|
1578
|
+
from rem.models.core.experiment import ExperimentConfig
|
|
1579
|
+
from rem.settings import settings
|
|
1580
|
+
from rem.services.fs.s3_provider import S3Provider
|
|
1581
|
+
import os
|
|
1582
|
+
import json
|
|
1583
|
+
|
|
1584
|
+
try:
|
|
1585
|
+
# Resolve base path
|
|
1586
|
+
if base_path is None:
|
|
1587
|
+
base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
|
|
1588
|
+
|
|
1589
|
+
# Load experiment configuration
|
|
1590
|
+
config_path = Path(base_path) / name / "experiment.yaml"
|
|
1591
|
+
if not config_path.exists():
|
|
1592
|
+
click.echo(f"Experiment not found: {name}")
|
|
1593
|
+
click.echo(f" Looked in: {config_path}")
|
|
1594
|
+
raise click.Abort()
|
|
1595
|
+
|
|
1596
|
+
config = ExperimentConfig.from_yaml(config_path)
|
|
1597
|
+
click.echo(f"✓ Loaded experiment: {name}")
|
|
1598
|
+
|
|
1599
|
+
# Resolve bucket
|
|
1600
|
+
if bucket is None:
|
|
1601
|
+
bucket = settings.data_lake.bucket_name
|
|
1602
|
+
if bucket is None:
|
|
1603
|
+
click.echo("Error: No S3 bucket configured.")
|
|
1604
|
+
click.echo(" Set DATA_LAKE__BUCKET_NAME environment variable or use --bucket option")
|
|
1605
|
+
raise click.Abort()
|
|
1606
|
+
|
|
1607
|
+
# Build S3 paths
|
|
1608
|
+
s3_base = config.get_s3_export_path(bucket, version)
|
|
1609
|
+
exp_dir = config.get_experiment_dir(base_path)
|
|
1610
|
+
|
|
1611
|
+
# Collect files to export
|
|
1612
|
+
files_to_export = []
|
|
1613
|
+
|
|
1614
|
+
# Always include these files
|
|
1615
|
+
required_files = [
|
|
1616
|
+
("experiment.yaml", exp_dir / "experiment.yaml"),
|
|
1617
|
+
("README.md", exp_dir / "README.md"),
|
|
1618
|
+
]
|
|
1619
|
+
|
|
1620
|
+
for s3_name, local_path in required_files:
|
|
1621
|
+
if local_path.exists():
|
|
1622
|
+
files_to_export.append((s3_name, local_path))
|
|
1623
|
+
|
|
1624
|
+
# Include ground-truth directory
|
|
1625
|
+
ground_truth_dir = exp_dir / "ground-truth"
|
|
1626
|
+
if ground_truth_dir.exists():
|
|
1627
|
+
for f in ground_truth_dir.rglob("*"):
|
|
1628
|
+
if f.is_file():
|
|
1629
|
+
relative = f.relative_to(exp_dir)
|
|
1630
|
+
files_to_export.append((str(relative), f))
|
|
1631
|
+
|
|
1632
|
+
# Include seed-data directory
|
|
1633
|
+
seed_data_dir = exp_dir / "seed-data"
|
|
1634
|
+
if seed_data_dir.exists():
|
|
1635
|
+
for f in seed_data_dir.rglob("*"):
|
|
1636
|
+
if f.is_file():
|
|
1637
|
+
relative = f.relative_to(exp_dir)
|
|
1638
|
+
files_to_export.append((str(relative), f))
|
|
1639
|
+
|
|
1640
|
+
# Optionally include results
|
|
1641
|
+
if include_results:
|
|
1642
|
+
results_dir = exp_dir / "results"
|
|
1643
|
+
if results_dir.exists():
|
|
1644
|
+
for f in results_dir.rglob("*"):
|
|
1645
|
+
if f.is_file():
|
|
1646
|
+
relative = f.relative_to(exp_dir)
|
|
1647
|
+
files_to_export.append((str(relative), f))
|
|
1648
|
+
|
|
1649
|
+
# Display export plan
|
|
1650
|
+
click.echo(f"\n{'=' * 60}")
|
|
1651
|
+
click.echo(f"EXPORT {'PLAN' if plan else 'TO S3'}")
|
|
1652
|
+
click.echo(f"{'=' * 60}")
|
|
1653
|
+
click.echo(f"\nExperiment: {config.name}")
|
|
1654
|
+
click.echo(f"Agent: {config.agent_schema_ref.name}")
|
|
1655
|
+
click.echo(f"Task: {config.task}")
|
|
1656
|
+
click.echo(f"Evaluator file: {config.get_evaluator_filename()}")
|
|
1657
|
+
click.echo(f"\nDestination: {s3_base}/")
|
|
1658
|
+
click.echo(f"\nFiles to export ({len(files_to_export)}):")
|
|
1659
|
+
|
|
1660
|
+
for s3_name, local_path in files_to_export:
|
|
1661
|
+
s3_uri = f"{s3_base}/{s3_name}"
|
|
1662
|
+
if plan:
|
|
1663
|
+
click.echo(f" {local_path}")
|
|
1664
|
+
click.echo(f" → {s3_uri}")
|
|
1665
|
+
else:
|
|
1666
|
+
click.echo(f" {s3_name}")
|
|
1667
|
+
|
|
1668
|
+
if plan:
|
|
1669
|
+
click.echo(f"\n[PLAN MODE] No files were uploaded.")
|
|
1670
|
+
click.echo(f"Run without --plan to execute the export.")
|
|
1671
|
+
return
|
|
1672
|
+
|
|
1673
|
+
# Execute export
|
|
1674
|
+
click.echo(f"\n⏳ Uploading to S3...")
|
|
1675
|
+
s3 = S3Provider()
|
|
1676
|
+
|
|
1677
|
+
uploaded = 0
|
|
1678
|
+
for s3_name, local_path in files_to_export:
|
|
1679
|
+
s3_uri = f"{s3_base}/{s3_name}"
|
|
1680
|
+
try:
|
|
1681
|
+
s3.copy(str(local_path), s3_uri)
|
|
1682
|
+
uploaded += 1
|
|
1683
|
+
click.echo(f" ✓ {s3_name}")
|
|
1684
|
+
except Exception as e:
|
|
1685
|
+
click.echo(f" ✗ {s3_name}: {e}")
|
|
1686
|
+
|
|
1687
|
+
click.echo(f"\n✓ Exported {uploaded}/{len(files_to_export)} files to {s3_base}/")
|
|
1688
|
+
|
|
1689
|
+
# Show next steps
|
|
1690
|
+
click.echo(f"\nNext steps:")
|
|
1691
|
+
click.echo(f" - View in S3: aws s3 ls {s3_base}/ --recursive")
|
|
1692
|
+
click.echo(f" - Download: aws s3 sync {s3_base}/ ./{config.agent_schema_ref.name}/{config.task}/")
|
|
1693
|
+
|
|
1694
|
+
except Exception as e:
|
|
1695
|
+
logger.error(f"Failed to export experiment: {e}")
|
|
1696
|
+
click.echo(f"Error: {e}", err=True)
|
|
1697
|
+
raise click.Abort()
|