remdb 0.3.242__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +129 -0
- rem/agentic/README.md +760 -0
- rem/agentic/__init__.py +54 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +38 -0
- rem/agentic/agents/agent_manager.py +311 -0
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +425 -0
- rem/agentic/context_builder.py +360 -0
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +273 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +240 -0
- rem/agentic/providers/phoenix.py +926 -0
- rem/agentic/providers/pydantic_ai.py +854 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +737 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +242 -0
- rem/api/README.md +657 -0
- rem/api/deps.py +253 -0
- rem/api/main.py +460 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +820 -0
- rem/api/mcp_router/server.py +243 -0
- rem/api/mcp_router/tools.py +1605 -0
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +520 -0
- rem/api/routers/auth.py +898 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/child_streaming.py +394 -0
- rem/api/routers/chat/completions.py +702 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +202 -0
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +546 -0
- rem/api/routers/chat/streaming.py +950 -0
- rem/api/routers/chat/streaming_utils.py +327 -0
- rem/api/routers/common.py +18 -0
- rem/api/routers/dev.py +87 -0
- rem/api/routers/feedback.py +276 -0
- rem/api/routers/messages.py +620 -0
- rem/api/routers/models.py +86 -0
- rem/api/routers/query.py +362 -0
- rem/api/routers/shared_sessions.py +422 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +36 -0
- rem/auth/jwt.py +367 -0
- rem/auth/middleware.py +318 -0
- rem/auth/providers/__init__.py +16 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/email.py +215 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +517 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +299 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +549 -0
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +495 -0
- rem/cli/commands/db.py +828 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1698 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +388 -0
- rem/cli/commands/query.py +109 -0
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +230 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/commands/session.py +453 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +123 -0
- rem/config.py +244 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +70 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +672 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +246 -0
- rem/models/entities/__init__.py +68 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +64 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +181 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/session.py +84 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/subscriber.py +175 -0
- rem/models/entities/user.py +93 -0
- rem/py.typed +0 -0
- rem/registry.py +373 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/agent-builder.yaml +235 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +132 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +18 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +760 -0
- rem/services/content/service.py +762 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +322 -0
- rem/services/dreaming/moment_service.py +251 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/email/__init__.py +10 -0
- rem/services/email/service.py +522 -0
- rem/services/email/templates.py +360 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +127 -0
- rem/services/embeddings/worker.py +435 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +960 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +757 -0
- rem/services/postgres/__init__.py +49 -0
- rem/services/postgres/diff_service.py +599 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
- rem/services/postgres/register_type.py +353 -0
- rem/services/postgres/repository.py +481 -0
- rem/services/postgres/schema_generator.py +661 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +355 -0
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +318 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +180 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +608 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +13 -0
- rem/services/session/compression.py +488 -0
- rem/services/session/pydantic_messages.py +310 -0
- rem/services/session/reload.py +85 -0
- rem/services/user_service.py +130 -0
- rem/settings.py +1877 -0
- rem/sql/background_indexes.sql +52 -0
- rem/sql/migrations/001_install.sql +983 -0
- rem/sql/migrations/002_install_models.sql +3157 -0
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +282 -0
- rem/sql/migrations/005_schema_update.sql +145 -0
- rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +628 -0
- rem/utils/__init__.py +61 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +436 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/files.py +323 -0
- rem/utils/markdown.py +16 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +492 -0
- rem/utils/schema_loader.py +649 -0
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +350 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +325 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +7 -0
- rem/workers/db_listener.py +579 -0
- rem/workers/db_maintainer.py +74 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- rem/workers/unlogged_maintainer.py +463 -0
- remdb-0.3.242.dist-info/METADATA +1632 -0
- remdb-0.3.242.dist-info/RECORD +235 -0
- remdb-0.3.242.dist-info/WHEEL +4 -0
- remdb-0.3.242.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,1698 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Experiment management CLI commands.
|
|
3
|
+
|
|
4
|
+
Experiments use ExperimentConfig (rem/models/core/experiment.py) for configuration
|
|
5
|
+
and support Git+S3 hybrid storage. Includes dataset, prompt, and trace management.
|
|
6
|
+
|
|
7
|
+
Directory Structure:
|
|
8
|
+
experiments/{experiment-name}/
|
|
9
|
+
├── experiment.yaml # ExperimentConfig (metadata, agent ref, evaluator ref)
|
|
10
|
+
├── README.md # Auto-generated documentation
|
|
11
|
+
├── ground-truth/ # Evaluation datasets (Q&A pairs)
|
|
12
|
+
│ ├── dataset.csv # Input/output pairs for evaluation
|
|
13
|
+
│ └── dataset.yaml # Alternative YAML format
|
|
14
|
+
├── seed-data/ # Data to seed REM before running experiments
|
|
15
|
+
│ └── data.yaml # Users, resources, moments in REM format
|
|
16
|
+
└── results/ # Experiment results and metrics
|
|
17
|
+
└── {run-timestamp}/ # Each run gets its own timestamped folder
|
|
18
|
+
├── metrics.json # Summary metrics
|
|
19
|
+
└── run_info.json # Run metadata (eval framework URLs, etc)
|
|
20
|
+
|
|
21
|
+
Environment Variables:
|
|
22
|
+
EXPERIMENTS_HOME: Override default experiment directory (default: "experiments")
|
|
23
|
+
|
|
24
|
+
Commands:
|
|
25
|
+
# Experiment lifecycle
|
|
26
|
+
rem experiments create <name> --agent <agent> --evaluator <evaluator>
|
|
27
|
+
rem experiments list
|
|
28
|
+
rem experiments show <name>
|
|
29
|
+
rem experiments run <name> [--version <tag>]
|
|
30
|
+
|
|
31
|
+
# Dataset management
|
|
32
|
+
rem experiments dataset list
|
|
33
|
+
rem experiments dataset create <name> --from-csv data.csv
|
|
34
|
+
rem experiments dataset add <name> --from-csv data.csv
|
|
35
|
+
|
|
36
|
+
# Prompt management
|
|
37
|
+
rem experiments prompt list
|
|
38
|
+
rem experiments prompt create <name> --system-prompt "..."
|
|
39
|
+
|
|
40
|
+
# Trace retrieval
|
|
41
|
+
rem experiments trace list --project <name>
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
import asyncio
|
|
45
|
+
from pathlib import Path
|
|
46
|
+
from typing import Any, Optional, cast
|
|
47
|
+
|
|
48
|
+
import click
|
|
49
|
+
from loguru import logger
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@click.group()
|
|
53
|
+
def experiments():
|
|
54
|
+
"""Experiment configuration and execution commands."""
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# =============================================================================
|
|
59
|
+
# CREATE COMMAND
|
|
60
|
+
# =============================================================================
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@experiments.command("create")
|
|
64
|
+
@click.argument("name")
|
|
65
|
+
@click.option("--agent", "-a", required=True, help="Agent schema name (e.g., 'cv-parser')")
|
|
66
|
+
@click.option("--task", "-t", default="general", help="Task name for organizing experiments (e.g., 'risk-assessment')")
|
|
67
|
+
@click.option("--evaluator", "-e", default="default", help="Evaluator schema name (default: 'default')")
|
|
68
|
+
@click.option("--description", "-d", help="Experiment description")
|
|
69
|
+
@click.option("--dataset-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
|
|
70
|
+
help="Where to store datasets")
|
|
71
|
+
@click.option("--results-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
|
|
72
|
+
help="Where to store results")
|
|
73
|
+
@click.option("--tags", help="Comma-separated tags (e.g., 'production,cv-parser')")
|
|
74
|
+
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
75
|
+
def create(
|
|
76
|
+
name: str,
|
|
77
|
+
agent: str,
|
|
78
|
+
task: str,
|
|
79
|
+
evaluator: str,
|
|
80
|
+
description: Optional[str],
|
|
81
|
+
dataset_location: str,
|
|
82
|
+
results_location: str,
|
|
83
|
+
tags: Optional[str],
|
|
84
|
+
base_path: Optional[str],
|
|
85
|
+
):
|
|
86
|
+
"""Create a new experiment configuration.
|
|
87
|
+
|
|
88
|
+
Creates directory structure and generates experiment.yaml and README.md.
|
|
89
|
+
|
|
90
|
+
The experiment directory will contain:
|
|
91
|
+
- ground-truth/: Q&A pairs for evaluation
|
|
92
|
+
- seed-data/: REM data (users, resources, moments) to load before running
|
|
93
|
+
- results/: Timestamped run results
|
|
94
|
+
|
|
95
|
+
Examples:
|
|
96
|
+
# Small experiment (Git-only)
|
|
97
|
+
rem experiments create hello-world-validation \\
|
|
98
|
+
--agent hello-world \\
|
|
99
|
+
--evaluator default \\
|
|
100
|
+
--description "Smoke test for hello-world agent"
|
|
101
|
+
|
|
102
|
+
# Large experiment (Hybrid storage)
|
|
103
|
+
rem experiments create cv-parser-production \\
|
|
104
|
+
--agent cv-parser \\
|
|
105
|
+
--evaluator default \\
|
|
106
|
+
--description "Production CV parser evaluation" \\
|
|
107
|
+
--dataset-location s3 \\
|
|
108
|
+
--results-location hybrid \\
|
|
109
|
+
--tags "production,cv-parser,weekly"
|
|
110
|
+
|
|
111
|
+
# Custom location
|
|
112
|
+
EXPERIMENTS_HOME=/path/to/experiments rem experiments create my-test --agent my-agent
|
|
113
|
+
"""
|
|
114
|
+
from rem.models.core.experiment import (
|
|
115
|
+
ExperimentConfig,
|
|
116
|
+
DatasetLocation,
|
|
117
|
+
DatasetReference,
|
|
118
|
+
SchemaReference,
|
|
119
|
+
ResultsConfig,
|
|
120
|
+
ExperimentStatus,
|
|
121
|
+
)
|
|
122
|
+
import os
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
# Resolve base path: CLI arg > EXPERIMENTS_HOME env var > default "experiments"
|
|
126
|
+
if base_path is None:
|
|
127
|
+
base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
|
|
128
|
+
# Build dataset reference (format auto-detected from file extension)
|
|
129
|
+
if dataset_location == "git":
|
|
130
|
+
dataset_ref = DatasetReference(
|
|
131
|
+
location=DatasetLocation.GIT,
|
|
132
|
+
path="ground-truth/dataset.csv",
|
|
133
|
+
description="Ground truth Q&A dataset for evaluation"
|
|
134
|
+
)
|
|
135
|
+
else: # s3 or hybrid
|
|
136
|
+
dataset_ref = DatasetReference(
|
|
137
|
+
location=DatasetLocation(dataset_location),
|
|
138
|
+
path=f"s3://rem-experiments/{name}/datasets/ground_truth.parquet",
|
|
139
|
+
schema_path="datasets/schema.yaml" if dataset_location == "hybrid" else None,
|
|
140
|
+
description="Ground truth dataset for evaluation"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Build results config
|
|
144
|
+
if results_location == "git":
|
|
145
|
+
results_config = ResultsConfig(
|
|
146
|
+
location=DatasetLocation.GIT,
|
|
147
|
+
base_path="results/",
|
|
148
|
+
save_traces=False,
|
|
149
|
+
save_metrics_summary=True
|
|
150
|
+
)
|
|
151
|
+
elif results_location == "s3":
|
|
152
|
+
results_config = ResultsConfig(
|
|
153
|
+
location=DatasetLocation.S3,
|
|
154
|
+
base_path=f"s3://rem-experiments/{name}/results/",
|
|
155
|
+
save_traces=True,
|
|
156
|
+
save_metrics_summary=False
|
|
157
|
+
)
|
|
158
|
+
else: # hybrid
|
|
159
|
+
results_config = ResultsConfig(
|
|
160
|
+
location=DatasetLocation.HYBRID,
|
|
161
|
+
base_path=f"s3://rem-experiments/{name}/results/",
|
|
162
|
+
save_traces=True,
|
|
163
|
+
save_metrics_summary=True,
|
|
164
|
+
metrics_file="metrics.json"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Parse tags
|
|
168
|
+
tag_list = [t.strip() for t in tags.split(",")] if tags else []
|
|
169
|
+
|
|
170
|
+
# Create experiment config
|
|
171
|
+
config = ExperimentConfig(
|
|
172
|
+
name=name,
|
|
173
|
+
task=task,
|
|
174
|
+
description=description or f"Evaluation experiment for {agent} agent ({task} task)",
|
|
175
|
+
agent_schema_ref=SchemaReference(
|
|
176
|
+
name=agent,
|
|
177
|
+
version=None, # Use latest by default
|
|
178
|
+
type="agent"
|
|
179
|
+
),
|
|
180
|
+
evaluator_schema_ref=SchemaReference(
|
|
181
|
+
name=evaluator,
|
|
182
|
+
type="evaluator"
|
|
183
|
+
),
|
|
184
|
+
datasets={"ground_truth": dataset_ref},
|
|
185
|
+
results=results_config,
|
|
186
|
+
status=ExperimentStatus.DRAFT,
|
|
187
|
+
tags=tag_list
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Save configuration
|
|
191
|
+
config_path = config.save(base_path)
|
|
192
|
+
readme_path = config.save_readme(base_path)
|
|
193
|
+
|
|
194
|
+
# Create new directory structure
|
|
195
|
+
exp_dir = config.get_experiment_dir(base_path)
|
|
196
|
+
|
|
197
|
+
# Create ground-truth directory
|
|
198
|
+
ground_truth_dir = exp_dir / "ground-truth"
|
|
199
|
+
ground_truth_dir.mkdir(parents=True, exist_ok=True)
|
|
200
|
+
|
|
201
|
+
# Create seed-data directory
|
|
202
|
+
seed_data_dir = exp_dir / "seed-data"
|
|
203
|
+
seed_data_dir.mkdir(parents=True, exist_ok=True)
|
|
204
|
+
|
|
205
|
+
# Create results directory if Git-based
|
|
206
|
+
if results_location == "git":
|
|
207
|
+
results_dir = exp_dir / "results"
|
|
208
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
209
|
+
|
|
210
|
+
# Create placeholder files with documentation
|
|
211
|
+
ground_truth_readme = ground_truth_dir / "README.md"
|
|
212
|
+
ground_truth_readme.write_text("""# Ground Truth Dataset
|
|
213
|
+
|
|
214
|
+
This directory contains Q&A pairs for evaluating the agent.
|
|
215
|
+
|
|
216
|
+
## Format
|
|
217
|
+
|
|
218
|
+
**CSV format** (`dataset.csv`):
|
|
219
|
+
```csv
|
|
220
|
+
input,expected_output,metadata
|
|
221
|
+
"What is the capital of France?","Paris","{\"difficulty\": \"easy\"}"
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
**YAML format** (`dataset.yaml`):
|
|
225
|
+
```yaml
|
|
226
|
+
- input: "What is the capital of France?"
|
|
227
|
+
expected_output: "Paris"
|
|
228
|
+
metadata:
|
|
229
|
+
difficulty: easy
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
## Generating Ground Truth
|
|
233
|
+
|
|
234
|
+
### Using AI Assistants
|
|
235
|
+
|
|
236
|
+
AI coding assistants (like Claude, GPT-4, etc.) can help generate comprehensive ground-truth datasets:
|
|
237
|
+
|
|
238
|
+
1. **Generate from existing examples**: Show the assistant examples from your domain and ask it to create similar Q&A pairs
|
|
239
|
+
2. **Create challenging questions**: Ask the assistant to act as a judge and generate HARD questions that test edge cases
|
|
240
|
+
3. **Vary difficulty levels**: Request a mix of easy, medium, and hard questions with appropriate metadata tags
|
|
241
|
+
|
|
242
|
+
Example prompt:
|
|
243
|
+
```
|
|
244
|
+
Based on these example documents about [your domain], generate 20 Q&A pairs
|
|
245
|
+
for evaluating an agent. Include:
|
|
246
|
+
- 5 easy factual questions
|
|
247
|
+
- 10 medium questions requiring reasoning
|
|
248
|
+
- 5 hard questions with edge cases
|
|
249
|
+
Format as CSV with difficulty and category metadata.
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### Ground Truth as Judge
|
|
253
|
+
|
|
254
|
+
**Important**: Keep ground-truth data **separate** from the agent being tested:
|
|
255
|
+
- Ground truth should be hidden from the agent during evaluation
|
|
256
|
+
- The agent should only see the `input` field
|
|
257
|
+
- The evaluator compares agent output against `expected_output`
|
|
258
|
+
- This ensures unbiased evaluation
|
|
259
|
+
|
|
260
|
+
### Quality Guidelines
|
|
261
|
+
|
|
262
|
+
1. **Diverse Coverage**: Include various question types and difficulty levels
|
|
263
|
+
2. **Domain-Specific**: Use terminology and scenarios from your actual use case
|
|
264
|
+
3. **Metadata Tags**: Add difficulty, category, priority for analysis
|
|
265
|
+
4. **SME Review**: Have domain experts validate expected outputs
|
|
266
|
+
|
|
267
|
+
## Usage
|
|
268
|
+
|
|
269
|
+
These datasets can be:
|
|
270
|
+
- Loaded into evaluation frameworks (Arize Phoenix, etc.)
|
|
271
|
+
- Used for regression testing
|
|
272
|
+
- Converted to different formats as needed
|
|
273
|
+
|
|
274
|
+
The experiment runner will automatically use this data for evaluation.
|
|
275
|
+
""")
|
|
276
|
+
|
|
277
|
+
seed_data_readme = seed_data_dir / "README.md"
|
|
278
|
+
seed_data_readme.write_text("""# Seed Data
|
|
279
|
+
|
|
280
|
+
This directory contains REM data to load before running the experiment.
|
|
281
|
+
|
|
282
|
+
## Format
|
|
283
|
+
|
|
284
|
+
Use standard REM YAML format:
|
|
285
|
+
|
|
286
|
+
```yaml
|
|
287
|
+
users:
|
|
288
|
+
- id: test-user-001
|
|
289
|
+
user_id: experiment-test
|
|
290
|
+
email: test@example.com
|
|
291
|
+
|
|
292
|
+
resources:
|
|
293
|
+
- id: resource-001
|
|
294
|
+
user_id: experiment-test
|
|
295
|
+
label: example-document
|
|
296
|
+
content: "Document content here..."
|
|
297
|
+
|
|
298
|
+
moments:
|
|
299
|
+
- id: moment-001
|
|
300
|
+
user_id: experiment-test
|
|
301
|
+
label: example-meeting
|
|
302
|
+
starts_timestamp: "2024-01-15T14:00:00"
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
## Generating Seed Data
|
|
306
|
+
|
|
307
|
+
### Using AI Assistants
|
|
308
|
+
|
|
309
|
+
AI coding assistants can help generate realistic seed data for your experiments:
|
|
310
|
+
|
|
311
|
+
1. **From existing datasets**: Reference examples from the `datasets/` directory
|
|
312
|
+
2. **Domain-specific scenarios**: Describe your use case and ask for appropriate test data
|
|
313
|
+
3. **Anonymized versions**: Ask to create fictional data based on real patterns
|
|
314
|
+
|
|
315
|
+
Example prompt:
|
|
316
|
+
```
|
|
317
|
+
Based on the recruitment dataset examples in datasets/domains/recruitment/,
|
|
318
|
+
generate seed data for testing a CV parser agent. Include:
|
|
319
|
+
- 3 test users
|
|
320
|
+
- 5 CV documents (resources) with varied experience levels
|
|
321
|
+
- 2 interview moment entries
|
|
322
|
+
Use fictional names and anonymize all content.
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
### Best Practices
|
|
326
|
+
|
|
327
|
+
1. **Minimal**: Only include data necessary for the ground-truth questions to be answerable
|
|
328
|
+
2. **Anonymized**: Always use fictional names, companies, and content
|
|
329
|
+
3. **Relevant**: Seed data should provide context for evaluation questions
|
|
330
|
+
4. **Versioned**: Track changes to seed data in Git for reproducibility
|
|
331
|
+
|
|
332
|
+
## Usage
|
|
333
|
+
|
|
334
|
+
Load this data before running experiments:
|
|
335
|
+
```bash
|
|
336
|
+
rem db load --file seed-data/data.yaml --user-id experiment-test
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
This ensures your agent has the necessary context for evaluation.
|
|
340
|
+
""")
|
|
341
|
+
|
|
342
|
+
click.echo(f"\n✓ Created experiment: {name}")
|
|
343
|
+
click.echo(f" Configuration: {config_path}")
|
|
344
|
+
click.echo(f" Documentation: {readme_path}")
|
|
345
|
+
click.echo(f" Ground Truth: {ground_truth_dir}")
|
|
346
|
+
click.echo(f" Seed Data: {seed_data_dir}")
|
|
347
|
+
if results_location == "git":
|
|
348
|
+
click.echo(f" Results: {results_dir}")
|
|
349
|
+
click.echo(f"\nNext steps:")
|
|
350
|
+
click.echo(f" 1. Add ground truth Q&A to {ground_truth_dir}/dataset.csv")
|
|
351
|
+
click.echo(f" 2. Add seed data to {seed_data_dir}/data.yaml (optional)")
|
|
352
|
+
click.echo(f" 3. Review configuration: {config_path}")
|
|
353
|
+
click.echo(f" 4. Run experiment: rem experiments run {name}")
|
|
354
|
+
click.echo(f" 5. Commit to Git: git add {base_path}/{name}/ && git commit")
|
|
355
|
+
|
|
356
|
+
except Exception as e:
|
|
357
|
+
logger.error(f"Failed to create experiment: {e}")
|
|
358
|
+
click.echo(f"Error: {e}", err=True)
|
|
359
|
+
raise click.Abort()
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
# =============================================================================
|
|
363
|
+
# LIST COMMAND
|
|
364
|
+
# =============================================================================
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
@experiments.command("list")
|
|
368
|
+
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
369
|
+
@click.option("--status", help="Filter by status (draft, ready, completed, etc.)")
|
|
370
|
+
@click.option("--tags", help="Filter by tags (comma-separated)")
|
|
371
|
+
def list_experiments(
|
|
372
|
+
base_path: Optional[str],
|
|
373
|
+
status: Optional[str],
|
|
374
|
+
tags: Optional[str],
|
|
375
|
+
):
|
|
376
|
+
"""List all experiments.
|
|
377
|
+
|
|
378
|
+
Examples:
|
|
379
|
+
rem experiments list
|
|
380
|
+
rem experiments list --status ready
|
|
381
|
+
rem experiments list --tags production,cv-parser
|
|
382
|
+
"""
|
|
383
|
+
from rem.models.core.experiment import ExperimentConfig, ExperimentStatus
|
|
384
|
+
import os
|
|
385
|
+
|
|
386
|
+
try:
|
|
387
|
+
# Resolve base path
|
|
388
|
+
if base_path is None:
|
|
389
|
+
base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
|
|
390
|
+
|
|
391
|
+
experiments_dir = Path(base_path)
|
|
392
|
+
if not experiments_dir.exists():
|
|
393
|
+
click.echo(f"No experiments directory found at {base_path}")
|
|
394
|
+
return
|
|
395
|
+
|
|
396
|
+
# Find all experiment.yaml files
|
|
397
|
+
configs = []
|
|
398
|
+
for exp_dir in experiments_dir.iterdir():
|
|
399
|
+
if not exp_dir.is_dir() or exp_dir.name.startswith("."):
|
|
400
|
+
continue
|
|
401
|
+
|
|
402
|
+
config_file = exp_dir / "experiment.yaml"
|
|
403
|
+
if config_file.exists():
|
|
404
|
+
try:
|
|
405
|
+
config = ExperimentConfig.from_yaml(config_file)
|
|
406
|
+
configs.append(config)
|
|
407
|
+
except Exception as e:
|
|
408
|
+
logger.warning(f"Failed to load {config_file}: {e}")
|
|
409
|
+
|
|
410
|
+
# Apply filters
|
|
411
|
+
if status:
|
|
412
|
+
status_enum = ExperimentStatus(status)
|
|
413
|
+
configs = [c for c in configs if c.status == status_enum]
|
|
414
|
+
|
|
415
|
+
if tags:
|
|
416
|
+
filter_tags = set(t.strip().lower() for t in tags.split(","))
|
|
417
|
+
configs = [c for c in configs if filter_tags & set(c.tags)]
|
|
418
|
+
|
|
419
|
+
if not configs:
|
|
420
|
+
click.echo("No experiments found")
|
|
421
|
+
return
|
|
422
|
+
|
|
423
|
+
# Sort by updated_at descending
|
|
424
|
+
configs.sort(key=lambda c: c.updated_at, reverse=True)
|
|
425
|
+
|
|
426
|
+
# Display table
|
|
427
|
+
click.echo(f"\nExperiments ({len(configs)} total):\n")
|
|
428
|
+
click.echo(f"{'Name':<30} {'Status':<12} {'Agent':<20} {'Updated':<12}")
|
|
429
|
+
click.echo("-" * 75)
|
|
430
|
+
|
|
431
|
+
for config in configs:
|
|
432
|
+
name = config.name[:30]
|
|
433
|
+
status_str = config.status.value[:12]
|
|
434
|
+
agent = config.agent_schema_ref.name[:20]
|
|
435
|
+
updated = config.updated_at.strftime("%Y-%m-%d")
|
|
436
|
+
click.echo(f"{name:<30} {status_str:<12} {agent:<20} {updated:<12}")
|
|
437
|
+
|
|
438
|
+
except Exception as e:
|
|
439
|
+
logger.error(f"Failed to list experiments: {e}")
|
|
440
|
+
click.echo(f"Error: {e}", err=True)
|
|
441
|
+
raise click.Abort()
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
# =============================================================================
|
|
445
|
+
# SHOW COMMAND
|
|
446
|
+
# =============================================================================
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
@experiments.command("show")
|
|
450
|
+
@click.argument("name")
|
|
451
|
+
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
452
|
+
def show(name: str, base_path: Optional[str]):
|
|
453
|
+
"""Show experiment details.
|
|
454
|
+
|
|
455
|
+
Examples:
|
|
456
|
+
rem experiments show hello-world-validation
|
|
457
|
+
"""
|
|
458
|
+
from rem.models.core.experiment import ExperimentConfig
|
|
459
|
+
import os
|
|
460
|
+
|
|
461
|
+
try:
|
|
462
|
+
# Resolve base path
|
|
463
|
+
if base_path is None:
|
|
464
|
+
base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
|
|
465
|
+
|
|
466
|
+
config_path = Path(base_path) / name / "experiment.yaml"
|
|
467
|
+
if not config_path.exists():
|
|
468
|
+
click.echo(f"Experiment not found: {name}")
|
|
469
|
+
click.echo(f" Looked in: {config_path}")
|
|
470
|
+
raise click.Abort()
|
|
471
|
+
|
|
472
|
+
config = ExperimentConfig.from_yaml(config_path)
|
|
473
|
+
|
|
474
|
+
click.echo(f"\nExperiment: {config.name}")
|
|
475
|
+
click.echo(f"{'=' * 60}\n")
|
|
476
|
+
click.echo(f"Description: {config.description}")
|
|
477
|
+
click.echo(f"Status: {config.status.value}")
|
|
478
|
+
if config.tags:
|
|
479
|
+
click.echo(f"Tags: {', '.join(config.tags)}")
|
|
480
|
+
|
|
481
|
+
click.echo(f"\nAgent Schema:")
|
|
482
|
+
click.echo(f" Name: {config.agent_schema_ref.name}")
|
|
483
|
+
click.echo(f" Version: {config.agent_schema_ref.version or 'latest'}")
|
|
484
|
+
|
|
485
|
+
click.echo(f"\nEvaluator Schema:")
|
|
486
|
+
click.echo(f" Name: {config.evaluator_schema_ref.name}")
|
|
487
|
+
|
|
488
|
+
click.echo(f"\nDatasets:")
|
|
489
|
+
for ds_name, ds_ref in config.datasets.items():
|
|
490
|
+
click.echo(f" {ds_name}:")
|
|
491
|
+
click.echo(f" Location: {ds_ref.location.value}")
|
|
492
|
+
click.echo(f" Path: {ds_ref.path}")
|
|
493
|
+
click.echo(f" Format: {ds_ref.format}")
|
|
494
|
+
|
|
495
|
+
click.echo(f"\nResults:")
|
|
496
|
+
click.echo(f" Location: {config.results.location.value}")
|
|
497
|
+
click.echo(f" Base Path: {config.results.base_path}")
|
|
498
|
+
click.echo(f" Save Traces: {config.results.save_traces}")
|
|
499
|
+
click.echo(f" Metrics File: {config.results.metrics_file}")
|
|
500
|
+
|
|
501
|
+
click.echo(f"\nTimestamps:")
|
|
502
|
+
click.echo(f" Created: {config.created_at.isoformat()}")
|
|
503
|
+
click.echo(f" Updated: {config.updated_at.isoformat()}")
|
|
504
|
+
if config.last_run_at:
|
|
505
|
+
click.echo(f" Last Run: {config.last_run_at.isoformat()}")
|
|
506
|
+
|
|
507
|
+
if config.metadata:
|
|
508
|
+
click.echo(f"\nMetadata:")
|
|
509
|
+
for key, value in config.metadata.items():
|
|
510
|
+
click.echo(f" {key}: {value}")
|
|
511
|
+
|
|
512
|
+
except Exception as e:
|
|
513
|
+
logger.error(f"Failed to show experiment: {e}")
|
|
514
|
+
click.echo(f"Error: {e}", err=True)
|
|
515
|
+
raise click.Abort()
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
# =============================================================================
|
|
519
|
+
# VIBES MODE HELPER
|
|
520
|
+
# =============================================================================
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def _run_vibes_mode(
|
|
524
|
+
config: Any,
|
|
525
|
+
dataset_df: Any,
|
|
526
|
+
task_fn: Any,
|
|
527
|
+
base_path: str,
|
|
528
|
+
limit: Optional[int],
|
|
529
|
+
evaluator_schema_path: Path,
|
|
530
|
+
) -> None:
|
|
531
|
+
"""Run experiment in vibes mode - execute agent and export for AI evaluation.
|
|
532
|
+
|
|
533
|
+
Vibes mode runs the agent on each example and saves results to a JSONL file.
|
|
534
|
+
The AI assistant (e.g., Claude Code) then acts as the judge using the
|
|
535
|
+
evaluator schema to evaluate results.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
config: ExperimentConfig object
|
|
539
|
+
dataset_df: Polars DataFrame with ground truth examples
|
|
540
|
+
task_fn: Function to run agent on each example
|
|
541
|
+
base_path: Base directory for experiments
|
|
542
|
+
limit: Optional limit on number of examples to process
|
|
543
|
+
evaluator_schema_path: Path to the evaluator schema YAML file
|
|
544
|
+
"""
|
|
545
|
+
from rem.utils.date_utils import format_timestamp_for_experiment, utc_now, to_iso
|
|
546
|
+
import json
|
|
547
|
+
|
|
548
|
+
# Apply limit if specified
|
|
549
|
+
if limit:
|
|
550
|
+
dataset_df = dataset_df.head(limit)
|
|
551
|
+
click.echo(f" (Limited to {limit} examples)")
|
|
552
|
+
|
|
553
|
+
# Create results directory
|
|
554
|
+
timestamp = format_timestamp_for_experiment()
|
|
555
|
+
results_dir = Path(base_path) / config.name / "results" / timestamp
|
|
556
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
557
|
+
|
|
558
|
+
click.echo(f"\n⏳ Running agent on {len(dataset_df)} examples...")
|
|
559
|
+
click.echo(f" Results will be saved to: {results_dir}")
|
|
560
|
+
click.echo()
|
|
561
|
+
|
|
562
|
+
# Run agent on each example and collect results
|
|
563
|
+
results = []
|
|
564
|
+
records = dataset_df.to_dicts()
|
|
565
|
+
|
|
566
|
+
for i, record in enumerate(records, 1):
|
|
567
|
+
example_id = record.get("id", i)
|
|
568
|
+
click.echo(f" [{i}/{len(records)}] Processing example {example_id}...", nl=False)
|
|
569
|
+
|
|
570
|
+
try:
|
|
571
|
+
# Prepare input for agent
|
|
572
|
+
input_text = record.get("text", record.get("input", record.get("query", "")))
|
|
573
|
+
example_input = {"query": input_text} if isinstance(input_text, str) else input_text
|
|
574
|
+
|
|
575
|
+
# Run agent
|
|
576
|
+
output = task_fn({"input": example_input})
|
|
577
|
+
|
|
578
|
+
result = {
|
|
579
|
+
"id": example_id,
|
|
580
|
+
"input": input_text,
|
|
581
|
+
"ground_truth": record.get("ground_truth", record.get("expected_output", "")),
|
|
582
|
+
"category": record.get("category", ""),
|
|
583
|
+
"agent_output": output,
|
|
584
|
+
"status": "success",
|
|
585
|
+
}
|
|
586
|
+
click.echo(" ✓")
|
|
587
|
+
|
|
588
|
+
except Exception as e:
|
|
589
|
+
result = {
|
|
590
|
+
"id": example_id,
|
|
591
|
+
"input": record.get("text", record.get("input", "")),
|
|
592
|
+
"ground_truth": record.get("ground_truth", record.get("expected_output", "")),
|
|
593
|
+
"category": record.get("category", ""),
|
|
594
|
+
"agent_output": None,
|
|
595
|
+
"status": "error",
|
|
596
|
+
"error": str(e),
|
|
597
|
+
}
|
|
598
|
+
click.echo(f" ✗ ({e})")
|
|
599
|
+
|
|
600
|
+
results.append(result)
|
|
601
|
+
|
|
602
|
+
# Save results to JSONL
|
|
603
|
+
results_file = results_dir / "vibes-results.jsonl"
|
|
604
|
+
with open(results_file, "w") as f:
|
|
605
|
+
for result in results:
|
|
606
|
+
f.write(json.dumps(result) + "\n")
|
|
607
|
+
|
|
608
|
+
# Copy evaluator schema to results dir for easy reference
|
|
609
|
+
import shutil
|
|
610
|
+
evaluator_copy = results_dir / "evaluator-schema.yaml"
|
|
611
|
+
shutil.copy(evaluator_schema_path, evaluator_copy)
|
|
612
|
+
|
|
613
|
+
# Save run metadata
|
|
614
|
+
run_info = {
|
|
615
|
+
"experiment": config.name,
|
|
616
|
+
"agent": config.agent_schema_ref.name,
|
|
617
|
+
"evaluator": config.evaluator_schema_ref.name,
|
|
618
|
+
"mode": "vibes",
|
|
619
|
+
"timestamp": timestamp,
|
|
620
|
+
"total_examples": len(records),
|
|
621
|
+
"successful": len([r for r in results if r["status"] == "success"]),
|
|
622
|
+
"failed": len([r for r in results if r["status"] == "error"]),
|
|
623
|
+
"completed_at": to_iso(utc_now()),
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
run_info_file = results_dir / "run-info.json"
|
|
627
|
+
with open(run_info_file, "w") as f:
|
|
628
|
+
json.dump(run_info, f, indent=2)
|
|
629
|
+
|
|
630
|
+
# Print summary and instructions
|
|
631
|
+
success_count = run_info["successful"]
|
|
632
|
+
fail_count = run_info["failed"]
|
|
633
|
+
|
|
634
|
+
click.echo(f"\n{'=' * 60}")
|
|
635
|
+
click.echo(f"VIBES MODE COMPLETE")
|
|
636
|
+
click.echo(f"{'=' * 60}")
|
|
637
|
+
click.echo(f"\nResults: {success_count} successful, {fail_count} failed")
|
|
638
|
+
click.echo(f"\nFiles saved to: {results_dir}/")
|
|
639
|
+
click.echo(f" - vibes-results.jsonl (agent outputs)")
|
|
640
|
+
click.echo(f" - evaluator-schema.yaml (evaluation criteria)")
|
|
641
|
+
click.echo(f" - run-info.json (run metadata)")
|
|
642
|
+
|
|
643
|
+
click.echo(f"\n{'=' * 60}")
|
|
644
|
+
click.echo(f"NEXT STEP: Ask your AI assistant to evaluate")
|
|
645
|
+
click.echo(f"{'=' * 60}")
|
|
646
|
+
click.echo(f"""
|
|
647
|
+
Copy this prompt to Claude Code or your AI assistant:
|
|
648
|
+
|
|
649
|
+
Please evaluate the experiment results in:
|
|
650
|
+
{results_dir}/
|
|
651
|
+
|
|
652
|
+
Read the vibes-results.jsonl file and evaluate each example
|
|
653
|
+
using the evaluator schema in evaluator-schema.yaml.
|
|
654
|
+
|
|
655
|
+
For each example, provide:
|
|
656
|
+
1. extracted_classification
|
|
657
|
+
2. exact_match (vs ground_truth)
|
|
658
|
+
3. semantic_match
|
|
659
|
+
4. reasoning_quality_score
|
|
660
|
+
5. overall_score
|
|
661
|
+
6. pass/fail
|
|
662
|
+
|
|
663
|
+
Then provide summary metrics:
|
|
664
|
+
- Exact match accuracy
|
|
665
|
+
- Semantic match accuracy
|
|
666
|
+
- Average overall score
|
|
667
|
+
- Pass rate
|
|
668
|
+
""")
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
# =============================================================================
|
|
672
|
+
# RUN COMMAND
|
|
673
|
+
# =============================================================================
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
@experiments.command("run")
|
|
677
|
+
@click.argument("name")
|
|
678
|
+
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
679
|
+
@click.option("--version", help="Git tag version to load (e.g., 'experiments/my-exp/v1.0.0')")
|
|
680
|
+
@click.option("--dry-run", is_flag=True, help="Test on small subset without saving")
|
|
681
|
+
@click.option("--only-vibes", is_flag=True, help="Run agent locally, export results for AI evaluation (no Phoenix)")
|
|
682
|
+
@click.option("--limit", "-n", type=int, help="Limit number of examples to evaluate (useful with --only-vibes)")
|
|
683
|
+
@click.option("--update-prompts", is_flag=True, help="Update prompts in Phoenix before running")
|
|
684
|
+
@click.option("--phoenix-url", help="Phoenix server URL (overrides PHOENIX_BASE_URL env var)")
|
|
685
|
+
@click.option("--phoenix-api-key", help="Phoenix API key (overrides PHOENIX_API_KEY env var)")
|
|
686
|
+
def run(
|
|
687
|
+
name: str,
|
|
688
|
+
base_path: Optional[str],
|
|
689
|
+
version: Optional[str],
|
|
690
|
+
dry_run: bool,
|
|
691
|
+
only_vibes: bool,
|
|
692
|
+
limit: Optional[int],
|
|
693
|
+
update_prompts: bool,
|
|
694
|
+
phoenix_url: Optional[str],
|
|
695
|
+
phoenix_api_key: Optional[str],
|
|
696
|
+
):
|
|
697
|
+
"""Run an experiment using Phoenix provider or local vibes mode.
|
|
698
|
+
|
|
699
|
+
Loads configuration, executes agent and evaluator, saves results.
|
|
700
|
+
|
|
701
|
+
Vibes Mode (--only-vibes):
|
|
702
|
+
Run agent locally without Phoenix infrastructure. Agent outputs are saved
|
|
703
|
+
to a JSONL file along with the evaluator schema. Your AI assistant (e.g.,
|
|
704
|
+
Claude Code) then acts as the judge to evaluate results.
|
|
705
|
+
|
|
706
|
+
This enables seamless switching between:
|
|
707
|
+
- Local evaluation: Quick iteration with AI-as-judge
|
|
708
|
+
- Phoenix evaluation: Production metrics and dashboards
|
|
709
|
+
|
|
710
|
+
Usage:
|
|
711
|
+
rem experiments run my-experiment --only-vibes
|
|
712
|
+
rem experiments run my-experiment --only-vibes --limit 5
|
|
713
|
+
|
|
714
|
+
The command will:
|
|
715
|
+
1. Run the agent on each ground-truth example
|
|
716
|
+
2. Save results to results/{timestamp}/vibes-results.jsonl
|
|
717
|
+
3. Print the evaluator prompt and schema
|
|
718
|
+
4. Instruct you to ask your AI assistant to evaluate
|
|
719
|
+
|
|
720
|
+
Example workflow with Claude Code:
|
|
721
|
+
$ rem experiments run mental-health-classifier --only-vibes --limit 3
|
|
722
|
+
# ... agent runs ...
|
|
723
|
+
# Results saved to: .experiments/mental-health-classifier/results/20241203-143022/
|
|
724
|
+
|
|
725
|
+
# Then ask Claude Code:
|
|
726
|
+
"Please evaluate the experiment results in
|
|
727
|
+
.experiments/mental-health-classifier/results/20241203-143022/
|
|
728
|
+
using the evaluator schema provided"
|
|
729
|
+
|
|
730
|
+
Phoenix Connection:
|
|
731
|
+
Commands respect PHOENIX_BASE_URL and PHOENIX_API_KEY environment variables.
|
|
732
|
+
Defaults to localhost:6006 for local development.
|
|
733
|
+
|
|
734
|
+
Production (on cluster):
|
|
735
|
+
export PHOENIX_BASE_URL=http://phoenix-svc.observability.svc.cluster.local:6006
|
|
736
|
+
export PHOENIX_API_KEY=<your-key>
|
|
737
|
+
kubectl exec -it deployment/rem-api -- rem experiments run my-experiment
|
|
738
|
+
|
|
739
|
+
Development (port-forward):
|
|
740
|
+
kubectl port-forward -n observability svc/phoenix-svc 6006:6006
|
|
741
|
+
export PHOENIX_API_KEY=<your-key>
|
|
742
|
+
rem experiments run my-experiment
|
|
743
|
+
|
|
744
|
+
Local (local Phoenix):
|
|
745
|
+
python -m phoenix.server.main serve
|
|
746
|
+
rem experiments run my-experiment
|
|
747
|
+
|
|
748
|
+
Examples:
|
|
749
|
+
# Run experiment with latest schemas
|
|
750
|
+
rem experiments run hello-world-validation
|
|
751
|
+
|
|
752
|
+
# Quick local evaluation (vibes mode)
|
|
753
|
+
rem experiments run hello-world-validation --only-vibes
|
|
754
|
+
|
|
755
|
+
# Vibes mode with limited examples
|
|
756
|
+
rem experiments run hello-world-validation --only-vibes --limit 5
|
|
757
|
+
|
|
758
|
+
# Run specific version
|
|
759
|
+
rem experiments run hello-world-validation \\
|
|
760
|
+
--version experiments/hello-world-validation/v1.0.0
|
|
761
|
+
|
|
762
|
+
# Dry run (test without saving)
|
|
763
|
+
rem experiments run cv-parser-production --dry-run
|
|
764
|
+
|
|
765
|
+
# Override Phoenix connection
|
|
766
|
+
rem experiments run my-experiment \\
|
|
767
|
+
--phoenix-url http://phoenix.example.com:6006 \\
|
|
768
|
+
--phoenix-api-key <key>
|
|
769
|
+
"""
|
|
770
|
+
from rem.models.core.experiment import ExperimentConfig, ExperimentStatus
|
|
771
|
+
from rem.services.git import GitService
|
|
772
|
+
from rem.services.phoenix import PhoenixClient
|
|
773
|
+
from rem.agentic.providers.phoenix import create_evaluator_from_schema
|
|
774
|
+
from rem.utils.date_utils import utc_now, to_iso, format_timestamp_for_experiment
|
|
775
|
+
import os
|
|
776
|
+
|
|
777
|
+
try:
|
|
778
|
+
# Resolve base path
|
|
779
|
+
if base_path is None:
|
|
780
|
+
base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
|
|
781
|
+
|
|
782
|
+
# Load experiment configuration
|
|
783
|
+
if version:
|
|
784
|
+
# Load from Git at specific version
|
|
785
|
+
git_svc = GitService()
|
|
786
|
+
config_yaml = git_svc.fs.read(
|
|
787
|
+
f"git://rem/.experiments/{name}/experiment.yaml?ref={version}"
|
|
788
|
+
)
|
|
789
|
+
config = ExperimentConfig(**config_yaml)
|
|
790
|
+
click.echo(f"✓ Loaded experiment from Git: {version}")
|
|
791
|
+
else:
|
|
792
|
+
# Load from local filesystem
|
|
793
|
+
config_path = Path(base_path) / name / "experiment.yaml"
|
|
794
|
+
if not config_path.exists():
|
|
795
|
+
click.echo(f"Experiment not found: {name}")
|
|
796
|
+
click.echo(f" Looked in: {config_path}")
|
|
797
|
+
raise click.Abort()
|
|
798
|
+
config = ExperimentConfig.from_yaml(config_path)
|
|
799
|
+
click.echo(f"✓ Loaded experiment: {name}")
|
|
800
|
+
|
|
801
|
+
# Display experiment info
|
|
802
|
+
click.echo(f"\nExperiment: {config.name}")
|
|
803
|
+
click.echo(f" Agent: {config.agent_schema_ref.name} (version: {config.agent_schema_ref.version or 'latest'})")
|
|
804
|
+
click.echo(f" Evaluator: {config.evaluator_schema_ref.name}")
|
|
805
|
+
click.echo(f" Status: {config.status.value}")
|
|
806
|
+
if dry_run:
|
|
807
|
+
click.echo(f" Mode: DRY RUN (no data will be saved)")
|
|
808
|
+
click.echo()
|
|
809
|
+
|
|
810
|
+
# Load agent schema using centralized schema loader
|
|
811
|
+
agent_name = config.agent_schema_ref.name
|
|
812
|
+
agent_version = config.agent_schema_ref.version
|
|
813
|
+
|
|
814
|
+
click.echo(f"Loading agent schema: {agent_name} (version: {agent_version or 'latest'})")
|
|
815
|
+
|
|
816
|
+
from rem.utils.schema_loader import load_agent_schema
|
|
817
|
+
|
|
818
|
+
try:
|
|
819
|
+
agent_schema = load_agent_schema(agent_name)
|
|
820
|
+
click.echo(f"✓ Loaded agent schema: {agent_name}")
|
|
821
|
+
except FileNotFoundError as e:
|
|
822
|
+
logger.error(f"Failed to load agent schema: {e}")
|
|
823
|
+
click.echo(f"Error: Could not load agent schema '{agent_name}'")
|
|
824
|
+
click.echo(f" {e}")
|
|
825
|
+
raise click.Abort()
|
|
826
|
+
|
|
827
|
+
# Create agent function from schema
|
|
828
|
+
from rem.agentic.providers.pydantic_ai import create_agent
|
|
829
|
+
from rem.agentic.context import AgentContext
|
|
830
|
+
|
|
831
|
+
# Create agent context
|
|
832
|
+
context = AgentContext(
|
|
833
|
+
user_id="experiment-runner",
|
|
834
|
+
tenant_id="experiments",
|
|
835
|
+
session_id=f"experiment-{config.name}",
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
agent_runtime = asyncio.run(create_agent(
|
|
839
|
+
context=context,
|
|
840
|
+
agent_schema_override=agent_schema
|
|
841
|
+
))
|
|
842
|
+
|
|
843
|
+
def task_fn(example: dict[str, Any]) -> dict[str, Any]:
|
|
844
|
+
"""Run agent on example."""
|
|
845
|
+
input_data = example.get("input", {})
|
|
846
|
+
|
|
847
|
+
# Extract query from input
|
|
848
|
+
query = input_data.get("query", "")
|
|
849
|
+
if not query:
|
|
850
|
+
# Try other common input keys
|
|
851
|
+
query = input_data.get("text", input_data.get("prompt", str(input_data)))
|
|
852
|
+
|
|
853
|
+
# Run agent
|
|
854
|
+
result = asyncio.run(agent_runtime.run(query))
|
|
855
|
+
|
|
856
|
+
# Serialize result (critical for Pydantic models!)
|
|
857
|
+
from rem.agentic.serialization import serialize_agent_result
|
|
858
|
+
serialized = serialize_agent_result(result)
|
|
859
|
+
# Ensure we return a dict (Phoenix expects dict output)
|
|
860
|
+
if isinstance(serialized, str):
|
|
861
|
+
return {"output": serialized}
|
|
862
|
+
return serialized if isinstance(serialized, dict) else {"output": str(serialized)}
|
|
863
|
+
|
|
864
|
+
# Load evaluator schema using centralized schema loader
|
|
865
|
+
evaluator_name = config.evaluator_schema_ref.name
|
|
866
|
+
evaluator_version = config.evaluator_schema_ref.version
|
|
867
|
+
|
|
868
|
+
click.echo(f"Loading evaluator: {evaluator_name} for agent {agent_name}")
|
|
869
|
+
|
|
870
|
+
# Find evaluator schema file path
|
|
871
|
+
from rem.utils.schema_loader import get_evaluator_schema_path
|
|
872
|
+
|
|
873
|
+
evaluator_schema_path = get_evaluator_schema_path(evaluator_name)
|
|
874
|
+
if not evaluator_schema_path or not evaluator_schema_path.exists():
|
|
875
|
+
click.echo(f"Error: Could not find evaluator schema '{evaluator_name}'")
|
|
876
|
+
raise click.Abort()
|
|
877
|
+
|
|
878
|
+
click.echo(f"✓ Found evaluator schema: {evaluator_schema_path}")
|
|
879
|
+
|
|
880
|
+
# For Phoenix mode, also load evaluator function
|
|
881
|
+
evaluator_fn = None
|
|
882
|
+
if not only_vibes:
|
|
883
|
+
# Try multiple evaluator path patterns (agent-specific, then generic)
|
|
884
|
+
evaluator_paths_to_try = [
|
|
885
|
+
f"{agent_name}/{evaluator_name}", # e.g., hello-world/default
|
|
886
|
+
f"{agent_name}-{evaluator_name}", # e.g., hello-world-default
|
|
887
|
+
evaluator_name, # e.g., default (generic)
|
|
888
|
+
]
|
|
889
|
+
|
|
890
|
+
evaluator_load_error = None
|
|
891
|
+
|
|
892
|
+
for evaluator_path in evaluator_paths_to_try:
|
|
893
|
+
try:
|
|
894
|
+
evaluator_fn = create_evaluator_from_schema(
|
|
895
|
+
evaluator_schema_path=evaluator_path,
|
|
896
|
+
model_name=None, # Use default from schema
|
|
897
|
+
)
|
|
898
|
+
click.echo(f"✓ Loaded evaluator function: {evaluator_path}")
|
|
899
|
+
break
|
|
900
|
+
except FileNotFoundError as e:
|
|
901
|
+
evaluator_load_error = e
|
|
902
|
+
logger.debug(f"Evaluator not found at {evaluator_path}: {e}")
|
|
903
|
+
continue
|
|
904
|
+
except Exception as e:
|
|
905
|
+
evaluator_load_error = e
|
|
906
|
+
logger.warning(f"Failed to load evaluator from {evaluator_path}: {e}")
|
|
907
|
+
continue
|
|
908
|
+
|
|
909
|
+
if evaluator_fn is None and not only_vibes:
|
|
910
|
+
click.echo(f"Error: Could not load evaluator function '{evaluator_name}'")
|
|
911
|
+
click.echo(f" Tried paths: {evaluator_paths_to_try}")
|
|
912
|
+
if evaluator_load_error:
|
|
913
|
+
click.echo(f" Last error: {evaluator_load_error}")
|
|
914
|
+
raise click.Abort()
|
|
915
|
+
|
|
916
|
+
# Validate evaluator credentials before running expensive agent tasks
|
|
917
|
+
if evaluator_fn is not None and not only_vibes:
|
|
918
|
+
from rem.agentic.providers.phoenix import validate_evaluator_credentials
|
|
919
|
+
|
|
920
|
+
click.echo("Validating evaluator credentials...")
|
|
921
|
+
is_valid, error_msg = validate_evaluator_credentials()
|
|
922
|
+
if not is_valid:
|
|
923
|
+
click.echo(click.style(f"\n⚠️ Evaluator validation failed: {error_msg}", fg="yellow"))
|
|
924
|
+
click.echo("\nOptions:")
|
|
925
|
+
click.echo(" 1. Fix the credentials issue and re-run")
|
|
926
|
+
click.echo(" 2. Run with --only-vibes to skip LLM evaluation")
|
|
927
|
+
click.echo(" 3. Use --evaluator-model to specify a different model")
|
|
928
|
+
raise click.Abort()
|
|
929
|
+
click.echo("✓ Evaluator credentials validated")
|
|
930
|
+
|
|
931
|
+
# Load dataset using read_dataframe utility (auto-detects format from extension)
|
|
932
|
+
from rem.utils.files import read_dataframe
|
|
933
|
+
|
|
934
|
+
click.echo(f"Loading dataset: {list(config.datasets.keys())[0]}")
|
|
935
|
+
dataset_ref = list(config.datasets.values())[0]
|
|
936
|
+
|
|
937
|
+
try:
|
|
938
|
+
if dataset_ref.location.value == "git":
|
|
939
|
+
# Load from Git (local filesystem)
|
|
940
|
+
dataset_path = Path(base_path) / name / dataset_ref.path
|
|
941
|
+
if not dataset_path.exists():
|
|
942
|
+
click.echo(f"Error: Dataset not found: {dataset_path}")
|
|
943
|
+
raise click.Abort()
|
|
944
|
+
|
|
945
|
+
dataset_df = read_dataframe(dataset_path)
|
|
946
|
+
|
|
947
|
+
elif dataset_ref.location.value in ["s3", "hybrid"]:
|
|
948
|
+
# Load from S3 using FS provider
|
|
949
|
+
from rem.services.fs import FS
|
|
950
|
+
|
|
951
|
+
fs = FS()
|
|
952
|
+
content = fs.read(dataset_ref.path)
|
|
953
|
+
# Ensure we have bytes
|
|
954
|
+
if isinstance(content, str):
|
|
955
|
+
content = content.encode()
|
|
956
|
+
dataset_df = read_dataframe(content, filename=dataset_ref.path)
|
|
957
|
+
click.echo(f"✓ Loaded dataset from S3")
|
|
958
|
+
|
|
959
|
+
else:
|
|
960
|
+
click.echo(f"Error: Unknown dataset location: {dataset_ref.location.value}")
|
|
961
|
+
raise click.Abort()
|
|
962
|
+
|
|
963
|
+
except ValueError as e:
|
|
964
|
+
# Unsupported format error from read_dataframe
|
|
965
|
+
click.echo(f"Error: {e}")
|
|
966
|
+
raise click.Abort()
|
|
967
|
+
except Exception as e:
|
|
968
|
+
logger.error(f"Failed to load dataset: {e}")
|
|
969
|
+
click.echo(f"Error: Could not load dataset")
|
|
970
|
+
click.echo(f" Path: {dataset_ref.path}")
|
|
971
|
+
raise click.Abort()
|
|
972
|
+
|
|
973
|
+
click.echo(f"✓ Loaded dataset: {len(dataset_df)} examples")
|
|
974
|
+
|
|
975
|
+
# Update prompts in Phoenix if requested
|
|
976
|
+
if update_prompts:
|
|
977
|
+
# TODO: Implement prompt updating
|
|
978
|
+
click.echo("⚠ --update-prompts not yet implemented")
|
|
979
|
+
|
|
980
|
+
# Vibes mode: run agent and export for AI evaluation
|
|
981
|
+
if only_vibes:
|
|
982
|
+
_run_vibes_mode(
|
|
983
|
+
config=config,
|
|
984
|
+
dataset_df=dataset_df,
|
|
985
|
+
task_fn=task_fn,
|
|
986
|
+
base_path=base_path,
|
|
987
|
+
limit=limit,
|
|
988
|
+
evaluator_schema_path=evaluator_schema_path,
|
|
989
|
+
)
|
|
990
|
+
return
|
|
991
|
+
|
|
992
|
+
# Run experiment via Phoenix
|
|
993
|
+
if not dry_run:
|
|
994
|
+
# Create Phoenix client with optional overrides
|
|
995
|
+
from rem.services.phoenix.config import PhoenixConfig
|
|
996
|
+
import os
|
|
997
|
+
|
|
998
|
+
phoenix_config = PhoenixConfig(
|
|
999
|
+
base_url=phoenix_url or os.getenv("PHOENIX_BASE_URL"),
|
|
1000
|
+
api_key=phoenix_api_key or os.getenv("PHOENIX_API_KEY")
|
|
1001
|
+
)
|
|
1002
|
+
|
|
1003
|
+
# Display Phoenix connection info
|
|
1004
|
+
phoenix_display_url = phoenix_config.base_url
|
|
1005
|
+
phoenix_has_key = "Yes" if phoenix_config.api_key else "No"
|
|
1006
|
+
click.echo(f"\nPhoenix Connection:")
|
|
1007
|
+
click.echo(f" URL: {phoenix_display_url}")
|
|
1008
|
+
click.echo(f" API Key: {phoenix_has_key}")
|
|
1009
|
+
click.echo()
|
|
1010
|
+
|
|
1011
|
+
client = PhoenixClient(config=phoenix_config)
|
|
1012
|
+
|
|
1013
|
+
experiment_name = f"{config.name}-{format_timestamp_for_experiment()}"
|
|
1014
|
+
|
|
1015
|
+
click.echo(f"\n⏳ Running experiment: {experiment_name}")
|
|
1016
|
+
click.echo(f" This may take several minutes...")
|
|
1017
|
+
|
|
1018
|
+
experiment = client.run_experiment(
|
|
1019
|
+
dataset=dataset_df,
|
|
1020
|
+
task=task_fn,
|
|
1021
|
+
evaluators=[evaluator_fn],
|
|
1022
|
+
experiment_name=experiment_name,
|
|
1023
|
+
experiment_description=config.description,
|
|
1024
|
+
experiment_metadata={
|
|
1025
|
+
"agent": config.agent_schema_ref.name,
|
|
1026
|
+
"evaluator": config.evaluator_schema_ref.name,
|
|
1027
|
+
"experiment_config": config.name,
|
|
1028
|
+
**config.metadata
|
|
1029
|
+
},
|
|
1030
|
+
# Smart column detection for DataFrame -> Phoenix Dataset conversion
|
|
1031
|
+
input_keys=["input"] if "input" in dataset_df.columns else None,
|
|
1032
|
+
output_keys=["expected_output"] if "expected_output" in dataset_df.columns else None,
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
# Update experiment status
|
|
1036
|
+
config.status = ExperimentStatus.COMPLETED
|
|
1037
|
+
config.last_run_at = utc_now()
|
|
1038
|
+
if not version: # Only save if not loading from Git
|
|
1039
|
+
config.save(base_path)
|
|
1040
|
+
|
|
1041
|
+
click.echo(f"\n✓ Experiment complete!")
|
|
1042
|
+
if hasattr(experiment, "url"):
|
|
1043
|
+
click.echo(f" View results: {experiment.url}") # type: ignore[attr-defined]
|
|
1044
|
+
|
|
1045
|
+
# Save results according to config.results settings
|
|
1046
|
+
if config.results.save_metrics_summary:
|
|
1047
|
+
# Get experiment data
|
|
1048
|
+
try:
|
|
1049
|
+
exp_data = client.get_experiment(experiment.id) # type: ignore[attr-defined]
|
|
1050
|
+
|
|
1051
|
+
# Build metrics summary
|
|
1052
|
+
metrics = {
|
|
1053
|
+
"experiment_id": experiment.id, # type: ignore[attr-defined]
|
|
1054
|
+
"experiment_name": experiment_name,
|
|
1055
|
+
"agent": config.agent_schema_ref.name,
|
|
1056
|
+
"evaluator": config.evaluator_schema_ref.name,
|
|
1057
|
+
"dataset_size": len(dataset_df),
|
|
1058
|
+
"completed_at": to_iso(utc_now()),
|
|
1059
|
+
"phoenix_url": getattr(experiment, "url", None),
|
|
1060
|
+
"task_runs": len(exp_data.get("task_runs", [])),
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
# Save metrics
|
|
1064
|
+
if config.results.location.value == "git" or config.results.location.value == "hybrid":
|
|
1065
|
+
# Save to Git
|
|
1066
|
+
metrics_path = Path(base_path) / name / "results" / (config.results.metrics_file or "metrics.json")
|
|
1067
|
+
metrics_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1068
|
+
|
|
1069
|
+
import json
|
|
1070
|
+
with open(metrics_path, "w") as f:
|
|
1071
|
+
json.dump(metrics, f, indent=2)
|
|
1072
|
+
|
|
1073
|
+
click.echo(f"\n✓ Saved metrics summary: {metrics_path}")
|
|
1074
|
+
|
|
1075
|
+
if config.results.location.value == "s3" or config.results.location.value == "hybrid":
|
|
1076
|
+
# Save to S3
|
|
1077
|
+
from rem.services.fs import FS
|
|
1078
|
+
fs = FS()
|
|
1079
|
+
|
|
1080
|
+
s3_metrics_path = config.results.base_path.rstrip("/") + "/" + (config.results.metrics_file or "metrics.json")
|
|
1081
|
+
|
|
1082
|
+
import json
|
|
1083
|
+
fs.write(s3_metrics_path, json.dumps(metrics, indent=2))
|
|
1084
|
+
|
|
1085
|
+
click.echo(f"✓ Saved metrics summary to S3: {s3_metrics_path}")
|
|
1086
|
+
|
|
1087
|
+
except Exception as e:
|
|
1088
|
+
logger.warning(f"Failed to save metrics: {e}")
|
|
1089
|
+
click.echo(f"⚠ Could not save metrics summary: {e}")
|
|
1090
|
+
else:
|
|
1091
|
+
click.echo("\n✓ Dry run complete (no data saved)")
|
|
1092
|
+
|
|
1093
|
+
except Exception as e:
|
|
1094
|
+
logger.error(f"Failed to run experiment: {e}")
|
|
1095
|
+
click.echo(f"Error: {e}", err=True)
|
|
1096
|
+
raise click.Abort()
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
# =============================================================================
|
|
1100
|
+
# DATASET COMMANDS
|
|
1101
|
+
# =============================================================================
|
|
1102
|
+
|
|
1103
|
+
|
|
1104
|
+
@experiments.group()
|
|
1105
|
+
def dataset():
|
|
1106
|
+
"""Dataset management commands."""
|
|
1107
|
+
pass
|
|
1108
|
+
|
|
1109
|
+
|
|
1110
|
+
@dataset.command("list")
|
|
1111
|
+
def dataset_list():
|
|
1112
|
+
"""List all datasets.
|
|
1113
|
+
|
|
1114
|
+
Example:
|
|
1115
|
+
rem experiments dataset list
|
|
1116
|
+
"""
|
|
1117
|
+
from rem.services.phoenix import PhoenixClient
|
|
1118
|
+
|
|
1119
|
+
try:
|
|
1120
|
+
client = PhoenixClient()
|
|
1121
|
+
datasets = client.list_datasets()
|
|
1122
|
+
|
|
1123
|
+
if not datasets:
|
|
1124
|
+
click.echo("No datasets found")
|
|
1125
|
+
return
|
|
1126
|
+
|
|
1127
|
+
click.echo(f"\nDatasets ({len(datasets)} total):\n")
|
|
1128
|
+
click.echo(f"{'Name':<40} {'Examples':>10} {'Created':<12}")
|
|
1129
|
+
click.echo("-" * 65)
|
|
1130
|
+
|
|
1131
|
+
for ds in datasets:
|
|
1132
|
+
name = ds.get("name", "")[:40]
|
|
1133
|
+
count = ds.get("example_count", 0)
|
|
1134
|
+
created = ds.get("created_at", "")[:10]
|
|
1135
|
+
click.echo(f"{name:<40} {count:>10} {created:<12}")
|
|
1136
|
+
|
|
1137
|
+
except Exception as e:
|
|
1138
|
+
logger.error(f"Failed to list datasets: {e}")
|
|
1139
|
+
click.echo(f"Error: {e}", err=True)
|
|
1140
|
+
raise click.Abort()
|
|
1141
|
+
|
|
1142
|
+
|
|
1143
|
+
@dataset.command("create")
|
|
1144
|
+
@click.argument("name")
|
|
1145
|
+
@click.option("--from-csv", type=click.Path(exists=True, path_type=Path), help="Create from CSV file")
|
|
1146
|
+
@click.option("--input-keys", help="Comma-separated input column names")
|
|
1147
|
+
@click.option("--output-keys", help="Comma-separated output column names (reference/ground truth)")
|
|
1148
|
+
@click.option("--metadata-keys", help="Comma-separated metadata column names (difficulty, type, etc.)")
|
|
1149
|
+
@click.option("--description", help="Dataset description")
|
|
1150
|
+
def dataset_create(
|
|
1151
|
+
name: str,
|
|
1152
|
+
from_csv: Optional[Path],
|
|
1153
|
+
input_keys: Optional[str],
|
|
1154
|
+
output_keys: Optional[str],
|
|
1155
|
+
metadata_keys: Optional[str],
|
|
1156
|
+
description: Optional[str],
|
|
1157
|
+
):
|
|
1158
|
+
"""Create a dataset (golden set).
|
|
1159
|
+
|
|
1160
|
+
Two modes:
|
|
1161
|
+
1. From CSV: --from-csv golden.csv --input-keys query --output-keys expected
|
|
1162
|
+
2. Manual (empty): Will create empty dataset to populate later
|
|
1163
|
+
|
|
1164
|
+
Examples:
|
|
1165
|
+
# From CSV (SME golden set)
|
|
1166
|
+
rem experiments dataset create rem-lookup-golden \\
|
|
1167
|
+
--from-csv golden-lookup.csv \\
|
|
1168
|
+
--input-keys query \\
|
|
1169
|
+
--output-keys expected_label,expected_type \\
|
|
1170
|
+
--metadata-keys difficulty,query_type
|
|
1171
|
+
|
|
1172
|
+
# Empty dataset (populate later)
|
|
1173
|
+
rem experiments dataset create rem-test --description "Test dataset"
|
|
1174
|
+
"""
|
|
1175
|
+
from rem.services.phoenix import PhoenixClient
|
|
1176
|
+
|
|
1177
|
+
try:
|
|
1178
|
+
client = PhoenixClient()
|
|
1179
|
+
|
|
1180
|
+
if from_csv:
|
|
1181
|
+
# Create from CSV
|
|
1182
|
+
if not input_keys or not output_keys:
|
|
1183
|
+
click.echo("Error: --input-keys and --output-keys required for CSV", err=True)
|
|
1184
|
+
raise click.Abort()
|
|
1185
|
+
|
|
1186
|
+
dataset = client.create_dataset_from_csv(
|
|
1187
|
+
name=name,
|
|
1188
|
+
csv_file_path=from_csv,
|
|
1189
|
+
input_keys=input_keys.split(","),
|
|
1190
|
+
output_keys=output_keys.split(","),
|
|
1191
|
+
metadata_keys=metadata_keys.split(",") if metadata_keys else None,
|
|
1192
|
+
description=description,
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
click.echo(f"✓ Created dataset '{dataset.name}' from CSV with {len(dataset)} examples")
|
|
1196
|
+
|
|
1197
|
+
else:
|
|
1198
|
+
# Create empty dataset
|
|
1199
|
+
dataset = client.create_dataset_from_data(
|
|
1200
|
+
name=name,
|
|
1201
|
+
inputs=[],
|
|
1202
|
+
outputs=[],
|
|
1203
|
+
description=description,
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1206
|
+
click.echo(f"✓ Created empty dataset '{dataset.name}'")
|
|
1207
|
+
click.echo(" Use 'rem experiments dataset add' to add examples")
|
|
1208
|
+
|
|
1209
|
+
except Exception as e:
|
|
1210
|
+
logger.error(f"Failed to create dataset: {e}")
|
|
1211
|
+
click.echo(f"Error: {e}", err=True)
|
|
1212
|
+
raise click.Abort()
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
@dataset.command("add")
|
|
1216
|
+
@click.argument("dataset_name")
|
|
1217
|
+
@click.option("--from-csv", type=click.Path(exists=True, path_type=Path), required=True,
|
|
1218
|
+
help="CSV file with examples")
|
|
1219
|
+
@click.option("--input-keys", required=True, help="Comma-separated input column names")
|
|
1220
|
+
@click.option("--output-keys", required=True, help="Comma-separated output column names")
|
|
1221
|
+
@click.option("--metadata-keys", help="Comma-separated metadata column names")
|
|
1222
|
+
def dataset_add(
|
|
1223
|
+
dataset_name: str,
|
|
1224
|
+
from_csv: Path,
|
|
1225
|
+
input_keys: str,
|
|
1226
|
+
output_keys: str,
|
|
1227
|
+
metadata_keys: Optional[str],
|
|
1228
|
+
):
|
|
1229
|
+
"""Add examples to an existing dataset.
|
|
1230
|
+
|
|
1231
|
+
Example:
|
|
1232
|
+
rem experiments dataset add rem-lookup-golden \\
|
|
1233
|
+
--from-csv new-examples.csv \\
|
|
1234
|
+
--input-keys query \\
|
|
1235
|
+
--output-keys expected_label,expected_type
|
|
1236
|
+
"""
|
|
1237
|
+
from rem.services.phoenix import PhoenixClient
|
|
1238
|
+
import polars as pl
|
|
1239
|
+
|
|
1240
|
+
try:
|
|
1241
|
+
client = PhoenixClient()
|
|
1242
|
+
|
|
1243
|
+
# Load CSV with Polars
|
|
1244
|
+
df = pl.read_csv(from_csv)
|
|
1245
|
+
records = df.to_dicts()
|
|
1246
|
+
|
|
1247
|
+
# Extract data
|
|
1248
|
+
input_cols = input_keys.split(",")
|
|
1249
|
+
output_cols = output_keys.split(",")
|
|
1250
|
+
inputs = [{k: row.get(k) for k in input_cols} for row in records]
|
|
1251
|
+
outputs = [{k: row.get(k) for k in output_cols} for row in records]
|
|
1252
|
+
metadata = None
|
|
1253
|
+
if metadata_keys:
|
|
1254
|
+
meta_cols = metadata_keys.split(",")
|
|
1255
|
+
metadata = [{k: row.get(k) for k in meta_cols} for row in records]
|
|
1256
|
+
|
|
1257
|
+
# Add to dataset
|
|
1258
|
+
dataset = client.add_examples_to_dataset(
|
|
1259
|
+
dataset=dataset_name,
|
|
1260
|
+
inputs=inputs,
|
|
1261
|
+
outputs=outputs,
|
|
1262
|
+
metadata=metadata,
|
|
1263
|
+
)
|
|
1264
|
+
|
|
1265
|
+
click.echo(f"✓ Added {len(inputs)} examples to dataset '{dataset.name}'")
|
|
1266
|
+
click.echo(f" Total examples: {len(dataset)}")
|
|
1267
|
+
|
|
1268
|
+
except Exception as e:
|
|
1269
|
+
logger.error(f"Failed to add examples: {e}")
|
|
1270
|
+
click.echo(f"Error: {e}", err=True)
|
|
1271
|
+
raise click.Abort()
|
|
1272
|
+
|
|
1273
|
+
|
|
1274
|
+
# =============================================================================
|
|
1275
|
+
# PROMPT COMMANDS
|
|
1276
|
+
# =============================================================================
|
|
1277
|
+
|
|
1278
|
+
|
|
1279
|
+
@experiments.group()
|
|
1280
|
+
def prompt():
|
|
1281
|
+
"""Prompt management commands."""
|
|
1282
|
+
pass
|
|
1283
|
+
|
|
1284
|
+
|
|
1285
|
+
@prompt.command("create")
|
|
1286
|
+
@click.argument("name")
|
|
1287
|
+
@click.option("--system-prompt", "-s", required=True, help="System prompt text")
|
|
1288
|
+
@click.option("--description", "-d", help="Prompt description")
|
|
1289
|
+
@click.option("--model-provider", default="OPENAI", help="Model provider (OPENAI, ANTHROPIC)")
|
|
1290
|
+
@click.option("--model-name", "-m", help="Model name (e.g., gpt-4.1, claude-sonnet-4-5)")
|
|
1291
|
+
@click.option("--type", "-t", "prompt_type", default="Agent", help="Prompt type (Agent or Evaluator)")
|
|
1292
|
+
def prompt_create(
|
|
1293
|
+
name: str,
|
|
1294
|
+
system_prompt: str,
|
|
1295
|
+
description: Optional[str],
|
|
1296
|
+
model_provider: str,
|
|
1297
|
+
model_name: Optional[str],
|
|
1298
|
+
prompt_type: str,
|
|
1299
|
+
):
|
|
1300
|
+
"""Create a prompt.
|
|
1301
|
+
|
|
1302
|
+
Examples:
|
|
1303
|
+
# Create agent prompt
|
|
1304
|
+
rem experiments prompt create hello-world \\
|
|
1305
|
+
--system-prompt "You are a helpful assistant." \\
|
|
1306
|
+
--model-name gpt-4.1
|
|
1307
|
+
|
|
1308
|
+
# Create evaluator prompt
|
|
1309
|
+
rem experiments prompt create correctness-evaluator \\
|
|
1310
|
+
--system-prompt "Evaluate the correctness of responses." \\
|
|
1311
|
+
--type Evaluator \\
|
|
1312
|
+
--model-provider ANTHROPIC \\
|
|
1313
|
+
--model-name claude-sonnet-4-5
|
|
1314
|
+
"""
|
|
1315
|
+
from rem.services.phoenix import PhoenixClient
|
|
1316
|
+
from rem.services.phoenix.prompt_labels import PhoenixPromptLabels
|
|
1317
|
+
from phoenix.client import Client
|
|
1318
|
+
from phoenix.client.types.prompts import PromptVersion
|
|
1319
|
+
from phoenix.client.__generated__ import v1
|
|
1320
|
+
|
|
1321
|
+
try:
|
|
1322
|
+
# Set default model if not specified
|
|
1323
|
+
if not model_name:
|
|
1324
|
+
model_name = "gpt-4.1" if model_provider == "OPENAI" else "claude-sonnet-4-5-20250929"
|
|
1325
|
+
|
|
1326
|
+
# Get config
|
|
1327
|
+
phoenix_client = PhoenixClient()
|
|
1328
|
+
config = phoenix_client.config
|
|
1329
|
+
|
|
1330
|
+
# Create client
|
|
1331
|
+
client = Client(
|
|
1332
|
+
base_url=config.base_url,
|
|
1333
|
+
api_key=config.api_key
|
|
1334
|
+
)
|
|
1335
|
+
|
|
1336
|
+
# Create prompt messages
|
|
1337
|
+
messages = [
|
|
1338
|
+
v1.PromptMessage(
|
|
1339
|
+
role="system",
|
|
1340
|
+
content=system_prompt
|
|
1341
|
+
)
|
|
1342
|
+
]
|
|
1343
|
+
|
|
1344
|
+
# Create PromptVersion
|
|
1345
|
+
version = PromptVersion(
|
|
1346
|
+
messages,
|
|
1347
|
+
model_name=model_name,
|
|
1348
|
+
description="v1.0",
|
|
1349
|
+
model_provider=model_provider # type: ignore[arg-type]
|
|
1350
|
+
)
|
|
1351
|
+
|
|
1352
|
+
# Create the prompt
|
|
1353
|
+
result = client.prompts.create(
|
|
1354
|
+
name=name,
|
|
1355
|
+
version=version,
|
|
1356
|
+
prompt_description=description or f"{prompt_type} prompt: {name}"
|
|
1357
|
+
)
|
|
1358
|
+
|
|
1359
|
+
click.echo(f"✓ Created prompt '{name}' (ID: {result.id})")
|
|
1360
|
+
|
|
1361
|
+
# Try to get the prompt ID for label assignment
|
|
1362
|
+
try:
|
|
1363
|
+
import httpx
|
|
1364
|
+
query = """
|
|
1365
|
+
query {
|
|
1366
|
+
prompts(first: 1, filterBy: {name: {equals: "%s"}}) {
|
|
1367
|
+
edges {
|
|
1368
|
+
node {
|
|
1369
|
+
id
|
|
1370
|
+
name
|
|
1371
|
+
}
|
|
1372
|
+
}
|
|
1373
|
+
}
|
|
1374
|
+
}
|
|
1375
|
+
""" % name
|
|
1376
|
+
|
|
1377
|
+
response = httpx.post(
|
|
1378
|
+
f"{config.base_url}/graphql",
|
|
1379
|
+
json={"query": query},
|
|
1380
|
+
headers={"authorization": f"Bearer {config.api_key}"},
|
|
1381
|
+
timeout=10,
|
|
1382
|
+
)
|
|
1383
|
+
graphql_result = response.json()
|
|
1384
|
+
prompts = graphql_result.get("data", {}).get("prompts", {}).get("edges", [])
|
|
1385
|
+
|
|
1386
|
+
if prompts:
|
|
1387
|
+
prompt_id = prompts[0]["node"]["id"]
|
|
1388
|
+
|
|
1389
|
+
# Assign labels
|
|
1390
|
+
if not config.base_url:
|
|
1391
|
+
raise ValueError("Phoenix base_url is required")
|
|
1392
|
+
labels_helper = PhoenixPromptLabels(
|
|
1393
|
+
base_url=config.base_url, api_key=config.api_key
|
|
1394
|
+
)
|
|
1395
|
+
|
|
1396
|
+
# Assign REM + type label
|
|
1397
|
+
label_names = ["REM", prompt_type]
|
|
1398
|
+
labels_helper.assign_prompt_labels(prompt_id, label_names)
|
|
1399
|
+
click.echo(f"✓ Assigned labels: {', '.join(label_names)}")
|
|
1400
|
+
except Exception as e:
|
|
1401
|
+
click.echo(f"⚠ Warning: Could not assign labels: {e}")
|
|
1402
|
+
|
|
1403
|
+
click.echo(f"\nView in UI: {config.base_url}")
|
|
1404
|
+
|
|
1405
|
+
except Exception as e:
|
|
1406
|
+
logger.error(f"Failed to create prompt: {e}")
|
|
1407
|
+
click.echo(f"Error: {e}", err=True)
|
|
1408
|
+
raise click.Abort()
|
|
1409
|
+
|
|
1410
|
+
|
|
1411
|
+
@prompt.command("list")
|
|
1412
|
+
def prompt_list():
|
|
1413
|
+
"""List all prompts.
|
|
1414
|
+
|
|
1415
|
+
Example:
|
|
1416
|
+
rem experiments prompt list
|
|
1417
|
+
"""
|
|
1418
|
+
import httpx
|
|
1419
|
+
from rem.services.phoenix import PhoenixClient
|
|
1420
|
+
|
|
1421
|
+
try:
|
|
1422
|
+
phoenix_client = PhoenixClient()
|
|
1423
|
+
config = phoenix_client.config
|
|
1424
|
+
|
|
1425
|
+
query = """
|
|
1426
|
+
query {
|
|
1427
|
+
prompts(first: 100) {
|
|
1428
|
+
edges {
|
|
1429
|
+
node {
|
|
1430
|
+
id
|
|
1431
|
+
name
|
|
1432
|
+
description
|
|
1433
|
+
createdAt
|
|
1434
|
+
}
|
|
1435
|
+
}
|
|
1436
|
+
}
|
|
1437
|
+
}
|
|
1438
|
+
"""
|
|
1439
|
+
|
|
1440
|
+
response = httpx.post(
|
|
1441
|
+
f"{config.base_url}/graphql",
|
|
1442
|
+
json={"query": query},
|
|
1443
|
+
headers={"authorization": f"Bearer {config.api_key}"},
|
|
1444
|
+
timeout=10,
|
|
1445
|
+
)
|
|
1446
|
+
|
|
1447
|
+
result = response.json()
|
|
1448
|
+
prompts = result.get("data", {}).get("prompts", {}).get("edges", [])
|
|
1449
|
+
|
|
1450
|
+
if not prompts:
|
|
1451
|
+
click.echo("No prompts found")
|
|
1452
|
+
return
|
|
1453
|
+
|
|
1454
|
+
click.echo(f"\nPrompts ({len(prompts)} total):\n")
|
|
1455
|
+
click.echo(f"{'Name':<40} {'Created':<20}")
|
|
1456
|
+
click.echo("-" * 65)
|
|
1457
|
+
|
|
1458
|
+
for edge in prompts:
|
|
1459
|
+
node = edge["node"]
|
|
1460
|
+
name = node.get("name", "")[:40]
|
|
1461
|
+
created = node.get("createdAt", "")[:19]
|
|
1462
|
+
click.echo(f"{name:<40} {created:<20}")
|
|
1463
|
+
|
|
1464
|
+
except Exception as e:
|
|
1465
|
+
logger.error(f"Failed to list prompts: {e}")
|
|
1466
|
+
click.echo(f"Error: {e}", err=True)
|
|
1467
|
+
raise click.Abort()
|
|
1468
|
+
|
|
1469
|
+
|
|
1470
|
+
# =============================================================================
|
|
1471
|
+
# TRACE COMMANDS
|
|
1472
|
+
# =============================================================================
|
|
1473
|
+
|
|
1474
|
+
|
|
1475
|
+
@experiments.group()
|
|
1476
|
+
def trace():
|
|
1477
|
+
"""Trace retrieval commands."""
|
|
1478
|
+
pass
|
|
1479
|
+
|
|
1480
|
+
|
|
1481
|
+
@trace.command("list")
|
|
1482
|
+
@click.option("--project", "-p", help="Filter by project name")
|
|
1483
|
+
@click.option("--days", "-d", default=7, help="Number of days to look back")
|
|
1484
|
+
@click.option("--limit", "-l", default=20, help="Maximum traces to return")
|
|
1485
|
+
def trace_list(
|
|
1486
|
+
project: Optional[str],
|
|
1487
|
+
days: int,
|
|
1488
|
+
limit: int,
|
|
1489
|
+
):
|
|
1490
|
+
"""List recent traces.
|
|
1491
|
+
|
|
1492
|
+
Example:
|
|
1493
|
+
rem experiments trace list --project rem-agents --days 7 --limit 50
|
|
1494
|
+
"""
|
|
1495
|
+
from rem.services.phoenix import PhoenixClient
|
|
1496
|
+
from rem.utils.date_utils import days_ago
|
|
1497
|
+
|
|
1498
|
+
try:
|
|
1499
|
+
client = PhoenixClient()
|
|
1500
|
+
|
|
1501
|
+
start_time = days_ago(days)
|
|
1502
|
+
|
|
1503
|
+
traces_df = client.get_traces(
|
|
1504
|
+
project_name=project,
|
|
1505
|
+
start_time=start_time,
|
|
1506
|
+
limit=limit,
|
|
1507
|
+
)
|
|
1508
|
+
|
|
1509
|
+
if len(traces_df) == 0:
|
|
1510
|
+
click.echo("No traces found")
|
|
1511
|
+
return
|
|
1512
|
+
|
|
1513
|
+
click.echo(f"\nRecent Traces ({len(traces_df)} results):\n")
|
|
1514
|
+
click.echo(f"{'Span ID':<15} {'Name':<30} {'Start Time':<20}")
|
|
1515
|
+
click.echo("-" * 70)
|
|
1516
|
+
|
|
1517
|
+
for _, row in traces_df.head(limit).iterrows():
|
|
1518
|
+
span_id = str(row.get("context.span_id", ""))[:12]
|
|
1519
|
+
name = str(row.get("name", ""))[:30]
|
|
1520
|
+
start = str(row.get("start_time", ""))[:19]
|
|
1521
|
+
click.echo(f"{span_id:<15} {name:<30} {start:<20}")
|
|
1522
|
+
|
|
1523
|
+
except Exception as e:
|
|
1524
|
+
logger.error(f"Failed to list traces: {e}")
|
|
1525
|
+
click.echo(f"Error: {e}", err=True)
|
|
1526
|
+
raise click.Abort()
|
|
1527
|
+
|
|
1528
|
+
|
|
1529
|
+
# =============================================================================
|
|
1530
|
+
# EXPORT COMMAND
|
|
1531
|
+
# =============================================================================
|
|
1532
|
+
|
|
1533
|
+
|
|
1534
|
+
@experiments.command("export")
|
|
1535
|
+
@click.argument("name")
|
|
1536
|
+
@click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
|
|
1537
|
+
@click.option("--bucket", "-b", help="S3 bucket name (default: DATA_LAKE__BUCKET_NAME)")
|
|
1538
|
+
@click.option("--version", "-v", default="v0", help="Data lake version prefix (default: v0)")
|
|
1539
|
+
@click.option("--plan", is_flag=True, help="Show what would be exported without uploading")
|
|
1540
|
+
@click.option("--include-results", is_flag=True, help="Include results directory in export")
|
|
1541
|
+
def export(
|
|
1542
|
+
name: str,
|
|
1543
|
+
base_path: Optional[str],
|
|
1544
|
+
bucket: Optional[str],
|
|
1545
|
+
version: str,
|
|
1546
|
+
plan: bool,
|
|
1547
|
+
include_results: bool,
|
|
1548
|
+
):
|
|
1549
|
+
"""Export experiment to S3 data lake.
|
|
1550
|
+
|
|
1551
|
+
Exports experiment configuration, ground truth, and optionally results
|
|
1552
|
+
to the S3 data lake following the convention:
|
|
1553
|
+
|
|
1554
|
+
s3://{bucket}/{version}/datasets/calibration/experiments/{agent}/{task}/
|
|
1555
|
+
|
|
1556
|
+
The export includes:
|
|
1557
|
+
- experiment.yaml (configuration)
|
|
1558
|
+
- README.md (documentation)
|
|
1559
|
+
- ground-truth/ (evaluation datasets)
|
|
1560
|
+
- seed-data/ (optional seed data)
|
|
1561
|
+
- results/ (optional, with --include-results)
|
|
1562
|
+
|
|
1563
|
+
Examples:
|
|
1564
|
+
# Preview what would be exported
|
|
1565
|
+
rem experiments export my-experiment --plan
|
|
1566
|
+
|
|
1567
|
+
# Export to configured data lake bucket
|
|
1568
|
+
rem experiments export my-experiment
|
|
1569
|
+
|
|
1570
|
+
# Export to specific bucket
|
|
1571
|
+
rem experiments export my-experiment --bucket siggy-data
|
|
1572
|
+
|
|
1573
|
+
# Include results in export
|
|
1574
|
+
rem experiments export my-experiment --include-results
|
|
1575
|
+
|
|
1576
|
+
# Export with custom version prefix
|
|
1577
|
+
rem experiments export my-experiment --version v1
|
|
1578
|
+
"""
|
|
1579
|
+
from rem.models.core.experiment import ExperimentConfig
|
|
1580
|
+
from rem.settings import settings
|
|
1581
|
+
from rem.services.fs.s3_provider import S3Provider
|
|
1582
|
+
import os
|
|
1583
|
+
import json
|
|
1584
|
+
|
|
1585
|
+
try:
|
|
1586
|
+
# Resolve base path
|
|
1587
|
+
if base_path is None:
|
|
1588
|
+
base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
|
|
1589
|
+
|
|
1590
|
+
# Load experiment configuration
|
|
1591
|
+
config_path = Path(base_path) / name / "experiment.yaml"
|
|
1592
|
+
if not config_path.exists():
|
|
1593
|
+
click.echo(f"Experiment not found: {name}")
|
|
1594
|
+
click.echo(f" Looked in: {config_path}")
|
|
1595
|
+
raise click.Abort()
|
|
1596
|
+
|
|
1597
|
+
config = ExperimentConfig.from_yaml(config_path)
|
|
1598
|
+
click.echo(f"✓ Loaded experiment: {name}")
|
|
1599
|
+
|
|
1600
|
+
# Resolve bucket
|
|
1601
|
+
if bucket is None:
|
|
1602
|
+
bucket = settings.data_lake.bucket_name
|
|
1603
|
+
if bucket is None:
|
|
1604
|
+
click.echo("Error: No S3 bucket configured.")
|
|
1605
|
+
click.echo(" Set DATA_LAKE__BUCKET_NAME environment variable or use --bucket option")
|
|
1606
|
+
raise click.Abort()
|
|
1607
|
+
|
|
1608
|
+
# Build S3 paths
|
|
1609
|
+
s3_base = config.get_s3_export_path(bucket, version)
|
|
1610
|
+
exp_dir = config.get_experiment_dir(base_path)
|
|
1611
|
+
|
|
1612
|
+
# Collect files to export
|
|
1613
|
+
files_to_export = []
|
|
1614
|
+
|
|
1615
|
+
# Always include these files
|
|
1616
|
+
required_files = [
|
|
1617
|
+
("experiment.yaml", exp_dir / "experiment.yaml"),
|
|
1618
|
+
("README.md", exp_dir / "README.md"),
|
|
1619
|
+
]
|
|
1620
|
+
|
|
1621
|
+
for s3_name, local_path in required_files:
|
|
1622
|
+
if local_path.exists():
|
|
1623
|
+
files_to_export.append((s3_name, local_path))
|
|
1624
|
+
|
|
1625
|
+
# Include ground-truth directory
|
|
1626
|
+
ground_truth_dir = exp_dir / "ground-truth"
|
|
1627
|
+
if ground_truth_dir.exists():
|
|
1628
|
+
for f in ground_truth_dir.rglob("*"):
|
|
1629
|
+
if f.is_file():
|
|
1630
|
+
relative = f.relative_to(exp_dir)
|
|
1631
|
+
files_to_export.append((str(relative), f))
|
|
1632
|
+
|
|
1633
|
+
# Include seed-data directory
|
|
1634
|
+
seed_data_dir = exp_dir / "seed-data"
|
|
1635
|
+
if seed_data_dir.exists():
|
|
1636
|
+
for f in seed_data_dir.rglob("*"):
|
|
1637
|
+
if f.is_file():
|
|
1638
|
+
relative = f.relative_to(exp_dir)
|
|
1639
|
+
files_to_export.append((str(relative), f))
|
|
1640
|
+
|
|
1641
|
+
# Optionally include results
|
|
1642
|
+
if include_results:
|
|
1643
|
+
results_dir = exp_dir / "results"
|
|
1644
|
+
if results_dir.exists():
|
|
1645
|
+
for f in results_dir.rglob("*"):
|
|
1646
|
+
if f.is_file():
|
|
1647
|
+
relative = f.relative_to(exp_dir)
|
|
1648
|
+
files_to_export.append((str(relative), f))
|
|
1649
|
+
|
|
1650
|
+
# Display export plan
|
|
1651
|
+
click.echo(f"\n{'=' * 60}")
|
|
1652
|
+
click.echo(f"EXPORT {'PLAN' if plan else 'TO S3'}")
|
|
1653
|
+
click.echo(f"{'=' * 60}")
|
|
1654
|
+
click.echo(f"\nExperiment: {config.name}")
|
|
1655
|
+
click.echo(f"Agent: {config.agent_schema_ref.name}")
|
|
1656
|
+
click.echo(f"Task: {config.task}")
|
|
1657
|
+
click.echo(f"Evaluator file: {config.get_evaluator_filename()}")
|
|
1658
|
+
click.echo(f"\nDestination: {s3_base}/")
|
|
1659
|
+
click.echo(f"\nFiles to export ({len(files_to_export)}):")
|
|
1660
|
+
|
|
1661
|
+
for s3_name, local_path in files_to_export:
|
|
1662
|
+
s3_uri = f"{s3_base}/{s3_name}"
|
|
1663
|
+
if plan:
|
|
1664
|
+
click.echo(f" {local_path}")
|
|
1665
|
+
click.echo(f" → {s3_uri}")
|
|
1666
|
+
else:
|
|
1667
|
+
click.echo(f" {s3_name}")
|
|
1668
|
+
|
|
1669
|
+
if plan:
|
|
1670
|
+
click.echo(f"\n[PLAN MODE] No files were uploaded.")
|
|
1671
|
+
click.echo(f"Run without --plan to execute the export.")
|
|
1672
|
+
return
|
|
1673
|
+
|
|
1674
|
+
# Execute export
|
|
1675
|
+
click.echo(f"\n⏳ Uploading to S3...")
|
|
1676
|
+
s3 = S3Provider()
|
|
1677
|
+
|
|
1678
|
+
uploaded = 0
|
|
1679
|
+
for s3_name, local_path in files_to_export:
|
|
1680
|
+
s3_uri = f"{s3_base}/{s3_name}"
|
|
1681
|
+
try:
|
|
1682
|
+
s3.copy(str(local_path), s3_uri)
|
|
1683
|
+
uploaded += 1
|
|
1684
|
+
click.echo(f" ✓ {s3_name}")
|
|
1685
|
+
except Exception as e:
|
|
1686
|
+
click.echo(f" ✗ {s3_name}: {e}")
|
|
1687
|
+
|
|
1688
|
+
click.echo(f"\n✓ Exported {uploaded}/{len(files_to_export)} files to {s3_base}/")
|
|
1689
|
+
|
|
1690
|
+
# Show next steps
|
|
1691
|
+
click.echo(f"\nNext steps:")
|
|
1692
|
+
click.echo(f" - View in S3: aws s3 ls {s3_base}/ --recursive")
|
|
1693
|
+
click.echo(f" - Download: aws s3 sync {s3_base}/ ./{config.agent_schema_ref.name}/{config.task}/")
|
|
1694
|
+
|
|
1695
|
+
except Exception as e:
|
|
1696
|
+
logger.error(f"Failed to export experiment: {e}")
|
|
1697
|
+
click.echo(f"Error: {e}", err=True)
|
|
1698
|
+
raise click.Abort()
|