remdb 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +565 -0
- rem/cli/commands/configure.py +423 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1124 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +88 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +806 -0
- rem/services/content/service.py +657 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +229 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.2.6.dist-info/METADATA +1191 -0
- remdb-0.2.6.dist-info/RECORD +187 -0
- remdb-0.2.6.dist-info/WHEEL +4 -0
- remdb-0.2.6.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,1124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Experiment management CLI commands.
|
|
3
|
+
|
|
4
|
+
Experiments use ExperimentConfig (rem/models/core/experiment.py) for configuration
|
|
5
|
+
and support Git+S3 hybrid storage. Includes dataset, prompt, and trace management.
|
|
6
|
+
|
|
7
|
+
Directory Structure:
|
|
8
|
+
.experiments/{experiment-name}/
|
|
9
|
+
├── experiment.yaml # ExperimentConfig
|
|
10
|
+
├── README.md # Auto-generated docs
|
|
11
|
+
├── datasets/ # Optional: small datasets
|
|
12
|
+
└── results/ # Optional: metrics summaries
|
|
13
|
+
|
|
14
|
+
Commands:
|
|
15
|
+
# Experiment lifecycle
|
|
16
|
+
rem experiments create <name> --agent <agent> --evaluator <evaluator>
|
|
17
|
+
rem experiments list
|
|
18
|
+
rem experiments show <name>
|
|
19
|
+
rem experiments run <name> [--version <tag>]
|
|
20
|
+
|
|
21
|
+
# Dataset management
|
|
22
|
+
rem experiments dataset list
|
|
23
|
+
rem experiments dataset create <name> --from-csv data.csv
|
|
24
|
+
rem experiments dataset add <name> --from-csv data.csv
|
|
25
|
+
|
|
26
|
+
# Prompt management
|
|
27
|
+
rem experiments prompt list
|
|
28
|
+
rem experiments prompt create <name> --system-prompt "..."
|
|
29
|
+
|
|
30
|
+
# Trace retrieval
|
|
31
|
+
rem experiments trace list --project <name>
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
import asyncio
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
from typing import Any, Optional, cast
|
|
37
|
+
|
|
38
|
+
import click
|
|
39
|
+
from loguru import logger
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@click.group()
|
|
43
|
+
def experiments():
|
|
44
|
+
"""Experiment configuration and execution commands."""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# =============================================================================
|
|
49
|
+
# CREATE COMMAND
|
|
50
|
+
# =============================================================================
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@experiments.command("create")
|
|
54
|
+
@click.argument("name")
|
|
55
|
+
@click.option("--agent", "-a", required=True, help="Agent schema name (e.g., 'cv-parser')")
|
|
56
|
+
@click.option("--evaluator", "-e", default="default", help="Evaluator schema name (default: 'default')")
|
|
57
|
+
@click.option("--description", "-d", help="Experiment description")
|
|
58
|
+
@click.option("--dataset-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
|
|
59
|
+
help="Where to store datasets")
|
|
60
|
+
@click.option("--results-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
|
|
61
|
+
help="Where to store results")
|
|
62
|
+
@click.option("--tags", help="Comma-separated tags (e.g., 'production,cv-parser')")
|
|
63
|
+
@click.option("--base-path", default=".experiments", help="Base directory for experiments")
|
|
64
|
+
def create(
|
|
65
|
+
name: str,
|
|
66
|
+
agent: str,
|
|
67
|
+
evaluator: str,
|
|
68
|
+
description: Optional[str],
|
|
69
|
+
dataset_location: str,
|
|
70
|
+
results_location: str,
|
|
71
|
+
tags: Optional[str],
|
|
72
|
+
base_path: str,
|
|
73
|
+
):
|
|
74
|
+
"""Create a new experiment configuration.
|
|
75
|
+
|
|
76
|
+
Creates directory structure and generates experiment.yaml and README.md.
|
|
77
|
+
|
|
78
|
+
Examples:
|
|
79
|
+
# Small experiment (Git-only)
|
|
80
|
+
rem experiments create hello-world-validation \\
|
|
81
|
+
--agent hello-world \\
|
|
82
|
+
--evaluator default \\
|
|
83
|
+
--description "Smoke test for hello-world agent"
|
|
84
|
+
|
|
85
|
+
# Large experiment (Hybrid storage)
|
|
86
|
+
rem experiments create cv-parser-production \\
|
|
87
|
+
--agent cv-parser \\
|
|
88
|
+
--evaluator default \\
|
|
89
|
+
--description "Production CV parser evaluation" \\
|
|
90
|
+
--dataset-location s3 \\
|
|
91
|
+
--results-location hybrid \\
|
|
92
|
+
--tags "production,cv-parser,weekly"
|
|
93
|
+
"""
|
|
94
|
+
from rem.models.core.experiment import (
|
|
95
|
+
ExperimentConfig,
|
|
96
|
+
DatasetLocation,
|
|
97
|
+
DatasetReference,
|
|
98
|
+
SchemaReference,
|
|
99
|
+
ResultsConfig,
|
|
100
|
+
ExperimentStatus,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
# Build dataset reference
|
|
105
|
+
if dataset_location == "git":
|
|
106
|
+
dataset_ref = DatasetReference(
|
|
107
|
+
location=DatasetLocation.GIT,
|
|
108
|
+
path="datasets/ground_truth.csv",
|
|
109
|
+
format="csv",
|
|
110
|
+
description="Ground truth dataset for evaluation"
|
|
111
|
+
)
|
|
112
|
+
else: # s3 or hybrid
|
|
113
|
+
dataset_ref = DatasetReference(
|
|
114
|
+
location=DatasetLocation(dataset_location),
|
|
115
|
+
path=f"s3://rem-experiments/{name}/datasets/ground_truth.parquet",
|
|
116
|
+
format="parquet",
|
|
117
|
+
schema_path="datasets/schema.yaml" if dataset_location == "hybrid" else None,
|
|
118
|
+
description="Ground truth dataset for evaluation"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Build results config
|
|
122
|
+
if results_location == "git":
|
|
123
|
+
results_config = ResultsConfig(
|
|
124
|
+
location=DatasetLocation.GIT,
|
|
125
|
+
base_path="results/",
|
|
126
|
+
save_traces=False,
|
|
127
|
+
save_metrics_summary=True
|
|
128
|
+
)
|
|
129
|
+
elif results_location == "s3":
|
|
130
|
+
results_config = ResultsConfig(
|
|
131
|
+
location=DatasetLocation.S3,
|
|
132
|
+
base_path=f"s3://rem-experiments/{name}/results/",
|
|
133
|
+
save_traces=True,
|
|
134
|
+
save_metrics_summary=False
|
|
135
|
+
)
|
|
136
|
+
else: # hybrid
|
|
137
|
+
results_config = ResultsConfig(
|
|
138
|
+
location=DatasetLocation.HYBRID,
|
|
139
|
+
base_path=f"s3://rem-experiments/{name}/results/",
|
|
140
|
+
save_traces=True,
|
|
141
|
+
save_metrics_summary=True,
|
|
142
|
+
metrics_file="metrics.json"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Parse tags
|
|
146
|
+
tag_list = [t.strip() for t in tags.split(",")] if tags else []
|
|
147
|
+
|
|
148
|
+
# Create experiment config
|
|
149
|
+
config = ExperimentConfig(
|
|
150
|
+
name=name,
|
|
151
|
+
description=description or f"Evaluation experiment for {agent} agent",
|
|
152
|
+
agent_schema_ref=SchemaReference(
|
|
153
|
+
name=agent,
|
|
154
|
+
version=None, # Use latest by default
|
|
155
|
+
type="agent"
|
|
156
|
+
),
|
|
157
|
+
evaluator_schema_ref=SchemaReference(
|
|
158
|
+
name=evaluator,
|
|
159
|
+
type="evaluator"
|
|
160
|
+
),
|
|
161
|
+
datasets={"ground_truth": dataset_ref},
|
|
162
|
+
results=results_config,
|
|
163
|
+
status=ExperimentStatus.DRAFT,
|
|
164
|
+
tags=tag_list
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Save configuration
|
|
168
|
+
config_path = config.save(base_path)
|
|
169
|
+
readme_path = config.save_readme(base_path)
|
|
170
|
+
|
|
171
|
+
# Create datasets directory
|
|
172
|
+
datasets_dir = config.get_experiment_dir(base_path) / "datasets"
|
|
173
|
+
datasets_dir.mkdir(parents=True, exist_ok=True)
|
|
174
|
+
|
|
175
|
+
# Create results directory if Git-based
|
|
176
|
+
if results_location == "git":
|
|
177
|
+
results_dir = config.get_experiment_dir(base_path) / "results"
|
|
178
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
179
|
+
|
|
180
|
+
click.echo(f"\n✓ Created experiment: {name}")
|
|
181
|
+
click.echo(f" Configuration: {config_path}")
|
|
182
|
+
click.echo(f" Documentation: {readme_path}")
|
|
183
|
+
click.echo(f" Datasets: {datasets_dir}")
|
|
184
|
+
if results_location == "git":
|
|
185
|
+
click.echo(f" Results: {results_dir}")
|
|
186
|
+
click.echo(f"\nNext steps:")
|
|
187
|
+
click.echo(f" 1. Add dataset to {datasets_dir}/")
|
|
188
|
+
click.echo(f" 2. Review configuration: {config_path}")
|
|
189
|
+
click.echo(f" 3. Run experiment: rem experiments run {name}")
|
|
190
|
+
click.echo(f" 4. Commit to Git: git add .experiments/{name}/ && git commit")
|
|
191
|
+
|
|
192
|
+
except Exception as e:
|
|
193
|
+
logger.error(f"Failed to create experiment: {e}")
|
|
194
|
+
click.echo(f"Error: {e}", err=True)
|
|
195
|
+
raise click.Abort()
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# =============================================================================
|
|
199
|
+
# LIST COMMAND
|
|
200
|
+
# =============================================================================
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@experiments.command("list")
|
|
204
|
+
@click.option("--base-path", default=".experiments", help="Base directory for experiments")
|
|
205
|
+
@click.option("--status", help="Filter by status (draft, ready, completed, etc.)")
|
|
206
|
+
@click.option("--tags", help="Filter by tags (comma-separated)")
|
|
207
|
+
def list_experiments(
|
|
208
|
+
base_path: str,
|
|
209
|
+
status: Optional[str],
|
|
210
|
+
tags: Optional[str],
|
|
211
|
+
):
|
|
212
|
+
"""List all experiments.
|
|
213
|
+
|
|
214
|
+
Examples:
|
|
215
|
+
rem experiments list
|
|
216
|
+
rem experiments list --status ready
|
|
217
|
+
rem experiments list --tags production,cv-parser
|
|
218
|
+
"""
|
|
219
|
+
from rem.models.core.experiment import ExperimentConfig, ExperimentStatus
|
|
220
|
+
|
|
221
|
+
try:
|
|
222
|
+
experiments_dir = Path(base_path)
|
|
223
|
+
if not experiments_dir.exists():
|
|
224
|
+
click.echo(f"No experiments directory found at {base_path}")
|
|
225
|
+
return
|
|
226
|
+
|
|
227
|
+
# Find all experiment.yaml files
|
|
228
|
+
configs = []
|
|
229
|
+
for exp_dir in experiments_dir.iterdir():
|
|
230
|
+
if not exp_dir.is_dir() or exp_dir.name.startswith("."):
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
config_file = exp_dir / "experiment.yaml"
|
|
234
|
+
if config_file.exists():
|
|
235
|
+
try:
|
|
236
|
+
config = ExperimentConfig.from_yaml(config_file)
|
|
237
|
+
configs.append(config)
|
|
238
|
+
except Exception as e:
|
|
239
|
+
logger.warning(f"Failed to load {config_file}: {e}")
|
|
240
|
+
|
|
241
|
+
# Apply filters
|
|
242
|
+
if status:
|
|
243
|
+
status_enum = ExperimentStatus(status)
|
|
244
|
+
configs = [c for c in configs if c.status == status_enum]
|
|
245
|
+
|
|
246
|
+
if tags:
|
|
247
|
+
filter_tags = set(t.strip().lower() for t in tags.split(","))
|
|
248
|
+
configs = [c for c in configs if filter_tags & set(c.tags)]
|
|
249
|
+
|
|
250
|
+
if not configs:
|
|
251
|
+
click.echo("No experiments found")
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
# Sort by updated_at descending
|
|
255
|
+
configs.sort(key=lambda c: c.updated_at, reverse=True)
|
|
256
|
+
|
|
257
|
+
# Display table
|
|
258
|
+
click.echo(f"\nExperiments ({len(configs)} total):\n")
|
|
259
|
+
click.echo(f"{'Name':<30} {'Status':<12} {'Agent':<20} {'Updated':<12}")
|
|
260
|
+
click.echo("-" * 75)
|
|
261
|
+
|
|
262
|
+
for config in configs:
|
|
263
|
+
name = config.name[:30]
|
|
264
|
+
status_str = config.status.value[:12]
|
|
265
|
+
agent = config.agent_schema_ref.name[:20]
|
|
266
|
+
updated = config.updated_at.strftime("%Y-%m-%d")
|
|
267
|
+
click.echo(f"{name:<30} {status_str:<12} {agent:<20} {updated:<12}")
|
|
268
|
+
|
|
269
|
+
except Exception as e:
|
|
270
|
+
logger.error(f"Failed to list experiments: {e}")
|
|
271
|
+
click.echo(f"Error: {e}", err=True)
|
|
272
|
+
raise click.Abort()
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
# =============================================================================
|
|
276
|
+
# SHOW COMMAND
|
|
277
|
+
# =============================================================================
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
@experiments.command("show")
|
|
281
|
+
@click.argument("name")
|
|
282
|
+
@click.option("--base-path", default=".experiments", help="Base directory for experiments")
|
|
283
|
+
def show(name: str, base_path: str):
|
|
284
|
+
"""Show experiment details.
|
|
285
|
+
|
|
286
|
+
Examples:
|
|
287
|
+
rem experiments show hello-world-validation
|
|
288
|
+
"""
|
|
289
|
+
from rem.models.core.experiment import ExperimentConfig
|
|
290
|
+
|
|
291
|
+
try:
|
|
292
|
+
config_path = Path(base_path) / name / "experiment.yaml"
|
|
293
|
+
if not config_path.exists():
|
|
294
|
+
click.echo(f"Experiment not found: {name}")
|
|
295
|
+
click.echo(f" Looked in: {config_path}")
|
|
296
|
+
raise click.Abort()
|
|
297
|
+
|
|
298
|
+
config = ExperimentConfig.from_yaml(config_path)
|
|
299
|
+
|
|
300
|
+
click.echo(f"\nExperiment: {config.name}")
|
|
301
|
+
click.echo(f"{'=' * 60}\n")
|
|
302
|
+
click.echo(f"Description: {config.description}")
|
|
303
|
+
click.echo(f"Status: {config.status.value}")
|
|
304
|
+
if config.tags:
|
|
305
|
+
click.echo(f"Tags: {', '.join(config.tags)}")
|
|
306
|
+
|
|
307
|
+
click.echo(f"\nAgent Schema:")
|
|
308
|
+
click.echo(f" Name: {config.agent_schema_ref.name}")
|
|
309
|
+
click.echo(f" Version: {config.agent_schema_ref.version or 'latest'}")
|
|
310
|
+
|
|
311
|
+
click.echo(f"\nEvaluator Schema:")
|
|
312
|
+
click.echo(f" Name: {config.evaluator_schema_ref.name}")
|
|
313
|
+
|
|
314
|
+
click.echo(f"\nDatasets:")
|
|
315
|
+
for ds_name, ds_ref in config.datasets.items():
|
|
316
|
+
click.echo(f" {ds_name}:")
|
|
317
|
+
click.echo(f" Location: {ds_ref.location.value}")
|
|
318
|
+
click.echo(f" Path: {ds_ref.path}")
|
|
319
|
+
click.echo(f" Format: {ds_ref.format}")
|
|
320
|
+
|
|
321
|
+
click.echo(f"\nResults:")
|
|
322
|
+
click.echo(f" Location: {config.results.location.value}")
|
|
323
|
+
click.echo(f" Base Path: {config.results.base_path}")
|
|
324
|
+
click.echo(f" Save Traces: {config.results.save_traces}")
|
|
325
|
+
click.echo(f" Metrics File: {config.results.metrics_file}")
|
|
326
|
+
|
|
327
|
+
click.echo(f"\nTimestamps:")
|
|
328
|
+
click.echo(f" Created: {config.created_at.isoformat()}")
|
|
329
|
+
click.echo(f" Updated: {config.updated_at.isoformat()}")
|
|
330
|
+
if config.last_run_at:
|
|
331
|
+
click.echo(f" Last Run: {config.last_run_at.isoformat()}")
|
|
332
|
+
|
|
333
|
+
if config.metadata:
|
|
334
|
+
click.echo(f"\nMetadata:")
|
|
335
|
+
for key, value in config.metadata.items():
|
|
336
|
+
click.echo(f" {key}: {value}")
|
|
337
|
+
|
|
338
|
+
except Exception as e:
|
|
339
|
+
logger.error(f"Failed to show experiment: {e}")
|
|
340
|
+
click.echo(f"Error: {e}", err=True)
|
|
341
|
+
raise click.Abort()
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
# =============================================================================
|
|
345
|
+
# RUN COMMAND
|
|
346
|
+
# =============================================================================
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
@experiments.command("run")
|
|
350
|
+
@click.argument("name")
|
|
351
|
+
@click.option("--base-path", default=".experiments", help="Base directory for experiments")
|
|
352
|
+
@click.option("--version", help="Git tag version to load (e.g., 'experiments/my-exp/v1.0.0')")
|
|
353
|
+
@click.option("--dry-run", is_flag=True, help="Test on small subset without saving")
|
|
354
|
+
@click.option("--update-prompts", is_flag=True, help="Update prompts in Phoenix before running")
|
|
355
|
+
@click.option("--phoenix-url", help="Phoenix server URL (overrides PHOENIX_BASE_URL env var)")
|
|
356
|
+
@click.option("--phoenix-api-key", help="Phoenix API key (overrides PHOENIX_API_KEY env var)")
|
|
357
|
+
def run(
|
|
358
|
+
name: str,
|
|
359
|
+
base_path: str,
|
|
360
|
+
version: Optional[str],
|
|
361
|
+
dry_run: bool,
|
|
362
|
+
update_prompts: bool,
|
|
363
|
+
phoenix_url: Optional[str],
|
|
364
|
+
phoenix_api_key: Optional[str],
|
|
365
|
+
):
|
|
366
|
+
"""Run an experiment using Phoenix provider.
|
|
367
|
+
|
|
368
|
+
Loads configuration, executes agent and evaluator, saves results.
|
|
369
|
+
|
|
370
|
+
Phoenix Connection:
|
|
371
|
+
Commands respect PHOENIX_BASE_URL and PHOENIX_API_KEY environment variables.
|
|
372
|
+
Defaults to localhost:6006 for local development.
|
|
373
|
+
|
|
374
|
+
Production (on cluster):
|
|
375
|
+
export PHOENIX_BASE_URL=http://phoenix-svc.observability.svc.cluster.local:6006
|
|
376
|
+
export PHOENIX_API_KEY=<your-key>
|
|
377
|
+
kubectl exec -it deployment/rem-api -- rem experiments run my-experiment
|
|
378
|
+
|
|
379
|
+
Development (port-forward):
|
|
380
|
+
kubectl port-forward -n observability svc/phoenix-svc 6006:6006
|
|
381
|
+
export PHOENIX_API_KEY=<your-key>
|
|
382
|
+
rem experiments run my-experiment
|
|
383
|
+
|
|
384
|
+
Local (local Phoenix):
|
|
385
|
+
python -m phoenix.server.main serve
|
|
386
|
+
rem experiments run my-experiment
|
|
387
|
+
|
|
388
|
+
Examples:
|
|
389
|
+
# Run experiment with latest schemas
|
|
390
|
+
rem experiments run hello-world-validation
|
|
391
|
+
|
|
392
|
+
# Run specific version
|
|
393
|
+
rem experiments run hello-world-validation \\
|
|
394
|
+
--version experiments/hello-world-validation/v1.0.0
|
|
395
|
+
|
|
396
|
+
# Dry run (test without saving)
|
|
397
|
+
rem experiments run cv-parser-production --dry-run
|
|
398
|
+
|
|
399
|
+
# Override Phoenix connection
|
|
400
|
+
rem experiments run my-experiment \\
|
|
401
|
+
--phoenix-url http://phoenix.example.com:6006 \\
|
|
402
|
+
--phoenix-api-key <key>
|
|
403
|
+
"""
|
|
404
|
+
from rem.models.core.experiment import ExperimentConfig, ExperimentStatus
|
|
405
|
+
from rem.services.git import GitService
|
|
406
|
+
from rem.services.phoenix import PhoenixClient
|
|
407
|
+
from rem.agentic.providers.phoenix import create_evaluator_from_schema
|
|
408
|
+
from datetime import datetime
|
|
409
|
+
import pandas as pd
|
|
410
|
+
|
|
411
|
+
try:
|
|
412
|
+
# Load experiment configuration
|
|
413
|
+
if version:
|
|
414
|
+
# Load from Git at specific version
|
|
415
|
+
git_svc = GitService()
|
|
416
|
+
config_yaml = git_svc.fs.read(
|
|
417
|
+
f"git://rem/.experiments/{name}/experiment.yaml?ref={version}"
|
|
418
|
+
)
|
|
419
|
+
config = ExperimentConfig(**config_yaml)
|
|
420
|
+
click.echo(f"✓ Loaded experiment from Git: {version}")
|
|
421
|
+
else:
|
|
422
|
+
# Load from local filesystem
|
|
423
|
+
config_path = Path(base_path) / name / "experiment.yaml"
|
|
424
|
+
if not config_path.exists():
|
|
425
|
+
click.echo(f"Experiment not found: {name}")
|
|
426
|
+
click.echo(f" Looked in: {config_path}")
|
|
427
|
+
raise click.Abort()
|
|
428
|
+
config = ExperimentConfig.from_yaml(config_path)
|
|
429
|
+
click.echo(f"✓ Loaded experiment: {name}")
|
|
430
|
+
|
|
431
|
+
# Display experiment info
|
|
432
|
+
click.echo(f"\nExperiment: {config.name}")
|
|
433
|
+
click.echo(f" Agent: {config.agent_schema_ref.name} (version: {config.agent_schema_ref.version or 'latest'})")
|
|
434
|
+
click.echo(f" Evaluator: {config.evaluator_schema_ref.name}")
|
|
435
|
+
click.echo(f" Status: {config.status.value}")
|
|
436
|
+
if dry_run:
|
|
437
|
+
click.echo(f" Mode: DRY RUN (no data will be saved)")
|
|
438
|
+
click.echo()
|
|
439
|
+
|
|
440
|
+
# Load agent schema from Git or filesystem
|
|
441
|
+
agent_name = config.agent_schema_ref.name
|
|
442
|
+
agent_version = config.agent_schema_ref.version
|
|
443
|
+
|
|
444
|
+
click.echo(f"Loading agent schema: {agent_name} (version: {agent_version or 'latest'})")
|
|
445
|
+
|
|
446
|
+
# Try Git first, fallback to filesystem
|
|
447
|
+
agent_schema = None
|
|
448
|
+
try:
|
|
449
|
+
git_svc = GitService()
|
|
450
|
+
agent_schema = git_svc.load_schema(agent_name, version=agent_version)
|
|
451
|
+
click.echo(f"✓ Loaded agent schema from Git")
|
|
452
|
+
except Exception as e:
|
|
453
|
+
logger.debug(f"Git not available, trying filesystem: {e}")
|
|
454
|
+
|
|
455
|
+
# Fallback to local filesystem
|
|
456
|
+
from rem.services.fs import FS
|
|
457
|
+
fs = FS()
|
|
458
|
+
|
|
459
|
+
schema_path = f"schemas/agents/{agent_name}.yaml"
|
|
460
|
+
try:
|
|
461
|
+
agent_schema = fs.read(schema_path)
|
|
462
|
+
click.echo(f"✓ Loaded agent schema from filesystem")
|
|
463
|
+
except Exception as fs_error:
|
|
464
|
+
logger.error(f"Failed to load agent schema: Git: {e}, FS: {fs_error}")
|
|
465
|
+
click.echo(f"Error: Could not load agent schema '{agent_name}'")
|
|
466
|
+
click.echo(f" Tried Git: {e}")
|
|
467
|
+
click.echo(f" Tried filesystem: {schema_path}")
|
|
468
|
+
click.echo(f" Make sure the schema exists")
|
|
469
|
+
raise click.Abort()
|
|
470
|
+
|
|
471
|
+
# Create agent function from schema
|
|
472
|
+
from rem.agentic.providers.pydantic_ai import create_agent
|
|
473
|
+
from rem.agentic.context import AgentContext
|
|
474
|
+
|
|
475
|
+
# Create agent context
|
|
476
|
+
context = AgentContext(
|
|
477
|
+
user_id="experiment-runner",
|
|
478
|
+
tenant_id="experiments",
|
|
479
|
+
session_id=f"experiment-{config.name}",
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
agent_runtime = asyncio.run(create_agent(
|
|
483
|
+
context=context,
|
|
484
|
+
agent_schema_override=agent_schema
|
|
485
|
+
))
|
|
486
|
+
|
|
487
|
+
def task_fn(example: dict[str, Any]) -> dict[str, Any]:
|
|
488
|
+
"""Run agent on example."""
|
|
489
|
+
input_data = example.get("input", {})
|
|
490
|
+
|
|
491
|
+
# Extract query from input
|
|
492
|
+
query = input_data.get("query", "")
|
|
493
|
+
if not query:
|
|
494
|
+
# Try other common input keys
|
|
495
|
+
query = input_data.get("text", input_data.get("prompt", str(input_data)))
|
|
496
|
+
|
|
497
|
+
# Run agent
|
|
498
|
+
result = asyncio.run(agent_runtime.run(query))
|
|
499
|
+
|
|
500
|
+
# Serialize result (critical for Pydantic models!)
|
|
501
|
+
from rem.agentic.serialization import serialize_agent_result
|
|
502
|
+
serialized = serialize_agent_result(result)
|
|
503
|
+
# Ensure we return a dict (Phoenix expects dict output)
|
|
504
|
+
if isinstance(serialized, str):
|
|
505
|
+
return {"output": serialized}
|
|
506
|
+
return serialized if isinstance(serialized, dict) else {"output": str(serialized)}
|
|
507
|
+
|
|
508
|
+
# Load evaluator schema
|
|
509
|
+
evaluator_name = config.evaluator_schema_ref.name
|
|
510
|
+
evaluator_version = config.evaluator_schema_ref.version
|
|
511
|
+
|
|
512
|
+
# Resolve evaluator path (evaluators are organized by agent name)
|
|
513
|
+
evaluator_schema_path = f"rem/schemas/evaluators/{agent_name}/{evaluator_name}.yaml"
|
|
514
|
+
|
|
515
|
+
click.echo(f"Loading evaluator: {evaluator_name} for agent {agent_name}")
|
|
516
|
+
|
|
517
|
+
try:
|
|
518
|
+
evaluator_fn = create_evaluator_from_schema(
|
|
519
|
+
evaluator_schema_path=evaluator_schema_path,
|
|
520
|
+
model_name=None, # Use default from schema
|
|
521
|
+
)
|
|
522
|
+
click.echo(f"✓ Loaded evaluator schema")
|
|
523
|
+
except Exception as e:
|
|
524
|
+
logger.warning(f"Failed to load evaluator: {e}")
|
|
525
|
+
click.echo(f"Error: Could not load evaluator schema")
|
|
526
|
+
click.echo(f" Path: {evaluator_schema_path}")
|
|
527
|
+
click.echo(f" Make sure the schema exists")
|
|
528
|
+
raise click.Abort()
|
|
529
|
+
|
|
530
|
+
# Load dataset
|
|
531
|
+
click.echo(f"Loading dataset: {list(config.datasets.keys())[0]}")
|
|
532
|
+
dataset_ref = list(config.datasets.values())[0]
|
|
533
|
+
|
|
534
|
+
if dataset_ref.location.value == "git":
|
|
535
|
+
# Load from Git
|
|
536
|
+
dataset_path = Path(base_path) / name / dataset_ref.path
|
|
537
|
+
if not dataset_path.exists():
|
|
538
|
+
click.echo(f"Error: Dataset not found: {dataset_path}")
|
|
539
|
+
raise click.Abort()
|
|
540
|
+
|
|
541
|
+
if dataset_ref.format == "csv":
|
|
542
|
+
dataset_df = pd.read_csv(dataset_path)
|
|
543
|
+
elif dataset_ref.format == "parquet":
|
|
544
|
+
dataset_df = pd.read_parquet(dataset_path)
|
|
545
|
+
elif dataset_ref.format == "jsonl":
|
|
546
|
+
dataset_df = pd.read_json(dataset_path, lines=True)
|
|
547
|
+
else:
|
|
548
|
+
click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
|
|
549
|
+
raise click.Abort()
|
|
550
|
+
elif dataset_ref.location.value in ["s3", "hybrid"]:
|
|
551
|
+
# Load from S3 using FS provider
|
|
552
|
+
from rem.services.fs import FS
|
|
553
|
+
|
|
554
|
+
fs = FS()
|
|
555
|
+
|
|
556
|
+
try:
|
|
557
|
+
if dataset_ref.format == "csv":
|
|
558
|
+
content = fs.read(dataset_ref.path)
|
|
559
|
+
from io import StringIO
|
|
560
|
+
dataset_df = pd.read_csv(StringIO(content))
|
|
561
|
+
elif dataset_ref.format == "parquet":
|
|
562
|
+
# For parquet, we need binary read
|
|
563
|
+
import tempfile
|
|
564
|
+
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
|
|
565
|
+
tmp_path = tmp.name
|
|
566
|
+
# Download via FS
|
|
567
|
+
content_bytes = fs.read(dataset_ref.path)
|
|
568
|
+
tmp.write(content_bytes)
|
|
569
|
+
dataset_df = pd.read_parquet(tmp_path)
|
|
570
|
+
Path(tmp_path).unlink() # Clean up temp file
|
|
571
|
+
elif dataset_ref.format == "jsonl":
|
|
572
|
+
content = fs.read(dataset_ref.path)
|
|
573
|
+
from io import StringIO
|
|
574
|
+
dataset_df = pd.read_json(StringIO(content), lines=True)
|
|
575
|
+
else:
|
|
576
|
+
click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
|
|
577
|
+
raise click.Abort()
|
|
578
|
+
|
|
579
|
+
click.echo(f"✓ Loaded dataset from S3")
|
|
580
|
+
except Exception as e:
|
|
581
|
+
logger.error(f"Failed to load dataset from S3: {e}")
|
|
582
|
+
click.echo(f"Error: Could not load dataset from S3")
|
|
583
|
+
click.echo(f" Path: {dataset_ref.path}")
|
|
584
|
+
click.echo(f" Format: {dataset_ref.format}")
|
|
585
|
+
raise click.Abort()
|
|
586
|
+
else:
|
|
587
|
+
click.echo(f"Error: Unknown dataset location: {dataset_ref.location.value}")
|
|
588
|
+
raise click.Abort()
|
|
589
|
+
|
|
590
|
+
click.echo(f"✓ Loaded dataset: {len(dataset_df)} examples")
|
|
591
|
+
|
|
592
|
+
# Update prompts in Phoenix if requested
|
|
593
|
+
if update_prompts:
|
|
594
|
+
# TODO: Implement prompt updating
|
|
595
|
+
click.echo("⚠ --update-prompts not yet implemented")
|
|
596
|
+
|
|
597
|
+
# Run experiment via Phoenix
|
|
598
|
+
if not dry_run:
|
|
599
|
+
# Create Phoenix client with optional overrides
|
|
600
|
+
from rem.services.phoenix.config import PhoenixConfig
|
|
601
|
+
import os
|
|
602
|
+
|
|
603
|
+
phoenix_config = PhoenixConfig(
|
|
604
|
+
base_url=phoenix_url or os.getenv("PHOENIX_BASE_URL"),
|
|
605
|
+
api_key=phoenix_api_key or os.getenv("PHOENIX_API_KEY")
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
# Display Phoenix connection info
|
|
609
|
+
phoenix_display_url = phoenix_config.base_url
|
|
610
|
+
phoenix_has_key = "Yes" if phoenix_config.api_key else "No"
|
|
611
|
+
click.echo(f"\nPhoenix Connection:")
|
|
612
|
+
click.echo(f" URL: {phoenix_display_url}")
|
|
613
|
+
click.echo(f" API Key: {phoenix_has_key}")
|
|
614
|
+
click.echo()
|
|
615
|
+
|
|
616
|
+
client = PhoenixClient(config=phoenix_config)
|
|
617
|
+
|
|
618
|
+
experiment_name = f"{config.name}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
|
619
|
+
|
|
620
|
+
click.echo(f"\n⏳ Running experiment: {experiment_name}")
|
|
621
|
+
click.echo(f" This may take several minutes...")
|
|
622
|
+
|
|
623
|
+
experiment = client.run_experiment(
|
|
624
|
+
dataset=dataset_df, # type: ignore[arg-type]
|
|
625
|
+
task=task_fn,
|
|
626
|
+
evaluators=[evaluator_fn],
|
|
627
|
+
experiment_name=experiment_name,
|
|
628
|
+
experiment_description=config.description,
|
|
629
|
+
experiment_metadata={
|
|
630
|
+
"agent": config.agent_schema_ref.name,
|
|
631
|
+
"evaluator": config.evaluator_schema_ref.name,
|
|
632
|
+
"experiment_config": config.name,
|
|
633
|
+
**config.metadata
|
|
634
|
+
}
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
# Update experiment status
|
|
638
|
+
config.status = ExperimentStatus.COMPLETED
|
|
639
|
+
config.last_run_at = datetime.now()
|
|
640
|
+
if not version: # Only save if not loading from Git
|
|
641
|
+
config.save(base_path)
|
|
642
|
+
|
|
643
|
+
click.echo(f"\n✓ Experiment complete!")
|
|
644
|
+
if hasattr(experiment, "url"):
|
|
645
|
+
click.echo(f" View results: {experiment.url}") # type: ignore[attr-defined]
|
|
646
|
+
|
|
647
|
+
# Save results according to config.results settings
|
|
648
|
+
if config.results.save_metrics_summary:
|
|
649
|
+
# Get experiment data
|
|
650
|
+
try:
|
|
651
|
+
exp_data = client.get_experiment(experiment.id) # type: ignore[attr-defined]
|
|
652
|
+
|
|
653
|
+
# Build metrics summary
|
|
654
|
+
metrics = {
|
|
655
|
+
"experiment_id": experiment.id, # type: ignore[attr-defined]
|
|
656
|
+
"experiment_name": experiment_name,
|
|
657
|
+
"agent": config.agent_schema_ref.name,
|
|
658
|
+
"evaluator": config.evaluator_schema_ref.name,
|
|
659
|
+
"dataset_size": len(dataset_df),
|
|
660
|
+
"completed_at": datetime.now().isoformat(),
|
|
661
|
+
"phoenix_url": getattr(experiment, "url", None),
|
|
662
|
+
"task_runs": len(exp_data.get("task_runs", [])),
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
# Save metrics
|
|
666
|
+
if config.results.location.value == "git" or config.results.location.value == "hybrid":
|
|
667
|
+
# Save to Git
|
|
668
|
+
metrics_path = Path(base_path) / name / "results" / (config.results.metrics_file or "metrics.json")
|
|
669
|
+
metrics_path.parent.mkdir(parents=True, exist_ok=True)
|
|
670
|
+
|
|
671
|
+
import json
|
|
672
|
+
with open(metrics_path, "w") as f:
|
|
673
|
+
json.dump(metrics, f, indent=2)
|
|
674
|
+
|
|
675
|
+
click.echo(f"\n✓ Saved metrics summary: {metrics_path}")
|
|
676
|
+
|
|
677
|
+
if config.results.location.value == "s3" or config.results.location.value == "hybrid":
|
|
678
|
+
# Save to S3
|
|
679
|
+
from rem.services.fs import FS
|
|
680
|
+
fs = FS()
|
|
681
|
+
|
|
682
|
+
s3_metrics_path = config.results.base_path.rstrip("/") + "/" + (config.results.metrics_file or "metrics.json")
|
|
683
|
+
|
|
684
|
+
import json
|
|
685
|
+
fs.write(s3_metrics_path, json.dumps(metrics, indent=2))
|
|
686
|
+
|
|
687
|
+
click.echo(f"✓ Saved metrics summary to S3: {s3_metrics_path}")
|
|
688
|
+
|
|
689
|
+
except Exception as e:
|
|
690
|
+
logger.warning(f"Failed to save metrics: {e}")
|
|
691
|
+
click.echo(f"⚠ Could not save metrics summary: {e}")
|
|
692
|
+
else:
|
|
693
|
+
click.echo("\n✓ Dry run complete (no data saved)")
|
|
694
|
+
|
|
695
|
+
except Exception as e:
|
|
696
|
+
logger.error(f"Failed to run experiment: {e}")
|
|
697
|
+
click.echo(f"Error: {e}", err=True)
|
|
698
|
+
raise click.Abort()
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
# =============================================================================
|
|
702
|
+
# DATASET COMMANDS
|
|
703
|
+
# =============================================================================
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
@experiments.group()
|
|
707
|
+
def dataset():
|
|
708
|
+
"""Dataset management commands."""
|
|
709
|
+
pass
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
@dataset.command("list")
|
|
713
|
+
def dataset_list():
|
|
714
|
+
"""List all datasets.
|
|
715
|
+
|
|
716
|
+
Example:
|
|
717
|
+
rem experiments dataset list
|
|
718
|
+
"""
|
|
719
|
+
from rem.services.phoenix import PhoenixClient
|
|
720
|
+
|
|
721
|
+
try:
|
|
722
|
+
client = PhoenixClient()
|
|
723
|
+
datasets = client.list_datasets()
|
|
724
|
+
|
|
725
|
+
if not datasets:
|
|
726
|
+
click.echo("No datasets found")
|
|
727
|
+
return
|
|
728
|
+
|
|
729
|
+
click.echo(f"\nDatasets ({len(datasets)} total):\n")
|
|
730
|
+
click.echo(f"{'Name':<40} {'Examples':>10} {'Created':<12}")
|
|
731
|
+
click.echo("-" * 65)
|
|
732
|
+
|
|
733
|
+
for ds in datasets:
|
|
734
|
+
name = ds.get("name", "")[:40]
|
|
735
|
+
count = ds.get("example_count", 0)
|
|
736
|
+
created = ds.get("created_at", "")[:10]
|
|
737
|
+
click.echo(f"{name:<40} {count:>10} {created:<12}")
|
|
738
|
+
|
|
739
|
+
except Exception as e:
|
|
740
|
+
logger.error(f"Failed to list datasets: {e}")
|
|
741
|
+
click.echo(f"Error: {e}", err=True)
|
|
742
|
+
raise click.Abort()
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
@dataset.command("create")
|
|
746
|
+
@click.argument("name")
|
|
747
|
+
@click.option("--from-csv", type=click.Path(exists=True, path_type=Path), help="Create from CSV file")
|
|
748
|
+
@click.option("--input-keys", help="Comma-separated input column names")
|
|
749
|
+
@click.option("--output-keys", help="Comma-separated output column names (reference/ground truth)")
|
|
750
|
+
@click.option("--metadata-keys", help="Comma-separated metadata column names (difficulty, type, etc.)")
|
|
751
|
+
@click.option("--description", help="Dataset description")
|
|
752
|
+
def dataset_create(
|
|
753
|
+
name: str,
|
|
754
|
+
from_csv: Optional[Path],
|
|
755
|
+
input_keys: Optional[str],
|
|
756
|
+
output_keys: Optional[str],
|
|
757
|
+
metadata_keys: Optional[str],
|
|
758
|
+
description: Optional[str],
|
|
759
|
+
):
|
|
760
|
+
"""Create a dataset (golden set).
|
|
761
|
+
|
|
762
|
+
Two modes:
|
|
763
|
+
1. From CSV: --from-csv golden.csv --input-keys query --output-keys expected
|
|
764
|
+
2. Manual (empty): Will create empty dataset to populate later
|
|
765
|
+
|
|
766
|
+
Examples:
|
|
767
|
+
# From CSV (SME golden set)
|
|
768
|
+
rem experiments dataset create rem-lookup-golden \\
|
|
769
|
+
--from-csv golden-lookup.csv \\
|
|
770
|
+
--input-keys query \\
|
|
771
|
+
--output-keys expected_label,expected_type \\
|
|
772
|
+
--metadata-keys difficulty,query_type
|
|
773
|
+
|
|
774
|
+
# Empty dataset (populate later)
|
|
775
|
+
rem experiments dataset create rem-test --description "Test dataset"
|
|
776
|
+
"""
|
|
777
|
+
from rem.services.phoenix import PhoenixClient
|
|
778
|
+
|
|
779
|
+
try:
|
|
780
|
+
client = PhoenixClient()
|
|
781
|
+
|
|
782
|
+
if from_csv:
|
|
783
|
+
# Create from CSV
|
|
784
|
+
if not input_keys or not output_keys:
|
|
785
|
+
click.echo("Error: --input-keys and --output-keys required for CSV", err=True)
|
|
786
|
+
raise click.Abort()
|
|
787
|
+
|
|
788
|
+
dataset = client.create_dataset_from_csv(
|
|
789
|
+
name=name,
|
|
790
|
+
csv_file_path=from_csv,
|
|
791
|
+
input_keys=input_keys.split(","),
|
|
792
|
+
output_keys=output_keys.split(","),
|
|
793
|
+
metadata_keys=metadata_keys.split(",") if metadata_keys else None,
|
|
794
|
+
description=description,
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
click.echo(f"✓ Created dataset '{dataset.name}' from CSV with {len(dataset)} examples")
|
|
798
|
+
|
|
799
|
+
else:
|
|
800
|
+
# Create empty dataset
|
|
801
|
+
dataset = client.create_dataset_from_data(
|
|
802
|
+
name=name,
|
|
803
|
+
inputs=[],
|
|
804
|
+
outputs=[],
|
|
805
|
+
description=description,
|
|
806
|
+
)
|
|
807
|
+
|
|
808
|
+
click.echo(f"✓ Created empty dataset '{dataset.name}'")
|
|
809
|
+
click.echo(" Use 'rem experiments dataset add' to add examples")
|
|
810
|
+
|
|
811
|
+
except Exception as e:
|
|
812
|
+
logger.error(f"Failed to create dataset: {e}")
|
|
813
|
+
click.echo(f"Error: {e}", err=True)
|
|
814
|
+
raise click.Abort()
|
|
815
|
+
|
|
816
|
+
|
|
817
|
+
@dataset.command("add")
|
|
818
|
+
@click.argument("dataset_name")
|
|
819
|
+
@click.option("--from-csv", type=click.Path(exists=True, path_type=Path), required=True,
|
|
820
|
+
help="CSV file with examples")
|
|
821
|
+
@click.option("--input-keys", required=True, help="Comma-separated input column names")
|
|
822
|
+
@click.option("--output-keys", required=True, help="Comma-separated output column names")
|
|
823
|
+
@click.option("--metadata-keys", help="Comma-separated metadata column names")
|
|
824
|
+
def dataset_add(
|
|
825
|
+
dataset_name: str,
|
|
826
|
+
from_csv: Path,
|
|
827
|
+
input_keys: str,
|
|
828
|
+
output_keys: str,
|
|
829
|
+
metadata_keys: Optional[str],
|
|
830
|
+
):
|
|
831
|
+
"""Add examples to an existing dataset.
|
|
832
|
+
|
|
833
|
+
Example:
|
|
834
|
+
rem experiments dataset add rem-lookup-golden \\
|
|
835
|
+
--from-csv new-examples.csv \\
|
|
836
|
+
--input-keys query \\
|
|
837
|
+
--output-keys expected_label,expected_type
|
|
838
|
+
"""
|
|
839
|
+
from rem.services.phoenix import PhoenixClient
|
|
840
|
+
import pandas as pd
|
|
841
|
+
|
|
842
|
+
try:
|
|
843
|
+
client = PhoenixClient()
|
|
844
|
+
|
|
845
|
+
# Load CSV
|
|
846
|
+
df = pd.read_csv(from_csv)
|
|
847
|
+
|
|
848
|
+
# Extract data
|
|
849
|
+
inputs = cast(list[dict[str, Any]], df[input_keys.split(",")].to_dict("records"))
|
|
850
|
+
outputs = cast(list[dict[str, Any]], df[output_keys.split(",")].to_dict("records"))
|
|
851
|
+
metadata = None
|
|
852
|
+
if metadata_keys:
|
|
853
|
+
metadata = cast(list[dict[str, Any]], df[metadata_keys.split(",")].to_dict("records"))
|
|
854
|
+
|
|
855
|
+
# Add to dataset
|
|
856
|
+
dataset = client.add_examples_to_dataset(
|
|
857
|
+
dataset=dataset_name,
|
|
858
|
+
inputs=inputs,
|
|
859
|
+
outputs=outputs,
|
|
860
|
+
metadata=metadata,
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
click.echo(f"✓ Added {len(inputs)} examples to dataset '{dataset.name}'")
|
|
864
|
+
click.echo(f" Total examples: {len(dataset)}")
|
|
865
|
+
|
|
866
|
+
except Exception as e:
|
|
867
|
+
logger.error(f"Failed to add examples: {e}")
|
|
868
|
+
click.echo(f"Error: {e}", err=True)
|
|
869
|
+
raise click.Abort()
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
# =============================================================================
|
|
873
|
+
# PROMPT COMMANDS
|
|
874
|
+
# =============================================================================
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
@experiments.group()
|
|
878
|
+
def prompt():
|
|
879
|
+
"""Prompt management commands."""
|
|
880
|
+
pass
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
@prompt.command("create")
|
|
884
|
+
@click.argument("name")
|
|
885
|
+
@click.option("--system-prompt", "-s", required=True, help="System prompt text")
|
|
886
|
+
@click.option("--description", "-d", help="Prompt description")
|
|
887
|
+
@click.option("--model-provider", default="OPENAI", help="Model provider (OPENAI, ANTHROPIC)")
|
|
888
|
+
@click.option("--model-name", "-m", help="Model name (e.g., gpt-4o, claude-sonnet-4-5)")
|
|
889
|
+
@click.option("--type", "-t", "prompt_type", default="Agent", help="Prompt type (Agent or Evaluator)")
|
|
890
|
+
def prompt_create(
|
|
891
|
+
name: str,
|
|
892
|
+
system_prompt: str,
|
|
893
|
+
description: Optional[str],
|
|
894
|
+
model_provider: str,
|
|
895
|
+
model_name: Optional[str],
|
|
896
|
+
prompt_type: str,
|
|
897
|
+
):
|
|
898
|
+
"""Create a prompt.
|
|
899
|
+
|
|
900
|
+
Examples:
|
|
901
|
+
# Create agent prompt
|
|
902
|
+
rem experiments prompt create hello-world \\
|
|
903
|
+
--system-prompt "You are a helpful assistant." \\
|
|
904
|
+
--model-name gpt-4o
|
|
905
|
+
|
|
906
|
+
# Create evaluator prompt
|
|
907
|
+
rem experiments prompt create correctness-evaluator \\
|
|
908
|
+
--system-prompt "Evaluate the correctness of responses." \\
|
|
909
|
+
--type Evaluator \\
|
|
910
|
+
--model-provider ANTHROPIC \\
|
|
911
|
+
--model-name claude-sonnet-4-5
|
|
912
|
+
"""
|
|
913
|
+
from rem.services.phoenix import PhoenixClient
|
|
914
|
+
from rem.services.phoenix.prompt_labels import PhoenixPromptLabels
|
|
915
|
+
from phoenix.client import Client
|
|
916
|
+
from phoenix.client.types.prompts import PromptVersion
|
|
917
|
+
from phoenix.client.__generated__ import v1
|
|
918
|
+
|
|
919
|
+
try:
|
|
920
|
+
# Set default model if not specified
|
|
921
|
+
if not model_name:
|
|
922
|
+
model_name = "gpt-4o" if model_provider == "OPENAI" else "claude-sonnet-4-5-20250929"
|
|
923
|
+
|
|
924
|
+
# Get config
|
|
925
|
+
phoenix_client = PhoenixClient()
|
|
926
|
+
config = phoenix_client.config
|
|
927
|
+
|
|
928
|
+
# Create client
|
|
929
|
+
client = Client(
|
|
930
|
+
base_url=config.base_url,
|
|
931
|
+
api_key=config.api_key
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
# Create prompt messages
|
|
935
|
+
messages = [
|
|
936
|
+
v1.PromptMessage(
|
|
937
|
+
role="system",
|
|
938
|
+
content=system_prompt
|
|
939
|
+
)
|
|
940
|
+
]
|
|
941
|
+
|
|
942
|
+
# Create PromptVersion
|
|
943
|
+
version = PromptVersion(
|
|
944
|
+
messages,
|
|
945
|
+
model_name=model_name,
|
|
946
|
+
description="v1.0",
|
|
947
|
+
model_provider=model_provider # type: ignore[arg-type]
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
# Create the prompt
|
|
951
|
+
result = client.prompts.create(
|
|
952
|
+
name=name,
|
|
953
|
+
version=version,
|
|
954
|
+
prompt_description=description or f"{prompt_type} prompt: {name}"
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
click.echo(f"✓ Created prompt '{name}' (ID: {result.id})")
|
|
958
|
+
|
|
959
|
+
# Try to get the prompt ID for label assignment
|
|
960
|
+
try:
|
|
961
|
+
import httpx
|
|
962
|
+
query = """
|
|
963
|
+
query {
|
|
964
|
+
prompts(first: 1, filterBy: {name: {equals: "%s"}}) {
|
|
965
|
+
edges {
|
|
966
|
+
node {
|
|
967
|
+
id
|
|
968
|
+
name
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
}
|
|
972
|
+
}
|
|
973
|
+
""" % name
|
|
974
|
+
|
|
975
|
+
response = httpx.post(
|
|
976
|
+
f"{config.base_url}/graphql",
|
|
977
|
+
json={"query": query},
|
|
978
|
+
headers={"authorization": f"Bearer {config.api_key}"},
|
|
979
|
+
timeout=10,
|
|
980
|
+
)
|
|
981
|
+
graphql_result = response.json()
|
|
982
|
+
prompts = graphql_result.get("data", {}).get("prompts", {}).get("edges", [])
|
|
983
|
+
|
|
984
|
+
if prompts:
|
|
985
|
+
prompt_id = prompts[0]["node"]["id"]
|
|
986
|
+
|
|
987
|
+
# Assign labels
|
|
988
|
+
if not config.base_url:
|
|
989
|
+
raise ValueError("Phoenix base_url is required")
|
|
990
|
+
labels_helper = PhoenixPromptLabels(
|
|
991
|
+
base_url=config.base_url, api_key=config.api_key
|
|
992
|
+
)
|
|
993
|
+
|
|
994
|
+
# Assign REM + type label
|
|
995
|
+
label_names = ["REM", prompt_type]
|
|
996
|
+
labels_helper.assign_prompt_labels(prompt_id, label_names)
|
|
997
|
+
click.echo(f"✓ Assigned labels: {', '.join(label_names)}")
|
|
998
|
+
except Exception as e:
|
|
999
|
+
click.echo(f"⚠ Warning: Could not assign labels: {e}")
|
|
1000
|
+
|
|
1001
|
+
click.echo(f"\nView in UI: {config.base_url}")
|
|
1002
|
+
|
|
1003
|
+
except Exception as e:
|
|
1004
|
+
logger.error(f"Failed to create prompt: {e}")
|
|
1005
|
+
click.echo(f"Error: {e}", err=True)
|
|
1006
|
+
raise click.Abort()
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
@prompt.command("list")
|
|
1010
|
+
def prompt_list():
|
|
1011
|
+
"""List all prompts.
|
|
1012
|
+
|
|
1013
|
+
Example:
|
|
1014
|
+
rem experiments prompt list
|
|
1015
|
+
"""
|
|
1016
|
+
import httpx
|
|
1017
|
+
from rem.services.phoenix import PhoenixClient
|
|
1018
|
+
|
|
1019
|
+
try:
|
|
1020
|
+
phoenix_client = PhoenixClient()
|
|
1021
|
+
config = phoenix_client.config
|
|
1022
|
+
|
|
1023
|
+
query = """
|
|
1024
|
+
query {
|
|
1025
|
+
prompts(first: 100) {
|
|
1026
|
+
edges {
|
|
1027
|
+
node {
|
|
1028
|
+
id
|
|
1029
|
+
name
|
|
1030
|
+
description
|
|
1031
|
+
createdAt
|
|
1032
|
+
}
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
"""
|
|
1037
|
+
|
|
1038
|
+
response = httpx.post(
|
|
1039
|
+
f"{config.base_url}/graphql",
|
|
1040
|
+
json={"query": query},
|
|
1041
|
+
headers={"authorization": f"Bearer {config.api_key}"},
|
|
1042
|
+
timeout=10,
|
|
1043
|
+
)
|
|
1044
|
+
|
|
1045
|
+
result = response.json()
|
|
1046
|
+
prompts = result.get("data", {}).get("prompts", {}).get("edges", [])
|
|
1047
|
+
|
|
1048
|
+
if not prompts:
|
|
1049
|
+
click.echo("No prompts found")
|
|
1050
|
+
return
|
|
1051
|
+
|
|
1052
|
+
click.echo(f"\nPrompts ({len(prompts)} total):\n")
|
|
1053
|
+
click.echo(f"{'Name':<40} {'Created':<20}")
|
|
1054
|
+
click.echo("-" * 65)
|
|
1055
|
+
|
|
1056
|
+
for edge in prompts:
|
|
1057
|
+
node = edge["node"]
|
|
1058
|
+
name = node.get("name", "")[:40]
|
|
1059
|
+
created = node.get("createdAt", "")[:19]
|
|
1060
|
+
click.echo(f"{name:<40} {created:<20}")
|
|
1061
|
+
|
|
1062
|
+
except Exception as e:
|
|
1063
|
+
logger.error(f"Failed to list prompts: {e}")
|
|
1064
|
+
click.echo(f"Error: {e}", err=True)
|
|
1065
|
+
raise click.Abort()
|
|
1066
|
+
|
|
1067
|
+
|
|
1068
|
+
# =============================================================================
|
|
1069
|
+
# TRACE COMMANDS
|
|
1070
|
+
# =============================================================================
|
|
1071
|
+
|
|
1072
|
+
|
|
1073
|
+
@experiments.group()
|
|
1074
|
+
def trace():
|
|
1075
|
+
"""Trace retrieval commands."""
|
|
1076
|
+
pass
|
|
1077
|
+
|
|
1078
|
+
|
|
1079
|
+
@trace.command("list")
|
|
1080
|
+
@click.option("--project", "-p", help="Filter by project name")
|
|
1081
|
+
@click.option("--days", "-d", default=7, help="Number of days to look back")
|
|
1082
|
+
@click.option("--limit", "-l", default=20, help="Maximum traces to return")
|
|
1083
|
+
def trace_list(
|
|
1084
|
+
project: Optional[str],
|
|
1085
|
+
days: int,
|
|
1086
|
+
limit: int,
|
|
1087
|
+
):
|
|
1088
|
+
"""List recent traces.
|
|
1089
|
+
|
|
1090
|
+
Example:
|
|
1091
|
+
rem experiments trace list --project rem-agents --days 7 --limit 50
|
|
1092
|
+
"""
|
|
1093
|
+
from rem.services.phoenix import PhoenixClient
|
|
1094
|
+
from datetime import datetime, timedelta
|
|
1095
|
+
|
|
1096
|
+
try:
|
|
1097
|
+
client = PhoenixClient()
|
|
1098
|
+
|
|
1099
|
+
start_time = datetime.now() - timedelta(days=days)
|
|
1100
|
+
|
|
1101
|
+
traces_df = client.get_traces(
|
|
1102
|
+
project_name=project,
|
|
1103
|
+
start_time=start_time,
|
|
1104
|
+
limit=limit,
|
|
1105
|
+
)
|
|
1106
|
+
|
|
1107
|
+
if len(traces_df) == 0:
|
|
1108
|
+
click.echo("No traces found")
|
|
1109
|
+
return
|
|
1110
|
+
|
|
1111
|
+
click.echo(f"\nRecent Traces ({len(traces_df)} results):\n")
|
|
1112
|
+
click.echo(f"{'Span ID':<15} {'Name':<30} {'Start Time':<20}")
|
|
1113
|
+
click.echo("-" * 70)
|
|
1114
|
+
|
|
1115
|
+
for _, row in traces_df.head(limit).iterrows():
|
|
1116
|
+
span_id = str(row.get("context.span_id", ""))[:12]
|
|
1117
|
+
name = str(row.get("name", ""))[:30]
|
|
1118
|
+
start = str(row.get("start_time", ""))[:19]
|
|
1119
|
+
click.echo(f"{span_id:<15} {name:<30} {start:<20}")
|
|
1120
|
+
|
|
1121
|
+
except Exception as e:
|
|
1122
|
+
logger.error(f"Failed to list traces: {e}")
|
|
1123
|
+
click.echo(f"Error: {e}", err=True)
|
|
1124
|
+
raise click.Abort()
|