remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,1698 @@
1
+ """
2
+ Experiment management CLI commands.
3
+
4
+ Experiments use ExperimentConfig (rem/models/core/experiment.py) for configuration
5
+ and support Git+S3 hybrid storage. Includes dataset, prompt, and trace management.
6
+
7
+ Directory Structure:
8
+ experiments/{experiment-name}/
9
+ ├── experiment.yaml # ExperimentConfig (metadata, agent ref, evaluator ref)
10
+ ├── README.md # Auto-generated documentation
11
+ ├── ground-truth/ # Evaluation datasets (Q&A pairs)
12
+ │ ├── dataset.csv # Input/output pairs for evaluation
13
+ │ └── dataset.yaml # Alternative YAML format
14
+ ├── seed-data/ # Data to seed REM before running experiments
15
+ │ └── data.yaml # Users, resources, moments in REM format
16
+ └── results/ # Experiment results and metrics
17
+ └── {run-timestamp}/ # Each run gets its own timestamped folder
18
+ ├── metrics.json # Summary metrics
19
+ └── run_info.json # Run metadata (eval framework URLs, etc)
20
+
21
+ Environment Variables:
22
+ EXPERIMENTS_HOME: Override default experiment directory (default: "experiments")
23
+
24
+ Commands:
25
+ # Experiment lifecycle
26
+ rem experiments create <name> --agent <agent> --evaluator <evaluator>
27
+ rem experiments list
28
+ rem experiments show <name>
29
+ rem experiments run <name> [--version <tag>]
30
+
31
+ # Dataset management
32
+ rem experiments dataset list
33
+ rem experiments dataset create <name> --from-csv data.csv
34
+ rem experiments dataset add <name> --from-csv data.csv
35
+
36
+ # Prompt management
37
+ rem experiments prompt list
38
+ rem experiments prompt create <name> --system-prompt "..."
39
+
40
+ # Trace retrieval
41
+ rem experiments trace list --project <name>
42
+ """
43
+
44
+ import asyncio
45
+ from pathlib import Path
46
+ from typing import Any, Optional, cast
47
+
48
+ import click
49
+ from loguru import logger
50
+
51
+
52
+ @click.group()
53
+ def experiments():
54
+ """Experiment configuration and execution commands."""
55
+ pass
56
+
57
+
58
+ # =============================================================================
59
+ # CREATE COMMAND
60
+ # =============================================================================
61
+
62
+
63
+ @experiments.command("create")
64
+ @click.argument("name")
65
+ @click.option("--agent", "-a", required=True, help="Agent schema name (e.g., 'cv-parser')")
66
+ @click.option("--task", "-t", default="general", help="Task name for organizing experiments (e.g., 'risk-assessment')")
67
+ @click.option("--evaluator", "-e", default="default", help="Evaluator schema name (default: 'default')")
68
+ @click.option("--description", "-d", help="Experiment description")
69
+ @click.option("--dataset-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
70
+ help="Where to store datasets")
71
+ @click.option("--results-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
72
+ help="Where to store results")
73
+ @click.option("--tags", help="Comma-separated tags (e.g., 'production,cv-parser')")
74
+ @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
75
+ def create(
76
+ name: str,
77
+ agent: str,
78
+ task: str,
79
+ evaluator: str,
80
+ description: Optional[str],
81
+ dataset_location: str,
82
+ results_location: str,
83
+ tags: Optional[str],
84
+ base_path: Optional[str],
85
+ ):
86
+ """Create a new experiment configuration.
87
+
88
+ Creates directory structure and generates experiment.yaml and README.md.
89
+
90
+ The experiment directory will contain:
91
+ - ground-truth/: Q&A pairs for evaluation
92
+ - seed-data/: REM data (users, resources, moments) to load before running
93
+ - results/: Timestamped run results
94
+
95
+ Examples:
96
+ # Small experiment (Git-only)
97
+ rem experiments create hello-world-validation \\
98
+ --agent hello-world \\
99
+ --evaluator default \\
100
+ --description "Smoke test for hello-world agent"
101
+
102
+ # Large experiment (Hybrid storage)
103
+ rem experiments create cv-parser-production \\
104
+ --agent cv-parser \\
105
+ --evaluator default \\
106
+ --description "Production CV parser evaluation" \\
107
+ --dataset-location s3 \\
108
+ --results-location hybrid \\
109
+ --tags "production,cv-parser,weekly"
110
+
111
+ # Custom location
112
+ EXPERIMENTS_HOME=/path/to/experiments rem experiments create my-test --agent my-agent
113
+ """
114
+ from rem.models.core.experiment import (
115
+ ExperimentConfig,
116
+ DatasetLocation,
117
+ DatasetReference,
118
+ SchemaReference,
119
+ ResultsConfig,
120
+ ExperimentStatus,
121
+ )
122
+ import os
123
+
124
+ try:
125
+ # Resolve base path: CLI arg > EXPERIMENTS_HOME env var > default "experiments"
126
+ if base_path is None:
127
+ base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
128
+ # Build dataset reference (format auto-detected from file extension)
129
+ if dataset_location == "git":
130
+ dataset_ref = DatasetReference(
131
+ location=DatasetLocation.GIT,
132
+ path="ground-truth/dataset.csv",
133
+ description="Ground truth Q&A dataset for evaluation"
134
+ )
135
+ else: # s3 or hybrid
136
+ dataset_ref = DatasetReference(
137
+ location=DatasetLocation(dataset_location),
138
+ path=f"s3://rem-experiments/{name}/datasets/ground_truth.parquet",
139
+ schema_path="datasets/schema.yaml" if dataset_location == "hybrid" else None,
140
+ description="Ground truth dataset for evaluation"
141
+ )
142
+
143
+ # Build results config
144
+ if results_location == "git":
145
+ results_config = ResultsConfig(
146
+ location=DatasetLocation.GIT,
147
+ base_path="results/",
148
+ save_traces=False,
149
+ save_metrics_summary=True
150
+ )
151
+ elif results_location == "s3":
152
+ results_config = ResultsConfig(
153
+ location=DatasetLocation.S3,
154
+ base_path=f"s3://rem-experiments/{name}/results/",
155
+ save_traces=True,
156
+ save_metrics_summary=False
157
+ )
158
+ else: # hybrid
159
+ results_config = ResultsConfig(
160
+ location=DatasetLocation.HYBRID,
161
+ base_path=f"s3://rem-experiments/{name}/results/",
162
+ save_traces=True,
163
+ save_metrics_summary=True,
164
+ metrics_file="metrics.json"
165
+ )
166
+
167
+ # Parse tags
168
+ tag_list = [t.strip() for t in tags.split(",")] if tags else []
169
+
170
+ # Create experiment config
171
+ config = ExperimentConfig(
172
+ name=name,
173
+ task=task,
174
+ description=description or f"Evaluation experiment for {agent} agent ({task} task)",
175
+ agent_schema_ref=SchemaReference(
176
+ name=agent,
177
+ version=None, # Use latest by default
178
+ type="agent"
179
+ ),
180
+ evaluator_schema_ref=SchemaReference(
181
+ name=evaluator,
182
+ type="evaluator"
183
+ ),
184
+ datasets={"ground_truth": dataset_ref},
185
+ results=results_config,
186
+ status=ExperimentStatus.DRAFT,
187
+ tags=tag_list
188
+ )
189
+
190
+ # Save configuration
191
+ config_path = config.save(base_path)
192
+ readme_path = config.save_readme(base_path)
193
+
194
+ # Create new directory structure
195
+ exp_dir = config.get_experiment_dir(base_path)
196
+
197
+ # Create ground-truth directory
198
+ ground_truth_dir = exp_dir / "ground-truth"
199
+ ground_truth_dir.mkdir(parents=True, exist_ok=True)
200
+
201
+ # Create seed-data directory
202
+ seed_data_dir = exp_dir / "seed-data"
203
+ seed_data_dir.mkdir(parents=True, exist_ok=True)
204
+
205
+ # Create results directory if Git-based
206
+ if results_location == "git":
207
+ results_dir = exp_dir / "results"
208
+ results_dir.mkdir(parents=True, exist_ok=True)
209
+
210
+ # Create placeholder files with documentation
211
+ ground_truth_readme = ground_truth_dir / "README.md"
212
+ ground_truth_readme.write_text("""# Ground Truth Dataset
213
+
214
+ This directory contains Q&A pairs for evaluating the agent.
215
+
216
+ ## Format
217
+
218
+ **CSV format** (`dataset.csv`):
219
+ ```csv
220
+ input,expected_output,metadata
221
+ "What is the capital of France?","Paris","{\"difficulty\": \"easy\"}"
222
+ ```
223
+
224
+ **YAML format** (`dataset.yaml`):
225
+ ```yaml
226
+ - input: "What is the capital of France?"
227
+ expected_output: "Paris"
228
+ metadata:
229
+ difficulty: easy
230
+ ```
231
+
232
+ ## Generating Ground Truth
233
+
234
+ ### Using AI Assistants
235
+
236
+ AI coding assistants (like Claude, GPT-4, etc.) can help generate comprehensive ground-truth datasets:
237
+
238
+ 1. **Generate from existing examples**: Show the assistant examples from your domain and ask it to create similar Q&A pairs
239
+ 2. **Create challenging questions**: Ask the assistant to act as a judge and generate HARD questions that test edge cases
240
+ 3. **Vary difficulty levels**: Request a mix of easy, medium, and hard questions with appropriate metadata tags
241
+
242
+ Example prompt:
243
+ ```
244
+ Based on these example documents about [your domain], generate 20 Q&A pairs
245
+ for evaluating an agent. Include:
246
+ - 5 easy factual questions
247
+ - 10 medium questions requiring reasoning
248
+ - 5 hard questions with edge cases
249
+ Format as CSV with difficulty and category metadata.
250
+ ```
251
+
252
+ ### Ground Truth as Judge
253
+
254
+ **Important**: Keep ground-truth data **separate** from the agent being tested:
255
+ - Ground truth should be hidden from the agent during evaluation
256
+ - The agent should only see the `input` field
257
+ - The evaluator compares agent output against `expected_output`
258
+ - This ensures unbiased evaluation
259
+
260
+ ### Quality Guidelines
261
+
262
+ 1. **Diverse Coverage**: Include various question types and difficulty levels
263
+ 2. **Domain-Specific**: Use terminology and scenarios from your actual use case
264
+ 3. **Metadata Tags**: Add difficulty, category, priority for analysis
265
+ 4. **SME Review**: Have domain experts validate expected outputs
266
+
267
+ ## Usage
268
+
269
+ These datasets can be:
270
+ - Loaded into evaluation frameworks (Arize Phoenix, etc.)
271
+ - Used for regression testing
272
+ - Converted to different formats as needed
273
+
274
+ The experiment runner will automatically use this data for evaluation.
275
+ """)
276
+
277
+ seed_data_readme = seed_data_dir / "README.md"
278
+ seed_data_readme.write_text("""# Seed Data
279
+
280
+ This directory contains REM data to load before running the experiment.
281
+
282
+ ## Format
283
+
284
+ Use standard REM YAML format:
285
+
286
+ ```yaml
287
+ users:
288
+ - id: test-user-001
289
+ user_id: experiment-test
290
+ email: test@example.com
291
+
292
+ resources:
293
+ - id: resource-001
294
+ user_id: experiment-test
295
+ label: example-document
296
+ content: "Document content here..."
297
+
298
+ moments:
299
+ - id: moment-001
300
+ user_id: experiment-test
301
+ label: example-meeting
302
+ starts_timestamp: "2024-01-15T14:00:00"
303
+ ```
304
+
305
+ ## Generating Seed Data
306
+
307
+ ### Using AI Assistants
308
+
309
+ AI coding assistants can help generate realistic seed data for your experiments:
310
+
311
+ 1. **From existing datasets**: Reference examples from the `datasets/` directory
312
+ 2. **Domain-specific scenarios**: Describe your use case and ask for appropriate test data
313
+ 3. **Anonymized versions**: Ask to create fictional data based on real patterns
314
+
315
+ Example prompt:
316
+ ```
317
+ Based on the recruitment dataset examples in datasets/domains/recruitment/,
318
+ generate seed data for testing a CV parser agent. Include:
319
+ - 3 test users
320
+ - 5 CV documents (resources) with varied experience levels
321
+ - 2 interview moment entries
322
+ Use fictional names and anonymize all content.
323
+ ```
324
+
325
+ ### Best Practices
326
+
327
+ 1. **Minimal**: Only include data necessary for the ground-truth questions to be answerable
328
+ 2. **Anonymized**: Always use fictional names, companies, and content
329
+ 3. **Relevant**: Seed data should provide context for evaluation questions
330
+ 4. **Versioned**: Track changes to seed data in Git for reproducibility
331
+
332
+ ## Usage
333
+
334
+ Load this data before running experiments:
335
+ ```bash
336
+ rem db load --file seed-data/data.yaml --user-id experiment-test
337
+ ```
338
+
339
+ This ensures your agent has the necessary context for evaluation.
340
+ """)
341
+
342
+ click.echo(f"\n✓ Created experiment: {name}")
343
+ click.echo(f" Configuration: {config_path}")
344
+ click.echo(f" Documentation: {readme_path}")
345
+ click.echo(f" Ground Truth: {ground_truth_dir}")
346
+ click.echo(f" Seed Data: {seed_data_dir}")
347
+ if results_location == "git":
348
+ click.echo(f" Results: {results_dir}")
349
+ click.echo(f"\nNext steps:")
350
+ click.echo(f" 1. Add ground truth Q&A to {ground_truth_dir}/dataset.csv")
351
+ click.echo(f" 2. Add seed data to {seed_data_dir}/data.yaml (optional)")
352
+ click.echo(f" 3. Review configuration: {config_path}")
353
+ click.echo(f" 4. Run experiment: rem experiments run {name}")
354
+ click.echo(f" 5. Commit to Git: git add {base_path}/{name}/ && git commit")
355
+
356
+ except Exception as e:
357
+ logger.error(f"Failed to create experiment: {e}")
358
+ click.echo(f"Error: {e}", err=True)
359
+ raise click.Abort()
360
+
361
+
362
+ # =============================================================================
363
+ # LIST COMMAND
364
+ # =============================================================================
365
+
366
+
367
+ @experiments.command("list")
368
+ @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
369
+ @click.option("--status", help="Filter by status (draft, ready, completed, etc.)")
370
+ @click.option("--tags", help="Filter by tags (comma-separated)")
371
+ def list_experiments(
372
+ base_path: Optional[str],
373
+ status: Optional[str],
374
+ tags: Optional[str],
375
+ ):
376
+ """List all experiments.
377
+
378
+ Examples:
379
+ rem experiments list
380
+ rem experiments list --status ready
381
+ rem experiments list --tags production,cv-parser
382
+ """
383
+ from rem.models.core.experiment import ExperimentConfig, ExperimentStatus
384
+ import os
385
+
386
+ try:
387
+ # Resolve base path
388
+ if base_path is None:
389
+ base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
390
+
391
+ experiments_dir = Path(base_path)
392
+ if not experiments_dir.exists():
393
+ click.echo(f"No experiments directory found at {base_path}")
394
+ return
395
+
396
+ # Find all experiment.yaml files
397
+ configs = []
398
+ for exp_dir in experiments_dir.iterdir():
399
+ if not exp_dir.is_dir() or exp_dir.name.startswith("."):
400
+ continue
401
+
402
+ config_file = exp_dir / "experiment.yaml"
403
+ if config_file.exists():
404
+ try:
405
+ config = ExperimentConfig.from_yaml(config_file)
406
+ configs.append(config)
407
+ except Exception as e:
408
+ logger.warning(f"Failed to load {config_file}: {e}")
409
+
410
+ # Apply filters
411
+ if status:
412
+ status_enum = ExperimentStatus(status)
413
+ configs = [c for c in configs if c.status == status_enum]
414
+
415
+ if tags:
416
+ filter_tags = set(t.strip().lower() for t in tags.split(","))
417
+ configs = [c for c in configs if filter_tags & set(c.tags)]
418
+
419
+ if not configs:
420
+ click.echo("No experiments found")
421
+ return
422
+
423
+ # Sort by updated_at descending
424
+ configs.sort(key=lambda c: c.updated_at, reverse=True)
425
+
426
+ # Display table
427
+ click.echo(f"\nExperiments ({len(configs)} total):\n")
428
+ click.echo(f"{'Name':<30} {'Status':<12} {'Agent':<20} {'Updated':<12}")
429
+ click.echo("-" * 75)
430
+
431
+ for config in configs:
432
+ name = config.name[:30]
433
+ status_str = config.status.value[:12]
434
+ agent = config.agent_schema_ref.name[:20]
435
+ updated = config.updated_at.strftime("%Y-%m-%d")
436
+ click.echo(f"{name:<30} {status_str:<12} {agent:<20} {updated:<12}")
437
+
438
+ except Exception as e:
439
+ logger.error(f"Failed to list experiments: {e}")
440
+ click.echo(f"Error: {e}", err=True)
441
+ raise click.Abort()
442
+
443
+
444
+ # =============================================================================
445
+ # SHOW COMMAND
446
+ # =============================================================================
447
+
448
+
449
+ @experiments.command("show")
450
+ @click.argument("name")
451
+ @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
452
+ def show(name: str, base_path: Optional[str]):
453
+ """Show experiment details.
454
+
455
+ Examples:
456
+ rem experiments show hello-world-validation
457
+ """
458
+ from rem.models.core.experiment import ExperimentConfig
459
+ import os
460
+
461
+ try:
462
+ # Resolve base path
463
+ if base_path is None:
464
+ base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
465
+
466
+ config_path = Path(base_path) / name / "experiment.yaml"
467
+ if not config_path.exists():
468
+ click.echo(f"Experiment not found: {name}")
469
+ click.echo(f" Looked in: {config_path}")
470
+ raise click.Abort()
471
+
472
+ config = ExperimentConfig.from_yaml(config_path)
473
+
474
+ click.echo(f"\nExperiment: {config.name}")
475
+ click.echo(f"{'=' * 60}\n")
476
+ click.echo(f"Description: {config.description}")
477
+ click.echo(f"Status: {config.status.value}")
478
+ if config.tags:
479
+ click.echo(f"Tags: {', '.join(config.tags)}")
480
+
481
+ click.echo(f"\nAgent Schema:")
482
+ click.echo(f" Name: {config.agent_schema_ref.name}")
483
+ click.echo(f" Version: {config.agent_schema_ref.version or 'latest'}")
484
+
485
+ click.echo(f"\nEvaluator Schema:")
486
+ click.echo(f" Name: {config.evaluator_schema_ref.name}")
487
+
488
+ click.echo(f"\nDatasets:")
489
+ for ds_name, ds_ref in config.datasets.items():
490
+ click.echo(f" {ds_name}:")
491
+ click.echo(f" Location: {ds_ref.location.value}")
492
+ click.echo(f" Path: {ds_ref.path}")
493
+ click.echo(f" Format: {ds_ref.format}")
494
+
495
+ click.echo(f"\nResults:")
496
+ click.echo(f" Location: {config.results.location.value}")
497
+ click.echo(f" Base Path: {config.results.base_path}")
498
+ click.echo(f" Save Traces: {config.results.save_traces}")
499
+ click.echo(f" Metrics File: {config.results.metrics_file}")
500
+
501
+ click.echo(f"\nTimestamps:")
502
+ click.echo(f" Created: {config.created_at.isoformat()}")
503
+ click.echo(f" Updated: {config.updated_at.isoformat()}")
504
+ if config.last_run_at:
505
+ click.echo(f" Last Run: {config.last_run_at.isoformat()}")
506
+
507
+ if config.metadata:
508
+ click.echo(f"\nMetadata:")
509
+ for key, value in config.metadata.items():
510
+ click.echo(f" {key}: {value}")
511
+
512
+ except Exception as e:
513
+ logger.error(f"Failed to show experiment: {e}")
514
+ click.echo(f"Error: {e}", err=True)
515
+ raise click.Abort()
516
+
517
+
518
+ # =============================================================================
519
+ # VIBES MODE HELPER
520
+ # =============================================================================
521
+
522
+
523
+ def _run_vibes_mode(
524
+ config: Any,
525
+ dataset_df: Any,
526
+ task_fn: Any,
527
+ base_path: str,
528
+ limit: Optional[int],
529
+ evaluator_schema_path: Path,
530
+ ) -> None:
531
+ """Run experiment in vibes mode - execute agent and export for AI evaluation.
532
+
533
+ Vibes mode runs the agent on each example and saves results to a JSONL file.
534
+ The AI assistant (e.g., Claude Code) then acts as the judge using the
535
+ evaluator schema to evaluate results.
536
+
537
+ Args:
538
+ config: ExperimentConfig object
539
+ dataset_df: Polars DataFrame with ground truth examples
540
+ task_fn: Function to run agent on each example
541
+ base_path: Base directory for experiments
542
+ limit: Optional limit on number of examples to process
543
+ evaluator_schema_path: Path to the evaluator schema YAML file
544
+ """
545
+ from rem.utils.date_utils import format_timestamp_for_experiment, utc_now, to_iso
546
+ import json
547
+
548
+ # Apply limit if specified
549
+ if limit:
550
+ dataset_df = dataset_df.head(limit)
551
+ click.echo(f" (Limited to {limit} examples)")
552
+
553
+ # Create results directory
554
+ timestamp = format_timestamp_for_experiment()
555
+ results_dir = Path(base_path) / config.name / "results" / timestamp
556
+ results_dir.mkdir(parents=True, exist_ok=True)
557
+
558
+ click.echo(f"\n⏳ Running agent on {len(dataset_df)} examples...")
559
+ click.echo(f" Results will be saved to: {results_dir}")
560
+ click.echo()
561
+
562
+ # Run agent on each example and collect results
563
+ results = []
564
+ records = dataset_df.to_dicts()
565
+
566
+ for i, record in enumerate(records, 1):
567
+ example_id = record.get("id", i)
568
+ click.echo(f" [{i}/{len(records)}] Processing example {example_id}...", nl=False)
569
+
570
+ try:
571
+ # Prepare input for agent
572
+ input_text = record.get("text", record.get("input", record.get("query", "")))
573
+ example_input = {"query": input_text} if isinstance(input_text, str) else input_text
574
+
575
+ # Run agent
576
+ output = task_fn({"input": example_input})
577
+
578
+ result = {
579
+ "id": example_id,
580
+ "input": input_text,
581
+ "ground_truth": record.get("ground_truth", record.get("expected_output", "")),
582
+ "category": record.get("category", ""),
583
+ "agent_output": output,
584
+ "status": "success",
585
+ }
586
+ click.echo(" ✓")
587
+
588
+ except Exception as e:
589
+ result = {
590
+ "id": example_id,
591
+ "input": record.get("text", record.get("input", "")),
592
+ "ground_truth": record.get("ground_truth", record.get("expected_output", "")),
593
+ "category": record.get("category", ""),
594
+ "agent_output": None,
595
+ "status": "error",
596
+ "error": str(e),
597
+ }
598
+ click.echo(f" ✗ ({e})")
599
+
600
+ results.append(result)
601
+
602
+ # Save results to JSONL
603
+ results_file = results_dir / "vibes-results.jsonl"
604
+ with open(results_file, "w") as f:
605
+ for result in results:
606
+ f.write(json.dumps(result) + "\n")
607
+
608
+ # Copy evaluator schema to results dir for easy reference
609
+ import shutil
610
+ evaluator_copy = results_dir / "evaluator-schema.yaml"
611
+ shutil.copy(evaluator_schema_path, evaluator_copy)
612
+
613
+ # Save run metadata
614
+ run_info = {
615
+ "experiment": config.name,
616
+ "agent": config.agent_schema_ref.name,
617
+ "evaluator": config.evaluator_schema_ref.name,
618
+ "mode": "vibes",
619
+ "timestamp": timestamp,
620
+ "total_examples": len(records),
621
+ "successful": len([r for r in results if r["status"] == "success"]),
622
+ "failed": len([r for r in results if r["status"] == "error"]),
623
+ "completed_at": to_iso(utc_now()),
624
+ }
625
+
626
+ run_info_file = results_dir / "run-info.json"
627
+ with open(run_info_file, "w") as f:
628
+ json.dump(run_info, f, indent=2)
629
+
630
+ # Print summary and instructions
631
+ success_count = run_info["successful"]
632
+ fail_count = run_info["failed"]
633
+
634
+ click.echo(f"\n{'=' * 60}")
635
+ click.echo(f"VIBES MODE COMPLETE")
636
+ click.echo(f"{'=' * 60}")
637
+ click.echo(f"\nResults: {success_count} successful, {fail_count} failed")
638
+ click.echo(f"\nFiles saved to: {results_dir}/")
639
+ click.echo(f" - vibes-results.jsonl (agent outputs)")
640
+ click.echo(f" - evaluator-schema.yaml (evaluation criteria)")
641
+ click.echo(f" - run-info.json (run metadata)")
642
+
643
+ click.echo(f"\n{'=' * 60}")
644
+ click.echo(f"NEXT STEP: Ask your AI assistant to evaluate")
645
+ click.echo(f"{'=' * 60}")
646
+ click.echo(f"""
647
+ Copy this prompt to Claude Code or your AI assistant:
648
+
649
+ Please evaluate the experiment results in:
650
+ {results_dir}/
651
+
652
+ Read the vibes-results.jsonl file and evaluate each example
653
+ using the evaluator schema in evaluator-schema.yaml.
654
+
655
+ For each example, provide:
656
+ 1. extracted_classification
657
+ 2. exact_match (vs ground_truth)
658
+ 3. semantic_match
659
+ 4. reasoning_quality_score
660
+ 5. overall_score
661
+ 6. pass/fail
662
+
663
+ Then provide summary metrics:
664
+ - Exact match accuracy
665
+ - Semantic match accuracy
666
+ - Average overall score
667
+ - Pass rate
668
+ """)
669
+
670
+
671
+ # =============================================================================
672
+ # RUN COMMAND
673
+ # =============================================================================
674
+
675
+
676
+ @experiments.command("run")
677
+ @click.argument("name")
678
+ @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
679
+ @click.option("--version", help="Git tag version to load (e.g., 'experiments/my-exp/v1.0.0')")
680
+ @click.option("--dry-run", is_flag=True, help="Test on small subset without saving")
681
+ @click.option("--only-vibes", is_flag=True, help="Run agent locally, export results for AI evaluation (no Phoenix)")
682
+ @click.option("--limit", "-n", type=int, help="Limit number of examples to evaluate (useful with --only-vibes)")
683
+ @click.option("--update-prompts", is_flag=True, help="Update prompts in Phoenix before running")
684
+ @click.option("--phoenix-url", help="Phoenix server URL (overrides PHOENIX_BASE_URL env var)")
685
+ @click.option("--phoenix-api-key", help="Phoenix API key (overrides PHOENIX_API_KEY env var)")
686
+ def run(
687
+ name: str,
688
+ base_path: Optional[str],
689
+ version: Optional[str],
690
+ dry_run: bool,
691
+ only_vibes: bool,
692
+ limit: Optional[int],
693
+ update_prompts: bool,
694
+ phoenix_url: Optional[str],
695
+ phoenix_api_key: Optional[str],
696
+ ):
697
+ """Run an experiment using Phoenix provider or local vibes mode.
698
+
699
+ Loads configuration, executes agent and evaluator, saves results.
700
+
701
+ Vibes Mode (--only-vibes):
702
+ Run agent locally without Phoenix infrastructure. Agent outputs are saved
703
+ to a JSONL file along with the evaluator schema. Your AI assistant (e.g.,
704
+ Claude Code) then acts as the judge to evaluate results.
705
+
706
+ This enables seamless switching between:
707
+ - Local evaluation: Quick iteration with AI-as-judge
708
+ - Phoenix evaluation: Production metrics and dashboards
709
+
710
+ Usage:
711
+ rem experiments run my-experiment --only-vibes
712
+ rem experiments run my-experiment --only-vibes --limit 5
713
+
714
+ The command will:
715
+ 1. Run the agent on each ground-truth example
716
+ 2. Save results to results/{timestamp}/vibes-results.jsonl
717
+ 3. Print the evaluator prompt and schema
718
+ 4. Instruct you to ask your AI assistant to evaluate
719
+
720
+ Example workflow with Claude Code:
721
+ $ rem experiments run mental-health-classifier --only-vibes --limit 3
722
+ # ... agent runs ...
723
+ # Results saved to: .experiments/mental-health-classifier/results/20241203-143022/
724
+
725
+ # Then ask Claude Code:
726
+ "Please evaluate the experiment results in
727
+ .experiments/mental-health-classifier/results/20241203-143022/
728
+ using the evaluator schema provided"
729
+
730
+ Phoenix Connection:
731
+ Commands respect PHOENIX_BASE_URL and PHOENIX_API_KEY environment variables.
732
+ Defaults to localhost:6006 for local development.
733
+
734
+ Production (on cluster):
735
+ export PHOENIX_BASE_URL=http://phoenix-svc.observability.svc.cluster.local:6006
736
+ export PHOENIX_API_KEY=<your-key>
737
+ kubectl exec -it deployment/rem-api -- rem experiments run my-experiment
738
+
739
+ Development (port-forward):
740
+ kubectl port-forward -n observability svc/phoenix-svc 6006:6006
741
+ export PHOENIX_API_KEY=<your-key>
742
+ rem experiments run my-experiment
743
+
744
+ Local (local Phoenix):
745
+ python -m phoenix.server.main serve
746
+ rem experiments run my-experiment
747
+
748
+ Examples:
749
+ # Run experiment with latest schemas
750
+ rem experiments run hello-world-validation
751
+
752
+ # Quick local evaluation (vibes mode)
753
+ rem experiments run hello-world-validation --only-vibes
754
+
755
+ # Vibes mode with limited examples
756
+ rem experiments run hello-world-validation --only-vibes --limit 5
757
+
758
+ # Run specific version
759
+ rem experiments run hello-world-validation \\
760
+ --version experiments/hello-world-validation/v1.0.0
761
+
762
+ # Dry run (test without saving)
763
+ rem experiments run cv-parser-production --dry-run
764
+
765
+ # Override Phoenix connection
766
+ rem experiments run my-experiment \\
767
+ --phoenix-url http://phoenix.example.com:6006 \\
768
+ --phoenix-api-key <key>
769
+ """
770
+ from rem.models.core.experiment import ExperimentConfig, ExperimentStatus
771
+ from rem.services.git import GitService
772
+ from rem.services.phoenix import PhoenixClient
773
+ from rem.agentic.providers.phoenix import create_evaluator_from_schema
774
+ from rem.utils.date_utils import utc_now, to_iso, format_timestamp_for_experiment
775
+ import os
776
+
777
+ try:
778
+ # Resolve base path
779
+ if base_path is None:
780
+ base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
781
+
782
+ # Load experiment configuration
783
+ if version:
784
+ # Load from Git at specific version
785
+ git_svc = GitService()
786
+ config_yaml = git_svc.fs.read(
787
+ f"git://rem/.experiments/{name}/experiment.yaml?ref={version}"
788
+ )
789
+ config = ExperimentConfig(**config_yaml)
790
+ click.echo(f"✓ Loaded experiment from Git: {version}")
791
+ else:
792
+ # Load from local filesystem
793
+ config_path = Path(base_path) / name / "experiment.yaml"
794
+ if not config_path.exists():
795
+ click.echo(f"Experiment not found: {name}")
796
+ click.echo(f" Looked in: {config_path}")
797
+ raise click.Abort()
798
+ config = ExperimentConfig.from_yaml(config_path)
799
+ click.echo(f"✓ Loaded experiment: {name}")
800
+
801
+ # Display experiment info
802
+ click.echo(f"\nExperiment: {config.name}")
803
+ click.echo(f" Agent: {config.agent_schema_ref.name} (version: {config.agent_schema_ref.version or 'latest'})")
804
+ click.echo(f" Evaluator: {config.evaluator_schema_ref.name}")
805
+ click.echo(f" Status: {config.status.value}")
806
+ if dry_run:
807
+ click.echo(f" Mode: DRY RUN (no data will be saved)")
808
+ click.echo()
809
+
810
+ # Load agent schema using centralized schema loader
811
+ agent_name = config.agent_schema_ref.name
812
+ agent_version = config.agent_schema_ref.version
813
+
814
+ click.echo(f"Loading agent schema: {agent_name} (version: {agent_version or 'latest'})")
815
+
816
+ from rem.utils.schema_loader import load_agent_schema
817
+
818
+ try:
819
+ agent_schema = load_agent_schema(agent_name)
820
+ click.echo(f"✓ Loaded agent schema: {agent_name}")
821
+ except FileNotFoundError as e:
822
+ logger.error(f"Failed to load agent schema: {e}")
823
+ click.echo(f"Error: Could not load agent schema '{agent_name}'")
824
+ click.echo(f" {e}")
825
+ raise click.Abort()
826
+
827
+ # Create agent function from schema
828
+ from rem.agentic.providers.pydantic_ai import create_agent
829
+ from rem.agentic.context import AgentContext
830
+
831
+ # Create agent context
832
+ context = AgentContext(
833
+ user_id="experiment-runner",
834
+ tenant_id="experiments",
835
+ session_id=f"experiment-{config.name}",
836
+ )
837
+
838
+ agent_runtime = asyncio.run(create_agent(
839
+ context=context,
840
+ agent_schema_override=agent_schema
841
+ ))
842
+
843
+ def task_fn(example: dict[str, Any]) -> dict[str, Any]:
844
+ """Run agent on example."""
845
+ input_data = example.get("input", {})
846
+
847
+ # Extract query from input
848
+ query = input_data.get("query", "")
849
+ if not query:
850
+ # Try other common input keys
851
+ query = input_data.get("text", input_data.get("prompt", str(input_data)))
852
+
853
+ # Run agent
854
+ result = asyncio.run(agent_runtime.run(query))
855
+
856
+ # Serialize result (critical for Pydantic models!)
857
+ from rem.agentic.serialization import serialize_agent_result
858
+ serialized = serialize_agent_result(result)
859
+ # Ensure we return a dict (Phoenix expects dict output)
860
+ if isinstance(serialized, str):
861
+ return {"output": serialized}
862
+ return serialized if isinstance(serialized, dict) else {"output": str(serialized)}
863
+
864
+ # Load evaluator schema using centralized schema loader
865
+ evaluator_name = config.evaluator_schema_ref.name
866
+ evaluator_version = config.evaluator_schema_ref.version
867
+
868
+ click.echo(f"Loading evaluator: {evaluator_name} for agent {agent_name}")
869
+
870
+ # Find evaluator schema file path
871
+ from rem.utils.schema_loader import get_evaluator_schema_path
872
+
873
+ evaluator_schema_path = get_evaluator_schema_path(evaluator_name)
874
+ if not evaluator_schema_path or not evaluator_schema_path.exists():
875
+ click.echo(f"Error: Could not find evaluator schema '{evaluator_name}'")
876
+ raise click.Abort()
877
+
878
+ click.echo(f"✓ Found evaluator schema: {evaluator_schema_path}")
879
+
880
+ # For Phoenix mode, also load evaluator function
881
+ evaluator_fn = None
882
+ if not only_vibes:
883
+ # Try multiple evaluator path patterns (agent-specific, then generic)
884
+ evaluator_paths_to_try = [
885
+ f"{agent_name}/{evaluator_name}", # e.g., hello-world/default
886
+ f"{agent_name}-{evaluator_name}", # e.g., hello-world-default
887
+ evaluator_name, # e.g., default (generic)
888
+ ]
889
+
890
+ evaluator_load_error = None
891
+
892
+ for evaluator_path in evaluator_paths_to_try:
893
+ try:
894
+ evaluator_fn = create_evaluator_from_schema(
895
+ evaluator_schema_path=evaluator_path,
896
+ model_name=None, # Use default from schema
897
+ )
898
+ click.echo(f"✓ Loaded evaluator function: {evaluator_path}")
899
+ break
900
+ except FileNotFoundError as e:
901
+ evaluator_load_error = e
902
+ logger.debug(f"Evaluator not found at {evaluator_path}: {e}")
903
+ continue
904
+ except Exception as e:
905
+ evaluator_load_error = e
906
+ logger.warning(f"Failed to load evaluator from {evaluator_path}: {e}")
907
+ continue
908
+
909
+ if evaluator_fn is None and not only_vibes:
910
+ click.echo(f"Error: Could not load evaluator function '{evaluator_name}'")
911
+ click.echo(f" Tried paths: {evaluator_paths_to_try}")
912
+ if evaluator_load_error:
913
+ click.echo(f" Last error: {evaluator_load_error}")
914
+ raise click.Abort()
915
+
916
+ # Validate evaluator credentials before running expensive agent tasks
917
+ if evaluator_fn is not None and not only_vibes:
918
+ from rem.agentic.providers.phoenix import validate_evaluator_credentials
919
+
920
+ click.echo("Validating evaluator credentials...")
921
+ is_valid, error_msg = validate_evaluator_credentials()
922
+ if not is_valid:
923
+ click.echo(click.style(f"\n⚠️ Evaluator validation failed: {error_msg}", fg="yellow"))
924
+ click.echo("\nOptions:")
925
+ click.echo(" 1. Fix the credentials issue and re-run")
926
+ click.echo(" 2. Run with --only-vibes to skip LLM evaluation")
927
+ click.echo(" 3. Use --evaluator-model to specify a different model")
928
+ raise click.Abort()
929
+ click.echo("✓ Evaluator credentials validated")
930
+
931
+ # Load dataset using read_dataframe utility (auto-detects format from extension)
932
+ from rem.utils.files import read_dataframe
933
+
934
+ click.echo(f"Loading dataset: {list(config.datasets.keys())[0]}")
935
+ dataset_ref = list(config.datasets.values())[0]
936
+
937
+ try:
938
+ if dataset_ref.location.value == "git":
939
+ # Load from Git (local filesystem)
940
+ dataset_path = Path(base_path) / name / dataset_ref.path
941
+ if not dataset_path.exists():
942
+ click.echo(f"Error: Dataset not found: {dataset_path}")
943
+ raise click.Abort()
944
+
945
+ dataset_df = read_dataframe(dataset_path)
946
+
947
+ elif dataset_ref.location.value in ["s3", "hybrid"]:
948
+ # Load from S3 using FS provider
949
+ from rem.services.fs import FS
950
+
951
+ fs = FS()
952
+ content = fs.read(dataset_ref.path)
953
+ # Ensure we have bytes
954
+ if isinstance(content, str):
955
+ content = content.encode()
956
+ dataset_df = read_dataframe(content, filename=dataset_ref.path)
957
+ click.echo(f"✓ Loaded dataset from S3")
958
+
959
+ else:
960
+ click.echo(f"Error: Unknown dataset location: {dataset_ref.location.value}")
961
+ raise click.Abort()
962
+
963
+ except ValueError as e:
964
+ # Unsupported format error from read_dataframe
965
+ click.echo(f"Error: {e}")
966
+ raise click.Abort()
967
+ except Exception as e:
968
+ logger.error(f"Failed to load dataset: {e}")
969
+ click.echo(f"Error: Could not load dataset")
970
+ click.echo(f" Path: {dataset_ref.path}")
971
+ raise click.Abort()
972
+
973
+ click.echo(f"✓ Loaded dataset: {len(dataset_df)} examples")
974
+
975
+ # Update prompts in Phoenix if requested
976
+ if update_prompts:
977
+ # TODO: Implement prompt updating
978
+ click.echo("⚠ --update-prompts not yet implemented")
979
+
980
+ # Vibes mode: run agent and export for AI evaluation
981
+ if only_vibes:
982
+ _run_vibes_mode(
983
+ config=config,
984
+ dataset_df=dataset_df,
985
+ task_fn=task_fn,
986
+ base_path=base_path,
987
+ limit=limit,
988
+ evaluator_schema_path=evaluator_schema_path,
989
+ )
990
+ return
991
+
992
+ # Run experiment via Phoenix
993
+ if not dry_run:
994
+ # Create Phoenix client with optional overrides
995
+ from rem.services.phoenix.config import PhoenixConfig
996
+ import os
997
+
998
+ phoenix_config = PhoenixConfig(
999
+ base_url=phoenix_url or os.getenv("PHOENIX_BASE_URL"),
1000
+ api_key=phoenix_api_key or os.getenv("PHOENIX_API_KEY")
1001
+ )
1002
+
1003
+ # Display Phoenix connection info
1004
+ phoenix_display_url = phoenix_config.base_url
1005
+ phoenix_has_key = "Yes" if phoenix_config.api_key else "No"
1006
+ click.echo(f"\nPhoenix Connection:")
1007
+ click.echo(f" URL: {phoenix_display_url}")
1008
+ click.echo(f" API Key: {phoenix_has_key}")
1009
+ click.echo()
1010
+
1011
+ client = PhoenixClient(config=phoenix_config)
1012
+
1013
+ experiment_name = f"{config.name}-{format_timestamp_for_experiment()}"
1014
+
1015
+ click.echo(f"\n⏳ Running experiment: {experiment_name}")
1016
+ click.echo(f" This may take several minutes...")
1017
+
1018
+ experiment = client.run_experiment(
1019
+ dataset=dataset_df,
1020
+ task=task_fn,
1021
+ evaluators=[evaluator_fn],
1022
+ experiment_name=experiment_name,
1023
+ experiment_description=config.description,
1024
+ experiment_metadata={
1025
+ "agent": config.agent_schema_ref.name,
1026
+ "evaluator": config.evaluator_schema_ref.name,
1027
+ "experiment_config": config.name,
1028
+ **config.metadata
1029
+ },
1030
+ # Smart column detection for DataFrame -> Phoenix Dataset conversion
1031
+ input_keys=["input"] if "input" in dataset_df.columns else None,
1032
+ output_keys=["expected_output"] if "expected_output" in dataset_df.columns else None,
1033
+ )
1034
+
1035
+ # Update experiment status
1036
+ config.status = ExperimentStatus.COMPLETED
1037
+ config.last_run_at = utc_now()
1038
+ if not version: # Only save if not loading from Git
1039
+ config.save(base_path)
1040
+
1041
+ click.echo(f"\n✓ Experiment complete!")
1042
+ if hasattr(experiment, "url"):
1043
+ click.echo(f" View results: {experiment.url}") # type: ignore[attr-defined]
1044
+
1045
+ # Save results according to config.results settings
1046
+ if config.results.save_metrics_summary:
1047
+ # Get experiment data
1048
+ try:
1049
+ exp_data = client.get_experiment(experiment.id) # type: ignore[attr-defined]
1050
+
1051
+ # Build metrics summary
1052
+ metrics = {
1053
+ "experiment_id": experiment.id, # type: ignore[attr-defined]
1054
+ "experiment_name": experiment_name,
1055
+ "agent": config.agent_schema_ref.name,
1056
+ "evaluator": config.evaluator_schema_ref.name,
1057
+ "dataset_size": len(dataset_df),
1058
+ "completed_at": to_iso(utc_now()),
1059
+ "phoenix_url": getattr(experiment, "url", None),
1060
+ "task_runs": len(exp_data.get("task_runs", [])),
1061
+ }
1062
+
1063
+ # Save metrics
1064
+ if config.results.location.value == "git" or config.results.location.value == "hybrid":
1065
+ # Save to Git
1066
+ metrics_path = Path(base_path) / name / "results" / (config.results.metrics_file or "metrics.json")
1067
+ metrics_path.parent.mkdir(parents=True, exist_ok=True)
1068
+
1069
+ import json
1070
+ with open(metrics_path, "w") as f:
1071
+ json.dump(metrics, f, indent=2)
1072
+
1073
+ click.echo(f"\n✓ Saved metrics summary: {metrics_path}")
1074
+
1075
+ if config.results.location.value == "s3" or config.results.location.value == "hybrid":
1076
+ # Save to S3
1077
+ from rem.services.fs import FS
1078
+ fs = FS()
1079
+
1080
+ s3_metrics_path = config.results.base_path.rstrip("/") + "/" + (config.results.metrics_file or "metrics.json")
1081
+
1082
+ import json
1083
+ fs.write(s3_metrics_path, json.dumps(metrics, indent=2))
1084
+
1085
+ click.echo(f"✓ Saved metrics summary to S3: {s3_metrics_path}")
1086
+
1087
+ except Exception as e:
1088
+ logger.warning(f"Failed to save metrics: {e}")
1089
+ click.echo(f"⚠ Could not save metrics summary: {e}")
1090
+ else:
1091
+ click.echo("\n✓ Dry run complete (no data saved)")
1092
+
1093
+ except Exception as e:
1094
+ logger.error(f"Failed to run experiment: {e}")
1095
+ click.echo(f"Error: {e}", err=True)
1096
+ raise click.Abort()
1097
+
1098
+
1099
+ # =============================================================================
1100
+ # DATASET COMMANDS
1101
+ # =============================================================================
1102
+
1103
+
1104
+ @experiments.group()
1105
+ def dataset():
1106
+ """Dataset management commands."""
1107
+ pass
1108
+
1109
+
1110
+ @dataset.command("list")
1111
+ def dataset_list():
1112
+ """List all datasets.
1113
+
1114
+ Example:
1115
+ rem experiments dataset list
1116
+ """
1117
+ from rem.services.phoenix import PhoenixClient
1118
+
1119
+ try:
1120
+ client = PhoenixClient()
1121
+ datasets = client.list_datasets()
1122
+
1123
+ if not datasets:
1124
+ click.echo("No datasets found")
1125
+ return
1126
+
1127
+ click.echo(f"\nDatasets ({len(datasets)} total):\n")
1128
+ click.echo(f"{'Name':<40} {'Examples':>10} {'Created':<12}")
1129
+ click.echo("-" * 65)
1130
+
1131
+ for ds in datasets:
1132
+ name = ds.get("name", "")[:40]
1133
+ count = ds.get("example_count", 0)
1134
+ created = ds.get("created_at", "")[:10]
1135
+ click.echo(f"{name:<40} {count:>10} {created:<12}")
1136
+
1137
+ except Exception as e:
1138
+ logger.error(f"Failed to list datasets: {e}")
1139
+ click.echo(f"Error: {e}", err=True)
1140
+ raise click.Abort()
1141
+
1142
+
1143
+ @dataset.command("create")
1144
+ @click.argument("name")
1145
+ @click.option("--from-csv", type=click.Path(exists=True, path_type=Path), help="Create from CSV file")
1146
+ @click.option("--input-keys", help="Comma-separated input column names")
1147
+ @click.option("--output-keys", help="Comma-separated output column names (reference/ground truth)")
1148
+ @click.option("--metadata-keys", help="Comma-separated metadata column names (difficulty, type, etc.)")
1149
+ @click.option("--description", help="Dataset description")
1150
+ def dataset_create(
1151
+ name: str,
1152
+ from_csv: Optional[Path],
1153
+ input_keys: Optional[str],
1154
+ output_keys: Optional[str],
1155
+ metadata_keys: Optional[str],
1156
+ description: Optional[str],
1157
+ ):
1158
+ """Create a dataset (golden set).
1159
+
1160
+ Two modes:
1161
+ 1. From CSV: --from-csv golden.csv --input-keys query --output-keys expected
1162
+ 2. Manual (empty): Will create empty dataset to populate later
1163
+
1164
+ Examples:
1165
+ # From CSV (SME golden set)
1166
+ rem experiments dataset create rem-lookup-golden \\
1167
+ --from-csv golden-lookup.csv \\
1168
+ --input-keys query \\
1169
+ --output-keys expected_label,expected_type \\
1170
+ --metadata-keys difficulty,query_type
1171
+
1172
+ # Empty dataset (populate later)
1173
+ rem experiments dataset create rem-test --description "Test dataset"
1174
+ """
1175
+ from rem.services.phoenix import PhoenixClient
1176
+
1177
+ try:
1178
+ client = PhoenixClient()
1179
+
1180
+ if from_csv:
1181
+ # Create from CSV
1182
+ if not input_keys or not output_keys:
1183
+ click.echo("Error: --input-keys and --output-keys required for CSV", err=True)
1184
+ raise click.Abort()
1185
+
1186
+ dataset = client.create_dataset_from_csv(
1187
+ name=name,
1188
+ csv_file_path=from_csv,
1189
+ input_keys=input_keys.split(","),
1190
+ output_keys=output_keys.split(","),
1191
+ metadata_keys=metadata_keys.split(",") if metadata_keys else None,
1192
+ description=description,
1193
+ )
1194
+
1195
+ click.echo(f"✓ Created dataset '{dataset.name}' from CSV with {len(dataset)} examples")
1196
+
1197
+ else:
1198
+ # Create empty dataset
1199
+ dataset = client.create_dataset_from_data(
1200
+ name=name,
1201
+ inputs=[],
1202
+ outputs=[],
1203
+ description=description,
1204
+ )
1205
+
1206
+ click.echo(f"✓ Created empty dataset '{dataset.name}'")
1207
+ click.echo(" Use 'rem experiments dataset add' to add examples")
1208
+
1209
+ except Exception as e:
1210
+ logger.error(f"Failed to create dataset: {e}")
1211
+ click.echo(f"Error: {e}", err=True)
1212
+ raise click.Abort()
1213
+
1214
+
1215
+ @dataset.command("add")
1216
+ @click.argument("dataset_name")
1217
+ @click.option("--from-csv", type=click.Path(exists=True, path_type=Path), required=True,
1218
+ help="CSV file with examples")
1219
+ @click.option("--input-keys", required=True, help="Comma-separated input column names")
1220
+ @click.option("--output-keys", required=True, help="Comma-separated output column names")
1221
+ @click.option("--metadata-keys", help="Comma-separated metadata column names")
1222
+ def dataset_add(
1223
+ dataset_name: str,
1224
+ from_csv: Path,
1225
+ input_keys: str,
1226
+ output_keys: str,
1227
+ metadata_keys: Optional[str],
1228
+ ):
1229
+ """Add examples to an existing dataset.
1230
+
1231
+ Example:
1232
+ rem experiments dataset add rem-lookup-golden \\
1233
+ --from-csv new-examples.csv \\
1234
+ --input-keys query \\
1235
+ --output-keys expected_label,expected_type
1236
+ """
1237
+ from rem.services.phoenix import PhoenixClient
1238
+ import polars as pl
1239
+
1240
+ try:
1241
+ client = PhoenixClient()
1242
+
1243
+ # Load CSV with Polars
1244
+ df = pl.read_csv(from_csv)
1245
+ records = df.to_dicts()
1246
+
1247
+ # Extract data
1248
+ input_cols = input_keys.split(",")
1249
+ output_cols = output_keys.split(",")
1250
+ inputs = [{k: row.get(k) for k in input_cols} for row in records]
1251
+ outputs = [{k: row.get(k) for k in output_cols} for row in records]
1252
+ metadata = None
1253
+ if metadata_keys:
1254
+ meta_cols = metadata_keys.split(",")
1255
+ metadata = [{k: row.get(k) for k in meta_cols} for row in records]
1256
+
1257
+ # Add to dataset
1258
+ dataset = client.add_examples_to_dataset(
1259
+ dataset=dataset_name,
1260
+ inputs=inputs,
1261
+ outputs=outputs,
1262
+ metadata=metadata,
1263
+ )
1264
+
1265
+ click.echo(f"✓ Added {len(inputs)} examples to dataset '{dataset.name}'")
1266
+ click.echo(f" Total examples: {len(dataset)}")
1267
+
1268
+ except Exception as e:
1269
+ logger.error(f"Failed to add examples: {e}")
1270
+ click.echo(f"Error: {e}", err=True)
1271
+ raise click.Abort()
1272
+
1273
+
1274
+ # =============================================================================
1275
+ # PROMPT COMMANDS
1276
+ # =============================================================================
1277
+
1278
+
1279
+ @experiments.group()
1280
+ def prompt():
1281
+ """Prompt management commands."""
1282
+ pass
1283
+
1284
+
1285
+ @prompt.command("create")
1286
+ @click.argument("name")
1287
+ @click.option("--system-prompt", "-s", required=True, help="System prompt text")
1288
+ @click.option("--description", "-d", help="Prompt description")
1289
+ @click.option("--model-provider", default="OPENAI", help="Model provider (OPENAI, ANTHROPIC)")
1290
+ @click.option("--model-name", "-m", help="Model name (e.g., gpt-4.1, claude-sonnet-4-5)")
1291
+ @click.option("--type", "-t", "prompt_type", default="Agent", help="Prompt type (Agent or Evaluator)")
1292
+ def prompt_create(
1293
+ name: str,
1294
+ system_prompt: str,
1295
+ description: Optional[str],
1296
+ model_provider: str,
1297
+ model_name: Optional[str],
1298
+ prompt_type: str,
1299
+ ):
1300
+ """Create a prompt.
1301
+
1302
+ Examples:
1303
+ # Create agent prompt
1304
+ rem experiments prompt create hello-world \\
1305
+ --system-prompt "You are a helpful assistant." \\
1306
+ --model-name gpt-4.1
1307
+
1308
+ # Create evaluator prompt
1309
+ rem experiments prompt create correctness-evaluator \\
1310
+ --system-prompt "Evaluate the correctness of responses." \\
1311
+ --type Evaluator \\
1312
+ --model-provider ANTHROPIC \\
1313
+ --model-name claude-sonnet-4-5
1314
+ """
1315
+ from rem.services.phoenix import PhoenixClient
1316
+ from rem.services.phoenix.prompt_labels import PhoenixPromptLabels
1317
+ from phoenix.client import Client
1318
+ from phoenix.client.types.prompts import PromptVersion
1319
+ from phoenix.client.__generated__ import v1
1320
+
1321
+ try:
1322
+ # Set default model if not specified
1323
+ if not model_name:
1324
+ model_name = "gpt-4.1" if model_provider == "OPENAI" else "claude-sonnet-4-5-20250929"
1325
+
1326
+ # Get config
1327
+ phoenix_client = PhoenixClient()
1328
+ config = phoenix_client.config
1329
+
1330
+ # Create client
1331
+ client = Client(
1332
+ base_url=config.base_url,
1333
+ api_key=config.api_key
1334
+ )
1335
+
1336
+ # Create prompt messages
1337
+ messages = [
1338
+ v1.PromptMessage(
1339
+ role="system",
1340
+ content=system_prompt
1341
+ )
1342
+ ]
1343
+
1344
+ # Create PromptVersion
1345
+ version = PromptVersion(
1346
+ messages,
1347
+ model_name=model_name,
1348
+ description="v1.0",
1349
+ model_provider=model_provider # type: ignore[arg-type]
1350
+ )
1351
+
1352
+ # Create the prompt
1353
+ result = client.prompts.create(
1354
+ name=name,
1355
+ version=version,
1356
+ prompt_description=description or f"{prompt_type} prompt: {name}"
1357
+ )
1358
+
1359
+ click.echo(f"✓ Created prompt '{name}' (ID: {result.id})")
1360
+
1361
+ # Try to get the prompt ID for label assignment
1362
+ try:
1363
+ import httpx
1364
+ query = """
1365
+ query {
1366
+ prompts(first: 1, filterBy: {name: {equals: "%s"}}) {
1367
+ edges {
1368
+ node {
1369
+ id
1370
+ name
1371
+ }
1372
+ }
1373
+ }
1374
+ }
1375
+ """ % name
1376
+
1377
+ response = httpx.post(
1378
+ f"{config.base_url}/graphql",
1379
+ json={"query": query},
1380
+ headers={"authorization": f"Bearer {config.api_key}"},
1381
+ timeout=10,
1382
+ )
1383
+ graphql_result = response.json()
1384
+ prompts = graphql_result.get("data", {}).get("prompts", {}).get("edges", [])
1385
+
1386
+ if prompts:
1387
+ prompt_id = prompts[0]["node"]["id"]
1388
+
1389
+ # Assign labels
1390
+ if not config.base_url:
1391
+ raise ValueError("Phoenix base_url is required")
1392
+ labels_helper = PhoenixPromptLabels(
1393
+ base_url=config.base_url, api_key=config.api_key
1394
+ )
1395
+
1396
+ # Assign REM + type label
1397
+ label_names = ["REM", prompt_type]
1398
+ labels_helper.assign_prompt_labels(prompt_id, label_names)
1399
+ click.echo(f"✓ Assigned labels: {', '.join(label_names)}")
1400
+ except Exception as e:
1401
+ click.echo(f"⚠ Warning: Could not assign labels: {e}")
1402
+
1403
+ click.echo(f"\nView in UI: {config.base_url}")
1404
+
1405
+ except Exception as e:
1406
+ logger.error(f"Failed to create prompt: {e}")
1407
+ click.echo(f"Error: {e}", err=True)
1408
+ raise click.Abort()
1409
+
1410
+
1411
+ @prompt.command("list")
1412
+ def prompt_list():
1413
+ """List all prompts.
1414
+
1415
+ Example:
1416
+ rem experiments prompt list
1417
+ """
1418
+ import httpx
1419
+ from rem.services.phoenix import PhoenixClient
1420
+
1421
+ try:
1422
+ phoenix_client = PhoenixClient()
1423
+ config = phoenix_client.config
1424
+
1425
+ query = """
1426
+ query {
1427
+ prompts(first: 100) {
1428
+ edges {
1429
+ node {
1430
+ id
1431
+ name
1432
+ description
1433
+ createdAt
1434
+ }
1435
+ }
1436
+ }
1437
+ }
1438
+ """
1439
+
1440
+ response = httpx.post(
1441
+ f"{config.base_url}/graphql",
1442
+ json={"query": query},
1443
+ headers={"authorization": f"Bearer {config.api_key}"},
1444
+ timeout=10,
1445
+ )
1446
+
1447
+ result = response.json()
1448
+ prompts = result.get("data", {}).get("prompts", {}).get("edges", [])
1449
+
1450
+ if not prompts:
1451
+ click.echo("No prompts found")
1452
+ return
1453
+
1454
+ click.echo(f"\nPrompts ({len(prompts)} total):\n")
1455
+ click.echo(f"{'Name':<40} {'Created':<20}")
1456
+ click.echo("-" * 65)
1457
+
1458
+ for edge in prompts:
1459
+ node = edge["node"]
1460
+ name = node.get("name", "")[:40]
1461
+ created = node.get("createdAt", "")[:19]
1462
+ click.echo(f"{name:<40} {created:<20}")
1463
+
1464
+ except Exception as e:
1465
+ logger.error(f"Failed to list prompts: {e}")
1466
+ click.echo(f"Error: {e}", err=True)
1467
+ raise click.Abort()
1468
+
1469
+
1470
+ # =============================================================================
1471
+ # TRACE COMMANDS
1472
+ # =============================================================================
1473
+
1474
+
1475
+ @experiments.group()
1476
+ def trace():
1477
+ """Trace retrieval commands."""
1478
+ pass
1479
+
1480
+
1481
+ @trace.command("list")
1482
+ @click.option("--project", "-p", help="Filter by project name")
1483
+ @click.option("--days", "-d", default=7, help="Number of days to look back")
1484
+ @click.option("--limit", "-l", default=20, help="Maximum traces to return")
1485
+ def trace_list(
1486
+ project: Optional[str],
1487
+ days: int,
1488
+ limit: int,
1489
+ ):
1490
+ """List recent traces.
1491
+
1492
+ Example:
1493
+ rem experiments trace list --project rem-agents --days 7 --limit 50
1494
+ """
1495
+ from rem.services.phoenix import PhoenixClient
1496
+ from rem.utils.date_utils import days_ago
1497
+
1498
+ try:
1499
+ client = PhoenixClient()
1500
+
1501
+ start_time = days_ago(days)
1502
+
1503
+ traces_df = client.get_traces(
1504
+ project_name=project,
1505
+ start_time=start_time,
1506
+ limit=limit,
1507
+ )
1508
+
1509
+ if len(traces_df) == 0:
1510
+ click.echo("No traces found")
1511
+ return
1512
+
1513
+ click.echo(f"\nRecent Traces ({len(traces_df)} results):\n")
1514
+ click.echo(f"{'Span ID':<15} {'Name':<30} {'Start Time':<20}")
1515
+ click.echo("-" * 70)
1516
+
1517
+ for _, row in traces_df.head(limit).iterrows():
1518
+ span_id = str(row.get("context.span_id", ""))[:12]
1519
+ name = str(row.get("name", ""))[:30]
1520
+ start = str(row.get("start_time", ""))[:19]
1521
+ click.echo(f"{span_id:<15} {name:<30} {start:<20}")
1522
+
1523
+ except Exception as e:
1524
+ logger.error(f"Failed to list traces: {e}")
1525
+ click.echo(f"Error: {e}", err=True)
1526
+ raise click.Abort()
1527
+
1528
+
1529
+ # =============================================================================
1530
+ # EXPORT COMMAND
1531
+ # =============================================================================
1532
+
1533
+
1534
+ @experiments.command("export")
1535
+ @click.argument("name")
1536
+ @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
1537
+ @click.option("--bucket", "-b", help="S3 bucket name (default: DATA_LAKE__BUCKET_NAME)")
1538
+ @click.option("--version", "-v", default="v0", help="Data lake version prefix (default: v0)")
1539
+ @click.option("--plan", is_flag=True, help="Show what would be exported without uploading")
1540
+ @click.option("--include-results", is_flag=True, help="Include results directory in export")
1541
+ def export(
1542
+ name: str,
1543
+ base_path: Optional[str],
1544
+ bucket: Optional[str],
1545
+ version: str,
1546
+ plan: bool,
1547
+ include_results: bool,
1548
+ ):
1549
+ """Export experiment to S3 data lake.
1550
+
1551
+ Exports experiment configuration, ground truth, and optionally results
1552
+ to the S3 data lake following the convention:
1553
+
1554
+ s3://{bucket}/{version}/datasets/calibration/experiments/{agent}/{task}/
1555
+
1556
+ The export includes:
1557
+ - experiment.yaml (configuration)
1558
+ - README.md (documentation)
1559
+ - ground-truth/ (evaluation datasets)
1560
+ - seed-data/ (optional seed data)
1561
+ - results/ (optional, with --include-results)
1562
+
1563
+ Examples:
1564
+ # Preview what would be exported
1565
+ rem experiments export my-experiment --plan
1566
+
1567
+ # Export to configured data lake bucket
1568
+ rem experiments export my-experiment
1569
+
1570
+ # Export to specific bucket
1571
+ rem experiments export my-experiment --bucket siggy-data
1572
+
1573
+ # Include results in export
1574
+ rem experiments export my-experiment --include-results
1575
+
1576
+ # Export with custom version prefix
1577
+ rem experiments export my-experiment --version v1
1578
+ """
1579
+ from rem.models.core.experiment import ExperimentConfig
1580
+ from rem.settings import settings
1581
+ from rem.services.fs.s3_provider import S3Provider
1582
+ import os
1583
+ import json
1584
+
1585
+ try:
1586
+ # Resolve base path
1587
+ if base_path is None:
1588
+ base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
1589
+
1590
+ # Load experiment configuration
1591
+ config_path = Path(base_path) / name / "experiment.yaml"
1592
+ if not config_path.exists():
1593
+ click.echo(f"Experiment not found: {name}")
1594
+ click.echo(f" Looked in: {config_path}")
1595
+ raise click.Abort()
1596
+
1597
+ config = ExperimentConfig.from_yaml(config_path)
1598
+ click.echo(f"✓ Loaded experiment: {name}")
1599
+
1600
+ # Resolve bucket
1601
+ if bucket is None:
1602
+ bucket = settings.data_lake.bucket_name
1603
+ if bucket is None:
1604
+ click.echo("Error: No S3 bucket configured.")
1605
+ click.echo(" Set DATA_LAKE__BUCKET_NAME environment variable or use --bucket option")
1606
+ raise click.Abort()
1607
+
1608
+ # Build S3 paths
1609
+ s3_base = config.get_s3_export_path(bucket, version)
1610
+ exp_dir = config.get_experiment_dir(base_path)
1611
+
1612
+ # Collect files to export
1613
+ files_to_export = []
1614
+
1615
+ # Always include these files
1616
+ required_files = [
1617
+ ("experiment.yaml", exp_dir / "experiment.yaml"),
1618
+ ("README.md", exp_dir / "README.md"),
1619
+ ]
1620
+
1621
+ for s3_name, local_path in required_files:
1622
+ if local_path.exists():
1623
+ files_to_export.append((s3_name, local_path))
1624
+
1625
+ # Include ground-truth directory
1626
+ ground_truth_dir = exp_dir / "ground-truth"
1627
+ if ground_truth_dir.exists():
1628
+ for f in ground_truth_dir.rglob("*"):
1629
+ if f.is_file():
1630
+ relative = f.relative_to(exp_dir)
1631
+ files_to_export.append((str(relative), f))
1632
+
1633
+ # Include seed-data directory
1634
+ seed_data_dir = exp_dir / "seed-data"
1635
+ if seed_data_dir.exists():
1636
+ for f in seed_data_dir.rglob("*"):
1637
+ if f.is_file():
1638
+ relative = f.relative_to(exp_dir)
1639
+ files_to_export.append((str(relative), f))
1640
+
1641
+ # Optionally include results
1642
+ if include_results:
1643
+ results_dir = exp_dir / "results"
1644
+ if results_dir.exists():
1645
+ for f in results_dir.rglob("*"):
1646
+ if f.is_file():
1647
+ relative = f.relative_to(exp_dir)
1648
+ files_to_export.append((str(relative), f))
1649
+
1650
+ # Display export plan
1651
+ click.echo(f"\n{'=' * 60}")
1652
+ click.echo(f"EXPORT {'PLAN' if plan else 'TO S3'}")
1653
+ click.echo(f"{'=' * 60}")
1654
+ click.echo(f"\nExperiment: {config.name}")
1655
+ click.echo(f"Agent: {config.agent_schema_ref.name}")
1656
+ click.echo(f"Task: {config.task}")
1657
+ click.echo(f"Evaluator file: {config.get_evaluator_filename()}")
1658
+ click.echo(f"\nDestination: {s3_base}/")
1659
+ click.echo(f"\nFiles to export ({len(files_to_export)}):")
1660
+
1661
+ for s3_name, local_path in files_to_export:
1662
+ s3_uri = f"{s3_base}/{s3_name}"
1663
+ if plan:
1664
+ click.echo(f" {local_path}")
1665
+ click.echo(f" → {s3_uri}")
1666
+ else:
1667
+ click.echo(f" {s3_name}")
1668
+
1669
+ if plan:
1670
+ click.echo(f"\n[PLAN MODE] No files were uploaded.")
1671
+ click.echo(f"Run without --plan to execute the export.")
1672
+ return
1673
+
1674
+ # Execute export
1675
+ click.echo(f"\n⏳ Uploading to S3...")
1676
+ s3 = S3Provider()
1677
+
1678
+ uploaded = 0
1679
+ for s3_name, local_path in files_to_export:
1680
+ s3_uri = f"{s3_base}/{s3_name}"
1681
+ try:
1682
+ s3.copy(str(local_path), s3_uri)
1683
+ uploaded += 1
1684
+ click.echo(f" ✓ {s3_name}")
1685
+ except Exception as e:
1686
+ click.echo(f" ✗ {s3_name}: {e}")
1687
+
1688
+ click.echo(f"\n✓ Exported {uploaded}/{len(files_to_export)} files to {s3_base}/")
1689
+
1690
+ # Show next steps
1691
+ click.echo(f"\nNext steps:")
1692
+ click.echo(f" - View in S3: aws s3 ls {s3_base}/ --recursive")
1693
+ click.echo(f" - Download: aws s3 sync {s3_base}/ ./{config.agent_schema_ref.name}/{config.task}/")
1694
+
1695
+ except Exception as e:
1696
+ logger.error(f"Failed to export experiment: {e}")
1697
+ click.echo(f"Error: {e}", err=True)
1698
+ raise click.Abort()