remdb 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +566 -0
  44. rem/cli/commands/configure.py +497 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1302 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +96 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +801 -0
  104. rem/services/content/service.py +676 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +336 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.3.7.dist-info/METADATA +1473 -0
  185. remdb-0.3.7.dist-info/RECORD +187 -0
  186. remdb-0.3.7.dist-info/WHEEL +4 -0
  187. remdb-0.3.7.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,1302 @@
1
+ """
2
+ Experiment management CLI commands.
3
+
4
+ Experiments use ExperimentConfig (rem/models/core/experiment.py) for configuration
5
+ and support Git+S3 hybrid storage. Includes dataset, prompt, and trace management.
6
+
7
+ Directory Structure:
8
+ experiments/{experiment-name}/
9
+ ├── experiment.yaml # ExperimentConfig (metadata, agent ref, evaluator ref)
10
+ ├── README.md # Auto-generated documentation
11
+ ├── ground-truth/ # Evaluation datasets (Q&A pairs)
12
+ │ ├── dataset.csv # Input/output pairs for evaluation
13
+ │ └── dataset.yaml # Alternative YAML format
14
+ ├── seed-data/ # Data to seed REM before running experiments
15
+ │ └── data.yaml # Users, resources, moments in REM format
16
+ └── results/ # Experiment results and metrics
17
+ └── {run-timestamp}/ # Each run gets its own timestamped folder
18
+ ├── metrics.json # Summary metrics
19
+ └── run_info.json # Run metadata (eval framework URLs, etc)
20
+
21
+ Environment Variables:
22
+ EXPERIMENTS_HOME: Override default experiment directory (default: "experiments")
23
+
24
+ Commands:
25
+ # Experiment lifecycle
26
+ rem experiments create <name> --agent <agent> --evaluator <evaluator>
27
+ rem experiments list
28
+ rem experiments show <name>
29
+ rem experiments run <name> [--version <tag>]
30
+
31
+ # Dataset management
32
+ rem experiments dataset list
33
+ rem experiments dataset create <name> --from-csv data.csv
34
+ rem experiments dataset add <name> --from-csv data.csv
35
+
36
+ # Prompt management
37
+ rem experiments prompt list
38
+ rem experiments prompt create <name> --system-prompt "..."
39
+
40
+ # Trace retrieval
41
+ rem experiments trace list --project <name>
42
+ """
43
+
44
+ import asyncio
45
+ from pathlib import Path
46
+ from typing import Any, Optional, cast
47
+
48
+ import click
49
+ from loguru import logger
50
+
51
+
52
+ @click.group()
53
+ def experiments():
54
+ """Experiment configuration and execution commands."""
55
+ pass
56
+
57
+
58
+ # =============================================================================
59
+ # CREATE COMMAND
60
+ # =============================================================================
61
+
62
+
63
+ @experiments.command("create")
64
+ @click.argument("name")
65
+ @click.option("--agent", "-a", required=True, help="Agent schema name (e.g., 'cv-parser')")
66
+ @click.option("--evaluator", "-e", default="default", help="Evaluator schema name (default: 'default')")
67
+ @click.option("--description", "-d", help="Experiment description")
68
+ @click.option("--dataset-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
69
+ help="Where to store datasets")
70
+ @click.option("--results-location", type=click.Choice(["git", "s3", "hybrid"]), default="git",
71
+ help="Where to store results")
72
+ @click.option("--tags", help="Comma-separated tags (e.g., 'production,cv-parser')")
73
+ @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
74
+ def create(
75
+ name: str,
76
+ agent: str,
77
+ evaluator: str,
78
+ description: Optional[str],
79
+ dataset_location: str,
80
+ results_location: str,
81
+ tags: Optional[str],
82
+ base_path: Optional[str],
83
+ ):
84
+ """Create a new experiment configuration.
85
+
86
+ Creates directory structure and generates experiment.yaml and README.md.
87
+
88
+ The experiment directory will contain:
89
+ - ground-truth/: Q&A pairs for evaluation
90
+ - seed-data/: REM data (users, resources, moments) to load before running
91
+ - results/: Timestamped run results
92
+
93
+ Examples:
94
+ # Small experiment (Git-only)
95
+ rem experiments create hello-world-validation \\
96
+ --agent hello-world \\
97
+ --evaluator default \\
98
+ --description "Smoke test for hello-world agent"
99
+
100
+ # Large experiment (Hybrid storage)
101
+ rem experiments create cv-parser-production \\
102
+ --agent cv-parser \\
103
+ --evaluator default \\
104
+ --description "Production CV parser evaluation" \\
105
+ --dataset-location s3 \\
106
+ --results-location hybrid \\
107
+ --tags "production,cv-parser,weekly"
108
+
109
+ # Custom location
110
+ EXPERIMENTS_HOME=/path/to/experiments rem experiments create my-test --agent my-agent
111
+ """
112
+ from rem.models.core.experiment import (
113
+ ExperimentConfig,
114
+ DatasetLocation,
115
+ DatasetReference,
116
+ SchemaReference,
117
+ ResultsConfig,
118
+ ExperimentStatus,
119
+ )
120
+ import os
121
+
122
+ try:
123
+ # Resolve base path: CLI arg > EXPERIMENTS_HOME env var > default "experiments"
124
+ if base_path is None:
125
+ base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
126
+ # Build dataset reference
127
+ if dataset_location == "git":
128
+ dataset_ref = DatasetReference(
129
+ location=DatasetLocation.GIT,
130
+ path="ground-truth/dataset.csv",
131
+ format="csv",
132
+ description="Ground truth Q&A dataset for evaluation"
133
+ )
134
+ else: # s3 or hybrid
135
+ dataset_ref = DatasetReference(
136
+ location=DatasetLocation(dataset_location),
137
+ path=f"s3://rem-experiments/{name}/datasets/ground_truth.parquet",
138
+ format="parquet",
139
+ schema_path="datasets/schema.yaml" if dataset_location == "hybrid" else None,
140
+ description="Ground truth dataset for evaluation"
141
+ )
142
+
143
+ # Build results config
144
+ if results_location == "git":
145
+ results_config = ResultsConfig(
146
+ location=DatasetLocation.GIT,
147
+ base_path="results/",
148
+ save_traces=False,
149
+ save_metrics_summary=True
150
+ )
151
+ elif results_location == "s3":
152
+ results_config = ResultsConfig(
153
+ location=DatasetLocation.S3,
154
+ base_path=f"s3://rem-experiments/{name}/results/",
155
+ save_traces=True,
156
+ save_metrics_summary=False
157
+ )
158
+ else: # hybrid
159
+ results_config = ResultsConfig(
160
+ location=DatasetLocation.HYBRID,
161
+ base_path=f"s3://rem-experiments/{name}/results/",
162
+ save_traces=True,
163
+ save_metrics_summary=True,
164
+ metrics_file="metrics.json"
165
+ )
166
+
167
+ # Parse tags
168
+ tag_list = [t.strip() for t in tags.split(",")] if tags else []
169
+
170
+ # Create experiment config
171
+ config = ExperimentConfig(
172
+ name=name,
173
+ description=description or f"Evaluation experiment for {agent} agent",
174
+ agent_schema_ref=SchemaReference(
175
+ name=agent,
176
+ version=None, # Use latest by default
177
+ type="agent"
178
+ ),
179
+ evaluator_schema_ref=SchemaReference(
180
+ name=evaluator,
181
+ type="evaluator"
182
+ ),
183
+ datasets={"ground_truth": dataset_ref},
184
+ results=results_config,
185
+ status=ExperimentStatus.DRAFT,
186
+ tags=tag_list
187
+ )
188
+
189
+ # Save configuration
190
+ config_path = config.save(base_path)
191
+ readme_path = config.save_readme(base_path)
192
+
193
+ # Create new directory structure
194
+ exp_dir = config.get_experiment_dir(base_path)
195
+
196
+ # Create ground-truth directory
197
+ ground_truth_dir = exp_dir / "ground-truth"
198
+ ground_truth_dir.mkdir(parents=True, exist_ok=True)
199
+
200
+ # Create seed-data directory
201
+ seed_data_dir = exp_dir / "seed-data"
202
+ seed_data_dir.mkdir(parents=True, exist_ok=True)
203
+
204
+ # Create results directory if Git-based
205
+ if results_location == "git":
206
+ results_dir = exp_dir / "results"
207
+ results_dir.mkdir(parents=True, exist_ok=True)
208
+
209
+ # Create placeholder files with documentation
210
+ ground_truth_readme = ground_truth_dir / "README.md"
211
+ ground_truth_readme.write_text("""# Ground Truth Dataset
212
+
213
+ This directory contains Q&A pairs for evaluating the agent.
214
+
215
+ ## Format
216
+
217
+ **CSV format** (`dataset.csv`):
218
+ ```csv
219
+ input,expected_output,metadata
220
+ "What is the capital of France?","Paris","{\"difficulty\": \"easy\"}"
221
+ ```
222
+
223
+ **YAML format** (`dataset.yaml`):
224
+ ```yaml
225
+ - input: "What is the capital of France?"
226
+ expected_output: "Paris"
227
+ metadata:
228
+ difficulty: easy
229
+ ```
230
+
231
+ ## Generating Ground Truth
232
+
233
+ ### Using AI Assistants
234
+
235
+ AI coding assistants (like Claude, GPT-4, etc.) can help generate comprehensive ground-truth datasets:
236
+
237
+ 1. **Generate from existing examples**: Show the assistant examples from your domain and ask it to create similar Q&A pairs
238
+ 2. **Create challenging questions**: Ask the assistant to act as a judge and generate HARD questions that test edge cases
239
+ 3. **Vary difficulty levels**: Request a mix of easy, medium, and hard questions with appropriate metadata tags
240
+
241
+ Example prompt:
242
+ ```
243
+ Based on these example documents about [your domain], generate 20 Q&A pairs
244
+ for evaluating an agent. Include:
245
+ - 5 easy factual questions
246
+ - 10 medium questions requiring reasoning
247
+ - 5 hard questions with edge cases
248
+ Format as CSV with difficulty and category metadata.
249
+ ```
250
+
251
+ ### Ground Truth as Judge
252
+
253
+ **Important**: Keep ground-truth data **separate** from the agent being tested:
254
+ - Ground truth should be hidden from the agent during evaluation
255
+ - The agent should only see the `input` field
256
+ - The evaluator compares agent output against `expected_output`
257
+ - This ensures unbiased evaluation
258
+
259
+ ### Quality Guidelines
260
+
261
+ 1. **Diverse Coverage**: Include various question types and difficulty levels
262
+ 2. **Domain-Specific**: Use terminology and scenarios from your actual use case
263
+ 3. **Metadata Tags**: Add difficulty, category, priority for analysis
264
+ 4. **SME Review**: Have domain experts validate expected outputs
265
+
266
+ ## Usage
267
+
268
+ These datasets can be:
269
+ - Loaded into evaluation frameworks (Arize Phoenix, etc.)
270
+ - Used for regression testing
271
+ - Converted to different formats as needed
272
+
273
+ The experiment runner will automatically use this data for evaluation.
274
+ """)
275
+
276
+ seed_data_readme = seed_data_dir / "README.md"
277
+ seed_data_readme.write_text("""# Seed Data
278
+
279
+ This directory contains REM data to load before running the experiment.
280
+
281
+ ## Format
282
+
283
+ Use standard REM YAML format:
284
+
285
+ ```yaml
286
+ users:
287
+ - id: test-user-001
288
+ user_id: experiment-test
289
+ email: test@example.com
290
+
291
+ resources:
292
+ - id: resource-001
293
+ user_id: experiment-test
294
+ label: example-document
295
+ content: "Document content here..."
296
+
297
+ moments:
298
+ - id: moment-001
299
+ user_id: experiment-test
300
+ label: example-meeting
301
+ starts_timestamp: "2024-01-15T14:00:00"
302
+ ```
303
+
304
+ ## Generating Seed Data
305
+
306
+ ### Using AI Assistants
307
+
308
+ AI coding assistants can help generate realistic seed data for your experiments:
309
+
310
+ 1. **From existing datasets**: Reference examples from the `datasets/` directory
311
+ 2. **Domain-specific scenarios**: Describe your use case and ask for appropriate test data
312
+ 3. **Anonymized versions**: Ask to create fictional data based on real patterns
313
+
314
+ Example prompt:
315
+ ```
316
+ Based on the recruitment dataset examples in datasets/domains/recruitment/,
317
+ generate seed data for testing a CV parser agent. Include:
318
+ - 3 test users
319
+ - 5 CV documents (resources) with varied experience levels
320
+ - 2 interview moment entries
321
+ Use fictional names and anonymize all content.
322
+ ```
323
+
324
+ ### Best Practices
325
+
326
+ 1. **Minimal**: Only include data necessary for the ground-truth questions to be answerable
327
+ 2. **Anonymized**: Always use fictional names, companies, and content
328
+ 3. **Relevant**: Seed data should provide context for evaluation questions
329
+ 4. **Versioned**: Track changes to seed data in Git for reproducibility
330
+
331
+ ## Usage
332
+
333
+ Load this data before running experiments:
334
+ ```bash
335
+ rem db load --file seed-data/data.yaml --user-id experiment-test
336
+ ```
337
+
338
+ This ensures your agent has the necessary context for evaluation.
339
+ """)
340
+
341
+ click.echo(f"\n✓ Created experiment: {name}")
342
+ click.echo(f" Configuration: {config_path}")
343
+ click.echo(f" Documentation: {readme_path}")
344
+ click.echo(f" Ground Truth: {ground_truth_dir}")
345
+ click.echo(f" Seed Data: {seed_data_dir}")
346
+ if results_location == "git":
347
+ click.echo(f" Results: {results_dir}")
348
+ click.echo(f"\nNext steps:")
349
+ click.echo(f" 1. Add ground truth Q&A to {ground_truth_dir}/dataset.csv")
350
+ click.echo(f" 2. Add seed data to {seed_data_dir}/data.yaml (optional)")
351
+ click.echo(f" 3. Review configuration: {config_path}")
352
+ click.echo(f" 4. Run experiment: rem experiments run {name}")
353
+ click.echo(f" 5. Commit to Git: git add {base_path}/{name}/ && git commit")
354
+
355
+ except Exception as e:
356
+ logger.error(f"Failed to create experiment: {e}")
357
+ click.echo(f"Error: {e}", err=True)
358
+ raise click.Abort()
359
+
360
+
361
+ # =============================================================================
362
+ # LIST COMMAND
363
+ # =============================================================================
364
+
365
+
366
+ @experiments.command("list")
367
+ @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
368
+ @click.option("--status", help="Filter by status (draft, ready, completed, etc.)")
369
+ @click.option("--tags", help="Filter by tags (comma-separated)")
370
+ def list_experiments(
371
+ base_path: Optional[str],
372
+ status: Optional[str],
373
+ tags: Optional[str],
374
+ ):
375
+ """List all experiments.
376
+
377
+ Examples:
378
+ rem experiments list
379
+ rem experiments list --status ready
380
+ rem experiments list --tags production,cv-parser
381
+ """
382
+ from rem.models.core.experiment import ExperimentConfig, ExperimentStatus
383
+ import os
384
+
385
+ try:
386
+ # Resolve base path
387
+ if base_path is None:
388
+ base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
389
+
390
+ experiments_dir = Path(base_path)
391
+ if not experiments_dir.exists():
392
+ click.echo(f"No experiments directory found at {base_path}")
393
+ return
394
+
395
+ # Find all experiment.yaml files
396
+ configs = []
397
+ for exp_dir in experiments_dir.iterdir():
398
+ if not exp_dir.is_dir() or exp_dir.name.startswith("."):
399
+ continue
400
+
401
+ config_file = exp_dir / "experiment.yaml"
402
+ if config_file.exists():
403
+ try:
404
+ config = ExperimentConfig.from_yaml(config_file)
405
+ configs.append(config)
406
+ except Exception as e:
407
+ logger.warning(f"Failed to load {config_file}: {e}")
408
+
409
+ # Apply filters
410
+ if status:
411
+ status_enum = ExperimentStatus(status)
412
+ configs = [c for c in configs if c.status == status_enum]
413
+
414
+ if tags:
415
+ filter_tags = set(t.strip().lower() for t in tags.split(","))
416
+ configs = [c for c in configs if filter_tags & set(c.tags)]
417
+
418
+ if not configs:
419
+ click.echo("No experiments found")
420
+ return
421
+
422
+ # Sort by updated_at descending
423
+ configs.sort(key=lambda c: c.updated_at, reverse=True)
424
+
425
+ # Display table
426
+ click.echo(f"\nExperiments ({len(configs)} total):\n")
427
+ click.echo(f"{'Name':<30} {'Status':<12} {'Agent':<20} {'Updated':<12}")
428
+ click.echo("-" * 75)
429
+
430
+ for config in configs:
431
+ name = config.name[:30]
432
+ status_str = config.status.value[:12]
433
+ agent = config.agent_schema_ref.name[:20]
434
+ updated = config.updated_at.strftime("%Y-%m-%d")
435
+ click.echo(f"{name:<30} {status_str:<12} {agent:<20} {updated:<12}")
436
+
437
+ except Exception as e:
438
+ logger.error(f"Failed to list experiments: {e}")
439
+ click.echo(f"Error: {e}", err=True)
440
+ raise click.Abort()
441
+
442
+
443
+ # =============================================================================
444
+ # SHOW COMMAND
445
+ # =============================================================================
446
+
447
+
448
+ @experiments.command("show")
449
+ @click.argument("name")
450
+ @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
451
+ def show(name: str, base_path: Optional[str]):
452
+ """Show experiment details.
453
+
454
+ Examples:
455
+ rem experiments show hello-world-validation
456
+ """
457
+ from rem.models.core.experiment import ExperimentConfig
458
+ import os
459
+
460
+ try:
461
+ # Resolve base path
462
+ if base_path is None:
463
+ base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
464
+
465
+ config_path = Path(base_path) / name / "experiment.yaml"
466
+ if not config_path.exists():
467
+ click.echo(f"Experiment not found: {name}")
468
+ click.echo(f" Looked in: {config_path}")
469
+ raise click.Abort()
470
+
471
+ config = ExperimentConfig.from_yaml(config_path)
472
+
473
+ click.echo(f"\nExperiment: {config.name}")
474
+ click.echo(f"{'=' * 60}\n")
475
+ click.echo(f"Description: {config.description}")
476
+ click.echo(f"Status: {config.status.value}")
477
+ if config.tags:
478
+ click.echo(f"Tags: {', '.join(config.tags)}")
479
+
480
+ click.echo(f"\nAgent Schema:")
481
+ click.echo(f" Name: {config.agent_schema_ref.name}")
482
+ click.echo(f" Version: {config.agent_schema_ref.version or 'latest'}")
483
+
484
+ click.echo(f"\nEvaluator Schema:")
485
+ click.echo(f" Name: {config.evaluator_schema_ref.name}")
486
+
487
+ click.echo(f"\nDatasets:")
488
+ for ds_name, ds_ref in config.datasets.items():
489
+ click.echo(f" {ds_name}:")
490
+ click.echo(f" Location: {ds_ref.location.value}")
491
+ click.echo(f" Path: {ds_ref.path}")
492
+ click.echo(f" Format: {ds_ref.format}")
493
+
494
+ click.echo(f"\nResults:")
495
+ click.echo(f" Location: {config.results.location.value}")
496
+ click.echo(f" Base Path: {config.results.base_path}")
497
+ click.echo(f" Save Traces: {config.results.save_traces}")
498
+ click.echo(f" Metrics File: {config.results.metrics_file}")
499
+
500
+ click.echo(f"\nTimestamps:")
501
+ click.echo(f" Created: {config.created_at.isoformat()}")
502
+ click.echo(f" Updated: {config.updated_at.isoformat()}")
503
+ if config.last_run_at:
504
+ click.echo(f" Last Run: {config.last_run_at.isoformat()}")
505
+
506
+ if config.metadata:
507
+ click.echo(f"\nMetadata:")
508
+ for key, value in config.metadata.items():
509
+ click.echo(f" {key}: {value}")
510
+
511
+ except Exception as e:
512
+ logger.error(f"Failed to show experiment: {e}")
513
+ click.echo(f"Error: {e}", err=True)
514
+ raise click.Abort()
515
+
516
+
517
+ # =============================================================================
518
+ # RUN COMMAND
519
+ # =============================================================================
520
+
521
+
522
+ @experiments.command("run")
523
+ @click.argument("name")
524
+ @click.option("--base-path", help="Base directory for experiments (default: EXPERIMENTS_HOME or 'experiments')")
525
+ @click.option("--version", help="Git tag version to load (e.g., 'experiments/my-exp/v1.0.0')")
526
+ @click.option("--dry-run", is_flag=True, help="Test on small subset without saving")
527
+ @click.option("--update-prompts", is_flag=True, help="Update prompts in Phoenix before running")
528
+ @click.option("--phoenix-url", help="Phoenix server URL (overrides PHOENIX_BASE_URL env var)")
529
+ @click.option("--phoenix-api-key", help="Phoenix API key (overrides PHOENIX_API_KEY env var)")
530
+ def run(
531
+ name: str,
532
+ base_path: Optional[str],
533
+ version: Optional[str],
534
+ dry_run: bool,
535
+ update_prompts: bool,
536
+ phoenix_url: Optional[str],
537
+ phoenix_api_key: Optional[str],
538
+ ):
539
+ """Run an experiment using Phoenix provider.
540
+
541
+ Loads configuration, executes agent and evaluator, saves results.
542
+
543
+ Phoenix Connection:
544
+ Commands respect PHOENIX_BASE_URL and PHOENIX_API_KEY environment variables.
545
+ Defaults to localhost:6006 for local development.
546
+
547
+ Production (on cluster):
548
+ export PHOENIX_BASE_URL=http://phoenix-svc.observability.svc.cluster.local:6006
549
+ export PHOENIX_API_KEY=<your-key>
550
+ kubectl exec -it deployment/rem-api -- rem experiments run my-experiment
551
+
552
+ Development (port-forward):
553
+ kubectl port-forward -n observability svc/phoenix-svc 6006:6006
554
+ export PHOENIX_API_KEY=<your-key>
555
+ rem experiments run my-experiment
556
+
557
+ Local (local Phoenix):
558
+ python -m phoenix.server.main serve
559
+ rem experiments run my-experiment
560
+
561
+ Examples:
562
+ # Run experiment with latest schemas
563
+ rem experiments run hello-world-validation
564
+
565
+ # Run specific version
566
+ rem experiments run hello-world-validation \\
567
+ --version experiments/hello-world-validation/v1.0.0
568
+
569
+ # Dry run (test without saving)
570
+ rem experiments run cv-parser-production --dry-run
571
+
572
+ # Override Phoenix connection
573
+ rem experiments run my-experiment \\
574
+ --phoenix-url http://phoenix.example.com:6006 \\
575
+ --phoenix-api-key <key>
576
+ """
577
+ from rem.models.core.experiment import ExperimentConfig, ExperimentStatus
578
+ from rem.services.git import GitService
579
+ from rem.services.phoenix import PhoenixClient
580
+ from rem.agentic.providers.phoenix import create_evaluator_from_schema
581
+ from datetime import datetime
582
+ import pandas as pd
583
+ import os
584
+
585
+ try:
586
+ # Resolve base path
587
+ if base_path is None:
588
+ base_path = os.getenv("EXPERIMENTS_HOME", "experiments")
589
+
590
+ # Load experiment configuration
591
+ if version:
592
+ # Load from Git at specific version
593
+ git_svc = GitService()
594
+ config_yaml = git_svc.fs.read(
595
+ f"git://rem/.experiments/{name}/experiment.yaml?ref={version}"
596
+ )
597
+ config = ExperimentConfig(**config_yaml)
598
+ click.echo(f"✓ Loaded experiment from Git: {version}")
599
+ else:
600
+ # Load from local filesystem
601
+ config_path = Path(base_path) / name / "experiment.yaml"
602
+ if not config_path.exists():
603
+ click.echo(f"Experiment not found: {name}")
604
+ click.echo(f" Looked in: {config_path}")
605
+ raise click.Abort()
606
+ config = ExperimentConfig.from_yaml(config_path)
607
+ click.echo(f"✓ Loaded experiment: {name}")
608
+
609
+ # Display experiment info
610
+ click.echo(f"\nExperiment: {config.name}")
611
+ click.echo(f" Agent: {config.agent_schema_ref.name} (version: {config.agent_schema_ref.version or 'latest'})")
612
+ click.echo(f" Evaluator: {config.evaluator_schema_ref.name}")
613
+ click.echo(f" Status: {config.status.value}")
614
+ if dry_run:
615
+ click.echo(f" Mode: DRY RUN (no data will be saved)")
616
+ click.echo()
617
+
618
+ # Load agent schema from Git or filesystem
619
+ agent_name = config.agent_schema_ref.name
620
+ agent_version = config.agent_schema_ref.version
621
+
622
+ click.echo(f"Loading agent schema: {agent_name} (version: {agent_version or 'latest'})")
623
+
624
+ # Try Git first, fallback to filesystem
625
+ agent_schema = None
626
+ try:
627
+ git_svc = GitService()
628
+ agent_schema = git_svc.load_schema(agent_name, version=agent_version)
629
+ click.echo(f"✓ Loaded agent schema from Git")
630
+ except Exception as e:
631
+ logger.debug(f"Git not available, trying filesystem: {e}")
632
+
633
+ # Fallback to local filesystem
634
+ from rem.services.fs import FS
635
+ fs = FS()
636
+
637
+ schema_path = f"schemas/agents/{agent_name}.yaml"
638
+ try:
639
+ agent_schema = fs.read(schema_path)
640
+ click.echo(f"✓ Loaded agent schema from filesystem")
641
+ except Exception as fs_error:
642
+ logger.error(f"Failed to load agent schema: Git: {e}, FS: {fs_error}")
643
+ click.echo(f"Error: Could not load agent schema '{agent_name}'")
644
+ click.echo(f" Tried Git: {e}")
645
+ click.echo(f" Tried filesystem: {schema_path}")
646
+ click.echo(f" Make sure the schema exists")
647
+ raise click.Abort()
648
+
649
+ # Create agent function from schema
650
+ from rem.agentic.providers.pydantic_ai import create_agent
651
+ from rem.agentic.context import AgentContext
652
+
653
+ # Create agent context
654
+ context = AgentContext(
655
+ user_id="experiment-runner",
656
+ tenant_id="experiments",
657
+ session_id=f"experiment-{config.name}",
658
+ )
659
+
660
+ agent_runtime = asyncio.run(create_agent(
661
+ context=context,
662
+ agent_schema_override=agent_schema
663
+ ))
664
+
665
+ def task_fn(example: dict[str, Any]) -> dict[str, Any]:
666
+ """Run agent on example."""
667
+ input_data = example.get("input", {})
668
+
669
+ # Extract query from input
670
+ query = input_data.get("query", "")
671
+ if not query:
672
+ # Try other common input keys
673
+ query = input_data.get("text", input_data.get("prompt", str(input_data)))
674
+
675
+ # Run agent
676
+ result = asyncio.run(agent_runtime.run(query))
677
+
678
+ # Serialize result (critical for Pydantic models!)
679
+ from rem.agentic.serialization import serialize_agent_result
680
+ serialized = serialize_agent_result(result)
681
+ # Ensure we return a dict (Phoenix expects dict output)
682
+ if isinstance(serialized, str):
683
+ return {"output": serialized}
684
+ return serialized if isinstance(serialized, dict) else {"output": str(serialized)}
685
+
686
+ # Load evaluator schema
687
+ evaluator_name = config.evaluator_schema_ref.name
688
+ evaluator_version = config.evaluator_schema_ref.version
689
+
690
+ # Resolve evaluator path (evaluators are organized by agent name)
691
+ evaluator_schema_path = f"rem/schemas/evaluators/{agent_name}/{evaluator_name}.yaml"
692
+
693
+ click.echo(f"Loading evaluator: {evaluator_name} for agent {agent_name}")
694
+
695
+ try:
696
+ evaluator_fn = create_evaluator_from_schema(
697
+ evaluator_schema_path=evaluator_schema_path,
698
+ model_name=None, # Use default from schema
699
+ )
700
+ click.echo(f"✓ Loaded evaluator schema")
701
+ except Exception as e:
702
+ logger.warning(f"Failed to load evaluator: {e}")
703
+ click.echo(f"Error: Could not load evaluator schema")
704
+ click.echo(f" Path: {evaluator_schema_path}")
705
+ click.echo(f" Make sure the schema exists")
706
+ raise click.Abort()
707
+
708
+ # Load dataset
709
+ click.echo(f"Loading dataset: {list(config.datasets.keys())[0]}")
710
+ dataset_ref = list(config.datasets.values())[0]
711
+
712
+ if dataset_ref.location.value == "git":
713
+ # Load from Git
714
+ dataset_path = Path(base_path) / name / dataset_ref.path
715
+ if not dataset_path.exists():
716
+ click.echo(f"Error: Dataset not found: {dataset_path}")
717
+ raise click.Abort()
718
+
719
+ if dataset_ref.format == "csv":
720
+ dataset_df = pd.read_csv(dataset_path)
721
+ elif dataset_ref.format == "parquet":
722
+ dataset_df = pd.read_parquet(dataset_path)
723
+ elif dataset_ref.format == "jsonl":
724
+ dataset_df = pd.read_json(dataset_path, lines=True)
725
+ else:
726
+ click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
727
+ raise click.Abort()
728
+ elif dataset_ref.location.value in ["s3", "hybrid"]:
729
+ # Load from S3 using FS provider
730
+ from rem.services.fs import FS
731
+
732
+ fs = FS()
733
+
734
+ try:
735
+ if dataset_ref.format == "csv":
736
+ content = fs.read(dataset_ref.path)
737
+ from io import StringIO
738
+ dataset_df = pd.read_csv(StringIO(content))
739
+ elif dataset_ref.format == "parquet":
740
+ # For parquet, we need binary read
741
+ import tempfile
742
+ with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
743
+ tmp_path = tmp.name
744
+ # Download via FS
745
+ content_bytes = fs.read(dataset_ref.path)
746
+ tmp.write(content_bytes)
747
+ dataset_df = pd.read_parquet(tmp_path)
748
+ Path(tmp_path).unlink() # Clean up temp file
749
+ elif dataset_ref.format == "jsonl":
750
+ content = fs.read(dataset_ref.path)
751
+ from io import StringIO
752
+ dataset_df = pd.read_json(StringIO(content), lines=True)
753
+ else:
754
+ click.echo(f"Error: Format '{dataset_ref.format}' not yet supported")
755
+ raise click.Abort()
756
+
757
+ click.echo(f"✓ Loaded dataset from S3")
758
+ except Exception as e:
759
+ logger.error(f"Failed to load dataset from S3: {e}")
760
+ click.echo(f"Error: Could not load dataset from S3")
761
+ click.echo(f" Path: {dataset_ref.path}")
762
+ click.echo(f" Format: {dataset_ref.format}")
763
+ raise click.Abort()
764
+ else:
765
+ click.echo(f"Error: Unknown dataset location: {dataset_ref.location.value}")
766
+ raise click.Abort()
767
+
768
+ click.echo(f"✓ Loaded dataset: {len(dataset_df)} examples")
769
+
770
+ # Update prompts in Phoenix if requested
771
+ if update_prompts:
772
+ # TODO: Implement prompt updating
773
+ click.echo("⚠ --update-prompts not yet implemented")
774
+
775
+ # Run experiment via Phoenix
776
+ if not dry_run:
777
+ # Create Phoenix client with optional overrides
778
+ from rem.services.phoenix.config import PhoenixConfig
779
+ import os
780
+
781
+ phoenix_config = PhoenixConfig(
782
+ base_url=phoenix_url or os.getenv("PHOENIX_BASE_URL"),
783
+ api_key=phoenix_api_key or os.getenv("PHOENIX_API_KEY")
784
+ )
785
+
786
+ # Display Phoenix connection info
787
+ phoenix_display_url = phoenix_config.base_url
788
+ phoenix_has_key = "Yes" if phoenix_config.api_key else "No"
789
+ click.echo(f"\nPhoenix Connection:")
790
+ click.echo(f" URL: {phoenix_display_url}")
791
+ click.echo(f" API Key: {phoenix_has_key}")
792
+ click.echo()
793
+
794
+ client = PhoenixClient(config=phoenix_config)
795
+
796
+ experiment_name = f"{config.name}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
797
+
798
+ click.echo(f"\n⏳ Running experiment: {experiment_name}")
799
+ click.echo(f" This may take several minutes...")
800
+
801
+ experiment = client.run_experiment(
802
+ dataset=dataset_df, # type: ignore[arg-type]
803
+ task=task_fn,
804
+ evaluators=[evaluator_fn],
805
+ experiment_name=experiment_name,
806
+ experiment_description=config.description,
807
+ experiment_metadata={
808
+ "agent": config.agent_schema_ref.name,
809
+ "evaluator": config.evaluator_schema_ref.name,
810
+ "experiment_config": config.name,
811
+ **config.metadata
812
+ }
813
+ )
814
+
815
+ # Update experiment status
816
+ config.status = ExperimentStatus.COMPLETED
817
+ config.last_run_at = datetime.now()
818
+ if not version: # Only save if not loading from Git
819
+ config.save(base_path)
820
+
821
+ click.echo(f"\n✓ Experiment complete!")
822
+ if hasattr(experiment, "url"):
823
+ click.echo(f" View results: {experiment.url}") # type: ignore[attr-defined]
824
+
825
+ # Save results according to config.results settings
826
+ if config.results.save_metrics_summary:
827
+ # Get experiment data
828
+ try:
829
+ exp_data = client.get_experiment(experiment.id) # type: ignore[attr-defined]
830
+
831
+ # Build metrics summary
832
+ metrics = {
833
+ "experiment_id": experiment.id, # type: ignore[attr-defined]
834
+ "experiment_name": experiment_name,
835
+ "agent": config.agent_schema_ref.name,
836
+ "evaluator": config.evaluator_schema_ref.name,
837
+ "dataset_size": len(dataset_df),
838
+ "completed_at": datetime.now().isoformat(),
839
+ "phoenix_url": getattr(experiment, "url", None),
840
+ "task_runs": len(exp_data.get("task_runs", [])),
841
+ }
842
+
843
+ # Save metrics
844
+ if config.results.location.value == "git" or config.results.location.value == "hybrid":
845
+ # Save to Git
846
+ metrics_path = Path(base_path) / name / "results" / (config.results.metrics_file or "metrics.json")
847
+ metrics_path.parent.mkdir(parents=True, exist_ok=True)
848
+
849
+ import json
850
+ with open(metrics_path, "w") as f:
851
+ json.dump(metrics, f, indent=2)
852
+
853
+ click.echo(f"\n✓ Saved metrics summary: {metrics_path}")
854
+
855
+ if config.results.location.value == "s3" or config.results.location.value == "hybrid":
856
+ # Save to S3
857
+ from rem.services.fs import FS
858
+ fs = FS()
859
+
860
+ s3_metrics_path = config.results.base_path.rstrip("/") + "/" + (config.results.metrics_file or "metrics.json")
861
+
862
+ import json
863
+ fs.write(s3_metrics_path, json.dumps(metrics, indent=2))
864
+
865
+ click.echo(f"✓ Saved metrics summary to S3: {s3_metrics_path}")
866
+
867
+ except Exception as e:
868
+ logger.warning(f"Failed to save metrics: {e}")
869
+ click.echo(f"⚠ Could not save metrics summary: {e}")
870
+ else:
871
+ click.echo("\n✓ Dry run complete (no data saved)")
872
+
873
+ except Exception as e:
874
+ logger.error(f"Failed to run experiment: {e}")
875
+ click.echo(f"Error: {e}", err=True)
876
+ raise click.Abort()
877
+
878
+
879
+ # =============================================================================
880
+ # DATASET COMMANDS
881
+ # =============================================================================
882
+
883
+
884
+ @experiments.group()
885
+ def dataset():
886
+ """Dataset management commands."""
887
+ pass
888
+
889
+
890
+ @dataset.command("list")
891
+ def dataset_list():
892
+ """List all datasets.
893
+
894
+ Example:
895
+ rem experiments dataset list
896
+ """
897
+ from rem.services.phoenix import PhoenixClient
898
+
899
+ try:
900
+ client = PhoenixClient()
901
+ datasets = client.list_datasets()
902
+
903
+ if not datasets:
904
+ click.echo("No datasets found")
905
+ return
906
+
907
+ click.echo(f"\nDatasets ({len(datasets)} total):\n")
908
+ click.echo(f"{'Name':<40} {'Examples':>10} {'Created':<12}")
909
+ click.echo("-" * 65)
910
+
911
+ for ds in datasets:
912
+ name = ds.get("name", "")[:40]
913
+ count = ds.get("example_count", 0)
914
+ created = ds.get("created_at", "")[:10]
915
+ click.echo(f"{name:<40} {count:>10} {created:<12}")
916
+
917
+ except Exception as e:
918
+ logger.error(f"Failed to list datasets: {e}")
919
+ click.echo(f"Error: {e}", err=True)
920
+ raise click.Abort()
921
+
922
+
923
+ @dataset.command("create")
924
+ @click.argument("name")
925
+ @click.option("--from-csv", type=click.Path(exists=True, path_type=Path), help="Create from CSV file")
926
+ @click.option("--input-keys", help="Comma-separated input column names")
927
+ @click.option("--output-keys", help="Comma-separated output column names (reference/ground truth)")
928
+ @click.option("--metadata-keys", help="Comma-separated metadata column names (difficulty, type, etc.)")
929
+ @click.option("--description", help="Dataset description")
930
+ def dataset_create(
931
+ name: str,
932
+ from_csv: Optional[Path],
933
+ input_keys: Optional[str],
934
+ output_keys: Optional[str],
935
+ metadata_keys: Optional[str],
936
+ description: Optional[str],
937
+ ):
938
+ """Create a dataset (golden set).
939
+
940
+ Two modes:
941
+ 1. From CSV: --from-csv golden.csv --input-keys query --output-keys expected
942
+ 2. Manual (empty): Will create empty dataset to populate later
943
+
944
+ Examples:
945
+ # From CSV (SME golden set)
946
+ rem experiments dataset create rem-lookup-golden \\
947
+ --from-csv golden-lookup.csv \\
948
+ --input-keys query \\
949
+ --output-keys expected_label,expected_type \\
950
+ --metadata-keys difficulty,query_type
951
+
952
+ # Empty dataset (populate later)
953
+ rem experiments dataset create rem-test --description "Test dataset"
954
+ """
955
+ from rem.services.phoenix import PhoenixClient
956
+
957
+ try:
958
+ client = PhoenixClient()
959
+
960
+ if from_csv:
961
+ # Create from CSV
962
+ if not input_keys or not output_keys:
963
+ click.echo("Error: --input-keys and --output-keys required for CSV", err=True)
964
+ raise click.Abort()
965
+
966
+ dataset = client.create_dataset_from_csv(
967
+ name=name,
968
+ csv_file_path=from_csv,
969
+ input_keys=input_keys.split(","),
970
+ output_keys=output_keys.split(","),
971
+ metadata_keys=metadata_keys.split(",") if metadata_keys else None,
972
+ description=description,
973
+ )
974
+
975
+ click.echo(f"✓ Created dataset '{dataset.name}' from CSV with {len(dataset)} examples")
976
+
977
+ else:
978
+ # Create empty dataset
979
+ dataset = client.create_dataset_from_data(
980
+ name=name,
981
+ inputs=[],
982
+ outputs=[],
983
+ description=description,
984
+ )
985
+
986
+ click.echo(f"✓ Created empty dataset '{dataset.name}'")
987
+ click.echo(" Use 'rem experiments dataset add' to add examples")
988
+
989
+ except Exception as e:
990
+ logger.error(f"Failed to create dataset: {e}")
991
+ click.echo(f"Error: {e}", err=True)
992
+ raise click.Abort()
993
+
994
+
995
+ @dataset.command("add")
996
+ @click.argument("dataset_name")
997
+ @click.option("--from-csv", type=click.Path(exists=True, path_type=Path), required=True,
998
+ help="CSV file with examples")
999
+ @click.option("--input-keys", required=True, help="Comma-separated input column names")
1000
+ @click.option("--output-keys", required=True, help="Comma-separated output column names")
1001
+ @click.option("--metadata-keys", help="Comma-separated metadata column names")
1002
+ def dataset_add(
1003
+ dataset_name: str,
1004
+ from_csv: Path,
1005
+ input_keys: str,
1006
+ output_keys: str,
1007
+ metadata_keys: Optional[str],
1008
+ ):
1009
+ """Add examples to an existing dataset.
1010
+
1011
+ Example:
1012
+ rem experiments dataset add rem-lookup-golden \\
1013
+ --from-csv new-examples.csv \\
1014
+ --input-keys query \\
1015
+ --output-keys expected_label,expected_type
1016
+ """
1017
+ from rem.services.phoenix import PhoenixClient
1018
+ import pandas as pd
1019
+
1020
+ try:
1021
+ client = PhoenixClient()
1022
+
1023
+ # Load CSV
1024
+ df = pd.read_csv(from_csv)
1025
+
1026
+ # Extract data
1027
+ inputs = cast(list[dict[str, Any]], df[input_keys.split(",")].to_dict("records"))
1028
+ outputs = cast(list[dict[str, Any]], df[output_keys.split(",")].to_dict("records"))
1029
+ metadata = None
1030
+ if metadata_keys:
1031
+ metadata = cast(list[dict[str, Any]], df[metadata_keys.split(",")].to_dict("records"))
1032
+
1033
+ # Add to dataset
1034
+ dataset = client.add_examples_to_dataset(
1035
+ dataset=dataset_name,
1036
+ inputs=inputs,
1037
+ outputs=outputs,
1038
+ metadata=metadata,
1039
+ )
1040
+
1041
+ click.echo(f"✓ Added {len(inputs)} examples to dataset '{dataset.name}'")
1042
+ click.echo(f" Total examples: {len(dataset)}")
1043
+
1044
+ except Exception as e:
1045
+ logger.error(f"Failed to add examples: {e}")
1046
+ click.echo(f"Error: {e}", err=True)
1047
+ raise click.Abort()
1048
+
1049
+
1050
+ # =============================================================================
1051
+ # PROMPT COMMANDS
1052
+ # =============================================================================
1053
+
1054
+
1055
+ @experiments.group()
1056
+ def prompt():
1057
+ """Prompt management commands."""
1058
+ pass
1059
+
1060
+
1061
+ @prompt.command("create")
1062
+ @click.argument("name")
1063
+ @click.option("--system-prompt", "-s", required=True, help="System prompt text")
1064
+ @click.option("--description", "-d", help="Prompt description")
1065
+ @click.option("--model-provider", default="OPENAI", help="Model provider (OPENAI, ANTHROPIC)")
1066
+ @click.option("--model-name", "-m", help="Model name (e.g., gpt-4o, claude-sonnet-4-5)")
1067
+ @click.option("--type", "-t", "prompt_type", default="Agent", help="Prompt type (Agent or Evaluator)")
1068
+ def prompt_create(
1069
+ name: str,
1070
+ system_prompt: str,
1071
+ description: Optional[str],
1072
+ model_provider: str,
1073
+ model_name: Optional[str],
1074
+ prompt_type: str,
1075
+ ):
1076
+ """Create a prompt.
1077
+
1078
+ Examples:
1079
+ # Create agent prompt
1080
+ rem experiments prompt create hello-world \\
1081
+ --system-prompt "You are a helpful assistant." \\
1082
+ --model-name gpt-4o
1083
+
1084
+ # Create evaluator prompt
1085
+ rem experiments prompt create correctness-evaluator \\
1086
+ --system-prompt "Evaluate the correctness of responses." \\
1087
+ --type Evaluator \\
1088
+ --model-provider ANTHROPIC \\
1089
+ --model-name claude-sonnet-4-5
1090
+ """
1091
+ from rem.services.phoenix import PhoenixClient
1092
+ from rem.services.phoenix.prompt_labels import PhoenixPromptLabels
1093
+ from phoenix.client import Client
1094
+ from phoenix.client.types.prompts import PromptVersion
1095
+ from phoenix.client.__generated__ import v1
1096
+
1097
+ try:
1098
+ # Set default model if not specified
1099
+ if not model_name:
1100
+ model_name = "gpt-4o" if model_provider == "OPENAI" else "claude-sonnet-4-5-20250929"
1101
+
1102
+ # Get config
1103
+ phoenix_client = PhoenixClient()
1104
+ config = phoenix_client.config
1105
+
1106
+ # Create client
1107
+ client = Client(
1108
+ base_url=config.base_url,
1109
+ api_key=config.api_key
1110
+ )
1111
+
1112
+ # Create prompt messages
1113
+ messages = [
1114
+ v1.PromptMessage(
1115
+ role="system",
1116
+ content=system_prompt
1117
+ )
1118
+ ]
1119
+
1120
+ # Create PromptVersion
1121
+ version = PromptVersion(
1122
+ messages,
1123
+ model_name=model_name,
1124
+ description="v1.0",
1125
+ model_provider=model_provider # type: ignore[arg-type]
1126
+ )
1127
+
1128
+ # Create the prompt
1129
+ result = client.prompts.create(
1130
+ name=name,
1131
+ version=version,
1132
+ prompt_description=description or f"{prompt_type} prompt: {name}"
1133
+ )
1134
+
1135
+ click.echo(f"✓ Created prompt '{name}' (ID: {result.id})")
1136
+
1137
+ # Try to get the prompt ID for label assignment
1138
+ try:
1139
+ import httpx
1140
+ query = """
1141
+ query {
1142
+ prompts(first: 1, filterBy: {name: {equals: "%s"}}) {
1143
+ edges {
1144
+ node {
1145
+ id
1146
+ name
1147
+ }
1148
+ }
1149
+ }
1150
+ }
1151
+ """ % name
1152
+
1153
+ response = httpx.post(
1154
+ f"{config.base_url}/graphql",
1155
+ json={"query": query},
1156
+ headers={"authorization": f"Bearer {config.api_key}"},
1157
+ timeout=10,
1158
+ )
1159
+ graphql_result = response.json()
1160
+ prompts = graphql_result.get("data", {}).get("prompts", {}).get("edges", [])
1161
+
1162
+ if prompts:
1163
+ prompt_id = prompts[0]["node"]["id"]
1164
+
1165
+ # Assign labels
1166
+ if not config.base_url:
1167
+ raise ValueError("Phoenix base_url is required")
1168
+ labels_helper = PhoenixPromptLabels(
1169
+ base_url=config.base_url, api_key=config.api_key
1170
+ )
1171
+
1172
+ # Assign REM + type label
1173
+ label_names = ["REM", prompt_type]
1174
+ labels_helper.assign_prompt_labels(prompt_id, label_names)
1175
+ click.echo(f"✓ Assigned labels: {', '.join(label_names)}")
1176
+ except Exception as e:
1177
+ click.echo(f"⚠ Warning: Could not assign labels: {e}")
1178
+
1179
+ click.echo(f"\nView in UI: {config.base_url}")
1180
+
1181
+ except Exception as e:
1182
+ logger.error(f"Failed to create prompt: {e}")
1183
+ click.echo(f"Error: {e}", err=True)
1184
+ raise click.Abort()
1185
+
1186
+
1187
+ @prompt.command("list")
1188
+ def prompt_list():
1189
+ """List all prompts.
1190
+
1191
+ Example:
1192
+ rem experiments prompt list
1193
+ """
1194
+ import httpx
1195
+ from rem.services.phoenix import PhoenixClient
1196
+
1197
+ try:
1198
+ phoenix_client = PhoenixClient()
1199
+ config = phoenix_client.config
1200
+
1201
+ query = """
1202
+ query {
1203
+ prompts(first: 100) {
1204
+ edges {
1205
+ node {
1206
+ id
1207
+ name
1208
+ description
1209
+ createdAt
1210
+ }
1211
+ }
1212
+ }
1213
+ }
1214
+ """
1215
+
1216
+ response = httpx.post(
1217
+ f"{config.base_url}/graphql",
1218
+ json={"query": query},
1219
+ headers={"authorization": f"Bearer {config.api_key}"},
1220
+ timeout=10,
1221
+ )
1222
+
1223
+ result = response.json()
1224
+ prompts = result.get("data", {}).get("prompts", {}).get("edges", [])
1225
+
1226
+ if not prompts:
1227
+ click.echo("No prompts found")
1228
+ return
1229
+
1230
+ click.echo(f"\nPrompts ({len(prompts)} total):\n")
1231
+ click.echo(f"{'Name':<40} {'Created':<20}")
1232
+ click.echo("-" * 65)
1233
+
1234
+ for edge in prompts:
1235
+ node = edge["node"]
1236
+ name = node.get("name", "")[:40]
1237
+ created = node.get("createdAt", "")[:19]
1238
+ click.echo(f"{name:<40} {created:<20}")
1239
+
1240
+ except Exception as e:
1241
+ logger.error(f"Failed to list prompts: {e}")
1242
+ click.echo(f"Error: {e}", err=True)
1243
+ raise click.Abort()
1244
+
1245
+
1246
+ # =============================================================================
1247
+ # TRACE COMMANDS
1248
+ # =============================================================================
1249
+
1250
+
1251
+ @experiments.group()
1252
+ def trace():
1253
+ """Trace retrieval commands."""
1254
+ pass
1255
+
1256
+
1257
+ @trace.command("list")
1258
+ @click.option("--project", "-p", help="Filter by project name")
1259
+ @click.option("--days", "-d", default=7, help="Number of days to look back")
1260
+ @click.option("--limit", "-l", default=20, help="Maximum traces to return")
1261
+ def trace_list(
1262
+ project: Optional[str],
1263
+ days: int,
1264
+ limit: int,
1265
+ ):
1266
+ """List recent traces.
1267
+
1268
+ Example:
1269
+ rem experiments trace list --project rem-agents --days 7 --limit 50
1270
+ """
1271
+ from rem.services.phoenix import PhoenixClient
1272
+ from datetime import datetime, timedelta
1273
+
1274
+ try:
1275
+ client = PhoenixClient()
1276
+
1277
+ start_time = datetime.now() - timedelta(days=days)
1278
+
1279
+ traces_df = client.get_traces(
1280
+ project_name=project,
1281
+ start_time=start_time,
1282
+ limit=limit,
1283
+ )
1284
+
1285
+ if len(traces_df) == 0:
1286
+ click.echo("No traces found")
1287
+ return
1288
+
1289
+ click.echo(f"\nRecent Traces ({len(traces_df)} results):\n")
1290
+ click.echo(f"{'Span ID':<15} {'Name':<30} {'Start Time':<20}")
1291
+ click.echo("-" * 70)
1292
+
1293
+ for _, row in traces_df.head(limit).iterrows():
1294
+ span_id = str(row.get("context.span_id", ""))[:12]
1295
+ name = str(row.get("name", ""))[:30]
1296
+ start = str(row.get("start_time", ""))[:19]
1297
+ click.echo(f"{span_id:<15} {name:<30} {start:<20}")
1298
+
1299
+ except Exception as e:
1300
+ logger.error(f"Failed to list traces: {e}")
1301
+ click.echo(f"Error: {e}", err=True)
1302
+ raise click.Abort()