remdb 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +566 -0
  44. rem/cli/commands/configure.py +497 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1302 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +96 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +676 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +336 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.3.0.dist-info/METADATA +1455 -0
  185. remdb-0.3.0.dist-info/RECORD +187 -0
  186. remdb-0.3.0.dist-info/WHEEL +4 -0
  187. remdb-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,1146 @@
1
+ # REM Experiment Design Guide
2
+
3
+ **Version**: 1.0
4
+ **Date**: 2025-11-21
5
+ **Status**: Production-Ready
6
+
7
+ A comprehensive guide to designing, executing, and iterating on LLM evaluation experiments for REM agents using Phoenix.
8
+
9
+ ---
10
+
11
+ ## Table of Contents
12
+
13
+ 1. [Overview](#overview)
14
+ 2. [Design Principles](#design-principles)
15
+ 3. [Experiment Lifecycle](#experiment-lifecycle)
16
+ 4. [Data Sources](#data-sources)
17
+ 5. [Naming Conventions](#naming-conventions)
18
+ 6. [Vibe-Eval Methodology](#vibe-eval-methodology)
19
+ 7. [Phoenix Integration](#phoenix-integration)
20
+ 8. [Re-Evaluation Patterns](#re-evaluation-patterns)
21
+ 9. [Best Practices](#best-practices)
22
+ 10. [Example Workflows](#example-workflows)
23
+
24
+ ---
25
+
26
+ ## Overview
27
+
28
+ REM's experiment framework combines **interactive testing** (Vibe-Eval) with **systematic tracking** (Phoenix) to build reliable agent evaluation pipelines.
29
+
30
+ ### Key Concepts
31
+
32
+ **Engrams**: Generated datasets from REM's memory system (resources, entities, moments). These are synthetic but realistic test cases created by the dreaming worker.
33
+
34
+ **Ground Truth**: Reference answers from subject matter experts (SMEs), production data, or validated engrams. The agent NEVER sees ground truth during testing—it's the answer key for evaluation.
35
+
36
+ **Vibe-Eval**: Interactive test/fix cycle using CLI tools. Rapid iteration before committing to formal Phoenix experiments.
37
+
38
+ **Phoenix Experiments**: Automated evaluation runs tracked in Phoenix for systematic comparison over time.
39
+
40
+ ### Three-Folder Structure
41
+
42
+ Every experiment follows a strict separation of concerns:
43
+
44
+ ```
45
+ {EXPERIMENT-ID}/
46
+ ├── inputs/ # What agent CAN see
47
+ │ ├── specs/ # API specs, documentation
48
+ │ ├── engrams/ # Generated test data
49
+ │ └── context/ # Additional context files
50
+
51
+ ├── outputs/ # Questions to test agent
52
+ │ ├── questions.csv # Test questions (created FROM ground truth)
53
+ │ └── questions.yaml # Alternative format
54
+
55
+ └── validation/ # Ground truth (agent CANNOT see!)
56
+ ├── golden-set/ # Reference answers
57
+ ├── sme-examples/ # Expert-provided examples
58
+ └── production/ # Real-world validated data
59
+ ```
60
+
61
+ **Critical Rule**: The agent NEVER accesses the `validation/` folder. It must answer questions using only `inputs/`.
62
+
63
+ ---
64
+
65
+ ## Design Principles
66
+
67
+ ### 1. Ground Truth First
68
+
69
+ **Start with the answer key**, not the questions.
70
+
71
+ ```
72
+ Bad: "Let's test if the agent can map APIs" → Write random questions
73
+ Good: "Here's how APIs should be mapped" → Test if agent matches SME examples
74
+ ```
75
+
76
+ **Sources of Ground Truth**:
77
+ - SME examples (e.g., Postman collections, expert mappings)
78
+ - Production data (validated historical queries)
79
+ - Curated engrams (generated data that passed manual review)
80
+
81
+ ### 2. Separation of Concerns
82
+
83
+ **What agent sees** vs **What we judge against** must be distinct.
84
+
85
+ ```
86
+ inputs/ → Agent reads these to answer questions
87
+ validation/ → We read these to judge agent answers
88
+ outputs/ → Questions derived FROM validation (not shown to agent)
89
+ ```
90
+
91
+ ### 3. Iterative Refinement
92
+
93
+ **Vibe-Eval before Phoenix.**
94
+
95
+ 1. Test agent interactively (CLI tools)
96
+ 2. Fix broken prompts, tools, or schemas
97
+ 3. Iterate until stable
98
+ 4. THEN track with Phoenix experiments
99
+
100
+ **Why**: Prevents wasting Phoenix runs on obviously broken agents.
101
+
102
+ ### 4. Deterministic Naming
103
+
104
+ **Predictable artifact names** prevent Phoenix dataset proliferation.
105
+
106
+ ```
107
+ Dataset: {task}-{agent}-golden (e.g., rem-lookup-ask_rem-golden)
108
+ Experiment: {task}-{agent}-v{index} (e.g., rem-lookup-ask_rem-v1)
109
+ Evaluator: {agent}-{dimension} (e.g., ask_rem-correctness)
110
+ ```
111
+
112
+ ### 5. Data-Driven Design
113
+
114
+ **Use data to build better LLMs**, not guesses.
115
+
116
+ - Generate engrams from real REM usage patterns
117
+ - Extract failure modes from production traces
118
+ - Create test cases targeting specific weaknesses
119
+ - Track improvements with controlled experiments
120
+
121
+ ---
122
+
123
+ ## Experiment Lifecycle
124
+
125
+ ### Stage 0: Problem Definition
126
+
127
+ **What are you trying to improve?**
128
+
129
+ ```
130
+ Example:
131
+ - Problem: LOOKUP queries return wrong entity types
132
+ - Hypothesis: Agent confuses person vs project entities
133
+ - Goal: Improve type classification accuracy from 75% to 95%
134
+ ```
135
+
136
+ **Define Success Metrics:**
137
+ ```
138
+ Metric | Baseline | Target
139
+ -----------------------|----------|-------
140
+ Type correctness | 75% | 95%
141
+ Label completeness | 60% | 90%
142
+ Hallucination rate | 15% | < 5%
143
+ ```
144
+
145
+ ### Stage 1: Ground Truth Collection
146
+
147
+ **Gather reference answers.**
148
+
149
+ **Option A: SME Examples**
150
+ ```bash
151
+ # Expert provides examples
152
+ mkdir -p experiments/rem-001/validation/sme-examples/
153
+ cp postman-collection.json experiments/rem-001/validation/sme-examples/
154
+ ```
155
+
156
+ **Option B: Production Data**
157
+ ```bash
158
+ # Export validated production queries
159
+ rem experiments trace list --project rem-production --days 30 --output prod-queries.csv
160
+ # Manual review and curation
161
+ cp curated-queries.csv experiments/rem-001/validation/production/
162
+ ```
163
+
164
+ **Option C: Curated Engrams**
165
+ ```bash
166
+ # Generate engrams from REM data
167
+ rem dreaming full --user-id test-user --tenant-id acme --generate-test-cases
168
+
169
+ # Review and select high-quality engrams
170
+ rem engram list --quality high --limit 100 --output engrams.csv
171
+ cp engrams.csv experiments/rem-001/validation/engrams/
172
+ ```
173
+
174
+ ### Stage 2: Test Question Design
175
+
176
+ **Create questions FROM ground truth.**
177
+
178
+ Read `validation/` folder and extract test questions WITHOUT revealing answers.
179
+
180
+ ```csv
181
+ # outputs/questions.csv
182
+ input,reference
183
+ "LOOKUP person:sarah-chen","{""label"": ""sarah-chen"", ""type"": ""person"", ""properties"": {...}}"
184
+ "SEARCH for API design projects","{""entities"": [""api-design-v2"", ""rest-api-spec""], ""query_type"": ""semantic""}"
185
+ "TRAVERSE from sarah-chen to projects","{""paths"": [[""sarah-chen"", ""leads"", ""api-design-v2""]], ""depth"": 2}"
186
+ ```
187
+
188
+ **Question Design Checklist:**
189
+ - [ ] Covers diverse difficulty levels (easy, medium, hard)
190
+ - [ ] Includes edge cases (ambiguous queries, missing data)
191
+ - [ ] Tests specific failure modes (identified from baseline)
192
+ - [ ] Reference answers are explicit and complete
193
+ - [ ] No leakage (questions don't reveal answers)
194
+
195
+ ### Stage 3: Vibe-Eval (Interactive Testing)
196
+
197
+ **Test agent WITHOUT showing ground truth.**
198
+
199
+ ```bash
200
+ # Setup case context (agent CAN see this)
201
+ export CASE_REF="rem-001"
202
+ rem process files experiments/$CASE_REF/inputs/specs/*.yaml --case-ref $CASE_REF
203
+
204
+ # Test agent interactively
205
+ rem ask ask_rem "LOOKUP person:sarah-chen" --case-ref $CASE_REF
206
+
207
+ # Compare output to ground truth (YOU are the judge)
208
+ # - Does output match validation/golden-set/sarah-chen.json?
209
+ # - Are all fields present?
210
+ # - Any hallucinations?
211
+ ```
212
+
213
+ **Fix and Iterate:**
214
+ ```bash
215
+ # If agent fails:
216
+ # 1. Check tool usage
217
+ cat .fs/cases/$CASE_REF/scratchpad/deltas/*.yaml
218
+
219
+ # 2. Fix agent schema (prompt, tools, output format)
220
+ vim schemas/agents/ask-rem.yaml
221
+
222
+ # 3. Re-test
223
+ rem ask ask_rem "LOOKUP person:sarah-chen" --case-ref $CASE_REF
224
+
225
+ # 4. Repeat until stable
226
+ ```
227
+
228
+ **Exit Criteria for Vibe-Eval:**
229
+ - Agent correctly answers 8/10 diverse test questions
230
+ - No obvious hallucinations
231
+ - Tool usage is appropriate
232
+ - Output format is consistent
233
+
234
+ ### Stage 4: Phoenix Formalization
235
+
236
+ **Create Phoenix artifacts AFTER Vibe-Eval passes.**
237
+
238
+ ```bash
239
+ # 1. Create golden dataset
240
+ rem experiments dataset create rem-lookup-ask_rem-golden \
241
+ --from-csv experiments/rem-001/outputs/questions.csv \
242
+ --input-keys input \
243
+ --output-keys reference \
244
+ --description "Golden set for LOOKUP query evaluation"
245
+
246
+ # 2. Create evaluator schema
247
+ # Edit schemas/evaluators/ask_rem-correctness.yaml
248
+
249
+ # 3. Run baseline experiment
250
+ rem experiments experiment run rem-lookup-ask_rem-golden \
251
+ --experiment rem-lookup-ask_rem-v1 \
252
+ --agent ask_rem \
253
+ --evaluator ask_rem-correctness \
254
+ --description "Baseline evaluation after Vibe-Eval"
255
+ ```
256
+
257
+ ### Stage 5: Iteration and Tracking
258
+
259
+ **Track improvements over time.**
260
+
261
+ ```bash
262
+ # V1 baseline
263
+ rem experiments experiment run ... --experiment rem-lookup-ask_rem-v1
264
+
265
+ # V2 after prompt improvements
266
+ rem experiments experiment run ... --experiment rem-lookup-ask_rem-v2
267
+
268
+ # V3 after tool fixes
269
+ rem experiments experiment run ... --experiment rem-lookup-ask_rem-v3
270
+
271
+ # Compare in Phoenix UI
272
+ open http://localhost:6006
273
+ ```
274
+
275
+ ---
276
+
277
+ ## Data Sources
278
+
279
+ ### 1. SME Examples (Expert Knowledge)
280
+
281
+ **What**: Reference answers created by domain experts.
282
+
283
+ **Use Cases**:
284
+ - API mapper evaluation (Postman collections)
285
+ - CDA mapper evaluation (expert-provided mappings)
286
+ - Complex reasoning tasks (SME-validated outputs)
287
+
288
+ **Workflow**:
289
+ ```bash
290
+ # SME provides examples
291
+ validation/sme-examples/postman-collection.json
292
+ validation/sme-examples/expert-mappings.yaml
293
+
294
+ # Extract test questions
295
+ # Question: "Show complete API request for POST /orders/create"
296
+ # Reference: <exact request from Postman>
297
+ ```
298
+
299
+ **Pros**: High quality, domain-accurate
300
+ **Cons**: Manual effort, doesn't scale
301
+
302
+ ### 2. Production Data (Real-World Validated)
303
+
304
+ **What**: Queries and responses from production that have been manually validated.
305
+
306
+ **Use Cases**:
307
+ - Regression testing (ensure new versions don't break existing functionality)
308
+ - Coverage testing (test against real user query patterns)
309
+ - Edge case discovery (find unusual queries users actually make)
310
+
311
+ **Workflow**:
312
+ ```bash
313
+ # Export production traces
314
+ rem experiments trace list --project rem-production --days 30 --limit 1000 \
315
+ --output prod-traces.csv
316
+
317
+ # Manual curation (validate correctness)
318
+ # Keep only queries with verified correct outputs
319
+
320
+ # Create test dataset
321
+ rem experiments dataset create rem-production-regression \
322
+ --from-csv curated-prod-queries.csv \
323
+ --input-keys query \
324
+ --output-keys expected_output
325
+ ```
326
+
327
+ **Pros**: Real-world coverage, user patterns
328
+ **Cons**: Requires validation, may contain errors
329
+
330
+ ### 3. Engrams (Generated Test Data)
331
+
332
+ **What**: Synthetic datasets generated by REM's dreaming worker from memory system.
333
+
334
+ **Unique to REM**: Engrams are created through multi-stage "dreaming":
335
+ - **Stage 1**: Entity extraction from resources
336
+ - **Stage 2**: Moment generation (temporal narratives)
337
+ - **Stage 3**: Affinity matching (semantic clustering)
338
+ - **Stage 4**: Multiple dreaming cycles (rich interconnections)
339
+
340
+ **Use Cases**:
341
+ - Scale testing (generate thousands of test cases)
342
+ - Diverse scenario coverage (different entity types, query patterns)
343
+ - Controlled difficulty (easy vs hard examples)
344
+ - Stress testing (edge cases, missing data)
345
+
346
+ **Engram Quality Levels**:
347
+ ```
348
+ Level 0 (Raw): Resources only, minimal structure
349
+ Level 1 (Entities): Entities extracted, basic LOOKUP works
350
+ Level 2 (Moments): Temporal narratives, time-based queries work
351
+ Level 3 (Affinities): Semantic clustering, SEARCH works well
352
+ Level 4 (Mature): Multiple cycles, full query capabilities
353
+ ```
354
+
355
+ **Workflow**:
356
+ ```bash
357
+ # Generate engrams from REM data
358
+ rem dreaming full \
359
+ --user-id test-user \
360
+ --tenant-id acme \
361
+ --generate-test-cases \
362
+ --quality-level 3
363
+
364
+ # List available engrams
365
+ rem engram list \
366
+ --quality high \
367
+ --entity-type person,project \
368
+ --limit 100
369
+
370
+ # Export to golden set
371
+ rem engram export rem-engrams-high-quality \
372
+ --output engrams.csv \
373
+ --format phoenix
374
+
375
+ # Create dataset
376
+ rem experiments dataset create rem-search-ask_rem-golden \
377
+ --from-engrams engrams.csv \
378
+ --input-keys query,context \
379
+ --output-keys entities,relationships \
380
+ --description "High-quality engrams for SEARCH evaluation"
381
+ ```
382
+
383
+ **Pros**: Scalable, diverse, controllable difficulty
384
+ **Cons**: Synthetic (may not reflect real usage), requires curation
385
+
386
+ ### 4. Hybrid Approach (Recommended)
387
+
388
+ **Combine all three sources** for comprehensive coverage:
389
+
390
+ ```
391
+ Golden Set Composition:
392
+ ├── 20% SME Examples (high-quality, domain-accurate)
393
+ ├── 30% Production Data (real-world patterns)
394
+ └── 50% Curated Engrams (scale, diversity, edge cases)
395
+ ```
396
+
397
+ **Workflow**:
398
+ ```bash
399
+ # 1. Collect SME examples
400
+ cp sme-examples/*.json validation/sme-examples/
401
+
402
+ # 2. Export production data
403
+ rem experiments trace list --project rem-prod --output prod.csv
404
+
405
+ # 3. Generate engrams
406
+ rem engram export rem-high-quality --output engrams.csv
407
+
408
+ # 4. Merge into single golden set
409
+ python scripts/merge_golden_sets.py \
410
+ --sme validation/sme-examples/ \
411
+ --production prod.csv \
412
+ --engrams engrams.csv \
413
+ --output golden-set.csv
414
+
415
+ # 5. Create Phoenix dataset
416
+ rem experiments dataset create rem-comprehensive-golden \
417
+ --from-csv golden-set.csv \
418
+ --input-keys query,context \
419
+ --output-keys reference
420
+ ```
421
+
422
+ ---
423
+
424
+ ## Naming Conventions
425
+
426
+ ### Deterministic Naming Pattern
427
+
428
+ **Goal**: Prevent Phoenix dataset proliferation, enable traceability.
429
+
430
+ ### Datasets
431
+
432
+ **Golden Sets (Ground Truth)**:
433
+ ```
434
+ Pattern: {task}-{agent}-golden
435
+ Examples:
436
+ - rem-lookup-ask_rem-golden
437
+ - rem-search-ask_rem-golden
438
+ - rem-traverse-ask_rem-golden
439
+ ```
440
+
441
+ **Agent Results (Experiment Outputs)**:
442
+ ```
443
+ Pattern: {task}-{agent}-results
444
+ Examples:
445
+ - rem-lookup-ask_rem-results (auto-created by Phoenix)
446
+ ```
447
+
448
+ **Engram Datasets**:
449
+ ```
450
+ Pattern: rem-engrams-{quality}-{entity-type}
451
+ Examples:
452
+ - rem-engrams-high-person
453
+ - rem-engrams-medium-project
454
+ - rem-engrams-mature-mixed
455
+ ```
456
+
457
+ ### Experiments
458
+
459
+ **Pattern**: `{task}-{agent}-v{index}`
460
+
461
+ ```
462
+ Examples:
463
+ - rem-lookup-ask_rem-v1 (baseline)
464
+ - rem-lookup-ask_rem-v2 (after prompt improvements)
465
+ - rem-lookup-ask_rem-v3 (after tool fixes)
466
+ ```
467
+
468
+ **Metadata** (auto-stored):
469
+ ```json
470
+ {
471
+ "task": "rem-lookup",
472
+ "agent": "ask_rem",
473
+ "index": "v1",
474
+ "model": "claude-sonnet-4-5",
475
+ "dataset_id": "RGF0YXNldDo...",
476
+ "timestamp": "2025-11-21T10:30:00Z",
477
+ "hypothesis": "Baseline evaluation after Vibe-Eval"
478
+ }
479
+ ```
480
+
481
+ ### Evaluators
482
+
483
+ **Pattern**: `{agent}-{dimension}`
484
+
485
+ ```
486
+ Examples:
487
+ - ask_rem-correctness
488
+ - ask_rem-completeness
489
+ - ask_rem-faithfulness
490
+ - ask_rem-retrieval-precision
491
+ ```
492
+
493
+ ### Labels
494
+
495
+ **Standard Labels**:
496
+ ```
497
+ - rem (always added)
498
+ - golden-set (for curated datasets)
499
+ - experiment (for experiment runs)
500
+ - engram (for generated datasets)
501
+ - production (for production data)
502
+ - {task-name} (e.g., rem-lookup, rem-search)
503
+ ```
504
+
505
+ **Usage**:
506
+ ```bash
507
+ # Automatic labeling
508
+ rem experiments dataset create rem-lookup-ask_rem-golden ...
509
+ # Labels: rem, golden-set, rem-lookup (auto-applied)
510
+
511
+ # Custom labels
512
+ rem experiments dataset create ... --labels production,high-priority
513
+ ```
514
+
515
+ ---
516
+
517
+ ## Vibe-Eval Methodology
518
+
519
+ **Interactive test/fix cycle** using CLI tools before formal Phoenix tracking.
520
+
521
+ ### Phase 0: Setup Case Context
522
+
523
+ **Parse documents to create case structure.**
524
+
525
+ ```bash
526
+ export CASE_REF="rem-001"
527
+
528
+ # Parse specs/docs (agent CAN read these)
529
+ rem process files experiments/$CASE_REF/inputs/specs/*.yaml \
530
+ --case-ref $CASE_REF \
531
+ --wait
532
+
533
+ # Creates:
534
+ # .fs/cases/rem-001/
535
+ # ├── spec.yaml # Original file
536
+ # ├── spec.yaml.md # Parsed markdown (agent reads this)
537
+ # ├── spec.yaml.json # Parse metadata
538
+ # └── scratchpad/ # Agent memory (created on first call)
539
+ ```
540
+
541
+ ### Phase 1: Interactive Testing
542
+
543
+ **Test agent with questions WITHOUT showing ground truth.**
544
+
545
+ ```bash
546
+ # Question 1: LOOKUP query
547
+ rem ask ask_rem "LOOKUP person:sarah-chen" --case-ref $CASE_REF \
548
+ --output experiments/$CASE_REF/agent-responses/q1.json
549
+
550
+ # Judge manually (compare to validation/golden-set/)
551
+ # - Does output match validation/sarah-chen.json?
552
+ # - Are all fields present (label, type, properties)?
553
+ # - Any hallucinated information?
554
+ # - Tool usage appropriate?
555
+
556
+ # Question 2: SEARCH query
557
+ rem ask ask_rem "SEARCH for API design projects" --case-ref $CASE_REF \
558
+ --output experiments/$CASE_REF/agent-responses/q2.json
559
+
560
+ # Judge manually
561
+ # - Are returned entities relevant?
562
+ # - Is ranking quality good?
563
+ # - Any missing expected entities?
564
+ ```
565
+
566
+ ### Phase 2: Failure Analysis
567
+
568
+ **When agent fails, diagnose root cause.**
569
+
570
+ ```bash
571
+ # Check tool usage
572
+ cat .fs/cases/$CASE_REF/scratchpad/deltas/*.yaml
573
+
574
+ # Did agent call tools?
575
+ # - If NO: Prompt unclear? Tool descriptions inadequate?
576
+ # - If YES: Are tool outputs correct? Is agent interpreting results correctly?
577
+
578
+ # Check agent reasoning
579
+ cat experiments/$CASE_REF/agent-responses/q1.json
580
+
581
+ # Look for:
582
+ # - Hallucinations (making up entities that don't exist)
583
+ # - Missing fields (incomplete output)
584
+ # - Type confusion (wrong entity type)
585
+ # - Tool misuse (calling wrong tools or with wrong parameters)
586
+ ```
587
+
588
+ ### Phase 3: Fix and Iterate
589
+
590
+ **Fix root cause and re-test.**
591
+
592
+ ```bash
593
+ # Example: Fix prompt clarity
594
+ vim schemas/agents/ask-rem.yaml
595
+
596
+ # Add explicit instructions:
597
+ # "When answering LOOKUP queries:
598
+ # 1. ALWAYS call ask_rem tool with exact query
599
+ # 2. Return entity label, type, and ALL properties
600
+ # 3. NEVER invent entities not returned by tool"
601
+
602
+ # Re-test
603
+ rem ask ask_rem "LOOKUP person:sarah-chen" --case-ref $CASE_REF
604
+
605
+ # Compare to ground truth again
606
+ # - Fixed? Continue to next test
607
+ # - Still broken? Iterate again
608
+ ```
609
+
610
+ ### Phase 4: Coverage Testing
611
+
612
+ **Test diverse scenarios.**
613
+
614
+ ```bash
615
+ # Test matrix
616
+ Tests:
617
+ ├── Easy queries (exact matches, common patterns)
618
+ ├── Medium queries (fuzzy matches, disambiguation)
619
+ ├── Hard queries (complex traversals, missing data)
620
+ └── Edge cases (empty results, malformed queries)
621
+
622
+ # Run through all test questions
623
+ for question in experiments/$CASE_REF/outputs/questions.txt; do
624
+ rem ask ask_rem "$question" --case-ref $CASE_REF
625
+ # Judge each manually
626
+ done
627
+ ```
628
+
629
+ ### Phase 5: Exit Criteria
630
+
631
+ **When to move to Phoenix:**
632
+
633
+ - [ ] Agent answers 80%+ of test questions correctly
634
+ - [ ] No systematic hallucinations
635
+ - [ ] Tool usage is appropriate and consistent
636
+ - [ ] Output format is stable
637
+ - [ ] No obvious prompt issues
638
+ - [ ] Ready for automated tracking
639
+
640
+ **Document Findings**:
641
+ ```markdown
642
+ # Vibe-Eval Summary (rem-001)
643
+
644
+ ## Test Results
645
+ - Total questions: 25
646
+ - Correct: 21 (84%)
647
+ - Partial: 3 (12%)
648
+ - Wrong: 1 (4%)
649
+
650
+ ## Key Findings
651
+ - ✅ LOOKUP queries work reliably
652
+ - ✅ Tool usage is appropriate
653
+ - ⚠️ TRAVERSE queries sometimes miss indirect paths
654
+ - ❌ Ambiguous entity names cause confusion
655
+
656
+ ## Fixes Applied
657
+ 1. Updated prompt to emphasize exact tool usage
658
+ 2. Added examples for TRAVERSE queries
659
+ 3. Improved entity disambiguation instructions
660
+
661
+ ## Ready for Phoenix
662
+ Agent is stable enough for formal Phoenix tracking.
663
+ ```
664
+
665
+ ---
666
+
667
+ ## Phoenix Integration
668
+
669
+ **After Vibe-Eval passes**, create Phoenix artifacts for systematic tracking.
670
+
671
+ ### Step 1: Create Golden Dataset
672
+
673
+ ```bash
674
+ rem experiments dataset create rem-lookup-ask_rem-golden \
675
+ --from-csv experiments/rem-001/outputs/questions.csv \
676
+ --input-keys input \
677
+ --output-keys reference \
678
+ --metadata-keys difficulty,query_type \
679
+ --description "Golden set for LOOKUP queries (curated from SME + engrams)"
680
+ ```
681
+
682
+ ### Step 2: Create Evaluator Schema
683
+
684
+ Create `schemas/evaluators/ask_rem-correctness.yaml`:
685
+
686
+ ```yaml
687
+ ---
688
+ type: object
689
+ description: |
690
+ Evaluate REM ask_rem agent responses for LOOKUP queries.
691
+
692
+ You are an expert evaluator judging agent responses against ground truth.
693
+
694
+ Scoring Rubric:
695
+ - Correctness (0-1): Does output match expected entity?
696
+ - Completeness (0-1): Are all required fields present?
697
+ - Hallucination (0-1): Any invented information? (1 = none, 0 = severe)
698
+
699
+ Pass threshold: Average score >= 0.75
700
+
701
+ properties:
702
+ correctness:
703
+ type: number
704
+ minimum: 0.0
705
+ maximum: 1.0
706
+ description: |
707
+ 1.0: Entity label and type match exactly
708
+ 0.8: Minor label variation (e.g., "sarah-chen" vs "Sarah Chen")
709
+ 0.5: Correct type, wrong label
710
+ 0.2: Wrong type, partial info
711
+ 0.0: Completely wrong or missing
712
+
713
+ completeness:
714
+ type: number
715
+ minimum: 0.0
716
+ maximum: 1.0
717
+ description: |
718
+ 1.0: All expected fields present (label, type, properties)
719
+ 0.7: Missing optional fields only
720
+ 0.5: Missing required fields
721
+ 0.0: Minimal information returned
722
+
723
+ hallucination_score:
724
+ type: number
725
+ minimum: 0.0
726
+ maximum: 1.0
727
+ description: |
728
+ 1.0: No invented information
729
+ 0.8: Minor embellishments
730
+ 0.5: Some invented fields
731
+ 0.2: Significant hallucination
732
+ 0.0: Entirely made up
733
+
734
+ pass:
735
+ type: boolean
736
+ description: True if average score >= 0.75
737
+
738
+ explanation:
739
+ type: string
740
+ description: Detailed explanation of scoring
741
+
742
+ required:
743
+ - correctness
744
+ - completeness
745
+ - hallucination_score
746
+ - pass
747
+ - explanation
748
+
749
+ json_schema_extra:
750
+ evaluator_type: llm-as-judge
751
+ provider_configs:
752
+ - provider_name: openai
753
+ model_name: gpt-4.1
754
+ input_schema:
755
+ query: string (the LOOKUP query)
756
+ output_schema:
757
+ label: string (entity label returned)
758
+ type: string (entity type)
759
+ properties: dict (entity properties)
760
+ expected_schema:
761
+ label: string (expected entity label)
762
+ type: string (expected entity type)
763
+ properties: dict (expected properties)
764
+ ```
765
+
766
+ ### Step 3: Run Baseline Experiment
767
+
768
+ ```bash
769
+ rem experiments experiment run rem-lookup-ask_rem-golden \
770
+ --experiment rem-lookup-ask_rem-v1 \
771
+ --agent ask_rem \
772
+ --evaluator ask_rem-correctness \
773
+ --model claude-sonnet-4-5 \
774
+ --description "Baseline evaluation after Vibe-Eval (v1.0)"
775
+ ```
776
+
777
+ ### Step 4: View Results
778
+
779
+ ```bash
780
+ # Open Phoenix UI
781
+ open http://localhost:6006
782
+
783
+ # Navigate to experiments
784
+ # Compare metrics:
785
+ # - Correctness: 0.87 (target: >= 0.85)
786
+ # - Completeness: 0.79 (target: >= 0.80)
787
+ # - Hallucination: 0.92 (target: >= 0.90)
788
+ # - Pass rate: 84% (21/25)
789
+ ```
790
+
791
+ ### Step 5: Iterate
792
+
793
+ ```bash
794
+ # After improvements (v2)
795
+ rem experiments experiment run rem-lookup-ask_rem-golden \
796
+ --experiment rem-lookup-ask_rem-v2 \
797
+ --agent ask_rem \
798
+ --evaluator ask_rem-correctness \
799
+ --description "After prompt improvements"
800
+
801
+ # Compare v1 vs v2 in Phoenix UI
802
+ # - Correctness: 0.87 → 0.94 (+7%)
803
+ # - Completeness: 0.79 → 0.88 (+9%)
804
+ # - Pass rate: 84% → 92% (+8%)
805
+ ```
806
+
807
+ ---
808
+
809
+ ## Re-Evaluation Patterns
810
+
811
+ **Run evaluators on existing agent outputs** without re-executing agents.
812
+
813
+ ### Use Case: Test New Evaluator
814
+
815
+ **Scenario**: You created a new evaluator and want to test it on previous experiment outputs.
816
+
817
+ ```bash
818
+ # Step 1: Export previous experiment results
819
+ rem experiments experiment export rem-lookup-ask_rem-v1 \
820
+ --output /tmp/v1-results.csv
821
+
822
+ # Step 2: Run new evaluator on exported results
823
+ rem experiments experiment run \
824
+ --from-results /tmp/v1-results.csv \
825
+ --experiment rem-lookup-ask_rem-v1-reeval \
826
+ --evaluator ask_rem-completeness-v2 \
827
+ --description "Re-evaluate v1 with improved completeness evaluator"
828
+ ```
829
+
830
+ ### Use Case: Compare Evaluator Versions
831
+
832
+ **Scenario**: You improved an evaluator and want to compare scores on same agent outputs.
833
+
834
+ ```bash
835
+ # Baseline (old evaluator)
836
+ rem experiments experiment run rem-lookup-ask_rem-golden \
837
+ --experiment rem-eval-comparison-v1 \
838
+ --agent ask_rem \
839
+ --evaluator ask_rem-correctness-v1
840
+
841
+ # Export results
842
+ rem experiments experiment export rem-eval-comparison-v1 \
843
+ --output /tmp/agent-outputs.csv
844
+
845
+ # Re-evaluate with new evaluator
846
+ rem experiments experiment run \
847
+ --from-results /tmp/agent-outputs.csv \
848
+ --experiment rem-eval-comparison-v2 \
849
+ --evaluator ask_rem-correctness-v2
850
+
851
+ # Compare in Phoenix UI
852
+ # - v1 evaluator: 87% pass rate
853
+ # - v2 evaluator: 92% pass rate
854
+ # - Conclusion: v2 evaluator is more lenient (or v1 was too strict)
855
+ ```
856
+
857
+ ### Use Case: Multi-Evaluator Analysis
858
+
859
+ **Scenario**: Run multiple evaluators on same agent outputs to analyze different dimensions.
860
+
861
+ ```bash
862
+ # Run agent once
863
+ rem experiments experiment run rem-lookup-ask_rem-golden \
864
+ --experiment rem-multi-eval-baseline \
865
+ --agent ask_rem
866
+
867
+ # Export results
868
+ rem experiments experiment export rem-multi-eval-baseline \
869
+ --output /tmp/baseline.csv
870
+
871
+ # Re-evaluate with different evaluators
872
+ rem experiments experiment run --from-results /tmp/baseline.csv \
873
+ --experiment rem-correctness-eval \
874
+ --evaluator ask_rem-correctness
875
+
876
+ rem experiments experiment run --from-results /tmp/baseline.csv \
877
+ --experiment rem-completeness-eval \
878
+ --evaluator ask_rem-completeness
879
+
880
+ rem experiments experiment run --from-results /tmp/baseline.csv \
881
+ --experiment rem-faithfulness-eval \
882
+ --evaluator ask_rem-faithfulness
883
+
884
+ # Compare dimension scores in Phoenix UI
885
+ ```
886
+
887
+ ---
888
+
889
+ ## Best Practices
890
+
891
+ ### Golden Set Quality
892
+
893
+ **Diversity**:
894
+ ```
895
+ ✅ Mix of easy, medium, hard examples (30/50/20 split)
896
+ ✅ Diverse entity types (person, project, document, etc.)
897
+ ✅ Different query patterns (exact, fuzzy, semantic)
898
+ ✅ Edge cases (empty results, ambiguous, malformed)
899
+
900
+ ❌ All easy examples
901
+ ❌ Single entity type
902
+ ❌ Repetitive queries
903
+ ❌ No edge cases
904
+ ```
905
+
906
+ **Metadata**:
907
+ ```csv
908
+ input,reference,difficulty,query_type,entity_type
909
+ "LOOKUP person:sarah-chen","...",easy,exact,person
910
+ "SEARCH API projects","...",medium,semantic,project
911
+ "TRAVERSE sarah-chen depth=3","...",hard,graph,mixed
912
+ ```
913
+
914
+ **Versioning**:
915
+ ```bash
916
+ # Version golden sets when making significant changes
917
+ rem experiments dataset create rem-lookup-golden-v1 ... # Initial
918
+ rem experiments dataset create rem-lookup-golden-v2 ... # Added edge cases
919
+ rem experiments dataset create rem-lookup-golden-v3 ... # Production failures
920
+ ```
921
+
922
+ ### Evaluator Design
923
+
924
+ **Multi-Dimensional Scoring**:
925
+ ```yaml
926
+ # Good: Multiple dimensions
927
+ properties:
928
+ correctness: {type: number}
929
+ completeness: {type: number}
930
+ relevance: {type: number}
931
+ hallucination_score: {type: number}
932
+
933
+ # Bad: Single score
934
+ properties:
935
+ score: {type: number}
936
+ ```
937
+
938
+ **Clear Rubrics**:
939
+ ```yaml
940
+ # Good: Explicit scoring criteria
941
+ description: |
942
+ 1.0: Perfect match
943
+ 0.8: Minor variations
944
+ 0.5: Partially correct
945
+ 0.2: Mostly wrong
946
+ 0.0: Completely wrong
947
+
948
+ # Bad: Vague
949
+ description: "Score the output"
950
+ ```
951
+
952
+ **Strict Grading**:
953
+ ```yaml
954
+ # Good: Catches subtle issues
955
+ hallucination_score: 1.0 only if NO invented information
956
+
957
+ # Bad: Too lenient
958
+ hallucination_score: 0.8 if "mostly accurate"
959
+ ```
960
+
961
+ ### Experiment Metadata
962
+
963
+ **Track Important Context**:
964
+ ```python
965
+ metadata = {
966
+ "task": "rem-lookup",
967
+ "agent": "ask_rem",
968
+ "index": "v3",
969
+ "model": "claude-sonnet-4-5",
970
+ "prompt_version": "2025-11-21",
971
+ "hypothesis": "Fixed entity type confusion",
972
+ "baseline_score": 0.87,
973
+ "target_score": 0.92,
974
+ "changed_files": ["schemas/agents/ask-rem.yaml"],
975
+ }
976
+ ```
977
+
978
+ ### Progressive Testing
979
+
980
+ **Start Small, Scale Up**:
981
+ ```
982
+ Phase 1: Vibe-Eval (5-10 examples, interactive)
983
+ Phase 2: Phoenix Baseline (25 examples, full evaluators)
984
+ Phase 3: Comprehensive (100+ examples, all dimensions)
985
+ Phase 4: Production (1000+ examples, continuous)
986
+ ```
987
+
988
+ ---
989
+
990
+ ## Example Workflows
991
+
992
+ ### Workflow 1: Testing LOOKUP Query Agent
993
+
994
+ **Goal**: Ensure LOOKUP queries return correct entities with complete information.
995
+
996
+ ```bash
997
+ # 1. Collect ground truth
998
+ mkdir -p experiments/rem-lookup-001/validation/golden-set/
999
+ cp sme-examples/entities/*.json experiments/rem-lookup-001/validation/golden-set/
1000
+
1001
+ # 2. Create test questions
1002
+ cat > experiments/rem-lookup-001/outputs/questions.csv <<EOF
1003
+ input,reference
1004
+ "LOOKUP person:sarah-chen","{""label"": ""sarah-chen"", ""type"": ""person""}"
1005
+ "LOOKUP project:api-design-v2","{""label"": ""api-design-v2"", ""type"": ""project""}"
1006
+ EOF
1007
+
1008
+ # 3. Vibe-Eval
1009
+ export CASE_REF="rem-lookup-001"
1010
+ rem ask ask_rem "LOOKUP person:sarah-chen" --case-ref $CASE_REF
1011
+ # Judge: Does it match validation/golden-set/sarah-chen.json?
1012
+
1013
+ # 4. Phoenix
1014
+ rem experiments dataset create rem-lookup-ask_rem-golden \
1015
+ --from-csv experiments/rem-lookup-001/outputs/questions.csv \
1016
+ --input-keys input --output-keys reference
1017
+
1018
+ rem experiments experiment run rem-lookup-ask_rem-golden \
1019
+ --experiment rem-lookup-ask_rem-v1 \
1020
+ --agent ask_rem \
1021
+ --evaluator ask_rem-correctness
1022
+ ```
1023
+
1024
+ ### Workflow 2: Testing with Engrams
1025
+
1026
+ **Goal**: Scale testing using generated engrams.
1027
+
1028
+ ```bash
1029
+ # 1. Generate high-quality engrams
1030
+ rem dreaming full --tenant-id acme --generate-test-cases --quality-level 4
1031
+
1032
+ # 2. Export engrams
1033
+ rem engram export rem-engrams-mature-mixed --output engrams.csv --format phoenix
1034
+
1035
+ # 3. Create dataset
1036
+ rem experiments dataset create rem-search-ask_rem-golden \
1037
+ --from-engrams engrams.csv \
1038
+ --input-keys query,context \
1039
+ --output-keys entities,relationships
1040
+
1041
+ # 4. Run experiment
1042
+ rem experiments experiment run rem-search-ask_rem-golden \
1043
+ --experiment rem-search-ask_rem-v1 \
1044
+ --agent ask_rem \
1045
+ --evaluator ask_rem-retrieval-precision,ask_rem-retrieval-recall
1046
+ ```
1047
+
1048
+ ### Workflow 3: Re-Evaluation After Prompt Change
1049
+
1050
+ **Goal**: Test if prompt improvements increased accuracy without re-running agent.
1051
+
1052
+ ```bash
1053
+ # 1. Baseline experiment (already run)
1054
+ # rem experiments experiment run ... --experiment rem-v1
1055
+
1056
+ # 2. Export baseline results
1057
+ rem experiments experiment export rem-lookup-ask_rem-v1 --output /tmp/v1.csv
1058
+
1059
+ # 3. Update prompt
1060
+ vim schemas/agents/ask-rem.yaml
1061
+
1062
+ # 4. Test new prompt via Vibe-Eval (spot check)
1063
+ rem ask ask_rem "LOOKUP person:sarah-chen" --case-ref rem-test
1064
+
1065
+ # 5. Run full experiment with new prompt
1066
+ rem experiments experiment run rem-lookup-ask_rem-golden \
1067
+ --experiment rem-lookup-ask_rem-v2 \
1068
+ --agent ask_rem \
1069
+ --evaluator ask_rem-correctness
1070
+
1071
+ # 6. Compare v1 vs v2 in Phoenix UI
1072
+ ```
1073
+
1074
+ ### Workflow 4: Hybrid Golden Set (SME + Engrams + Production)
1075
+
1076
+ **Goal**: Comprehensive evaluation combining all data sources.
1077
+
1078
+ ```bash
1079
+ # 1. Collect SME examples
1080
+ cp sme-postman-collection.json validation/sme-examples/
1081
+
1082
+ # 2. Export production data
1083
+ rem experiments trace list --project rem-prod --days 30 --output prod.csv
1084
+
1085
+ # 3. Generate engrams
1086
+ rem engram export rem-high-quality --output engrams.csv
1087
+
1088
+ # 4. Merge sources
1089
+ python scripts/merge_golden_sets.py \
1090
+ --sme validation/sme-examples/ \
1091
+ --production prod.csv \
1092
+ --engrams engrams.csv \
1093
+ --weights 0.2,0.3,0.5 \
1094
+ --output golden-hybrid.csv
1095
+
1096
+ # 5. Create Phoenix dataset
1097
+ rem experiments dataset create rem-comprehensive-golden \
1098
+ --from-csv golden-hybrid.csv \
1099
+ --input-keys query,context \
1100
+ --output-keys reference \
1101
+ --metadata-keys source,difficulty
1102
+
1103
+ # 6. Run experiment
1104
+ rem experiments experiment run rem-comprehensive-golden \
1105
+ --experiment rem-comprehensive-v1 \
1106
+ --agent ask_rem \
1107
+ --evaluator ask_rem-correctness,ask_rem-completeness,ask_rem-faithfulness
1108
+ ```
1109
+
1110
+ ---
1111
+
1112
+ ## Summary
1113
+
1114
+ REM's experiment design framework provides:
1115
+
1116
+ ✅ **Clear methodology**: Vibe-Eval → Phoenix → Iteration
1117
+ ✅ **Multiple data sources**: SME + Production + Engrams
1118
+ ✅ **Deterministic naming**: Prevent Phoenix proliferation
1119
+ ✅ **Re-evaluation support**: Test new evaluators on old results
1120
+ ✅ **Data-driven design**: Use real patterns to build better agents
1121
+ ✅ **Systematic tracking**: Phoenix integration for long-term analysis
1122
+
1123
+ **Key Takeaways**:
1124
+
1125
+ 1. **Ground truth first**: Start with the answer key, not questions
1126
+ 2. **Separation of concerns**: Agent NEVER sees validation folder
1127
+ 3. **Vibe-Eval before Phoenix**: Interactive testing catches issues early
1128
+ 4. **Use engrams for scale**: Generated data covers diverse scenarios
1129
+ 5. **Track everything**: Metadata enables comparison over time
1130
+
1131
+ **Next Steps**:
1132
+
1133
+ 1. Define your first experiment (problem, metrics, hypothesis)
1134
+ 2. Collect ground truth (SME + production + engrams)
1135
+ 3. Run Vibe-Eval until stable
1136
+ 4. Formalize with Phoenix experiments
1137
+ 5. Iterate and track improvements
1138
+
1139
+ ---
1140
+
1141
+ ## Related Documentation
1142
+
1143
+ - [Phoenix README](./README.md) - Phoenix service overview
1144
+ - [CLAUDE.md](../../../CLAUDE.md) - REM architecture
1145
+ - [Evaluator Schemas](../../../schemas/evaluators/) - Pre-built evaluators
1146
+ - [Dreaming Worker](../../workers/dreaming.py) - Engram generation