remdb 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +566 -0
- rem/cli/commands/configure.py +497 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1302 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +96 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +801 -0
- rem/services/content/service.py +676 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +336 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.3.7.dist-info/METADATA +1473 -0
- remdb-0.3.7.dist-info/RECORD +187 -0
- remdb-0.3.7.dist-info/WHEEL +4 -0
- remdb-0.3.7.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,1146 @@
|
|
|
1
|
+
# REM Experiment Design Guide
|
|
2
|
+
|
|
3
|
+
**Version**: 1.0
|
|
4
|
+
**Date**: 2025-11-21
|
|
5
|
+
**Status**: Production-Ready
|
|
6
|
+
|
|
7
|
+
A comprehensive guide to designing, executing, and iterating on LLM evaluation experiments for REM agents using Phoenix.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Table of Contents
|
|
12
|
+
|
|
13
|
+
1. [Overview](#overview)
|
|
14
|
+
2. [Design Principles](#design-principles)
|
|
15
|
+
3. [Experiment Lifecycle](#experiment-lifecycle)
|
|
16
|
+
4. [Data Sources](#data-sources)
|
|
17
|
+
5. [Naming Conventions](#naming-conventions)
|
|
18
|
+
6. [Vibe-Eval Methodology](#vibe-eval-methodology)
|
|
19
|
+
7. [Phoenix Integration](#phoenix-integration)
|
|
20
|
+
8. [Re-Evaluation Patterns](#re-evaluation-patterns)
|
|
21
|
+
9. [Best Practices](#best-practices)
|
|
22
|
+
10. [Example Workflows](#example-workflows)
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Overview
|
|
27
|
+
|
|
28
|
+
REM's experiment framework combines **interactive testing** (Vibe-Eval) with **systematic tracking** (Phoenix) to build reliable agent evaluation pipelines.
|
|
29
|
+
|
|
30
|
+
### Key Concepts
|
|
31
|
+
|
|
32
|
+
**Engrams**: Generated datasets from REM's memory system (resources, entities, moments). These are synthetic but realistic test cases created by the dreaming worker.
|
|
33
|
+
|
|
34
|
+
**Ground Truth**: Reference answers from subject matter experts (SMEs), production data, or validated engrams. The agent NEVER sees ground truth during testing—it's the answer key for evaluation.
|
|
35
|
+
|
|
36
|
+
**Vibe-Eval**: Interactive test/fix cycle using CLI tools. Rapid iteration before committing to formal Phoenix experiments.
|
|
37
|
+
|
|
38
|
+
**Phoenix Experiments**: Automated evaluation runs tracked in Phoenix for systematic comparison over time.
|
|
39
|
+
|
|
40
|
+
### Three-Folder Structure
|
|
41
|
+
|
|
42
|
+
Every experiment follows a strict separation of concerns:
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
{EXPERIMENT-ID}/
|
|
46
|
+
├── inputs/ # What agent CAN see
|
|
47
|
+
│ ├── specs/ # API specs, documentation
|
|
48
|
+
│ ├── engrams/ # Generated test data
|
|
49
|
+
│ └── context/ # Additional context files
|
|
50
|
+
│
|
|
51
|
+
├── outputs/ # Questions to test agent
|
|
52
|
+
│ ├── questions.csv # Test questions (created FROM ground truth)
|
|
53
|
+
│ └── questions.yaml # Alternative format
|
|
54
|
+
│
|
|
55
|
+
└── validation/ # Ground truth (agent CANNOT see!)
|
|
56
|
+
├── golden-set/ # Reference answers
|
|
57
|
+
├── sme-examples/ # Expert-provided examples
|
|
58
|
+
└── production/ # Real-world validated data
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Critical Rule**: The agent NEVER accesses the `validation/` folder. It must answer questions using only `inputs/`.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Design Principles
|
|
66
|
+
|
|
67
|
+
### 1. Ground Truth First
|
|
68
|
+
|
|
69
|
+
**Start with the answer key**, not the questions.
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
Bad: "Let's test if the agent can map APIs" → Write random questions
|
|
73
|
+
Good: "Here's how APIs should be mapped" → Test if agent matches SME examples
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
**Sources of Ground Truth**:
|
|
77
|
+
- SME examples (e.g., Postman collections, expert mappings)
|
|
78
|
+
- Production data (validated historical queries)
|
|
79
|
+
- Curated engrams (generated data that passed manual review)
|
|
80
|
+
|
|
81
|
+
### 2. Separation of Concerns
|
|
82
|
+
|
|
83
|
+
**What agent sees** vs **What we judge against** must be distinct.
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
inputs/ → Agent reads these to answer questions
|
|
87
|
+
validation/ → We read these to judge agent answers
|
|
88
|
+
outputs/ → Questions derived FROM validation (not shown to agent)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### 3. Iterative Refinement
|
|
92
|
+
|
|
93
|
+
**Vibe-Eval before Phoenix.**
|
|
94
|
+
|
|
95
|
+
1. Test agent interactively (CLI tools)
|
|
96
|
+
2. Fix broken prompts, tools, or schemas
|
|
97
|
+
3. Iterate until stable
|
|
98
|
+
4. THEN track with Phoenix experiments
|
|
99
|
+
|
|
100
|
+
**Why**: Prevents wasting Phoenix runs on obviously broken agents.
|
|
101
|
+
|
|
102
|
+
### 4. Deterministic Naming
|
|
103
|
+
|
|
104
|
+
**Predictable artifact names** prevent Phoenix dataset proliferation.
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
Dataset: {task}-{agent}-golden (e.g., rem-lookup-ask_rem-golden)
|
|
108
|
+
Experiment: {task}-{agent}-v{index} (e.g., rem-lookup-ask_rem-v1)
|
|
109
|
+
Evaluator: {agent}-{dimension} (e.g., ask_rem-correctness)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### 5. Data-Driven Design
|
|
113
|
+
|
|
114
|
+
**Use data to build better LLMs**, not guesses.
|
|
115
|
+
|
|
116
|
+
- Generate engrams from real REM usage patterns
|
|
117
|
+
- Extract failure modes from production traces
|
|
118
|
+
- Create test cases targeting specific weaknesses
|
|
119
|
+
- Track improvements with controlled experiments
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Experiment Lifecycle
|
|
124
|
+
|
|
125
|
+
### Stage 0: Problem Definition
|
|
126
|
+
|
|
127
|
+
**What are you trying to improve?**
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
Example:
|
|
131
|
+
- Problem: LOOKUP queries return wrong entity types
|
|
132
|
+
- Hypothesis: Agent confuses person vs project entities
|
|
133
|
+
- Goal: Improve type classification accuracy from 75% to 95%
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
**Define Success Metrics:**
|
|
137
|
+
```
|
|
138
|
+
Metric | Baseline | Target
|
|
139
|
+
-----------------------|----------|-------
|
|
140
|
+
Type correctness | 75% | 95%
|
|
141
|
+
Label completeness | 60% | 90%
|
|
142
|
+
Hallucination rate | 15% | < 5%
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Stage 1: Ground Truth Collection
|
|
146
|
+
|
|
147
|
+
**Gather reference answers.**
|
|
148
|
+
|
|
149
|
+
**Option A: SME Examples**
|
|
150
|
+
```bash
|
|
151
|
+
# Expert provides examples
|
|
152
|
+
mkdir -p experiments/rem-001/validation/sme-examples/
|
|
153
|
+
cp postman-collection.json experiments/rem-001/validation/sme-examples/
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
**Option B: Production Data**
|
|
157
|
+
```bash
|
|
158
|
+
# Export validated production queries
|
|
159
|
+
rem experiments trace list --project rem-production --days 30 --output prod-queries.csv
|
|
160
|
+
# Manual review and curation
|
|
161
|
+
cp curated-queries.csv experiments/rem-001/validation/production/
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
**Option C: Curated Engrams**
|
|
165
|
+
```bash
|
|
166
|
+
# Generate engrams from REM data
|
|
167
|
+
rem dreaming full --user-id test-user --tenant-id acme --generate-test-cases
|
|
168
|
+
|
|
169
|
+
# Review and select high-quality engrams
|
|
170
|
+
rem engram list --quality high --limit 100 --output engrams.csv
|
|
171
|
+
cp engrams.csv experiments/rem-001/validation/engrams/
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Stage 2: Test Question Design
|
|
175
|
+
|
|
176
|
+
**Create questions FROM ground truth.**
|
|
177
|
+
|
|
178
|
+
Read `validation/` folder and extract test questions WITHOUT revealing answers.
|
|
179
|
+
|
|
180
|
+
```csv
|
|
181
|
+
# outputs/questions.csv
|
|
182
|
+
input,reference
|
|
183
|
+
"LOOKUP person:sarah-chen","{""label"": ""sarah-chen"", ""type"": ""person"", ""properties"": {...}}"
|
|
184
|
+
"SEARCH for API design projects","{""entities"": [""api-design-v2"", ""rest-api-spec""], ""query_type"": ""semantic""}"
|
|
185
|
+
"TRAVERSE from sarah-chen to projects","{""paths"": [[""sarah-chen"", ""leads"", ""api-design-v2""]], ""depth"": 2}"
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
**Question Design Checklist:**
|
|
189
|
+
- [ ] Covers diverse difficulty levels (easy, medium, hard)
|
|
190
|
+
- [ ] Includes edge cases (ambiguous queries, missing data)
|
|
191
|
+
- [ ] Tests specific failure modes (identified from baseline)
|
|
192
|
+
- [ ] Reference answers are explicit and complete
|
|
193
|
+
- [ ] No leakage (questions don't reveal answers)
|
|
194
|
+
|
|
195
|
+
### Stage 3: Vibe-Eval (Interactive Testing)
|
|
196
|
+
|
|
197
|
+
**Test agent WITHOUT showing ground truth.**
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
# Setup case context (agent CAN see this)
|
|
201
|
+
export CASE_REF="rem-001"
|
|
202
|
+
rem process files experiments/$CASE_REF/inputs/specs/*.yaml --case-ref $CASE_REF
|
|
203
|
+
|
|
204
|
+
# Test agent interactively
|
|
205
|
+
rem ask ask_rem "LOOKUP person:sarah-chen" --case-ref $CASE_REF
|
|
206
|
+
|
|
207
|
+
# Compare output to ground truth (YOU are the judge)
|
|
208
|
+
# - Does output match validation/golden-set/sarah-chen.json?
|
|
209
|
+
# - Are all fields present?
|
|
210
|
+
# - Any hallucinations?
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
**Fix and Iterate:**
|
|
214
|
+
```bash
|
|
215
|
+
# If agent fails:
|
|
216
|
+
# 1. Check tool usage
|
|
217
|
+
cat .fs/cases/$CASE_REF/scratchpad/deltas/*.yaml
|
|
218
|
+
|
|
219
|
+
# 2. Fix agent schema (prompt, tools, output format)
|
|
220
|
+
vim schemas/agents/ask-rem.yaml
|
|
221
|
+
|
|
222
|
+
# 3. Re-test
|
|
223
|
+
rem ask ask_rem "LOOKUP person:sarah-chen" --case-ref $CASE_REF
|
|
224
|
+
|
|
225
|
+
# 4. Repeat until stable
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
**Exit Criteria for Vibe-Eval:**
|
|
229
|
+
- Agent correctly answers 8/10 diverse test questions
|
|
230
|
+
- No obvious hallucinations
|
|
231
|
+
- Tool usage is appropriate
|
|
232
|
+
- Output format is consistent
|
|
233
|
+
|
|
234
|
+
### Stage 4: Phoenix Formalization
|
|
235
|
+
|
|
236
|
+
**Create Phoenix artifacts AFTER Vibe-Eval passes.**
|
|
237
|
+
|
|
238
|
+
```bash
|
|
239
|
+
# 1. Create golden dataset
|
|
240
|
+
rem experiments dataset create rem-lookup-ask_rem-golden \
|
|
241
|
+
--from-csv experiments/rem-001/outputs/questions.csv \
|
|
242
|
+
--input-keys input \
|
|
243
|
+
--output-keys reference \
|
|
244
|
+
--description "Golden set for LOOKUP query evaluation"
|
|
245
|
+
|
|
246
|
+
# 2. Create evaluator schema
|
|
247
|
+
# Edit schemas/evaluators/ask_rem-correctness.yaml
|
|
248
|
+
|
|
249
|
+
# 3. Run baseline experiment
|
|
250
|
+
rem experiments experiment run rem-lookup-ask_rem-golden \
|
|
251
|
+
--experiment rem-lookup-ask_rem-v1 \
|
|
252
|
+
--agent ask_rem \
|
|
253
|
+
--evaluator ask_rem-correctness \
|
|
254
|
+
--description "Baseline evaluation after Vibe-Eval"
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### Stage 5: Iteration and Tracking
|
|
258
|
+
|
|
259
|
+
**Track improvements over time.**
|
|
260
|
+
|
|
261
|
+
```bash
|
|
262
|
+
# V1 baseline
|
|
263
|
+
rem experiments experiment run ... --experiment rem-lookup-ask_rem-v1
|
|
264
|
+
|
|
265
|
+
# V2 after prompt improvements
|
|
266
|
+
rem experiments experiment run ... --experiment rem-lookup-ask_rem-v2
|
|
267
|
+
|
|
268
|
+
# V3 after tool fixes
|
|
269
|
+
rem experiments experiment run ... --experiment rem-lookup-ask_rem-v3
|
|
270
|
+
|
|
271
|
+
# Compare in Phoenix UI
|
|
272
|
+
open http://localhost:6006
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## Data Sources
|
|
278
|
+
|
|
279
|
+
### 1. SME Examples (Expert Knowledge)
|
|
280
|
+
|
|
281
|
+
**What**: Reference answers created by domain experts.
|
|
282
|
+
|
|
283
|
+
**Use Cases**:
|
|
284
|
+
- API mapper evaluation (Postman collections)
|
|
285
|
+
- CDA mapper evaluation (expert-provided mappings)
|
|
286
|
+
- Complex reasoning tasks (SME-validated outputs)
|
|
287
|
+
|
|
288
|
+
**Workflow**:
|
|
289
|
+
```bash
|
|
290
|
+
# SME provides examples
|
|
291
|
+
validation/sme-examples/postman-collection.json
|
|
292
|
+
validation/sme-examples/expert-mappings.yaml
|
|
293
|
+
|
|
294
|
+
# Extract test questions
|
|
295
|
+
# Question: "Show complete API request for POST /orders/create"
|
|
296
|
+
# Reference: <exact request from Postman>
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
**Pros**: High quality, domain-accurate
|
|
300
|
+
**Cons**: Manual effort, doesn't scale
|
|
301
|
+
|
|
302
|
+
### 2. Production Data (Real-World Validated)
|
|
303
|
+
|
|
304
|
+
**What**: Queries and responses from production that have been manually validated.
|
|
305
|
+
|
|
306
|
+
**Use Cases**:
|
|
307
|
+
- Regression testing (ensure new versions don't break existing functionality)
|
|
308
|
+
- Coverage testing (test against real user query patterns)
|
|
309
|
+
- Edge case discovery (find unusual queries users actually make)
|
|
310
|
+
|
|
311
|
+
**Workflow**:
|
|
312
|
+
```bash
|
|
313
|
+
# Export production traces
|
|
314
|
+
rem experiments trace list --project rem-production --days 30 --limit 1000 \
|
|
315
|
+
--output prod-traces.csv
|
|
316
|
+
|
|
317
|
+
# Manual curation (validate correctness)
|
|
318
|
+
# Keep only queries with verified correct outputs
|
|
319
|
+
|
|
320
|
+
# Create test dataset
|
|
321
|
+
rem experiments dataset create rem-production-regression \
|
|
322
|
+
--from-csv curated-prod-queries.csv \
|
|
323
|
+
--input-keys query \
|
|
324
|
+
--output-keys expected_output
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
**Pros**: Real-world coverage, user patterns
|
|
328
|
+
**Cons**: Requires validation, may contain errors
|
|
329
|
+
|
|
330
|
+
### 3. Engrams (Generated Test Data)
|
|
331
|
+
|
|
332
|
+
**What**: Synthetic datasets generated by REM's dreaming worker from memory system.
|
|
333
|
+
|
|
334
|
+
**Unique to REM**: Engrams are created through multi-stage "dreaming":
|
|
335
|
+
- **Stage 1**: Entity extraction from resources
|
|
336
|
+
- **Stage 2**: Moment generation (temporal narratives)
|
|
337
|
+
- **Stage 3**: Affinity matching (semantic clustering)
|
|
338
|
+
- **Stage 4**: Multiple dreaming cycles (rich interconnections)
|
|
339
|
+
|
|
340
|
+
**Use Cases**:
|
|
341
|
+
- Scale testing (generate thousands of test cases)
|
|
342
|
+
- Diverse scenario coverage (different entity types, query patterns)
|
|
343
|
+
- Controlled difficulty (easy vs hard examples)
|
|
344
|
+
- Stress testing (edge cases, missing data)
|
|
345
|
+
|
|
346
|
+
**Engram Quality Levels**:
|
|
347
|
+
```
|
|
348
|
+
Level 0 (Raw): Resources only, minimal structure
|
|
349
|
+
Level 1 (Entities): Entities extracted, basic LOOKUP works
|
|
350
|
+
Level 2 (Moments): Temporal narratives, time-based queries work
|
|
351
|
+
Level 3 (Affinities): Semantic clustering, SEARCH works well
|
|
352
|
+
Level 4 (Mature): Multiple cycles, full query capabilities
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
**Workflow**:
|
|
356
|
+
```bash
|
|
357
|
+
# Generate engrams from REM data
|
|
358
|
+
rem dreaming full \
|
|
359
|
+
--user-id test-user \
|
|
360
|
+
--tenant-id acme \
|
|
361
|
+
--generate-test-cases \
|
|
362
|
+
--quality-level 3
|
|
363
|
+
|
|
364
|
+
# List available engrams
|
|
365
|
+
rem engram list \
|
|
366
|
+
--quality high \
|
|
367
|
+
--entity-type person,project \
|
|
368
|
+
--limit 100
|
|
369
|
+
|
|
370
|
+
# Export to golden set
|
|
371
|
+
rem engram export rem-engrams-high-quality \
|
|
372
|
+
--output engrams.csv \
|
|
373
|
+
--format phoenix
|
|
374
|
+
|
|
375
|
+
# Create dataset
|
|
376
|
+
rem experiments dataset create rem-search-ask_rem-golden \
|
|
377
|
+
--from-engrams engrams.csv \
|
|
378
|
+
--input-keys query,context \
|
|
379
|
+
--output-keys entities,relationships \
|
|
380
|
+
--description "High-quality engrams for SEARCH evaluation"
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
**Pros**: Scalable, diverse, controllable difficulty
|
|
384
|
+
**Cons**: Synthetic (may not reflect real usage), requires curation
|
|
385
|
+
|
|
386
|
+
### 4. Hybrid Approach (Recommended)
|
|
387
|
+
|
|
388
|
+
**Combine all three sources** for comprehensive coverage:
|
|
389
|
+
|
|
390
|
+
```
|
|
391
|
+
Golden Set Composition:
|
|
392
|
+
├── 20% SME Examples (high-quality, domain-accurate)
|
|
393
|
+
├── 30% Production Data (real-world patterns)
|
|
394
|
+
└── 50% Curated Engrams (scale, diversity, edge cases)
|
|
395
|
+
```
|
|
396
|
+
|
|
397
|
+
**Workflow**:
|
|
398
|
+
```bash
|
|
399
|
+
# 1. Collect SME examples
|
|
400
|
+
cp sme-examples/*.json validation/sme-examples/
|
|
401
|
+
|
|
402
|
+
# 2. Export production data
|
|
403
|
+
rem experiments trace list --project rem-prod --output prod.csv
|
|
404
|
+
|
|
405
|
+
# 3. Generate engrams
|
|
406
|
+
rem engram export rem-high-quality --output engrams.csv
|
|
407
|
+
|
|
408
|
+
# 4. Merge into single golden set
|
|
409
|
+
python scripts/merge_golden_sets.py \
|
|
410
|
+
--sme validation/sme-examples/ \
|
|
411
|
+
--production prod.csv \
|
|
412
|
+
--engrams engrams.csv \
|
|
413
|
+
--output golden-set.csv
|
|
414
|
+
|
|
415
|
+
# 5. Create Phoenix dataset
|
|
416
|
+
rem experiments dataset create rem-comprehensive-golden \
|
|
417
|
+
--from-csv golden-set.csv \
|
|
418
|
+
--input-keys query,context \
|
|
419
|
+
--output-keys reference
|
|
420
|
+
```
|
|
421
|
+
|
|
422
|
+
---
|
|
423
|
+
|
|
424
|
+
## Naming Conventions
|
|
425
|
+
|
|
426
|
+
### Deterministic Naming Pattern
|
|
427
|
+
|
|
428
|
+
**Goal**: Prevent Phoenix dataset proliferation, enable traceability.
|
|
429
|
+
|
|
430
|
+
### Datasets
|
|
431
|
+
|
|
432
|
+
**Golden Sets (Ground Truth)**:
|
|
433
|
+
```
|
|
434
|
+
Pattern: {task}-{agent}-golden
|
|
435
|
+
Examples:
|
|
436
|
+
- rem-lookup-ask_rem-golden
|
|
437
|
+
- rem-search-ask_rem-golden
|
|
438
|
+
- rem-traverse-ask_rem-golden
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
**Agent Results (Experiment Outputs)**:
|
|
442
|
+
```
|
|
443
|
+
Pattern: {task}-{agent}-results
|
|
444
|
+
Examples:
|
|
445
|
+
- rem-lookup-ask_rem-results (auto-created by Phoenix)
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
**Engram Datasets**:
|
|
449
|
+
```
|
|
450
|
+
Pattern: rem-engrams-{quality}-{entity-type}
|
|
451
|
+
Examples:
|
|
452
|
+
- rem-engrams-high-person
|
|
453
|
+
- rem-engrams-medium-project
|
|
454
|
+
- rem-engrams-mature-mixed
|
|
455
|
+
```
|
|
456
|
+
|
|
457
|
+
### Experiments
|
|
458
|
+
|
|
459
|
+
**Pattern**: `{task}-{agent}-v{index}`
|
|
460
|
+
|
|
461
|
+
```
|
|
462
|
+
Examples:
|
|
463
|
+
- rem-lookup-ask_rem-v1 (baseline)
|
|
464
|
+
- rem-lookup-ask_rem-v2 (after prompt improvements)
|
|
465
|
+
- rem-lookup-ask_rem-v3 (after tool fixes)
|
|
466
|
+
```
|
|
467
|
+
|
|
468
|
+
**Metadata** (auto-stored):
|
|
469
|
+
```json
|
|
470
|
+
{
|
|
471
|
+
"task": "rem-lookup",
|
|
472
|
+
"agent": "ask_rem",
|
|
473
|
+
"index": "v1",
|
|
474
|
+
"model": "claude-sonnet-4-5",
|
|
475
|
+
"dataset_id": "RGF0YXNldDo...",
|
|
476
|
+
"timestamp": "2025-11-21T10:30:00Z",
|
|
477
|
+
"hypothesis": "Baseline evaluation after Vibe-Eval"
|
|
478
|
+
}
|
|
479
|
+
```
|
|
480
|
+
|
|
481
|
+
### Evaluators
|
|
482
|
+
|
|
483
|
+
**Pattern**: `{agent}-{dimension}`
|
|
484
|
+
|
|
485
|
+
```
|
|
486
|
+
Examples:
|
|
487
|
+
- ask_rem-correctness
|
|
488
|
+
- ask_rem-completeness
|
|
489
|
+
- ask_rem-faithfulness
|
|
490
|
+
- ask_rem-retrieval-precision
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
### Labels
|
|
494
|
+
|
|
495
|
+
**Standard Labels**:
|
|
496
|
+
```
|
|
497
|
+
- rem (always added)
|
|
498
|
+
- golden-set (for curated datasets)
|
|
499
|
+
- experiment (for experiment runs)
|
|
500
|
+
- engram (for generated datasets)
|
|
501
|
+
- production (for production data)
|
|
502
|
+
- {task-name} (e.g., rem-lookup, rem-search)
|
|
503
|
+
```
|
|
504
|
+
|
|
505
|
+
**Usage**:
|
|
506
|
+
```bash
|
|
507
|
+
# Automatic labeling
|
|
508
|
+
rem experiments dataset create rem-lookup-ask_rem-golden ...
|
|
509
|
+
# Labels: rem, golden-set, rem-lookup (auto-applied)
|
|
510
|
+
|
|
511
|
+
# Custom labels
|
|
512
|
+
rem experiments dataset create ... --labels production,high-priority
|
|
513
|
+
```
|
|
514
|
+
|
|
515
|
+
---
|
|
516
|
+
|
|
517
|
+
## Vibe-Eval Methodology
|
|
518
|
+
|
|
519
|
+
**Interactive test/fix cycle** using CLI tools before formal Phoenix tracking.
|
|
520
|
+
|
|
521
|
+
### Phase 0: Setup Case Context
|
|
522
|
+
|
|
523
|
+
**Parse documents to create case structure.**
|
|
524
|
+
|
|
525
|
+
```bash
|
|
526
|
+
export CASE_REF="rem-001"
|
|
527
|
+
|
|
528
|
+
# Parse specs/docs (agent CAN read these)
|
|
529
|
+
rem process files experiments/$CASE_REF/inputs/specs/*.yaml \
|
|
530
|
+
--case-ref $CASE_REF \
|
|
531
|
+
--wait
|
|
532
|
+
|
|
533
|
+
# Creates:
|
|
534
|
+
# .fs/cases/rem-001/
|
|
535
|
+
# ├── spec.yaml # Original file
|
|
536
|
+
# ├── spec.yaml.md # Parsed markdown (agent reads this)
|
|
537
|
+
# ├── spec.yaml.json # Parse metadata
|
|
538
|
+
# └── scratchpad/ # Agent memory (created on first call)
|
|
539
|
+
```
|
|
540
|
+
|
|
541
|
+
### Phase 1: Interactive Testing
|
|
542
|
+
|
|
543
|
+
**Test agent with questions WITHOUT showing ground truth.**
|
|
544
|
+
|
|
545
|
+
```bash
|
|
546
|
+
# Question 1: LOOKUP query
|
|
547
|
+
rem ask ask_rem "LOOKUP person:sarah-chen" --case-ref $CASE_REF \
|
|
548
|
+
--output experiments/$CASE_REF/agent-responses/q1.json
|
|
549
|
+
|
|
550
|
+
# Judge manually (compare to validation/golden-set/)
|
|
551
|
+
# - Does output match validation/sarah-chen.json?
|
|
552
|
+
# - Are all fields present (label, type, properties)?
|
|
553
|
+
# - Any hallucinated information?
|
|
554
|
+
# - Tool usage appropriate?
|
|
555
|
+
|
|
556
|
+
# Question 2: SEARCH query
|
|
557
|
+
rem ask ask_rem "SEARCH for API design projects" --case-ref $CASE_REF \
|
|
558
|
+
--output experiments/$CASE_REF/agent-responses/q2.json
|
|
559
|
+
|
|
560
|
+
# Judge manually
|
|
561
|
+
# - Are returned entities relevant?
|
|
562
|
+
# - Is ranking quality good?
|
|
563
|
+
# - Any missing expected entities?
|
|
564
|
+
```
|
|
565
|
+
|
|
566
|
+
### Phase 2: Failure Analysis
|
|
567
|
+
|
|
568
|
+
**When agent fails, diagnose root cause.**
|
|
569
|
+
|
|
570
|
+
```bash
|
|
571
|
+
# Check tool usage
|
|
572
|
+
cat .fs/cases/$CASE_REF/scratchpad/deltas/*.yaml
|
|
573
|
+
|
|
574
|
+
# Did agent call tools?
|
|
575
|
+
# - If NO: Prompt unclear? Tool descriptions inadequate?
|
|
576
|
+
# - If YES: Are tool outputs correct? Is agent interpreting results correctly?
|
|
577
|
+
|
|
578
|
+
# Check agent reasoning
|
|
579
|
+
cat experiments/$CASE_REF/agent-responses/q1.json
|
|
580
|
+
|
|
581
|
+
# Look for:
|
|
582
|
+
# - Hallucinations (making up entities that don't exist)
|
|
583
|
+
# - Missing fields (incomplete output)
|
|
584
|
+
# - Type confusion (wrong entity type)
|
|
585
|
+
# - Tool misuse (calling wrong tools or with wrong parameters)
|
|
586
|
+
```
|
|
587
|
+
|
|
588
|
+
### Phase 3: Fix and Iterate
|
|
589
|
+
|
|
590
|
+
**Fix root cause and re-test.**
|
|
591
|
+
|
|
592
|
+
```bash
|
|
593
|
+
# Example: Fix prompt clarity
|
|
594
|
+
vim schemas/agents/ask-rem.yaml
|
|
595
|
+
|
|
596
|
+
# Add explicit instructions:
|
|
597
|
+
# "When answering LOOKUP queries:
|
|
598
|
+
# 1. ALWAYS call ask_rem tool with exact query
|
|
599
|
+
# 2. Return entity label, type, and ALL properties
|
|
600
|
+
# 3. NEVER invent entities not returned by tool"
|
|
601
|
+
|
|
602
|
+
# Re-test
|
|
603
|
+
rem ask ask_rem "LOOKUP person:sarah-chen" --case-ref $CASE_REF
|
|
604
|
+
|
|
605
|
+
# Compare to ground truth again
|
|
606
|
+
# - Fixed? Continue to next test
|
|
607
|
+
# - Still broken? Iterate again
|
|
608
|
+
```
|
|
609
|
+
|
|
610
|
+
### Phase 4: Coverage Testing
|
|
611
|
+
|
|
612
|
+
**Test diverse scenarios.**
|
|
613
|
+
|
|
614
|
+
```bash
|
|
615
|
+
# Test matrix
|
|
616
|
+
Tests:
|
|
617
|
+
├── Easy queries (exact matches, common patterns)
|
|
618
|
+
├── Medium queries (fuzzy matches, disambiguation)
|
|
619
|
+
├── Hard queries (complex traversals, missing data)
|
|
620
|
+
└── Edge cases (empty results, malformed queries)
|
|
621
|
+
|
|
622
|
+
# Run through all test questions
|
|
623
|
+
for question in experiments/$CASE_REF/outputs/questions.txt; do
|
|
624
|
+
rem ask ask_rem "$question" --case-ref $CASE_REF
|
|
625
|
+
# Judge each manually
|
|
626
|
+
done
|
|
627
|
+
```
|
|
628
|
+
|
|
629
|
+
### Phase 5: Exit Criteria
|
|
630
|
+
|
|
631
|
+
**When to move to Phoenix:**
|
|
632
|
+
|
|
633
|
+
- [ ] Agent answers 80%+ of test questions correctly
|
|
634
|
+
- [ ] No systematic hallucinations
|
|
635
|
+
- [ ] Tool usage is appropriate and consistent
|
|
636
|
+
- [ ] Output format is stable
|
|
637
|
+
- [ ] No obvious prompt issues
|
|
638
|
+
- [ ] Ready for automated tracking
|
|
639
|
+
|
|
640
|
+
**Document Findings**:
|
|
641
|
+
```markdown
|
|
642
|
+
# Vibe-Eval Summary (rem-001)
|
|
643
|
+
|
|
644
|
+
## Test Results
|
|
645
|
+
- Total questions: 25
|
|
646
|
+
- Correct: 21 (84%)
|
|
647
|
+
- Partial: 3 (12%)
|
|
648
|
+
- Wrong: 1 (4%)
|
|
649
|
+
|
|
650
|
+
## Key Findings
|
|
651
|
+
- ✅ LOOKUP queries work reliably
|
|
652
|
+
- ✅ Tool usage is appropriate
|
|
653
|
+
- ⚠️ TRAVERSE queries sometimes miss indirect paths
|
|
654
|
+
- ❌ Ambiguous entity names cause confusion
|
|
655
|
+
|
|
656
|
+
## Fixes Applied
|
|
657
|
+
1. Updated prompt to emphasize exact tool usage
|
|
658
|
+
2. Added examples for TRAVERSE queries
|
|
659
|
+
3. Improved entity disambiguation instructions
|
|
660
|
+
|
|
661
|
+
## Ready for Phoenix
|
|
662
|
+
Agent is stable enough for formal Phoenix tracking.
|
|
663
|
+
```
|
|
664
|
+
|
|
665
|
+
---
|
|
666
|
+
|
|
667
|
+
## Phoenix Integration
|
|
668
|
+
|
|
669
|
+
**After Vibe-Eval passes**, create Phoenix artifacts for systematic tracking.
|
|
670
|
+
|
|
671
|
+
### Step 1: Create Golden Dataset
|
|
672
|
+
|
|
673
|
+
```bash
|
|
674
|
+
rem experiments dataset create rem-lookup-ask_rem-golden \
|
|
675
|
+
--from-csv experiments/rem-001/outputs/questions.csv \
|
|
676
|
+
--input-keys input \
|
|
677
|
+
--output-keys reference \
|
|
678
|
+
--metadata-keys difficulty,query_type \
|
|
679
|
+
--description "Golden set for LOOKUP queries (curated from SME + engrams)"
|
|
680
|
+
```
|
|
681
|
+
|
|
682
|
+
### Step 2: Create Evaluator Schema
|
|
683
|
+
|
|
684
|
+
Create `schemas/evaluators/ask_rem-correctness.yaml`:
|
|
685
|
+
|
|
686
|
+
```yaml
|
|
687
|
+
---
|
|
688
|
+
type: object
|
|
689
|
+
description: |
|
|
690
|
+
Evaluate REM ask_rem agent responses for LOOKUP queries.
|
|
691
|
+
|
|
692
|
+
You are an expert evaluator judging agent responses against ground truth.
|
|
693
|
+
|
|
694
|
+
Scoring Rubric:
|
|
695
|
+
- Correctness (0-1): Does output match expected entity?
|
|
696
|
+
- Completeness (0-1): Are all required fields present?
|
|
697
|
+
- Hallucination (0-1): Any invented information? (1 = none, 0 = severe)
|
|
698
|
+
|
|
699
|
+
Pass threshold: Average score >= 0.75
|
|
700
|
+
|
|
701
|
+
properties:
|
|
702
|
+
correctness:
|
|
703
|
+
type: number
|
|
704
|
+
minimum: 0.0
|
|
705
|
+
maximum: 1.0
|
|
706
|
+
description: |
|
|
707
|
+
1.0: Entity label and type match exactly
|
|
708
|
+
0.8: Minor label variation (e.g., "sarah-chen" vs "Sarah Chen")
|
|
709
|
+
0.5: Correct type, wrong label
|
|
710
|
+
0.2: Wrong type, partial info
|
|
711
|
+
0.0: Completely wrong or missing
|
|
712
|
+
|
|
713
|
+
completeness:
|
|
714
|
+
type: number
|
|
715
|
+
minimum: 0.0
|
|
716
|
+
maximum: 1.0
|
|
717
|
+
description: |
|
|
718
|
+
1.0: All expected fields present (label, type, properties)
|
|
719
|
+
0.7: Missing optional fields only
|
|
720
|
+
0.5: Missing required fields
|
|
721
|
+
0.0: Minimal information returned
|
|
722
|
+
|
|
723
|
+
hallucination_score:
|
|
724
|
+
type: number
|
|
725
|
+
minimum: 0.0
|
|
726
|
+
maximum: 1.0
|
|
727
|
+
description: |
|
|
728
|
+
1.0: No invented information
|
|
729
|
+
0.8: Minor embellishments
|
|
730
|
+
0.5: Some invented fields
|
|
731
|
+
0.2: Significant hallucination
|
|
732
|
+
0.0: Entirely made up
|
|
733
|
+
|
|
734
|
+
pass:
|
|
735
|
+
type: boolean
|
|
736
|
+
description: True if average score >= 0.75
|
|
737
|
+
|
|
738
|
+
explanation:
|
|
739
|
+
type: string
|
|
740
|
+
description: Detailed explanation of scoring
|
|
741
|
+
|
|
742
|
+
required:
|
|
743
|
+
- correctness
|
|
744
|
+
- completeness
|
|
745
|
+
- hallucination_score
|
|
746
|
+
- pass
|
|
747
|
+
- explanation
|
|
748
|
+
|
|
749
|
+
json_schema_extra:
|
|
750
|
+
evaluator_type: llm-as-judge
|
|
751
|
+
provider_configs:
|
|
752
|
+
- provider_name: openai
|
|
753
|
+
model_name: gpt-4.1
|
|
754
|
+
input_schema:
|
|
755
|
+
query: string (the LOOKUP query)
|
|
756
|
+
output_schema:
|
|
757
|
+
label: string (entity label returned)
|
|
758
|
+
type: string (entity type)
|
|
759
|
+
properties: dict (entity properties)
|
|
760
|
+
expected_schema:
|
|
761
|
+
label: string (expected entity label)
|
|
762
|
+
type: string (expected entity type)
|
|
763
|
+
properties: dict (expected properties)
|
|
764
|
+
```
|
|
765
|
+
|
|
766
|
+
### Step 3: Run Baseline Experiment
|
|
767
|
+
|
|
768
|
+
```bash
|
|
769
|
+
rem experiments experiment run rem-lookup-ask_rem-golden \
|
|
770
|
+
--experiment rem-lookup-ask_rem-v1 \
|
|
771
|
+
--agent ask_rem \
|
|
772
|
+
--evaluator ask_rem-correctness \
|
|
773
|
+
--model claude-sonnet-4-5 \
|
|
774
|
+
--description "Baseline evaluation after Vibe-Eval (v1.0)"
|
|
775
|
+
```
|
|
776
|
+
|
|
777
|
+
### Step 4: View Results
|
|
778
|
+
|
|
779
|
+
```bash
|
|
780
|
+
# Open Phoenix UI
|
|
781
|
+
open http://localhost:6006
|
|
782
|
+
|
|
783
|
+
# Navigate to experiments
|
|
784
|
+
# Compare metrics:
|
|
785
|
+
# - Correctness: 0.87 (target: >= 0.85)
|
|
786
|
+
# - Completeness: 0.79 (target: >= 0.80)
|
|
787
|
+
# - Hallucination: 0.92 (target: >= 0.90)
|
|
788
|
+
# - Pass rate: 84% (21/25)
|
|
789
|
+
```
|
|
790
|
+
|
|
791
|
+
### Step 5: Iterate
|
|
792
|
+
|
|
793
|
+
```bash
|
|
794
|
+
# After improvements (v2)
|
|
795
|
+
rem experiments experiment run rem-lookup-ask_rem-golden \
|
|
796
|
+
--experiment rem-lookup-ask_rem-v2 \
|
|
797
|
+
--agent ask_rem \
|
|
798
|
+
--evaluator ask_rem-correctness \
|
|
799
|
+
--description "After prompt improvements"
|
|
800
|
+
|
|
801
|
+
# Compare v1 vs v2 in Phoenix UI
|
|
802
|
+
# - Correctness: 0.87 → 0.94 (+7%)
|
|
803
|
+
# - Completeness: 0.79 → 0.88 (+9%)
|
|
804
|
+
# - Pass rate: 84% → 92% (+8%)
|
|
805
|
+
```
|
|
806
|
+
|
|
807
|
+
---
|
|
808
|
+
|
|
809
|
+
## Re-Evaluation Patterns
|
|
810
|
+
|
|
811
|
+
**Run evaluators on existing agent outputs** without re-executing agents.
|
|
812
|
+
|
|
813
|
+
### Use Case: Test New Evaluator
|
|
814
|
+
|
|
815
|
+
**Scenario**: You created a new evaluator and want to test it on previous experiment outputs.
|
|
816
|
+
|
|
817
|
+
```bash
|
|
818
|
+
# Step 1: Export previous experiment results
|
|
819
|
+
rem experiments experiment export rem-lookup-ask_rem-v1 \
|
|
820
|
+
--output /tmp/v1-results.csv
|
|
821
|
+
|
|
822
|
+
# Step 2: Run new evaluator on exported results
|
|
823
|
+
rem experiments experiment run \
|
|
824
|
+
--from-results /tmp/v1-results.csv \
|
|
825
|
+
--experiment rem-lookup-ask_rem-v1-reeval \
|
|
826
|
+
--evaluator ask_rem-completeness-v2 \
|
|
827
|
+
--description "Re-evaluate v1 with improved completeness evaluator"
|
|
828
|
+
```
|
|
829
|
+
|
|
830
|
+
### Use Case: Compare Evaluator Versions
|
|
831
|
+
|
|
832
|
+
**Scenario**: You improved an evaluator and want to compare scores on same agent outputs.
|
|
833
|
+
|
|
834
|
+
```bash
|
|
835
|
+
# Baseline (old evaluator)
|
|
836
|
+
rem experiments experiment run rem-lookup-ask_rem-golden \
|
|
837
|
+
--experiment rem-eval-comparison-v1 \
|
|
838
|
+
--agent ask_rem \
|
|
839
|
+
--evaluator ask_rem-correctness-v1
|
|
840
|
+
|
|
841
|
+
# Export results
|
|
842
|
+
rem experiments experiment export rem-eval-comparison-v1 \
|
|
843
|
+
--output /tmp/agent-outputs.csv
|
|
844
|
+
|
|
845
|
+
# Re-evaluate with new evaluator
|
|
846
|
+
rem experiments experiment run \
|
|
847
|
+
--from-results /tmp/agent-outputs.csv \
|
|
848
|
+
--experiment rem-eval-comparison-v2 \
|
|
849
|
+
--evaluator ask_rem-correctness-v2
|
|
850
|
+
|
|
851
|
+
# Compare in Phoenix UI
|
|
852
|
+
# - v1 evaluator: 87% pass rate
|
|
853
|
+
# - v2 evaluator: 92% pass rate
|
|
854
|
+
# - Conclusion: v2 evaluator is more lenient (or v1 was too strict)
|
|
855
|
+
```
|
|
856
|
+
|
|
857
|
+
### Use Case: Multi-Evaluator Analysis
|
|
858
|
+
|
|
859
|
+
**Scenario**: Run multiple evaluators on same agent outputs to analyze different dimensions.
|
|
860
|
+
|
|
861
|
+
```bash
|
|
862
|
+
# Run agent once
|
|
863
|
+
rem experiments experiment run rem-lookup-ask_rem-golden \
|
|
864
|
+
--experiment rem-multi-eval-baseline \
|
|
865
|
+
--agent ask_rem
|
|
866
|
+
|
|
867
|
+
# Export results
|
|
868
|
+
rem experiments experiment export rem-multi-eval-baseline \
|
|
869
|
+
--output /tmp/baseline.csv
|
|
870
|
+
|
|
871
|
+
# Re-evaluate with different evaluators
|
|
872
|
+
rem experiments experiment run --from-results /tmp/baseline.csv \
|
|
873
|
+
--experiment rem-correctness-eval \
|
|
874
|
+
--evaluator ask_rem-correctness
|
|
875
|
+
|
|
876
|
+
rem experiments experiment run --from-results /tmp/baseline.csv \
|
|
877
|
+
--experiment rem-completeness-eval \
|
|
878
|
+
--evaluator ask_rem-completeness
|
|
879
|
+
|
|
880
|
+
rem experiments experiment run --from-results /tmp/baseline.csv \
|
|
881
|
+
--experiment rem-faithfulness-eval \
|
|
882
|
+
--evaluator ask_rem-faithfulness
|
|
883
|
+
|
|
884
|
+
# Compare dimension scores in Phoenix UI
|
|
885
|
+
```
|
|
886
|
+
|
|
887
|
+
---
|
|
888
|
+
|
|
889
|
+
## Best Practices
|
|
890
|
+
|
|
891
|
+
### Golden Set Quality
|
|
892
|
+
|
|
893
|
+
**Diversity**:
|
|
894
|
+
```
|
|
895
|
+
✅ Mix of easy, medium, hard examples (30/50/20 split)
|
|
896
|
+
✅ Diverse entity types (person, project, document, etc.)
|
|
897
|
+
✅ Different query patterns (exact, fuzzy, semantic)
|
|
898
|
+
✅ Edge cases (empty results, ambiguous, malformed)
|
|
899
|
+
|
|
900
|
+
❌ All easy examples
|
|
901
|
+
❌ Single entity type
|
|
902
|
+
❌ Repetitive queries
|
|
903
|
+
❌ No edge cases
|
|
904
|
+
```
|
|
905
|
+
|
|
906
|
+
**Metadata**:
|
|
907
|
+
```csv
|
|
908
|
+
input,reference,difficulty,query_type,entity_type
|
|
909
|
+
"LOOKUP person:sarah-chen","...",easy,exact,person
|
|
910
|
+
"SEARCH API projects","...",medium,semantic,project
|
|
911
|
+
"TRAVERSE sarah-chen depth=3","...",hard,graph,mixed
|
|
912
|
+
```
|
|
913
|
+
|
|
914
|
+
**Versioning**:
|
|
915
|
+
```bash
|
|
916
|
+
# Version golden sets when making significant changes
|
|
917
|
+
rem experiments dataset create rem-lookup-golden-v1 ... # Initial
|
|
918
|
+
rem experiments dataset create rem-lookup-golden-v2 ... # Added edge cases
|
|
919
|
+
rem experiments dataset create rem-lookup-golden-v3 ... # Production failures
|
|
920
|
+
```
|
|
921
|
+
|
|
922
|
+
### Evaluator Design
|
|
923
|
+
|
|
924
|
+
**Multi-Dimensional Scoring**:
|
|
925
|
+
```yaml
|
|
926
|
+
# Good: Multiple dimensions
|
|
927
|
+
properties:
|
|
928
|
+
correctness: {type: number}
|
|
929
|
+
completeness: {type: number}
|
|
930
|
+
relevance: {type: number}
|
|
931
|
+
hallucination_score: {type: number}
|
|
932
|
+
|
|
933
|
+
# Bad: Single score
|
|
934
|
+
properties:
|
|
935
|
+
score: {type: number}
|
|
936
|
+
```
|
|
937
|
+
|
|
938
|
+
**Clear Rubrics**:
|
|
939
|
+
```yaml
|
|
940
|
+
# Good: Explicit scoring criteria
|
|
941
|
+
description: |
|
|
942
|
+
1.0: Perfect match
|
|
943
|
+
0.8: Minor variations
|
|
944
|
+
0.5: Partially correct
|
|
945
|
+
0.2: Mostly wrong
|
|
946
|
+
0.0: Completely wrong
|
|
947
|
+
|
|
948
|
+
# Bad: Vague
|
|
949
|
+
description: "Score the output"
|
|
950
|
+
```
|
|
951
|
+
|
|
952
|
+
**Strict Grading**:
|
|
953
|
+
```yaml
|
|
954
|
+
# Good: Catches subtle issues
|
|
955
|
+
hallucination_score: 1.0 only if NO invented information
|
|
956
|
+
|
|
957
|
+
# Bad: Too lenient
|
|
958
|
+
hallucination_score: 0.8 if "mostly accurate"
|
|
959
|
+
```
|
|
960
|
+
|
|
961
|
+
### Experiment Metadata
|
|
962
|
+
|
|
963
|
+
**Track Important Context**:
|
|
964
|
+
```python
|
|
965
|
+
metadata = {
|
|
966
|
+
"task": "rem-lookup",
|
|
967
|
+
"agent": "ask_rem",
|
|
968
|
+
"index": "v3",
|
|
969
|
+
"model": "claude-sonnet-4-5",
|
|
970
|
+
"prompt_version": "2025-11-21",
|
|
971
|
+
"hypothesis": "Fixed entity type confusion",
|
|
972
|
+
"baseline_score": 0.87,
|
|
973
|
+
"target_score": 0.92,
|
|
974
|
+
"changed_files": ["schemas/agents/ask-rem.yaml"],
|
|
975
|
+
}
|
|
976
|
+
```
|
|
977
|
+
|
|
978
|
+
### Progressive Testing
|
|
979
|
+
|
|
980
|
+
**Start Small, Scale Up**:
|
|
981
|
+
```
|
|
982
|
+
Phase 1: Vibe-Eval (5-10 examples, interactive)
|
|
983
|
+
Phase 2: Phoenix Baseline (25 examples, full evaluators)
|
|
984
|
+
Phase 3: Comprehensive (100+ examples, all dimensions)
|
|
985
|
+
Phase 4: Production (1000+ examples, continuous)
|
|
986
|
+
```
|
|
987
|
+
|
|
988
|
+
---
|
|
989
|
+
|
|
990
|
+
## Example Workflows
|
|
991
|
+
|
|
992
|
+
### Workflow 1: Testing LOOKUP Query Agent
|
|
993
|
+
|
|
994
|
+
**Goal**: Ensure LOOKUP queries return correct entities with complete information.
|
|
995
|
+
|
|
996
|
+
```bash
|
|
997
|
+
# 1. Collect ground truth
|
|
998
|
+
mkdir -p experiments/rem-lookup-001/validation/golden-set/
|
|
999
|
+
cp sme-examples/entities/*.json experiments/rem-lookup-001/validation/golden-set/
|
|
1000
|
+
|
|
1001
|
+
# 2. Create test questions
|
|
1002
|
+
cat > experiments/rem-lookup-001/outputs/questions.csv <<EOF
|
|
1003
|
+
input,reference
|
|
1004
|
+
"LOOKUP person:sarah-chen","{""label"": ""sarah-chen"", ""type"": ""person""}"
|
|
1005
|
+
"LOOKUP project:api-design-v2","{""label"": ""api-design-v2"", ""type"": ""project""}"
|
|
1006
|
+
EOF
|
|
1007
|
+
|
|
1008
|
+
# 3. Vibe-Eval
|
|
1009
|
+
export CASE_REF="rem-lookup-001"
|
|
1010
|
+
rem ask ask_rem "LOOKUP person:sarah-chen" --case-ref $CASE_REF
|
|
1011
|
+
# Judge: Does it match validation/golden-set/sarah-chen.json?
|
|
1012
|
+
|
|
1013
|
+
# 4. Phoenix
|
|
1014
|
+
rem experiments dataset create rem-lookup-ask_rem-golden \
|
|
1015
|
+
--from-csv experiments/rem-lookup-001/outputs/questions.csv \
|
|
1016
|
+
--input-keys input --output-keys reference
|
|
1017
|
+
|
|
1018
|
+
rem experiments experiment run rem-lookup-ask_rem-golden \
|
|
1019
|
+
--experiment rem-lookup-ask_rem-v1 \
|
|
1020
|
+
--agent ask_rem \
|
|
1021
|
+
--evaluator ask_rem-correctness
|
|
1022
|
+
```
|
|
1023
|
+
|
|
1024
|
+
### Workflow 2: Testing with Engrams
|
|
1025
|
+
|
|
1026
|
+
**Goal**: Scale testing using generated engrams.
|
|
1027
|
+
|
|
1028
|
+
```bash
|
|
1029
|
+
# 1. Generate high-quality engrams
|
|
1030
|
+
rem dreaming full --tenant-id acme --generate-test-cases --quality-level 4
|
|
1031
|
+
|
|
1032
|
+
# 2. Export engrams
|
|
1033
|
+
rem engram export rem-engrams-mature-mixed --output engrams.csv --format phoenix
|
|
1034
|
+
|
|
1035
|
+
# 3. Create dataset
|
|
1036
|
+
rem experiments dataset create rem-search-ask_rem-golden \
|
|
1037
|
+
--from-engrams engrams.csv \
|
|
1038
|
+
--input-keys query,context \
|
|
1039
|
+
--output-keys entities,relationships
|
|
1040
|
+
|
|
1041
|
+
# 4. Run experiment
|
|
1042
|
+
rem experiments experiment run rem-search-ask_rem-golden \
|
|
1043
|
+
--experiment rem-search-ask_rem-v1 \
|
|
1044
|
+
--agent ask_rem \
|
|
1045
|
+
--evaluator ask_rem-retrieval-precision,ask_rem-retrieval-recall
|
|
1046
|
+
```
|
|
1047
|
+
|
|
1048
|
+
### Workflow 3: Re-Evaluation After Prompt Change
|
|
1049
|
+
|
|
1050
|
+
**Goal**: Test if prompt improvements increased accuracy without re-running agent.
|
|
1051
|
+
|
|
1052
|
+
```bash
|
|
1053
|
+
# 1. Baseline experiment (already run)
|
|
1054
|
+
# rem experiments experiment run ... --experiment rem-v1
|
|
1055
|
+
|
|
1056
|
+
# 2. Export baseline results
|
|
1057
|
+
rem experiments experiment export rem-lookup-ask_rem-v1 --output /tmp/v1.csv
|
|
1058
|
+
|
|
1059
|
+
# 3. Update prompt
|
|
1060
|
+
vim schemas/agents/ask-rem.yaml
|
|
1061
|
+
|
|
1062
|
+
# 4. Test new prompt via Vibe-Eval (spot check)
|
|
1063
|
+
rem ask ask_rem "LOOKUP person:sarah-chen" --case-ref rem-test
|
|
1064
|
+
|
|
1065
|
+
# 5. Run full experiment with new prompt
|
|
1066
|
+
rem experiments experiment run rem-lookup-ask_rem-golden \
|
|
1067
|
+
--experiment rem-lookup-ask_rem-v2 \
|
|
1068
|
+
--agent ask_rem \
|
|
1069
|
+
--evaluator ask_rem-correctness
|
|
1070
|
+
|
|
1071
|
+
# 6. Compare v1 vs v2 in Phoenix UI
|
|
1072
|
+
```
|
|
1073
|
+
|
|
1074
|
+
### Workflow 4: Hybrid Golden Set (SME + Engrams + Production)
|
|
1075
|
+
|
|
1076
|
+
**Goal**: Comprehensive evaluation combining all data sources.
|
|
1077
|
+
|
|
1078
|
+
```bash
|
|
1079
|
+
# 1. Collect SME examples
|
|
1080
|
+
cp sme-postman-collection.json validation/sme-examples/
|
|
1081
|
+
|
|
1082
|
+
# 2. Export production data
|
|
1083
|
+
rem experiments trace list --project rem-prod --days 30 --output prod.csv
|
|
1084
|
+
|
|
1085
|
+
# 3. Generate engrams
|
|
1086
|
+
rem engram export rem-high-quality --output engrams.csv
|
|
1087
|
+
|
|
1088
|
+
# 4. Merge sources
|
|
1089
|
+
python scripts/merge_golden_sets.py \
|
|
1090
|
+
--sme validation/sme-examples/ \
|
|
1091
|
+
--production prod.csv \
|
|
1092
|
+
--engrams engrams.csv \
|
|
1093
|
+
--weights 0.2,0.3,0.5 \
|
|
1094
|
+
--output golden-hybrid.csv
|
|
1095
|
+
|
|
1096
|
+
# 5. Create Phoenix dataset
|
|
1097
|
+
rem experiments dataset create rem-comprehensive-golden \
|
|
1098
|
+
--from-csv golden-hybrid.csv \
|
|
1099
|
+
--input-keys query,context \
|
|
1100
|
+
--output-keys reference \
|
|
1101
|
+
--metadata-keys source,difficulty
|
|
1102
|
+
|
|
1103
|
+
# 6. Run experiment
|
|
1104
|
+
rem experiments experiment run rem-comprehensive-golden \
|
|
1105
|
+
--experiment rem-comprehensive-v1 \
|
|
1106
|
+
--agent ask_rem \
|
|
1107
|
+
--evaluator ask_rem-correctness,ask_rem-completeness,ask_rem-faithfulness
|
|
1108
|
+
```
|
|
1109
|
+
|
|
1110
|
+
---
|
|
1111
|
+
|
|
1112
|
+
## Summary
|
|
1113
|
+
|
|
1114
|
+
REM's experiment design framework provides:
|
|
1115
|
+
|
|
1116
|
+
✅ **Clear methodology**: Vibe-Eval → Phoenix → Iteration
|
|
1117
|
+
✅ **Multiple data sources**: SME + Production + Engrams
|
|
1118
|
+
✅ **Deterministic naming**: Prevent Phoenix proliferation
|
|
1119
|
+
✅ **Re-evaluation support**: Test new evaluators on old results
|
|
1120
|
+
✅ **Data-driven design**: Use real patterns to build better agents
|
|
1121
|
+
✅ **Systematic tracking**: Phoenix integration for long-term analysis
|
|
1122
|
+
|
|
1123
|
+
**Key Takeaways**:
|
|
1124
|
+
|
|
1125
|
+
1. **Ground truth first**: Start with the answer key, not questions
|
|
1126
|
+
2. **Separation of concerns**: Agent NEVER sees validation folder
|
|
1127
|
+
3. **Vibe-Eval before Phoenix**: Interactive testing catches issues early
|
|
1128
|
+
4. **Use engrams for scale**: Generated data covers diverse scenarios
|
|
1129
|
+
5. **Track everything**: Metadata enables comparison over time
|
|
1130
|
+
|
|
1131
|
+
**Next Steps**:
|
|
1132
|
+
|
|
1133
|
+
1. Define your first experiment (problem, metrics, hypothesis)
|
|
1134
|
+
2. Collect ground truth (SME + production + engrams)
|
|
1135
|
+
3. Run Vibe-Eval until stable
|
|
1136
|
+
4. Formalize with Phoenix experiments
|
|
1137
|
+
5. Iterate and track improvements
|
|
1138
|
+
|
|
1139
|
+
---
|
|
1140
|
+
|
|
1141
|
+
## Related Documentation
|
|
1142
|
+
|
|
1143
|
+
- [Phoenix README](./README.md) - Phoenix service overview
|
|
1144
|
+
- [CLAUDE.md](../../../CLAUDE.md) - REM architecture
|
|
1145
|
+
- [Evaluator Schemas](../../../schemas/evaluators/) - Pre-built evaluators
|
|
1146
|
+
- [Dreaming Worker](../../workers/dreaming.py) - Engram generation
|