remdb 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +566 -0
- rem/cli/commands/configure.py +497 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1302 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +96 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +806 -0
- rem/services/content/service.py +676 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +336 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.3.0.dist-info/METADATA +1455 -0
- remdb-0.3.0.dist-info/RECORD +187 -0
- remdb-0.3.0.dist-info/WHEEL +4 -0
- remdb-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,453 @@
|
|
|
1
|
+
# Phoenix Evaluation Framework for REM
|
|
2
|
+
|
|
3
|
+
Lean, two-phase evaluation system for REM agents using Arize Phoenix.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Quick Start
|
|
8
|
+
|
|
9
|
+
### Prerequisites
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Port-forward Phoenix (if on Kubernetes)
|
|
13
|
+
kubectl port-forward -n observability svc/phoenix-svc 6006:6006
|
|
14
|
+
|
|
15
|
+
# Set API key
|
|
16
|
+
export PHOENIX_API_KEY=<your-api-key>
|
|
17
|
+
|
|
18
|
+
# Verify connection
|
|
19
|
+
rem experiments dataset list
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### Two-Phase Workflow
|
|
23
|
+
|
|
24
|
+
**Phase 1: SME Creates Golden Set**
|
|
25
|
+
```bash
|
|
26
|
+
rem experiments dataset create rem-lookup-golden \
|
|
27
|
+
--from-csv golden.csv \
|
|
28
|
+
--input-keys query \
|
|
29
|
+
--output-keys expected_label,expected_type \
|
|
30
|
+
--metadata-keys difficulty,query_type
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
**Phase 2: Run Evaluation**
|
|
34
|
+
```bash
|
|
35
|
+
rem experiments run rem-lookup-golden \
|
|
36
|
+
--agent ask_rem \
|
|
37
|
+
--evaluator rem-lookup-correctness
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
**View Results**
|
|
41
|
+
```bash
|
|
42
|
+
open http://localhost:6006
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Architecture
|
|
48
|
+
|
|
49
|
+
### Two-Phase Evaluation Pattern
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
Phase 1: SME Golden Set Creation
|
|
53
|
+
├─ SMEs create (input, reference) pairs
|
|
54
|
+
├─ No agent execution required
|
|
55
|
+
└─ Stored in Phoenix for reuse
|
|
56
|
+
|
|
57
|
+
Phase 2: Automated Evaluation
|
|
58
|
+
├─ Run agents on golden sets → outputs
|
|
59
|
+
├─ Run evaluators → scores
|
|
60
|
+
└─ Track results in Phoenix
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
**Why Two Phases?**
|
|
64
|
+
- SMEs focus on domain knowledge (what's correct)
|
|
65
|
+
- Automation handles systematic testing (how well agents perform)
|
|
66
|
+
- Enables regression testing as agents evolve
|
|
67
|
+
|
|
68
|
+
### Components
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
Services (rem/src/rem/services/phoenix/)
|
|
72
|
+
├─ client.py - PhoenixClient for datasets/experiments
|
|
73
|
+
└─ config.py - Connection configuration
|
|
74
|
+
|
|
75
|
+
Providers (rem/src/rem/agentic/providers/phoenix.py)
|
|
76
|
+
├─ Evaluator factory (mirrors Pydantic AI pattern)
|
|
77
|
+
└─ Schema-based LLM-as-a-Judge evaluators
|
|
78
|
+
|
|
79
|
+
Evaluator Schemas (rem/schemas/evaluators/)
|
|
80
|
+
├─ Agent Evaluators (end-to-end)
|
|
81
|
+
│ ├─ rem-lookup-correctness.yaml
|
|
82
|
+
│ └─ rem-search-correctness.yaml
|
|
83
|
+
└─ RAG Evaluators (component-level)
|
|
84
|
+
├─ rem-retrieval-precision.yaml (RAGAS-inspired)
|
|
85
|
+
├─ rem-retrieval-recall.yaml (RAGAS-inspired)
|
|
86
|
+
└─ rem-faithfulness.yaml (RAGAS-inspired)
|
|
87
|
+
|
|
88
|
+
CLI Commands (rem/src/rem/cli/commands/experiments.py)
|
|
89
|
+
├─ rem experiments dataset list/create/add
|
|
90
|
+
├─ rem experiments run
|
|
91
|
+
├─ rem experiments prompt list/create
|
|
92
|
+
└─ rem experiments trace list
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Evaluator Types
|
|
98
|
+
|
|
99
|
+
### Agent Evaluators (End-to-End)
|
|
100
|
+
|
|
101
|
+
Evaluate complete agent output quality.
|
|
102
|
+
|
|
103
|
+
**rem-lookup-correctness.yaml**
|
|
104
|
+
- Dimensions: Correctness, Completeness, Performance Contract
|
|
105
|
+
- Pass threshold: >= 0.75
|
|
106
|
+
- Use for: LOOKUP query evaluation
|
|
107
|
+
|
|
108
|
+
**rem-search-correctness.yaml**
|
|
109
|
+
- Dimensions: Relevance, Completeness, Ranking Quality
|
|
110
|
+
- Pass threshold: >= 0.70
|
|
111
|
+
- Use for: SEARCH query evaluation
|
|
112
|
+
|
|
113
|
+
### RAG Evaluators (Component-Level)
|
|
114
|
+
|
|
115
|
+
Evaluate retrieval layer independently (RAGAS concepts, no dependency).
|
|
116
|
+
|
|
117
|
+
**rem-retrieval-precision.yaml**
|
|
118
|
+
- Measures: Relevant entities / Total retrieved entities
|
|
119
|
+
- Evaluates ranking quality (are relevant items ranked high?)
|
|
120
|
+
- Inspired by RAGAS context_precision
|
|
121
|
+
|
|
122
|
+
**rem-retrieval-recall.yaml**
|
|
123
|
+
- Measures: Retrieved expected / Total expected entities
|
|
124
|
+
- Evaluates coverage (did we get all expected entities?)
|
|
125
|
+
- Inspired by RAGAS context_recall
|
|
126
|
+
|
|
127
|
+
**rem-faithfulness.yaml**
|
|
128
|
+
- Measures: Supported claims / Total claims in answer
|
|
129
|
+
- Detects hallucinations (agent making up info not in context)
|
|
130
|
+
- Inspired by RAGAS faithfulness
|
|
131
|
+
|
|
132
|
+
**Usage:**
|
|
133
|
+
```bash
|
|
134
|
+
# Evaluate retrieval quality
|
|
135
|
+
rem experiments run rem-search-golden \
|
|
136
|
+
--agent ask_rem \
|
|
137
|
+
--evaluator rem-retrieval-precision,rem-retrieval-recall
|
|
138
|
+
|
|
139
|
+
# Evaluate faithfulness
|
|
140
|
+
rem experiments run rem-lookup-golden \
|
|
141
|
+
--agent ask_rem \
|
|
142
|
+
--evaluator rem-faithfulness
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## How Agents with Tools Work
|
|
148
|
+
|
|
149
|
+
**Phoenix doesn't "run" agents** - you provide task functions.
|
|
150
|
+
|
|
151
|
+
### Task Function Pattern
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
# You write this
|
|
155
|
+
async def ask_rem_task(example: dict) -> dict:
|
|
156
|
+
"""Task function that Phoenix calls for each example."""
|
|
157
|
+
query = example["input"]["query"]
|
|
158
|
+
|
|
159
|
+
# Create agent with MCP tools
|
|
160
|
+
agent = Agent(
|
|
161
|
+
model="claude-sonnet-4-5",
|
|
162
|
+
tools=[ask_rem, search_entities, lookup_entity] # Your MCP tools
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Run agent (tools get called)
|
|
166
|
+
result = await agent.run(query)
|
|
167
|
+
|
|
168
|
+
# Return output (Phoenix stores this)
|
|
169
|
+
return result.data.model_dump()
|
|
170
|
+
|
|
171
|
+
# Phoenix orchestrates
|
|
172
|
+
experiment = client.run_experiment(
|
|
173
|
+
dataset="rem-lookup-golden",
|
|
174
|
+
task=ask_rem_task, # Phoenix calls this for each example
|
|
175
|
+
evaluators=[correctness_evaluator]
|
|
176
|
+
)
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### What Phoenix Does
|
|
180
|
+
|
|
181
|
+
1. **Orchestrates**: Calls your task function for each dataset example
|
|
182
|
+
2. **Observes**: Captures OTEL traces (agent execution + tool calls)
|
|
183
|
+
3. **Evaluates**: Runs evaluators on (input, output, expected)
|
|
184
|
+
4. **Tracks**: Stores results and scores in UI
|
|
185
|
+
|
|
186
|
+
### MCP Tools Configuration
|
|
187
|
+
|
|
188
|
+
Tools are specified in agent schemas:
|
|
189
|
+
|
|
190
|
+
```yaml
|
|
191
|
+
# rem/schemas/agents/ask-rem.yaml
|
|
192
|
+
json_schema_extra:
|
|
193
|
+
tools:
|
|
194
|
+
- name: ask_rem
|
|
195
|
+
mcp_server: rem
|
|
196
|
+
usage: "Execute REM queries (LOOKUP, SEARCH, TRAVERSE, SQL)"
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
When agent is created, `create_pydantic_ai_agent()`:
|
|
200
|
+
1. Reads agent schema
|
|
201
|
+
2. Loads MCP tools from `json_schema_extra.tools`
|
|
202
|
+
3. Connects to MCP server (FastMCP at `/api/v1/mcp`)
|
|
203
|
+
4. Registers tools with agent
|
|
204
|
+
|
|
205
|
+
### OTEL Traces
|
|
206
|
+
|
|
207
|
+
If instrumentation enabled (`settings.otel.enabled`):
|
|
208
|
+
|
|
209
|
+
```
|
|
210
|
+
Trace: experiment-run
|
|
211
|
+
├─ Span: agent_run (parent)
|
|
212
|
+
│ ├─ input: "LOOKUP person:sarah-chen"
|
|
213
|
+
│ └─ output: {"answer": "...", "entities": [...]}
|
|
214
|
+
├─ Span: tool_call.ask_rem (child)
|
|
215
|
+
│ ├─ input: {"query": "LOOKUP person:sarah-chen"}
|
|
216
|
+
│ └─ output: {"entities": [...]}
|
|
217
|
+
└─ Span: evaluation.correctness (sibling)
|
|
218
|
+
├─ scores: {"correctness": 0.95, "completeness": 0.88}
|
|
219
|
+
└─ pass: true
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
Phoenix receives these spans and displays in UI.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## CLI Reference
|
|
227
|
+
|
|
228
|
+
### Dataset Commands
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
# List all datasets
|
|
232
|
+
rem experiments dataset list
|
|
233
|
+
|
|
234
|
+
# Create from CSV
|
|
235
|
+
rem experiments dataset create <name> \
|
|
236
|
+
--from-csv golden.csv \
|
|
237
|
+
--input-keys query \
|
|
238
|
+
--output-keys expected_label,expected_type \
|
|
239
|
+
--metadata-keys difficulty,query_type
|
|
240
|
+
|
|
241
|
+
# Add examples to existing dataset
|
|
242
|
+
rem experiments dataset add <name> \
|
|
243
|
+
--from-csv new-examples.csv \
|
|
244
|
+
--input-keys query \
|
|
245
|
+
--output-keys expected_label,expected_type
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
### Experiment Commands
|
|
249
|
+
|
|
250
|
+
```bash
|
|
251
|
+
# Run agent only
|
|
252
|
+
rem experiments run <dataset> \
|
|
253
|
+
--experiment <name> \
|
|
254
|
+
--agent ask_rem
|
|
255
|
+
|
|
256
|
+
# Run evaluator only
|
|
257
|
+
rem experiments run <dataset> \
|
|
258
|
+
--experiment <name> \
|
|
259
|
+
--evaluator rem-lookup-correctness
|
|
260
|
+
|
|
261
|
+
# Run agent + evaluators
|
|
262
|
+
rem experiments run <dataset> \
|
|
263
|
+
--experiment <name> \
|
|
264
|
+
--agent ask_rem \
|
|
265
|
+
--evaluator rem-lookup-correctness,rem-faithfulness
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### Trace Commands
|
|
269
|
+
|
|
270
|
+
```bash
|
|
271
|
+
# List recent traces
|
|
272
|
+
rem experiments trace list --project rem-agents --days 7 --limit 50
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## API Reference
|
|
278
|
+
|
|
279
|
+
### PhoenixClient
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
from rem.services.phoenix import PhoenixClient
|
|
283
|
+
|
|
284
|
+
client = PhoenixClient()
|
|
285
|
+
|
|
286
|
+
# Dataset management
|
|
287
|
+
datasets = client.list_datasets()
|
|
288
|
+
dataset = client.get_dataset("rem-lookup-golden")
|
|
289
|
+
dataset = client.create_dataset_from_data(
|
|
290
|
+
name="rem-test",
|
|
291
|
+
inputs=[{"query": "LOOKUP person:sarah-chen"}],
|
|
292
|
+
outputs=[{"label": "sarah-chen", "type": "person"}],
|
|
293
|
+
metadata=[{"difficulty": "easy"}]
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Experiment execution
|
|
297
|
+
experiment = client.run_experiment(
|
|
298
|
+
dataset="rem-lookup-golden",
|
|
299
|
+
task=ask_rem_task,
|
|
300
|
+
evaluators=[correctness_eval, faithfulness_eval],
|
|
301
|
+
experiment_name="rem-v1"
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
# Trace retrieval
|
|
305
|
+
traces = client.get_traces(
|
|
306
|
+
project_name="rem-agents",
|
|
307
|
+
limit=50
|
|
308
|
+
)
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### Evaluator Provider
|
|
312
|
+
|
|
313
|
+
```python
|
|
314
|
+
from rem.agentic.providers.phoenix import (
|
|
315
|
+
create_evaluator_from_schema,
|
|
316
|
+
load_evaluator_schema
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Load schema
|
|
320
|
+
schema = load_evaluator_schema("rem-lookup-correctness")
|
|
321
|
+
|
|
322
|
+
# Create evaluator
|
|
323
|
+
evaluator = create_evaluator_from_schema("rem-lookup-correctness")
|
|
324
|
+
|
|
325
|
+
# Use in experiment
|
|
326
|
+
result = evaluator({
|
|
327
|
+
"input": {"query": "LOOKUP person:sarah-chen"},
|
|
328
|
+
"output": {"label": "sarah-chen", ...},
|
|
329
|
+
"expected": {"label": "sarah-chen", ...}
|
|
330
|
+
})
|
|
331
|
+
# Returns: {"score": 0.95, "label": "correct", "explanation": "..."}
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
---
|
|
335
|
+
|
|
336
|
+
## Best Practices
|
|
337
|
+
|
|
338
|
+
### Golden Set Quality
|
|
339
|
+
|
|
340
|
+
**Good:**
|
|
341
|
+
- Diverse examples (easy, medium, hard)
|
|
342
|
+
- Edge cases included
|
|
343
|
+
- Clear expected outputs
|
|
344
|
+
- Metadata for filtering
|
|
345
|
+
|
|
346
|
+
**Poor:**
|
|
347
|
+
- Only easy examples
|
|
348
|
+
- Ambiguous expected outputs
|
|
349
|
+
- No metadata
|
|
350
|
+
- Too small (< 10 examples)
|
|
351
|
+
|
|
352
|
+
### Evaluator Design
|
|
353
|
+
|
|
354
|
+
**Good:**
|
|
355
|
+
- Multiple dimensions (correctness, completeness, etc.)
|
|
356
|
+
- Clear scoring rubric
|
|
357
|
+
- Strict grading (catches hallucinations)
|
|
358
|
+
- Detailed feedback
|
|
359
|
+
|
|
360
|
+
**Poor:**
|
|
361
|
+
- Single dimension (just "score")
|
|
362
|
+
- Vague rubric
|
|
363
|
+
- Lenient grading
|
|
364
|
+
- No explanations
|
|
365
|
+
|
|
366
|
+
### Iterative Improvement
|
|
367
|
+
|
|
368
|
+
1. Create initial golden set (10-20 examples)
|
|
369
|
+
2. Run baseline evaluation
|
|
370
|
+
3. Identify failure modes
|
|
371
|
+
4. Add edge cases to golden set
|
|
372
|
+
5. Improve agent or prompts
|
|
373
|
+
6. Re-run evaluation
|
|
374
|
+
7. Compare results over time
|
|
375
|
+
|
|
376
|
+
**Track Progress:**
|
|
377
|
+
- Use versioned experiment names: `rem-v1-baseline`, `rem-v2-improved`
|
|
378
|
+
- Add metadata: `{"agent_version": "v2", "prompt_version": "2024-11-20"}`
|
|
379
|
+
- Compare scores in Phoenix UI
|
|
380
|
+
|
|
381
|
+
---
|
|
382
|
+
|
|
383
|
+
## Troubleshooting
|
|
384
|
+
|
|
385
|
+
### Connection Issues
|
|
386
|
+
|
|
387
|
+
**Problem:** "Connection refused"
|
|
388
|
+
|
|
389
|
+
```bash
|
|
390
|
+
# Check port-forward
|
|
391
|
+
lsof -i :6006
|
|
392
|
+
|
|
393
|
+
# Restart port-forward
|
|
394
|
+
kubectl port-forward -n observability svc/phoenix-svc 6006:6006
|
|
395
|
+
```
|
|
396
|
+
|
|
397
|
+
### Authentication Issues
|
|
398
|
+
|
|
399
|
+
**Problem:** "401 Unauthorized"
|
|
400
|
+
|
|
401
|
+
```bash
|
|
402
|
+
# Check API key
|
|
403
|
+
echo $PHOENIX_API_KEY
|
|
404
|
+
|
|
405
|
+
# Set if empty
|
|
406
|
+
export PHOENIX_API_KEY=<your-key>
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
### Dataset Not Found
|
|
410
|
+
|
|
411
|
+
```bash
|
|
412
|
+
# List all datasets (check spelling, case-sensitive)
|
|
413
|
+
rem experiments dataset list
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
### Evaluator Schema Not Found
|
|
417
|
+
|
|
418
|
+
```bash
|
|
419
|
+
# Check schema exists
|
|
420
|
+
ls rem/schemas/evaluators/
|
|
421
|
+
|
|
422
|
+
# Load without file extension
|
|
423
|
+
# ✓ "rem-lookup-correctness"
|
|
424
|
+
# ✗ "rem-lookup-correctness.yaml"
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
---
|
|
428
|
+
|
|
429
|
+
## Related Documentation
|
|
430
|
+
|
|
431
|
+
- [REM CLAUDE.md](../../../CLAUDE.md) - Overall REM architecture
|
|
432
|
+
- [Phoenix Official Docs](https://docs.arize.com/phoenix) - Phoenix platform
|
|
433
|
+
- [Carrier Evaluation](https://github.com/anthropics/carrier/docs/03-evaluation.md) - Inspiration for two-phase approach
|
|
434
|
+
|
|
435
|
+
---
|
|
436
|
+
|
|
437
|
+
## Summary
|
|
438
|
+
|
|
439
|
+
REM's Phoenix evaluation framework provides:
|
|
440
|
+
|
|
441
|
+
✅ **Two-phase workflow** - SMEs create golden sets, automation runs evaluations
|
|
442
|
+
✅ **Lean service layer** - Clean API for datasets/experiments
|
|
443
|
+
✅ **Evaluator provider** - Schema-based LLM-as-a-Judge pattern
|
|
444
|
+
✅ **CLI commands** - Simple workflow for creating datasets and running experiments
|
|
445
|
+
✅ **Comprehensive schemas** - Agent evaluators + RAG evaluators (RAGAS-inspired, no dependency)
|
|
446
|
+
✅ **Agent + Tools support** - OTEL tracing of MCP tool calls
|
|
447
|
+
✅ **Systematic tracking** - Phoenix integration for analysis over time
|
|
448
|
+
|
|
449
|
+
**Next Steps:**
|
|
450
|
+
1. Create your first golden set (`rem experiments dataset create`)
|
|
451
|
+
2. Run baseline evaluation (`rem experiments run`)
|
|
452
|
+
3. Iterate and improve agents
|
|
453
|
+
4. Track progress in Phoenix UI (`open http://localhost:6006`)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Phoenix observability and evaluation services for REM.
|
|
2
|
+
|
|
3
|
+
This package provides Phoenix integration for:
|
|
4
|
+
1. Dataset management (golden sets, evaluation datasets)
|
|
5
|
+
2. Experiment execution (agent runs, evaluator runs)
|
|
6
|
+
3. Trace retrieval and analysis
|
|
7
|
+
4. Label management for organizing evaluations
|
|
8
|
+
|
|
9
|
+
Two-Phase Evaluation Workflow:
|
|
10
|
+
==============================
|
|
11
|
+
|
|
12
|
+
Phase 1: SME Golden Set Creation
|
|
13
|
+
---------------------------------
|
|
14
|
+
Subject Matter Experts create golden datasets containing:
|
|
15
|
+
- input: What the agent receives (e.g., {"query": "LOOKUP person:sarah-chen"})
|
|
16
|
+
- reference: Expected correct output (ground truth)
|
|
17
|
+
- metadata: Optional context (difficulty, category, etc.)
|
|
18
|
+
|
|
19
|
+
Phase 2: Automated Evaluation
|
|
20
|
+
------------------------------
|
|
21
|
+
1. Run agents against golden set → produces agent outputs
|
|
22
|
+
2. Run evaluators against (input, agent_output, reference) → produces scores
|
|
23
|
+
3. Track results in Phoenix for analysis and iteration
|
|
24
|
+
|
|
25
|
+
This two-phase approach allows:
|
|
26
|
+
- SMEs to contribute domain knowledge without running agents
|
|
27
|
+
- Automated regression testing as agents evolve
|
|
28
|
+
- Systematic comparison across agent versions
|
|
29
|
+
- Label-based organization (by query type, difficulty, etc.)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from .client import PhoenixClient
|
|
33
|
+
from .config import PhoenixConfig
|
|
34
|
+
from .prompt_labels import (
|
|
35
|
+
PhoenixPromptLabels,
|
|
36
|
+
setup_rem_labels,
|
|
37
|
+
REM_LABELS,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
"PhoenixClient",
|
|
42
|
+
"PhoenixConfig",
|
|
43
|
+
"PhoenixPromptLabels",
|
|
44
|
+
"setup_rem_labels",
|
|
45
|
+
"REM_LABELS",
|
|
46
|
+
]
|