remdb 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +565 -0
  44. rem/cli/commands/configure.py +423 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1124 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +88 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +657 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +229 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.2.6.dist-info/METADATA +1191 -0
  185. remdb-0.2.6.dist-info/RECORD +187 -0
  186. remdb-0.2.6.dist-info/WHEEL +4 -0
  187. remdb-0.2.6.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,453 @@
1
+ # Phoenix Evaluation Framework for REM
2
+
3
+ Lean, two-phase evaluation system for REM agents using Arize Phoenix.
4
+
5
+ ---
6
+
7
+ ## Quick Start
8
+
9
+ ### Prerequisites
10
+
11
+ ```bash
12
+ # Port-forward Phoenix (if on Kubernetes)
13
+ kubectl port-forward -n observability svc/phoenix-svc 6006:6006
14
+
15
+ # Set API key
16
+ export PHOENIX_API_KEY=<your-api-key>
17
+
18
+ # Verify connection
19
+ rem experiments dataset list
20
+ ```
21
+
22
+ ### Two-Phase Workflow
23
+
24
+ **Phase 1: SME Creates Golden Set**
25
+ ```bash
26
+ rem experiments dataset create rem-lookup-golden \
27
+ --from-csv golden.csv \
28
+ --input-keys query \
29
+ --output-keys expected_label,expected_type \
30
+ --metadata-keys difficulty,query_type
31
+ ```
32
+
33
+ **Phase 2: Run Evaluation**
34
+ ```bash
35
+ rem experiments run rem-lookup-golden \
36
+ --agent ask_rem \
37
+ --evaluator rem-lookup-correctness
38
+ ```
39
+
40
+ **View Results**
41
+ ```bash
42
+ open http://localhost:6006
43
+ ```
44
+
45
+ ---
46
+
47
+ ## Architecture
48
+
49
+ ### Two-Phase Evaluation Pattern
50
+
51
+ ```
52
+ Phase 1: SME Golden Set Creation
53
+ ├─ SMEs create (input, reference) pairs
54
+ ├─ No agent execution required
55
+ └─ Stored in Phoenix for reuse
56
+
57
+ Phase 2: Automated Evaluation
58
+ ├─ Run agents on golden sets → outputs
59
+ ├─ Run evaluators → scores
60
+ └─ Track results in Phoenix
61
+ ```
62
+
63
+ **Why Two Phases?**
64
+ - SMEs focus on domain knowledge (what's correct)
65
+ - Automation handles systematic testing (how well agents perform)
66
+ - Enables regression testing as agents evolve
67
+
68
+ ### Components
69
+
70
+ ```
71
+ Services (rem/src/rem/services/phoenix/)
72
+ ├─ client.py - PhoenixClient for datasets/experiments
73
+ └─ config.py - Connection configuration
74
+
75
+ Providers (rem/src/rem/agentic/providers/phoenix.py)
76
+ ├─ Evaluator factory (mirrors Pydantic AI pattern)
77
+ └─ Schema-based LLM-as-a-Judge evaluators
78
+
79
+ Evaluator Schemas (rem/schemas/evaluators/)
80
+ ├─ Agent Evaluators (end-to-end)
81
+ │ ├─ rem-lookup-correctness.yaml
82
+ │ └─ rem-search-correctness.yaml
83
+ └─ RAG Evaluators (component-level)
84
+ ├─ rem-retrieval-precision.yaml (RAGAS-inspired)
85
+ ├─ rem-retrieval-recall.yaml (RAGAS-inspired)
86
+ └─ rem-faithfulness.yaml (RAGAS-inspired)
87
+
88
+ CLI Commands (rem/src/rem/cli/commands/experiments.py)
89
+ ├─ rem experiments dataset list/create/add
90
+ ├─ rem experiments run
91
+ ├─ rem experiments prompt list/create
92
+ └─ rem experiments trace list
93
+ ```
94
+
95
+ ---
96
+
97
+ ## Evaluator Types
98
+
99
+ ### Agent Evaluators (End-to-End)
100
+
101
+ Evaluate complete agent output quality.
102
+
103
+ **rem-lookup-correctness.yaml**
104
+ - Dimensions: Correctness, Completeness, Performance Contract
105
+ - Pass threshold: >= 0.75
106
+ - Use for: LOOKUP query evaluation
107
+
108
+ **rem-search-correctness.yaml**
109
+ - Dimensions: Relevance, Completeness, Ranking Quality
110
+ - Pass threshold: >= 0.70
111
+ - Use for: SEARCH query evaluation
112
+
113
+ ### RAG Evaluators (Component-Level)
114
+
115
+ Evaluate retrieval layer independently (RAGAS concepts, no dependency).
116
+
117
+ **rem-retrieval-precision.yaml**
118
+ - Measures: Relevant entities / Total retrieved entities
119
+ - Evaluates ranking quality (are relevant items ranked high?)
120
+ - Inspired by RAGAS context_precision
121
+
122
+ **rem-retrieval-recall.yaml**
123
+ - Measures: Retrieved expected / Total expected entities
124
+ - Evaluates coverage (did we get all expected entities?)
125
+ - Inspired by RAGAS context_recall
126
+
127
+ **rem-faithfulness.yaml**
128
+ - Measures: Supported claims / Total claims in answer
129
+ - Detects hallucinations (agent making up info not in context)
130
+ - Inspired by RAGAS faithfulness
131
+
132
+ **Usage:**
133
+ ```bash
134
+ # Evaluate retrieval quality
135
+ rem experiments run rem-search-golden \
136
+ --agent ask_rem \
137
+ --evaluator rem-retrieval-precision,rem-retrieval-recall
138
+
139
+ # Evaluate faithfulness
140
+ rem experiments run rem-lookup-golden \
141
+ --agent ask_rem \
142
+ --evaluator rem-faithfulness
143
+ ```
144
+
145
+ ---
146
+
147
+ ## How Agents with Tools Work
148
+
149
+ **Phoenix doesn't "run" agents** - you provide task functions.
150
+
151
+ ### Task Function Pattern
152
+
153
+ ```python
154
+ # You write this
155
+ async def ask_rem_task(example: dict) -> dict:
156
+ """Task function that Phoenix calls for each example."""
157
+ query = example["input"]["query"]
158
+
159
+ # Create agent with MCP tools
160
+ agent = Agent(
161
+ model="claude-sonnet-4-5",
162
+ tools=[ask_rem, search_entities, lookup_entity] # Your MCP tools
163
+ )
164
+
165
+ # Run agent (tools get called)
166
+ result = await agent.run(query)
167
+
168
+ # Return output (Phoenix stores this)
169
+ return result.data.model_dump()
170
+
171
+ # Phoenix orchestrates
172
+ experiment = client.run_experiment(
173
+ dataset="rem-lookup-golden",
174
+ task=ask_rem_task, # Phoenix calls this for each example
175
+ evaluators=[correctness_evaluator]
176
+ )
177
+ ```
178
+
179
+ ### What Phoenix Does
180
+
181
+ 1. **Orchestrates**: Calls your task function for each dataset example
182
+ 2. **Observes**: Captures OTEL traces (agent execution + tool calls)
183
+ 3. **Evaluates**: Runs evaluators on (input, output, expected)
184
+ 4. **Tracks**: Stores results and scores in UI
185
+
186
+ ### MCP Tools Configuration
187
+
188
+ Tools are specified in agent schemas:
189
+
190
+ ```yaml
191
+ # rem/schemas/agents/ask-rem.yaml
192
+ json_schema_extra:
193
+ tools:
194
+ - name: ask_rem
195
+ mcp_server: rem
196
+ usage: "Execute REM queries (LOOKUP, SEARCH, TRAVERSE, SQL)"
197
+ ```
198
+
199
+ When agent is created, `create_pydantic_ai_agent()`:
200
+ 1. Reads agent schema
201
+ 2. Loads MCP tools from `json_schema_extra.tools`
202
+ 3. Connects to MCP server (FastMCP at `/api/v1/mcp`)
203
+ 4. Registers tools with agent
204
+
205
+ ### OTEL Traces
206
+
207
+ If instrumentation enabled (`settings.otel.enabled`):
208
+
209
+ ```
210
+ Trace: experiment-run
211
+ ├─ Span: agent_run (parent)
212
+ │ ├─ input: "LOOKUP person:sarah-chen"
213
+ │ └─ output: {"answer": "...", "entities": [...]}
214
+ ├─ Span: tool_call.ask_rem (child)
215
+ │ ├─ input: {"query": "LOOKUP person:sarah-chen"}
216
+ │ └─ output: {"entities": [...]}
217
+ └─ Span: evaluation.correctness (sibling)
218
+ ├─ scores: {"correctness": 0.95, "completeness": 0.88}
219
+ └─ pass: true
220
+ ```
221
+
222
+ Phoenix receives these spans and displays in UI.
223
+
224
+ ---
225
+
226
+ ## CLI Reference
227
+
228
+ ### Dataset Commands
229
+
230
+ ```bash
231
+ # List all datasets
232
+ rem experiments dataset list
233
+
234
+ # Create from CSV
235
+ rem experiments dataset create <name> \
236
+ --from-csv golden.csv \
237
+ --input-keys query \
238
+ --output-keys expected_label,expected_type \
239
+ --metadata-keys difficulty,query_type
240
+
241
+ # Add examples to existing dataset
242
+ rem experiments dataset add <name> \
243
+ --from-csv new-examples.csv \
244
+ --input-keys query \
245
+ --output-keys expected_label,expected_type
246
+ ```
247
+
248
+ ### Experiment Commands
249
+
250
+ ```bash
251
+ # Run agent only
252
+ rem experiments run <dataset> \
253
+ --experiment <name> \
254
+ --agent ask_rem
255
+
256
+ # Run evaluator only
257
+ rem experiments run <dataset> \
258
+ --experiment <name> \
259
+ --evaluator rem-lookup-correctness
260
+
261
+ # Run agent + evaluators
262
+ rem experiments run <dataset> \
263
+ --experiment <name> \
264
+ --agent ask_rem \
265
+ --evaluator rem-lookup-correctness,rem-faithfulness
266
+ ```
267
+
268
+ ### Trace Commands
269
+
270
+ ```bash
271
+ # List recent traces
272
+ rem experiments trace list --project rem-agents --days 7 --limit 50
273
+ ```
274
+
275
+ ---
276
+
277
+ ## API Reference
278
+
279
+ ### PhoenixClient
280
+
281
+ ```python
282
+ from rem.services.phoenix import PhoenixClient
283
+
284
+ client = PhoenixClient()
285
+
286
+ # Dataset management
287
+ datasets = client.list_datasets()
288
+ dataset = client.get_dataset("rem-lookup-golden")
289
+ dataset = client.create_dataset_from_data(
290
+ name="rem-test",
291
+ inputs=[{"query": "LOOKUP person:sarah-chen"}],
292
+ outputs=[{"label": "sarah-chen", "type": "person"}],
293
+ metadata=[{"difficulty": "easy"}]
294
+ )
295
+
296
+ # Experiment execution
297
+ experiment = client.run_experiment(
298
+ dataset="rem-lookup-golden",
299
+ task=ask_rem_task,
300
+ evaluators=[correctness_eval, faithfulness_eval],
301
+ experiment_name="rem-v1"
302
+ )
303
+
304
+ # Trace retrieval
305
+ traces = client.get_traces(
306
+ project_name="rem-agents",
307
+ limit=50
308
+ )
309
+ ```
310
+
311
+ ### Evaluator Provider
312
+
313
+ ```python
314
+ from rem.agentic.providers.phoenix import (
315
+ create_evaluator_from_schema,
316
+ load_evaluator_schema
317
+ )
318
+
319
+ # Load schema
320
+ schema = load_evaluator_schema("rem-lookup-correctness")
321
+
322
+ # Create evaluator
323
+ evaluator = create_evaluator_from_schema("rem-lookup-correctness")
324
+
325
+ # Use in experiment
326
+ result = evaluator({
327
+ "input": {"query": "LOOKUP person:sarah-chen"},
328
+ "output": {"label": "sarah-chen", ...},
329
+ "expected": {"label": "sarah-chen", ...}
330
+ })
331
+ # Returns: {"score": 0.95, "label": "correct", "explanation": "..."}
332
+ ```
333
+
334
+ ---
335
+
336
+ ## Best Practices
337
+
338
+ ### Golden Set Quality
339
+
340
+ **Good:**
341
+ - Diverse examples (easy, medium, hard)
342
+ - Edge cases included
343
+ - Clear expected outputs
344
+ - Metadata for filtering
345
+
346
+ **Poor:**
347
+ - Only easy examples
348
+ - Ambiguous expected outputs
349
+ - No metadata
350
+ - Too small (< 10 examples)
351
+
352
+ ### Evaluator Design
353
+
354
+ **Good:**
355
+ - Multiple dimensions (correctness, completeness, etc.)
356
+ - Clear scoring rubric
357
+ - Strict grading (catches hallucinations)
358
+ - Detailed feedback
359
+
360
+ **Poor:**
361
+ - Single dimension (just "score")
362
+ - Vague rubric
363
+ - Lenient grading
364
+ - No explanations
365
+
366
+ ### Iterative Improvement
367
+
368
+ 1. Create initial golden set (10-20 examples)
369
+ 2. Run baseline evaluation
370
+ 3. Identify failure modes
371
+ 4. Add edge cases to golden set
372
+ 5. Improve agent or prompts
373
+ 6. Re-run evaluation
374
+ 7. Compare results over time
375
+
376
+ **Track Progress:**
377
+ - Use versioned experiment names: `rem-v1-baseline`, `rem-v2-improved`
378
+ - Add metadata: `{"agent_version": "v2", "prompt_version": "2024-11-20"}`
379
+ - Compare scores in Phoenix UI
380
+
381
+ ---
382
+
383
+ ## Troubleshooting
384
+
385
+ ### Connection Issues
386
+
387
+ **Problem:** "Connection refused"
388
+
389
+ ```bash
390
+ # Check port-forward
391
+ lsof -i :6006
392
+
393
+ # Restart port-forward
394
+ kubectl port-forward -n observability svc/phoenix-svc 6006:6006
395
+ ```
396
+
397
+ ### Authentication Issues
398
+
399
+ **Problem:** "401 Unauthorized"
400
+
401
+ ```bash
402
+ # Check API key
403
+ echo $PHOENIX_API_KEY
404
+
405
+ # Set if empty
406
+ export PHOENIX_API_KEY=<your-key>
407
+ ```
408
+
409
+ ### Dataset Not Found
410
+
411
+ ```bash
412
+ # List all datasets (check spelling, case-sensitive)
413
+ rem experiments dataset list
414
+ ```
415
+
416
+ ### Evaluator Schema Not Found
417
+
418
+ ```bash
419
+ # Check schema exists
420
+ ls rem/schemas/evaluators/
421
+
422
+ # Load without file extension
423
+ # ✓ "rem-lookup-correctness"
424
+ # ✗ "rem-lookup-correctness.yaml"
425
+ ```
426
+
427
+ ---
428
+
429
+ ## Related Documentation
430
+
431
+ - [REM CLAUDE.md](../../../CLAUDE.md) - Overall REM architecture
432
+ - [Phoenix Official Docs](https://docs.arize.com/phoenix) - Phoenix platform
433
+ - [Carrier Evaluation](https://github.com/anthropics/carrier/docs/03-evaluation.md) - Inspiration for two-phase approach
434
+
435
+ ---
436
+
437
+ ## Summary
438
+
439
+ REM's Phoenix evaluation framework provides:
440
+
441
+ ✅ **Two-phase workflow** - SMEs create golden sets, automation runs evaluations
442
+ ✅ **Lean service layer** - Clean API for datasets/experiments
443
+ ✅ **Evaluator provider** - Schema-based LLM-as-a-Judge pattern
444
+ ✅ **CLI commands** - Simple workflow for creating datasets and running experiments
445
+ ✅ **Comprehensive schemas** - Agent evaluators + RAG evaluators (RAGAS-inspired, no dependency)
446
+ ✅ **Agent + Tools support** - OTEL tracing of MCP tool calls
447
+ ✅ **Systematic tracking** - Phoenix integration for analysis over time
448
+
449
+ **Next Steps:**
450
+ 1. Create your first golden set (`rem experiments dataset create`)
451
+ 2. Run baseline evaluation (`rem experiments run`)
452
+ 3. Iterate and improve agents
453
+ 4. Track progress in Phoenix UI (`open http://localhost:6006`)
@@ -0,0 +1,46 @@
1
+ """Phoenix observability and evaluation services for REM.
2
+
3
+ This package provides Phoenix integration for:
4
+ 1. Dataset management (golden sets, evaluation datasets)
5
+ 2. Experiment execution (agent runs, evaluator runs)
6
+ 3. Trace retrieval and analysis
7
+ 4. Label management for organizing evaluations
8
+
9
+ Two-Phase Evaluation Workflow:
10
+ ==============================
11
+
12
+ Phase 1: SME Golden Set Creation
13
+ ---------------------------------
14
+ Subject Matter Experts create golden datasets containing:
15
+ - input: What the agent receives (e.g., {"query": "LOOKUP person:sarah-chen"})
16
+ - reference: Expected correct output (ground truth)
17
+ - metadata: Optional context (difficulty, category, etc.)
18
+
19
+ Phase 2: Automated Evaluation
20
+ ------------------------------
21
+ 1. Run agents against golden set → produces agent outputs
22
+ 2. Run evaluators against (input, agent_output, reference) → produces scores
23
+ 3. Track results in Phoenix for analysis and iteration
24
+
25
+ This two-phase approach allows:
26
+ - SMEs to contribute domain knowledge without running agents
27
+ - Automated regression testing as agents evolve
28
+ - Systematic comparison across agent versions
29
+ - Label-based organization (by query type, difficulty, etc.)
30
+ """
31
+
32
+ from .client import PhoenixClient
33
+ from .config import PhoenixConfig
34
+ from .prompt_labels import (
35
+ PhoenixPromptLabels,
36
+ setup_rem_labels,
37
+ REM_LABELS,
38
+ )
39
+
40
+ __all__ = [
41
+ "PhoenixClient",
42
+ "PhoenixConfig",
43
+ "PhoenixPromptLabels",
44
+ "setup_rem_labels",
45
+ "REM_LABELS",
46
+ ]