remdb 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +566 -0
  44. rem/cli/commands/configure.py +497 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1302 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +96 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +801 -0
  104. rem/services/content/service.py +676 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +336 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.3.7.dist-info/METADATA +1473 -0
  185. remdb-0.3.7.dist-info/RECORD +187 -0
  186. remdb-0.3.7.dist-info/WHEEL +4 -0
  187. remdb-0.3.7.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,674 @@
1
+ """Phoenix evaluator provider for REM agents.
2
+
3
+ This module provides factory functions for creating Phoenix-compatible evaluators
4
+ from schema definitions, following the same pattern as Pydantic AI agent creation.
5
+
6
+ Exported Functions:
7
+ ===================
8
+ - load_evaluator_schema: Load evaluator schemas from schemas/evaluators/
9
+ - create_phoenix_evaluator: Create Phoenix evaluator config from schema
10
+ - create_evaluator_from_schema: Create callable evaluator function
11
+ - schema_to_prompt: Convert schema to Phoenix openai_params format
12
+ - sanitize_tool_name: Sanitize tool names for Phoenix/OpenAI compatibility
13
+ - run_evaluation_experiment: Run complete evaluation workflow
14
+
15
+ Design Pattern (mirrors Pydantic AI provider):
16
+ ==============================================
17
+ 1. Load evaluator schemas from schemas/evaluators/ directory
18
+ 2. Extract system prompt, output schema, and metadata
19
+ 3. Create Phoenix-compatible evaluator functions
20
+ 4. Support both LLM-as-a-Judge and code-based evaluators
21
+
22
+ Two-Phase Evaluation Architecture:
23
+ ===================================
24
+
25
+ Phase 1 - Golden Set Creation:
26
+ SMEs create datasets with (input, reference) pairs in Phoenix
27
+
28
+ Phase 2 - Automated Evaluation:
29
+ Step 1: Run agents → (input, agent_output)
30
+ Step 2: Run evaluators → (input, agent_output, reference) → scores
31
+
32
+ Evaluator Types:
33
+ ================
34
+
35
+ 1. LLM-as-a-Judge (uses Claude/GPT to evaluate):
36
+ - Compares agent output to reference
37
+ - Scores on multiple dimensions (correctness, completeness, etc.)
38
+ - Provides explanations and suggestions
39
+
40
+ 2. Code-based (deterministic evaluation):
41
+ - Exact match checking
42
+ - Field presence validation
43
+ - Format compliance
44
+
45
+ Usage:
46
+ ======
47
+
48
+ Create evaluator from schema:
49
+ >>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
50
+ >>> result = evaluator(example)
51
+ >>> # Returns: {"score": 0.95, "label": "correct", "explanation": "..."}
52
+
53
+ Run evaluation experiment:
54
+ >>> from rem.services.phoenix import PhoenixClient
55
+ >>> client = PhoenixClient()
56
+ >>> experiment = run_evaluation_experiment(
57
+ ... dataset_name="rem-lookup-golden",
58
+ ... task=run_agent_task,
59
+ ... evaluator_schema_path="rem-lookup-correctness",
60
+ ... phoenix_client=client
61
+ ... )
62
+ """
63
+
64
+ from typing import Any, Callable, TYPE_CHECKING
65
+ from pathlib import Path
66
+ import json
67
+ import yaml
68
+
69
+ from loguru import logger
70
+
71
+ # Lazy import to avoid Phoenix initialization at module load time
72
+ if TYPE_CHECKING:
73
+ from phoenix.evals import LLMEvaluator
74
+ from phoenix.client.resources.datasets import Dataset
75
+ from phoenix.client.resources.experiments.types import RanExperiment
76
+ from rem.services.phoenix import PhoenixClient
77
+
78
+ PHOENIX_AVAILABLE = None # Lazy check on first use
79
+
80
+
81
+ def _check_phoenix_available() -> bool:
82
+ """Lazy check if Phoenix is available (only imports when needed)."""
83
+ global PHOENIX_AVAILABLE
84
+ if PHOENIX_AVAILABLE is not None:
85
+ return PHOENIX_AVAILABLE
86
+
87
+ try:
88
+ import phoenix.evals # noqa: F401
89
+ PHOENIX_AVAILABLE = True
90
+ except ImportError:
91
+ PHOENIX_AVAILABLE = False
92
+ logger.warning("arize-phoenix package not installed - evaluator factory unavailable")
93
+
94
+ return PHOENIX_AVAILABLE
95
+
96
+
97
+ # =============================================================================
98
+ # NAME SANITIZATION
99
+ # =============================================================================
100
+
101
+
102
+ def sanitize_tool_name(tool_name: str) -> str:
103
+ """Sanitize tool name for Phoenix/OpenAI compatibility.
104
+
105
+ Replaces all non-alphanumeric characters with underscores to prevent
106
+ prompt breaking and ensure compatibility with OpenAI function calling.
107
+
108
+ Args:
109
+ tool_name: Original tool name (e.g., "ask_rem", "traverse-graph")
110
+
111
+ Returns:
112
+ Sanitized name with only alphanumeric characters and underscores
113
+
114
+ Example:
115
+ >>> sanitize_tool_name("ask_rem")
116
+ 'ask_rem'
117
+ >>> sanitize_tool_name("traverse-graph")
118
+ 'traverse_graph'
119
+ >>> sanitize_tool_name("mcp://server/tool-name")
120
+ 'mcp___server_tool_name'
121
+ """
122
+ return "".join(c if c.isalnum() else "_" for c in tool_name)
123
+
124
+
125
+ # =============================================================================
126
+ # SCHEMA LOADING
127
+ # =============================================================================
128
+
129
+
130
+ def load_evaluator_schema(evaluator_name: str) -> dict[str, Any]:
131
+ """Load evaluator schema from schemas/evaluators/ directory.
132
+
133
+ Searches for evaluator schema in rem/schemas/evaluators/
134
+ Supports .json, .yaml, and .yml files.
135
+
136
+ Args:
137
+ evaluator_name: Evaluator name (with or without extension)
138
+ e.g., "rem-lookup-correctness" or
139
+ "rem-lookup-correctness.yaml"
140
+
141
+ Returns:
142
+ Evaluator schema dictionary with keys:
143
+ - description: System prompt for LLM evaluator
144
+ - properties: Output schema fields
145
+ - required: Required output fields
146
+ - labels: Optional labels for categorization
147
+ - version: Schema version
148
+
149
+ Raises:
150
+ FileNotFoundError: If evaluator schema not found
151
+
152
+ Example:
153
+ >>> schema = load_evaluator_schema("rem-lookup-correctness")
154
+ >>> print(schema["description"])
155
+ """
156
+ # Get schemas directory (rem/schemas/evaluators/)
157
+ # rem.__file__ = rem/src/rem/__init__.py
158
+ # We need rem/schemas/evaluators/
159
+ import rem
160
+ rem_module_dir = Path(rem.__file__).parent # rem/src/rem
161
+ rem_package_root = rem_module_dir.parent.parent # rem/src/rem -> rem/src -> rem
162
+ schema_dir = rem_package_root / "schemas" / "evaluators"
163
+
164
+ # Try .yaml first (preferred format)
165
+ yaml_path = schema_dir / f"{evaluator_name}.yaml"
166
+ if yaml_path.exists():
167
+ logger.debug(f"Loading evaluator schema from {yaml_path}")
168
+ with open(yaml_path) as f:
169
+ return yaml.safe_load(f)
170
+
171
+ # Try .yml
172
+ yml_path = schema_dir / f"{evaluator_name}.yml"
173
+ if yml_path.exists():
174
+ logger.debug(f"Loading evaluator schema from {yml_path}")
175
+ with open(yml_path) as f:
176
+ return yaml.safe_load(f)
177
+
178
+ # Try .json
179
+ json_path = schema_dir / f"{evaluator_name}.json"
180
+ if json_path.exists():
181
+ logger.debug(f"Loading evaluator schema from {json_path}")
182
+ with open(json_path) as f:
183
+ return json.load(f)
184
+
185
+ raise FileNotFoundError(
186
+ f"Evaluator schema not found: {evaluator_name}\n"
187
+ f"Searched in: {schema_dir}\n"
188
+ f"Supported formats: .yaml, .yml, .json"
189
+ )
190
+
191
+
192
+ # =============================================================================
193
+ # EVALUATOR CREATION
194
+ # =============================================================================
195
+
196
+
197
+ def create_phoenix_evaluator(
198
+ evaluator_schema: dict[str, Any],
199
+ model_name: str | None = None,
200
+ ) -> dict[str, Any]:
201
+ """Create Phoenix evaluator configuration from schema.
202
+
203
+ Args:
204
+ evaluator_schema: Evaluator schema dictionary
205
+ model_name: Optional LLM model to use (defaults to claude-sonnet-4-5)
206
+
207
+ Returns:
208
+ Evaluator config dict with:
209
+ - name: Evaluator name
210
+ - llm: Phoenix LLM wrapper
211
+ - prompt_template: System prompt
212
+ - schema: Output schema
213
+
214
+ Raises:
215
+ ImportError: If arize-phoenix not installed
216
+ KeyError: If required schema fields missing
217
+ """
218
+ if not _check_phoenix_available():
219
+ raise ImportError(
220
+ "arize-phoenix package required for evaluators. "
221
+ "Install with: pip install arize-phoenix"
222
+ )
223
+
224
+ # Import Phoenix after availability check
225
+ from phoenix.evals import OpenAIModel, AnthropicModel
226
+
227
+ logger.debug("Creating Phoenix evaluator from schema")
228
+
229
+ # Extract schema fields
230
+ evaluator_name = evaluator_schema.get("title", "UnnamedEvaluator")
231
+ system_prompt = evaluator_schema.get("description", "")
232
+ output_schema = evaluator_schema.get("properties", {})
233
+
234
+ if not system_prompt:
235
+ raise KeyError("evaluator_schema must contain 'description' field with system prompt")
236
+
237
+ # Default model (use Claude Sonnet 4.5 for evaluators)
238
+ if model_name is None:
239
+ model_name = "claude-sonnet-4-5-20250929"
240
+ logger.debug(f"Using default evaluator model: {model_name}")
241
+
242
+ logger.info(f"Creating Phoenix evaluator: {evaluator_name} with model={model_name}")
243
+
244
+ # Parse provider and model name
245
+ if ":" in model_name:
246
+ provider, phoenix_model_name = model_name.split(":", 1)
247
+ else:
248
+ # Detect provider from model name
249
+ if model_name.startswith("claude"):
250
+ provider = "anthropic"
251
+ else:
252
+ provider = "openai"
253
+ phoenix_model_name = model_name
254
+
255
+ # Create appropriate Phoenix LLM wrapper based on provider
256
+ llm: OpenAIModel | AnthropicModel
257
+ if provider.lower() == "anthropic":
258
+ # Anthropic models don't support top_p parameter
259
+ llm = AnthropicModel(
260
+ model=phoenix_model_name,
261
+ temperature=0.0,
262
+ )
263
+ else:
264
+ # Default to OpenAI for other providers (gpt-4, etc.)
265
+ llm = OpenAIModel(model=phoenix_model_name, temperature=0.0)
266
+
267
+ # Return evaluator config (not an instance - we'll use llm_classify directly)
268
+ evaluator_config = {
269
+ "name": evaluator_name,
270
+ "llm": llm,
271
+ "prompt_template": system_prompt,
272
+ "schema": output_schema,
273
+ "labels": evaluator_schema.get("labels", []),
274
+ "version": evaluator_schema.get("version", "1.0.0"),
275
+ }
276
+
277
+ logger.info(f"Phoenix evaluator '{evaluator_name}' created successfully")
278
+ return evaluator_config
279
+
280
+
281
+ def create_evaluator_from_schema(
282
+ evaluator_schema_path: str | Path | dict[str, Any],
283
+ model_name: str | None = None,
284
+ ) -> Callable[[Any], Any]:
285
+ """Create an evaluator function from a schema file or dict.
286
+
287
+ The returned evaluator is a callable that Phoenix experiments can use.
288
+
289
+ Args:
290
+ evaluator_schema_path: Path to schema file, evaluator name, or schema dict
291
+ model_name: Optional LLM model to use for evaluation
292
+
293
+ Returns:
294
+ Evaluator function compatible with Phoenix experiments
295
+
296
+ Raises:
297
+ FileNotFoundError: If schema file not found
298
+ ImportError: If arize-phoenix not installed
299
+
300
+ Example:
301
+ >>> # From evaluator name (searches in schemas/evaluators/)
302
+ >>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
303
+ >>>
304
+ >>> # From schema dict
305
+ >>> schema = {"description": "...", "properties": {...}}
306
+ >>> evaluator = create_evaluator_from_schema(schema)
307
+ >>>
308
+ >>> # Use in experiment
309
+ >>> result = evaluator({
310
+ ... "input": {"query": "LOOKUP person:sarah-chen"},
311
+ ... "output": {"label": "sarah-chen", "type": "person", ...},
312
+ ... "expected": {"label": "sarah-chen", "type": "person", ...}
313
+ ... })
314
+ """
315
+ if not _check_phoenix_available():
316
+ raise ImportError(
317
+ "arize-phoenix package required for evaluators. "
318
+ "Install with: pip install arize-phoenix"
319
+ )
320
+
321
+ # Load schema if path/name provided
322
+ if isinstance(evaluator_schema_path, (str, Path)):
323
+ schema_path = Path(evaluator_schema_path)
324
+
325
+ # If it's a file path, load directly
326
+ if schema_path.exists():
327
+ logger.debug(f"Loading evaluator schema from {schema_path}")
328
+ if schema_path.suffix in [".yaml", ".yml"]:
329
+ with open(schema_path) as f:
330
+ schema = yaml.safe_load(f)
331
+ else:
332
+ with open(schema_path) as f:
333
+ schema = json.load(f)
334
+ else:
335
+ # Treat as evaluator name, search in schemas/evaluators/
336
+ schema = load_evaluator_schema(str(evaluator_schema_path))
337
+ else:
338
+ # Already a dict
339
+ schema = evaluator_schema_path
340
+
341
+ # Create evaluator config
342
+ evaluator_config = create_phoenix_evaluator(
343
+ evaluator_schema=schema,
344
+ model_name=model_name,
345
+ )
346
+
347
+ # Import llm_classify for evaluation
348
+ from phoenix.evals import llm_classify
349
+ import pandas as pd
350
+
351
+ # Wrap for Phoenix experiment compatibility
352
+ def evaluator_fn(example: dict[str, Any]) -> dict[str, Any]:
353
+ """Evaluate a single example using Phoenix llm_classify.
354
+
355
+ Args:
356
+ example: Dict with 'input', 'output', 'expected' keys
357
+ - input: Agent input dict (e.g., {"query": "LOOKUP person:sarah-chen"})
358
+ - output: Agent output dict (what the agent returned)
359
+ - expected: Expected output dict (ground truth from dataset)
360
+
361
+ Returns:
362
+ Evaluation result with score, label, explanation
363
+ """
364
+ logger.debug(f"Evaluating example: {example.get('input', '')[:100]}...")
365
+
366
+ # Phoenix llm_classify() expects a flat dict with string values
367
+ # Build evaluation input by flattening nested dicts
368
+ eval_input = {}
369
+
370
+ # Extract and flatten input fields
371
+ input_data = example.get("input", {})
372
+ if isinstance(input_data, dict):
373
+ for key, value in input_data.items():
374
+ eval_input[f"input_{key}"] = str(value) if value is not None else ""
375
+ else:
376
+ eval_input["input"] = str(input_data) if input_data is not None else ""
377
+
378
+ # Extract and flatten agent output fields
379
+ output_data = example.get("output", {})
380
+ if isinstance(output_data, dict):
381
+ for key, value in output_data.items():
382
+ eval_input[f"output_{key}"] = str(value) if value is not None else ""
383
+ else:
384
+ eval_input["output"] = str(output_data) if output_data is not None else ""
385
+
386
+ # Extract and flatten expected fields (reference/ground truth)
387
+ expected_data = example.get("expected", {})
388
+ if isinstance(expected_data, dict):
389
+ for key, value in expected_data.items():
390
+ eval_input[f"expected_{key}"] = str(value) if value is not None else ""
391
+ elif expected_data:
392
+ eval_input["expected"] = str(expected_data)
393
+
394
+ try:
395
+ # Create single-row DataFrame for llm_classify
396
+ df = pd.DataFrame([eval_input])
397
+
398
+ # Call Phoenix llm_classify
399
+ results_df = llm_classify(
400
+ dataframe=df,
401
+ model=evaluator_config["llm"],
402
+ template=evaluator_config["prompt_template"],
403
+ rails=["correct", "partial", "incorrect"], # Common labels
404
+ provide_explanation=True,
405
+ )
406
+
407
+ # Extract result
408
+ if not results_df.empty:
409
+ row = results_df.iloc[0]
410
+ label = row.get("label", "error")
411
+ explanation = row.get("explanation", "")
412
+
413
+ # Map labels to scores
414
+ score_map = {"correct": 1.0, "partial": 0.5, "incorrect": 0.0}
415
+ score = score_map.get(label, 0.0)
416
+
417
+ return {
418
+ "label": label,
419
+ "score": score,
420
+ "explanation": explanation or "",
421
+ }
422
+ else:
423
+ logger.warning("llm_classify returned empty DataFrame")
424
+ return {
425
+ "label": "error",
426
+ "score": 0.0,
427
+ "explanation": "Evaluator returned empty result",
428
+ }
429
+
430
+ except Exception as e:
431
+ logger.error(f"Evaluator error: {e}")
432
+ return {
433
+ "label": "error",
434
+ "score": 0.0,
435
+ "explanation": f"Evaluator failed: {str(e)}",
436
+ }
437
+
438
+ return evaluator_fn
439
+
440
+
441
+ def schema_to_prompt(
442
+ schema: dict[str, Any],
443
+ schema_type: str = "evaluator",
444
+ model_name: str = "gpt-4.1",
445
+ ) -> dict[str, Any]:
446
+ """Convert agent or evaluator schema to complete Phoenix openai_params.
447
+
448
+ Converts REM schema format to Phoenix PromptVersion.from_openai() format,
449
+ including messages, response_format, and tools (for agents).
450
+
451
+ Args:
452
+ schema: Schema dictionary (from load_evaluator_schema or agent schema)
453
+ schema_type: Type of schema - "agent" or "evaluator"
454
+ model_name: Model name for the prompt
455
+
456
+ Returns:
457
+ Complete openai_params dict ready for PromptVersion.from_openai()
458
+ Contains: model, messages, response_format, tools (for agents)
459
+
460
+ Example:
461
+ >>> schema = load_evaluator_schema("rem-lookup-correctness")
462
+ >>> openai_params = schema_to_prompt(schema, schema_type="evaluator")
463
+ >>> # Use with Phoenix: PromptVersion.from_openai(openai_params)
464
+ """
465
+ system_prompt = schema.get("description", "")
466
+ properties = schema.get("properties", {})
467
+ required = schema.get("required", [])
468
+
469
+ # Extract tool definitions and convert to OpenAI format (for agents)
470
+ tool_definitions = [] # For metadata YAML
471
+ openai_tools = [] # For Phoenix tools parameter
472
+
473
+ if schema_type == "agent":
474
+ json_schema_extra = schema.get("json_schema_extra", {})
475
+ tools = json_schema_extra.get("tools", [])
476
+
477
+ for tool in tools:
478
+ # Keep metadata format for YAML section
479
+ tool_def = {
480
+ "mcp_server": tool.get("mcp_server"),
481
+ "tool_name": tool.get("tool_name"),
482
+ "usage": tool.get("usage", ""),
483
+ }
484
+ tool_definitions.append(tool_def)
485
+
486
+ # Convert to OpenAI function calling format
487
+ # Sanitize tool name to prevent prompt breaking
488
+ tool_name = tool.get("tool_name", "")
489
+ sanitized_name = sanitize_tool_name(tool_name)
490
+
491
+ openai_tool = {
492
+ "type": "function",
493
+ "function": {
494
+ "name": sanitized_name,
495
+ "description": tool.get("usage", "MCP tool"),
496
+ "parameters": {
497
+ "type": "object",
498
+ "properties": {},
499
+ "required": []
500
+ }
501
+ }
502
+ }
503
+ openai_tools.append(openai_tool)
504
+
505
+ # Build schema metadata section
506
+ info_key = "agent_info" if schema_type == "agent" else "evaluator_info"
507
+ schema_metadata = {
508
+ info_key: {
509
+ "version": schema.get("version", "1.0.0"),
510
+ "title": schema.get("title", ""),
511
+ },
512
+ "output_schema": {
513
+ "description": f"Structured output returned by this {schema_type}",
514
+ "properties": {
515
+ k: {
516
+ "type": v.get("type", "unknown"),
517
+ "description": v.get("description", ""),
518
+ }
519
+ for k, v in properties.items()
520
+ },
521
+ "required": required,
522
+ },
523
+ }
524
+
525
+ # Add tool definitions for agents
526
+ if tool_definitions:
527
+ schema_metadata["tools"] = {
528
+ "description": "MCP tools available to this agent",
529
+ "tool_definitions": tool_definitions,
530
+ }
531
+
532
+ # Add input format for evaluators
533
+ if schema_type == "evaluator":
534
+ schema_metadata["input_format"] = {
535
+ "description": "Evaluators receive dataset examples with 'input' and 'output' fields",
536
+ "structure": {
537
+ "input": "dict[str, Any] - What the agent receives (e.g., {'query': '...'})",
538
+ "output": "dict[str, Any] - Expected/ground truth (e.g., {'label': '...'})",
539
+ "metadata": "dict[str, Any] - Optional metadata (e.g., {'difficulty': 'medium'})",
540
+ },
541
+ }
542
+
543
+ # Append schema metadata to system prompt
544
+ schema_yaml = yaml.dump(schema_metadata, default_flow_style=False, sort_keys=False)
545
+ schema_section = f"\n\n---\n\n## Schema Metadata\n\n```yaml\n{schema_yaml}```"
546
+ system_prompt = system_prompt + schema_section
547
+
548
+ # Create structured template
549
+ user_content = "{{input}}" if schema_type == "agent" else "Question: {{input}}\nAgent's Answer: {{output}}"
550
+
551
+ template_messages = [
552
+ {"role": "system", "content": system_prompt},
553
+ {"role": "user", "content": user_content}
554
+ ]
555
+
556
+ # Build response format
557
+ response_format = {
558
+ "type": "json_schema",
559
+ "json_schema": {
560
+ "name": schema.get("title", ""),
561
+ "schema": {
562
+ "type": "object",
563
+ "properties": properties,
564
+ "required": required,
565
+ "additionalProperties": False
566
+ },
567
+ "strict": True
568
+ }
569
+ }
570
+
571
+ # Build complete openai_params dict ready for PromptVersion.from_openai()
572
+ openai_params: dict[str, Any] = {
573
+ "model": model_name,
574
+ "messages": template_messages,
575
+ "response_format": response_format,
576
+ }
577
+
578
+ # Add tools for agents (OpenAI function calling format)
579
+ if openai_tools:
580
+ openai_params["tools"] = openai_tools
581
+
582
+ return openai_params
583
+
584
+
585
+ # =============================================================================
586
+ # EXPERIMENT WORKFLOWS
587
+ # =============================================================================
588
+
589
+
590
+ def run_evaluation_experiment(
591
+ dataset_name: str,
592
+ task: Callable[[Any], Any] | None = None,
593
+ evaluator_schema_path: str | Path | dict[str, Any] | None = None,
594
+ experiment_name: str | None = None,
595
+ experiment_description: str | None = None,
596
+ phoenix_client: "PhoenixClient | None" = None,
597
+ model_name: str | None = None,
598
+ ) -> "RanExperiment":
599
+ """Run a complete evaluation experiment using Phoenix.
600
+
601
+ High-level workflow that:
602
+ 1. Loads dataset from Phoenix
603
+ 2. Optionally runs task (agent) on dataset
604
+ 3. Optionally runs evaluators on results
605
+ 4. Tracks results in Phoenix UI
606
+
607
+ Args:
608
+ dataset_name: Name of dataset in Phoenix
609
+ task: Optional task function (agent) to run on dataset
610
+ evaluator_schema_path: Optional evaluator schema path/name/dict
611
+ experiment_name: Name for this experiment
612
+ experiment_description: Description of experiment
613
+ phoenix_client: Optional PhoenixClient (auto-creates if not provided)
614
+ model_name: LLM model for evaluation
615
+
616
+ Returns:
617
+ RanExperiment with results and metrics
618
+
619
+ Example - Agent Run Only:
620
+ >>> experiment = run_evaluation_experiment(
621
+ ... dataset_name="rem-lookup-golden",
622
+ ... task=run_agent_task,
623
+ ... experiment_name="rem-v1-baseline"
624
+ ... )
625
+
626
+ Example - Agent + Evaluator:
627
+ >>> experiment = run_evaluation_experiment(
628
+ ... dataset_name="rem-lookup-golden",
629
+ ... task=run_agent_task,
630
+ ... evaluator_schema_path="rem-lookup-correctness",
631
+ ... experiment_name="rem-v1-full-eval"
632
+ ... )
633
+
634
+ Example - Evaluator Only (on existing results):
635
+ >>> experiment = run_evaluation_experiment(
636
+ ... dataset_name="rem-v1-results",
637
+ ... evaluator_schema_path="rem-lookup-correctness",
638
+ ... experiment_name="rem-v1-scoring"
639
+ ... )
640
+ """
641
+ # Create Phoenix client if not provided
642
+ if phoenix_client is None:
643
+ from rem.services.phoenix import PhoenixClient
644
+ phoenix_client = PhoenixClient()
645
+
646
+ # Load dataset
647
+ logger.info(f"Loading dataset: {dataset_name}")
648
+ dataset = phoenix_client.get_dataset(dataset_name)
649
+
650
+ # Create evaluator if schema provided
651
+ evaluators = []
652
+ if evaluator_schema_path:
653
+ logger.info(f"Creating evaluator from schema: {evaluator_schema_path}")
654
+ evaluator = create_evaluator_from_schema(
655
+ evaluator_schema_path=evaluator_schema_path,
656
+ model_name=model_name,
657
+ )
658
+ evaluators.append(evaluator)
659
+
660
+ # Run experiment
661
+ logger.info(f"Running experiment: {experiment_name or 'unnamed'}")
662
+ experiment = phoenix_client.run_experiment(
663
+ dataset=dataset,
664
+ task=task,
665
+ evaluators=evaluators if evaluators else None,
666
+ experiment_name=experiment_name,
667
+ experiment_description=experiment_description,
668
+ )
669
+
670
+ logger.success(
671
+ f"Experiment complete. View results: {experiment.url if hasattr(experiment, 'url') else 'N/A'}" # type: ignore[attr-defined]
672
+ )
673
+
674
+ return experiment