remdb 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +566 -0
- rem/cli/commands/configure.py +497 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1302 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +96 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +801 -0
- rem/services/content/service.py +676 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +336 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.3.7.dist-info/METADATA +1473 -0
- remdb-0.3.7.dist-info/RECORD +187 -0
- remdb-0.3.7.dist-info/WHEEL +4 -0
- remdb-0.3.7.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,674 @@
|
|
|
1
|
+
"""Phoenix evaluator provider for REM agents.
|
|
2
|
+
|
|
3
|
+
This module provides factory functions for creating Phoenix-compatible evaluators
|
|
4
|
+
from schema definitions, following the same pattern as Pydantic AI agent creation.
|
|
5
|
+
|
|
6
|
+
Exported Functions:
|
|
7
|
+
===================
|
|
8
|
+
- load_evaluator_schema: Load evaluator schemas from schemas/evaluators/
|
|
9
|
+
- create_phoenix_evaluator: Create Phoenix evaluator config from schema
|
|
10
|
+
- create_evaluator_from_schema: Create callable evaluator function
|
|
11
|
+
- schema_to_prompt: Convert schema to Phoenix openai_params format
|
|
12
|
+
- sanitize_tool_name: Sanitize tool names for Phoenix/OpenAI compatibility
|
|
13
|
+
- run_evaluation_experiment: Run complete evaluation workflow
|
|
14
|
+
|
|
15
|
+
Design Pattern (mirrors Pydantic AI provider):
|
|
16
|
+
==============================================
|
|
17
|
+
1. Load evaluator schemas from schemas/evaluators/ directory
|
|
18
|
+
2. Extract system prompt, output schema, and metadata
|
|
19
|
+
3. Create Phoenix-compatible evaluator functions
|
|
20
|
+
4. Support both LLM-as-a-Judge and code-based evaluators
|
|
21
|
+
|
|
22
|
+
Two-Phase Evaluation Architecture:
|
|
23
|
+
===================================
|
|
24
|
+
|
|
25
|
+
Phase 1 - Golden Set Creation:
|
|
26
|
+
SMEs create datasets with (input, reference) pairs in Phoenix
|
|
27
|
+
|
|
28
|
+
Phase 2 - Automated Evaluation:
|
|
29
|
+
Step 1: Run agents → (input, agent_output)
|
|
30
|
+
Step 2: Run evaluators → (input, agent_output, reference) → scores
|
|
31
|
+
|
|
32
|
+
Evaluator Types:
|
|
33
|
+
================
|
|
34
|
+
|
|
35
|
+
1. LLM-as-a-Judge (uses Claude/GPT to evaluate):
|
|
36
|
+
- Compares agent output to reference
|
|
37
|
+
- Scores on multiple dimensions (correctness, completeness, etc.)
|
|
38
|
+
- Provides explanations and suggestions
|
|
39
|
+
|
|
40
|
+
2. Code-based (deterministic evaluation):
|
|
41
|
+
- Exact match checking
|
|
42
|
+
- Field presence validation
|
|
43
|
+
- Format compliance
|
|
44
|
+
|
|
45
|
+
Usage:
|
|
46
|
+
======
|
|
47
|
+
|
|
48
|
+
Create evaluator from schema:
|
|
49
|
+
>>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
|
|
50
|
+
>>> result = evaluator(example)
|
|
51
|
+
>>> # Returns: {"score": 0.95, "label": "correct", "explanation": "..."}
|
|
52
|
+
|
|
53
|
+
Run evaluation experiment:
|
|
54
|
+
>>> from rem.services.phoenix import PhoenixClient
|
|
55
|
+
>>> client = PhoenixClient()
|
|
56
|
+
>>> experiment = run_evaluation_experiment(
|
|
57
|
+
... dataset_name="rem-lookup-golden",
|
|
58
|
+
... task=run_agent_task,
|
|
59
|
+
... evaluator_schema_path="rem-lookup-correctness",
|
|
60
|
+
... phoenix_client=client
|
|
61
|
+
... )
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
from typing import Any, Callable, TYPE_CHECKING
|
|
65
|
+
from pathlib import Path
|
|
66
|
+
import json
|
|
67
|
+
import yaml
|
|
68
|
+
|
|
69
|
+
from loguru import logger
|
|
70
|
+
|
|
71
|
+
# Lazy import to avoid Phoenix initialization at module load time
|
|
72
|
+
if TYPE_CHECKING:
|
|
73
|
+
from phoenix.evals import LLMEvaluator
|
|
74
|
+
from phoenix.client.resources.datasets import Dataset
|
|
75
|
+
from phoenix.client.resources.experiments.types import RanExperiment
|
|
76
|
+
from rem.services.phoenix import PhoenixClient
|
|
77
|
+
|
|
78
|
+
PHOENIX_AVAILABLE = None # Lazy check on first use
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _check_phoenix_available() -> bool:
|
|
82
|
+
"""Lazy check if Phoenix is available (only imports when needed)."""
|
|
83
|
+
global PHOENIX_AVAILABLE
|
|
84
|
+
if PHOENIX_AVAILABLE is not None:
|
|
85
|
+
return PHOENIX_AVAILABLE
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
import phoenix.evals # noqa: F401
|
|
89
|
+
PHOENIX_AVAILABLE = True
|
|
90
|
+
except ImportError:
|
|
91
|
+
PHOENIX_AVAILABLE = False
|
|
92
|
+
logger.warning("arize-phoenix package not installed - evaluator factory unavailable")
|
|
93
|
+
|
|
94
|
+
return PHOENIX_AVAILABLE
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# =============================================================================
|
|
98
|
+
# NAME SANITIZATION
|
|
99
|
+
# =============================================================================
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def sanitize_tool_name(tool_name: str) -> str:
|
|
103
|
+
"""Sanitize tool name for Phoenix/OpenAI compatibility.
|
|
104
|
+
|
|
105
|
+
Replaces all non-alphanumeric characters with underscores to prevent
|
|
106
|
+
prompt breaking and ensure compatibility with OpenAI function calling.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
tool_name: Original tool name (e.g., "ask_rem", "traverse-graph")
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Sanitized name with only alphanumeric characters and underscores
|
|
113
|
+
|
|
114
|
+
Example:
|
|
115
|
+
>>> sanitize_tool_name("ask_rem")
|
|
116
|
+
'ask_rem'
|
|
117
|
+
>>> sanitize_tool_name("traverse-graph")
|
|
118
|
+
'traverse_graph'
|
|
119
|
+
>>> sanitize_tool_name("mcp://server/tool-name")
|
|
120
|
+
'mcp___server_tool_name'
|
|
121
|
+
"""
|
|
122
|
+
return "".join(c if c.isalnum() else "_" for c in tool_name)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# =============================================================================
|
|
126
|
+
# SCHEMA LOADING
|
|
127
|
+
# =============================================================================
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def load_evaluator_schema(evaluator_name: str) -> dict[str, Any]:
|
|
131
|
+
"""Load evaluator schema from schemas/evaluators/ directory.
|
|
132
|
+
|
|
133
|
+
Searches for evaluator schema in rem/schemas/evaluators/
|
|
134
|
+
Supports .json, .yaml, and .yml files.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
evaluator_name: Evaluator name (with or without extension)
|
|
138
|
+
e.g., "rem-lookup-correctness" or
|
|
139
|
+
"rem-lookup-correctness.yaml"
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Evaluator schema dictionary with keys:
|
|
143
|
+
- description: System prompt for LLM evaluator
|
|
144
|
+
- properties: Output schema fields
|
|
145
|
+
- required: Required output fields
|
|
146
|
+
- labels: Optional labels for categorization
|
|
147
|
+
- version: Schema version
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
FileNotFoundError: If evaluator schema not found
|
|
151
|
+
|
|
152
|
+
Example:
|
|
153
|
+
>>> schema = load_evaluator_schema("rem-lookup-correctness")
|
|
154
|
+
>>> print(schema["description"])
|
|
155
|
+
"""
|
|
156
|
+
# Get schemas directory (rem/schemas/evaluators/)
|
|
157
|
+
# rem.__file__ = rem/src/rem/__init__.py
|
|
158
|
+
# We need rem/schemas/evaluators/
|
|
159
|
+
import rem
|
|
160
|
+
rem_module_dir = Path(rem.__file__).parent # rem/src/rem
|
|
161
|
+
rem_package_root = rem_module_dir.parent.parent # rem/src/rem -> rem/src -> rem
|
|
162
|
+
schema_dir = rem_package_root / "schemas" / "evaluators"
|
|
163
|
+
|
|
164
|
+
# Try .yaml first (preferred format)
|
|
165
|
+
yaml_path = schema_dir / f"{evaluator_name}.yaml"
|
|
166
|
+
if yaml_path.exists():
|
|
167
|
+
logger.debug(f"Loading evaluator schema from {yaml_path}")
|
|
168
|
+
with open(yaml_path) as f:
|
|
169
|
+
return yaml.safe_load(f)
|
|
170
|
+
|
|
171
|
+
# Try .yml
|
|
172
|
+
yml_path = schema_dir / f"{evaluator_name}.yml"
|
|
173
|
+
if yml_path.exists():
|
|
174
|
+
logger.debug(f"Loading evaluator schema from {yml_path}")
|
|
175
|
+
with open(yml_path) as f:
|
|
176
|
+
return yaml.safe_load(f)
|
|
177
|
+
|
|
178
|
+
# Try .json
|
|
179
|
+
json_path = schema_dir / f"{evaluator_name}.json"
|
|
180
|
+
if json_path.exists():
|
|
181
|
+
logger.debug(f"Loading evaluator schema from {json_path}")
|
|
182
|
+
with open(json_path) as f:
|
|
183
|
+
return json.load(f)
|
|
184
|
+
|
|
185
|
+
raise FileNotFoundError(
|
|
186
|
+
f"Evaluator schema not found: {evaluator_name}\n"
|
|
187
|
+
f"Searched in: {schema_dir}\n"
|
|
188
|
+
f"Supported formats: .yaml, .yml, .json"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# =============================================================================
|
|
193
|
+
# EVALUATOR CREATION
|
|
194
|
+
# =============================================================================
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def create_phoenix_evaluator(
|
|
198
|
+
evaluator_schema: dict[str, Any],
|
|
199
|
+
model_name: str | None = None,
|
|
200
|
+
) -> dict[str, Any]:
|
|
201
|
+
"""Create Phoenix evaluator configuration from schema.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
evaluator_schema: Evaluator schema dictionary
|
|
205
|
+
model_name: Optional LLM model to use (defaults to claude-sonnet-4-5)
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Evaluator config dict with:
|
|
209
|
+
- name: Evaluator name
|
|
210
|
+
- llm: Phoenix LLM wrapper
|
|
211
|
+
- prompt_template: System prompt
|
|
212
|
+
- schema: Output schema
|
|
213
|
+
|
|
214
|
+
Raises:
|
|
215
|
+
ImportError: If arize-phoenix not installed
|
|
216
|
+
KeyError: If required schema fields missing
|
|
217
|
+
"""
|
|
218
|
+
if not _check_phoenix_available():
|
|
219
|
+
raise ImportError(
|
|
220
|
+
"arize-phoenix package required for evaluators. "
|
|
221
|
+
"Install with: pip install arize-phoenix"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Import Phoenix after availability check
|
|
225
|
+
from phoenix.evals import OpenAIModel, AnthropicModel
|
|
226
|
+
|
|
227
|
+
logger.debug("Creating Phoenix evaluator from schema")
|
|
228
|
+
|
|
229
|
+
# Extract schema fields
|
|
230
|
+
evaluator_name = evaluator_schema.get("title", "UnnamedEvaluator")
|
|
231
|
+
system_prompt = evaluator_schema.get("description", "")
|
|
232
|
+
output_schema = evaluator_schema.get("properties", {})
|
|
233
|
+
|
|
234
|
+
if not system_prompt:
|
|
235
|
+
raise KeyError("evaluator_schema must contain 'description' field with system prompt")
|
|
236
|
+
|
|
237
|
+
# Default model (use Claude Sonnet 4.5 for evaluators)
|
|
238
|
+
if model_name is None:
|
|
239
|
+
model_name = "claude-sonnet-4-5-20250929"
|
|
240
|
+
logger.debug(f"Using default evaluator model: {model_name}")
|
|
241
|
+
|
|
242
|
+
logger.info(f"Creating Phoenix evaluator: {evaluator_name} with model={model_name}")
|
|
243
|
+
|
|
244
|
+
# Parse provider and model name
|
|
245
|
+
if ":" in model_name:
|
|
246
|
+
provider, phoenix_model_name = model_name.split(":", 1)
|
|
247
|
+
else:
|
|
248
|
+
# Detect provider from model name
|
|
249
|
+
if model_name.startswith("claude"):
|
|
250
|
+
provider = "anthropic"
|
|
251
|
+
else:
|
|
252
|
+
provider = "openai"
|
|
253
|
+
phoenix_model_name = model_name
|
|
254
|
+
|
|
255
|
+
# Create appropriate Phoenix LLM wrapper based on provider
|
|
256
|
+
llm: OpenAIModel | AnthropicModel
|
|
257
|
+
if provider.lower() == "anthropic":
|
|
258
|
+
# Anthropic models don't support top_p parameter
|
|
259
|
+
llm = AnthropicModel(
|
|
260
|
+
model=phoenix_model_name,
|
|
261
|
+
temperature=0.0,
|
|
262
|
+
)
|
|
263
|
+
else:
|
|
264
|
+
# Default to OpenAI for other providers (gpt-4, etc.)
|
|
265
|
+
llm = OpenAIModel(model=phoenix_model_name, temperature=0.0)
|
|
266
|
+
|
|
267
|
+
# Return evaluator config (not an instance - we'll use llm_classify directly)
|
|
268
|
+
evaluator_config = {
|
|
269
|
+
"name": evaluator_name,
|
|
270
|
+
"llm": llm,
|
|
271
|
+
"prompt_template": system_prompt,
|
|
272
|
+
"schema": output_schema,
|
|
273
|
+
"labels": evaluator_schema.get("labels", []),
|
|
274
|
+
"version": evaluator_schema.get("version", "1.0.0"),
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
logger.info(f"Phoenix evaluator '{evaluator_name}' created successfully")
|
|
278
|
+
return evaluator_config
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def create_evaluator_from_schema(
|
|
282
|
+
evaluator_schema_path: str | Path | dict[str, Any],
|
|
283
|
+
model_name: str | None = None,
|
|
284
|
+
) -> Callable[[Any], Any]:
|
|
285
|
+
"""Create an evaluator function from a schema file or dict.
|
|
286
|
+
|
|
287
|
+
The returned evaluator is a callable that Phoenix experiments can use.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
evaluator_schema_path: Path to schema file, evaluator name, or schema dict
|
|
291
|
+
model_name: Optional LLM model to use for evaluation
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
Evaluator function compatible with Phoenix experiments
|
|
295
|
+
|
|
296
|
+
Raises:
|
|
297
|
+
FileNotFoundError: If schema file not found
|
|
298
|
+
ImportError: If arize-phoenix not installed
|
|
299
|
+
|
|
300
|
+
Example:
|
|
301
|
+
>>> # From evaluator name (searches in schemas/evaluators/)
|
|
302
|
+
>>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
|
|
303
|
+
>>>
|
|
304
|
+
>>> # From schema dict
|
|
305
|
+
>>> schema = {"description": "...", "properties": {...}}
|
|
306
|
+
>>> evaluator = create_evaluator_from_schema(schema)
|
|
307
|
+
>>>
|
|
308
|
+
>>> # Use in experiment
|
|
309
|
+
>>> result = evaluator({
|
|
310
|
+
... "input": {"query": "LOOKUP person:sarah-chen"},
|
|
311
|
+
... "output": {"label": "sarah-chen", "type": "person", ...},
|
|
312
|
+
... "expected": {"label": "sarah-chen", "type": "person", ...}
|
|
313
|
+
... })
|
|
314
|
+
"""
|
|
315
|
+
if not _check_phoenix_available():
|
|
316
|
+
raise ImportError(
|
|
317
|
+
"arize-phoenix package required for evaluators. "
|
|
318
|
+
"Install with: pip install arize-phoenix"
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Load schema if path/name provided
|
|
322
|
+
if isinstance(evaluator_schema_path, (str, Path)):
|
|
323
|
+
schema_path = Path(evaluator_schema_path)
|
|
324
|
+
|
|
325
|
+
# If it's a file path, load directly
|
|
326
|
+
if schema_path.exists():
|
|
327
|
+
logger.debug(f"Loading evaluator schema from {schema_path}")
|
|
328
|
+
if schema_path.suffix in [".yaml", ".yml"]:
|
|
329
|
+
with open(schema_path) as f:
|
|
330
|
+
schema = yaml.safe_load(f)
|
|
331
|
+
else:
|
|
332
|
+
with open(schema_path) as f:
|
|
333
|
+
schema = json.load(f)
|
|
334
|
+
else:
|
|
335
|
+
# Treat as evaluator name, search in schemas/evaluators/
|
|
336
|
+
schema = load_evaluator_schema(str(evaluator_schema_path))
|
|
337
|
+
else:
|
|
338
|
+
# Already a dict
|
|
339
|
+
schema = evaluator_schema_path
|
|
340
|
+
|
|
341
|
+
# Create evaluator config
|
|
342
|
+
evaluator_config = create_phoenix_evaluator(
|
|
343
|
+
evaluator_schema=schema,
|
|
344
|
+
model_name=model_name,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Import llm_classify for evaluation
|
|
348
|
+
from phoenix.evals import llm_classify
|
|
349
|
+
import pandas as pd
|
|
350
|
+
|
|
351
|
+
# Wrap for Phoenix experiment compatibility
|
|
352
|
+
def evaluator_fn(example: dict[str, Any]) -> dict[str, Any]:
|
|
353
|
+
"""Evaluate a single example using Phoenix llm_classify.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
example: Dict with 'input', 'output', 'expected' keys
|
|
357
|
+
- input: Agent input dict (e.g., {"query": "LOOKUP person:sarah-chen"})
|
|
358
|
+
- output: Agent output dict (what the agent returned)
|
|
359
|
+
- expected: Expected output dict (ground truth from dataset)
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
Evaluation result with score, label, explanation
|
|
363
|
+
"""
|
|
364
|
+
logger.debug(f"Evaluating example: {example.get('input', '')[:100]}...")
|
|
365
|
+
|
|
366
|
+
# Phoenix llm_classify() expects a flat dict with string values
|
|
367
|
+
# Build evaluation input by flattening nested dicts
|
|
368
|
+
eval_input = {}
|
|
369
|
+
|
|
370
|
+
# Extract and flatten input fields
|
|
371
|
+
input_data = example.get("input", {})
|
|
372
|
+
if isinstance(input_data, dict):
|
|
373
|
+
for key, value in input_data.items():
|
|
374
|
+
eval_input[f"input_{key}"] = str(value) if value is not None else ""
|
|
375
|
+
else:
|
|
376
|
+
eval_input["input"] = str(input_data) if input_data is not None else ""
|
|
377
|
+
|
|
378
|
+
# Extract and flatten agent output fields
|
|
379
|
+
output_data = example.get("output", {})
|
|
380
|
+
if isinstance(output_data, dict):
|
|
381
|
+
for key, value in output_data.items():
|
|
382
|
+
eval_input[f"output_{key}"] = str(value) if value is not None else ""
|
|
383
|
+
else:
|
|
384
|
+
eval_input["output"] = str(output_data) if output_data is not None else ""
|
|
385
|
+
|
|
386
|
+
# Extract and flatten expected fields (reference/ground truth)
|
|
387
|
+
expected_data = example.get("expected", {})
|
|
388
|
+
if isinstance(expected_data, dict):
|
|
389
|
+
for key, value in expected_data.items():
|
|
390
|
+
eval_input[f"expected_{key}"] = str(value) if value is not None else ""
|
|
391
|
+
elif expected_data:
|
|
392
|
+
eval_input["expected"] = str(expected_data)
|
|
393
|
+
|
|
394
|
+
try:
|
|
395
|
+
# Create single-row DataFrame for llm_classify
|
|
396
|
+
df = pd.DataFrame([eval_input])
|
|
397
|
+
|
|
398
|
+
# Call Phoenix llm_classify
|
|
399
|
+
results_df = llm_classify(
|
|
400
|
+
dataframe=df,
|
|
401
|
+
model=evaluator_config["llm"],
|
|
402
|
+
template=evaluator_config["prompt_template"],
|
|
403
|
+
rails=["correct", "partial", "incorrect"], # Common labels
|
|
404
|
+
provide_explanation=True,
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Extract result
|
|
408
|
+
if not results_df.empty:
|
|
409
|
+
row = results_df.iloc[0]
|
|
410
|
+
label = row.get("label", "error")
|
|
411
|
+
explanation = row.get("explanation", "")
|
|
412
|
+
|
|
413
|
+
# Map labels to scores
|
|
414
|
+
score_map = {"correct": 1.0, "partial": 0.5, "incorrect": 0.0}
|
|
415
|
+
score = score_map.get(label, 0.0)
|
|
416
|
+
|
|
417
|
+
return {
|
|
418
|
+
"label": label,
|
|
419
|
+
"score": score,
|
|
420
|
+
"explanation": explanation or "",
|
|
421
|
+
}
|
|
422
|
+
else:
|
|
423
|
+
logger.warning("llm_classify returned empty DataFrame")
|
|
424
|
+
return {
|
|
425
|
+
"label": "error",
|
|
426
|
+
"score": 0.0,
|
|
427
|
+
"explanation": "Evaluator returned empty result",
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
except Exception as e:
|
|
431
|
+
logger.error(f"Evaluator error: {e}")
|
|
432
|
+
return {
|
|
433
|
+
"label": "error",
|
|
434
|
+
"score": 0.0,
|
|
435
|
+
"explanation": f"Evaluator failed: {str(e)}",
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
return evaluator_fn
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def schema_to_prompt(
|
|
442
|
+
schema: dict[str, Any],
|
|
443
|
+
schema_type: str = "evaluator",
|
|
444
|
+
model_name: str = "gpt-4.1",
|
|
445
|
+
) -> dict[str, Any]:
|
|
446
|
+
"""Convert agent or evaluator schema to complete Phoenix openai_params.
|
|
447
|
+
|
|
448
|
+
Converts REM schema format to Phoenix PromptVersion.from_openai() format,
|
|
449
|
+
including messages, response_format, and tools (for agents).
|
|
450
|
+
|
|
451
|
+
Args:
|
|
452
|
+
schema: Schema dictionary (from load_evaluator_schema or agent schema)
|
|
453
|
+
schema_type: Type of schema - "agent" or "evaluator"
|
|
454
|
+
model_name: Model name for the prompt
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
Complete openai_params dict ready for PromptVersion.from_openai()
|
|
458
|
+
Contains: model, messages, response_format, tools (for agents)
|
|
459
|
+
|
|
460
|
+
Example:
|
|
461
|
+
>>> schema = load_evaluator_schema("rem-lookup-correctness")
|
|
462
|
+
>>> openai_params = schema_to_prompt(schema, schema_type="evaluator")
|
|
463
|
+
>>> # Use with Phoenix: PromptVersion.from_openai(openai_params)
|
|
464
|
+
"""
|
|
465
|
+
system_prompt = schema.get("description", "")
|
|
466
|
+
properties = schema.get("properties", {})
|
|
467
|
+
required = schema.get("required", [])
|
|
468
|
+
|
|
469
|
+
# Extract tool definitions and convert to OpenAI format (for agents)
|
|
470
|
+
tool_definitions = [] # For metadata YAML
|
|
471
|
+
openai_tools = [] # For Phoenix tools parameter
|
|
472
|
+
|
|
473
|
+
if schema_type == "agent":
|
|
474
|
+
json_schema_extra = schema.get("json_schema_extra", {})
|
|
475
|
+
tools = json_schema_extra.get("tools", [])
|
|
476
|
+
|
|
477
|
+
for tool in tools:
|
|
478
|
+
# Keep metadata format for YAML section
|
|
479
|
+
tool_def = {
|
|
480
|
+
"mcp_server": tool.get("mcp_server"),
|
|
481
|
+
"tool_name": tool.get("tool_name"),
|
|
482
|
+
"usage": tool.get("usage", ""),
|
|
483
|
+
}
|
|
484
|
+
tool_definitions.append(tool_def)
|
|
485
|
+
|
|
486
|
+
# Convert to OpenAI function calling format
|
|
487
|
+
# Sanitize tool name to prevent prompt breaking
|
|
488
|
+
tool_name = tool.get("tool_name", "")
|
|
489
|
+
sanitized_name = sanitize_tool_name(tool_name)
|
|
490
|
+
|
|
491
|
+
openai_tool = {
|
|
492
|
+
"type": "function",
|
|
493
|
+
"function": {
|
|
494
|
+
"name": sanitized_name,
|
|
495
|
+
"description": tool.get("usage", "MCP tool"),
|
|
496
|
+
"parameters": {
|
|
497
|
+
"type": "object",
|
|
498
|
+
"properties": {},
|
|
499
|
+
"required": []
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
openai_tools.append(openai_tool)
|
|
504
|
+
|
|
505
|
+
# Build schema metadata section
|
|
506
|
+
info_key = "agent_info" if schema_type == "agent" else "evaluator_info"
|
|
507
|
+
schema_metadata = {
|
|
508
|
+
info_key: {
|
|
509
|
+
"version": schema.get("version", "1.0.0"),
|
|
510
|
+
"title": schema.get("title", ""),
|
|
511
|
+
},
|
|
512
|
+
"output_schema": {
|
|
513
|
+
"description": f"Structured output returned by this {schema_type}",
|
|
514
|
+
"properties": {
|
|
515
|
+
k: {
|
|
516
|
+
"type": v.get("type", "unknown"),
|
|
517
|
+
"description": v.get("description", ""),
|
|
518
|
+
}
|
|
519
|
+
for k, v in properties.items()
|
|
520
|
+
},
|
|
521
|
+
"required": required,
|
|
522
|
+
},
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
# Add tool definitions for agents
|
|
526
|
+
if tool_definitions:
|
|
527
|
+
schema_metadata["tools"] = {
|
|
528
|
+
"description": "MCP tools available to this agent",
|
|
529
|
+
"tool_definitions": tool_definitions,
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
# Add input format for evaluators
|
|
533
|
+
if schema_type == "evaluator":
|
|
534
|
+
schema_metadata["input_format"] = {
|
|
535
|
+
"description": "Evaluators receive dataset examples with 'input' and 'output' fields",
|
|
536
|
+
"structure": {
|
|
537
|
+
"input": "dict[str, Any] - What the agent receives (e.g., {'query': '...'})",
|
|
538
|
+
"output": "dict[str, Any] - Expected/ground truth (e.g., {'label': '...'})",
|
|
539
|
+
"metadata": "dict[str, Any] - Optional metadata (e.g., {'difficulty': 'medium'})",
|
|
540
|
+
},
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
# Append schema metadata to system prompt
|
|
544
|
+
schema_yaml = yaml.dump(schema_metadata, default_flow_style=False, sort_keys=False)
|
|
545
|
+
schema_section = f"\n\n---\n\n## Schema Metadata\n\n```yaml\n{schema_yaml}```"
|
|
546
|
+
system_prompt = system_prompt + schema_section
|
|
547
|
+
|
|
548
|
+
# Create structured template
|
|
549
|
+
user_content = "{{input}}" if schema_type == "agent" else "Question: {{input}}\nAgent's Answer: {{output}}"
|
|
550
|
+
|
|
551
|
+
template_messages = [
|
|
552
|
+
{"role": "system", "content": system_prompt},
|
|
553
|
+
{"role": "user", "content": user_content}
|
|
554
|
+
]
|
|
555
|
+
|
|
556
|
+
# Build response format
|
|
557
|
+
response_format = {
|
|
558
|
+
"type": "json_schema",
|
|
559
|
+
"json_schema": {
|
|
560
|
+
"name": schema.get("title", ""),
|
|
561
|
+
"schema": {
|
|
562
|
+
"type": "object",
|
|
563
|
+
"properties": properties,
|
|
564
|
+
"required": required,
|
|
565
|
+
"additionalProperties": False
|
|
566
|
+
},
|
|
567
|
+
"strict": True
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
# Build complete openai_params dict ready for PromptVersion.from_openai()
|
|
572
|
+
openai_params: dict[str, Any] = {
|
|
573
|
+
"model": model_name,
|
|
574
|
+
"messages": template_messages,
|
|
575
|
+
"response_format": response_format,
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
# Add tools for agents (OpenAI function calling format)
|
|
579
|
+
if openai_tools:
|
|
580
|
+
openai_params["tools"] = openai_tools
|
|
581
|
+
|
|
582
|
+
return openai_params
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
# =============================================================================
|
|
586
|
+
# EXPERIMENT WORKFLOWS
|
|
587
|
+
# =============================================================================
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def run_evaluation_experiment(
|
|
591
|
+
dataset_name: str,
|
|
592
|
+
task: Callable[[Any], Any] | None = None,
|
|
593
|
+
evaluator_schema_path: str | Path | dict[str, Any] | None = None,
|
|
594
|
+
experiment_name: str | None = None,
|
|
595
|
+
experiment_description: str | None = None,
|
|
596
|
+
phoenix_client: "PhoenixClient | None" = None,
|
|
597
|
+
model_name: str | None = None,
|
|
598
|
+
) -> "RanExperiment":
|
|
599
|
+
"""Run a complete evaluation experiment using Phoenix.
|
|
600
|
+
|
|
601
|
+
High-level workflow that:
|
|
602
|
+
1. Loads dataset from Phoenix
|
|
603
|
+
2. Optionally runs task (agent) on dataset
|
|
604
|
+
3. Optionally runs evaluators on results
|
|
605
|
+
4. Tracks results in Phoenix UI
|
|
606
|
+
|
|
607
|
+
Args:
|
|
608
|
+
dataset_name: Name of dataset in Phoenix
|
|
609
|
+
task: Optional task function (agent) to run on dataset
|
|
610
|
+
evaluator_schema_path: Optional evaluator schema path/name/dict
|
|
611
|
+
experiment_name: Name for this experiment
|
|
612
|
+
experiment_description: Description of experiment
|
|
613
|
+
phoenix_client: Optional PhoenixClient (auto-creates if not provided)
|
|
614
|
+
model_name: LLM model for evaluation
|
|
615
|
+
|
|
616
|
+
Returns:
|
|
617
|
+
RanExperiment with results and metrics
|
|
618
|
+
|
|
619
|
+
Example - Agent Run Only:
|
|
620
|
+
>>> experiment = run_evaluation_experiment(
|
|
621
|
+
... dataset_name="rem-lookup-golden",
|
|
622
|
+
... task=run_agent_task,
|
|
623
|
+
... experiment_name="rem-v1-baseline"
|
|
624
|
+
... )
|
|
625
|
+
|
|
626
|
+
Example - Agent + Evaluator:
|
|
627
|
+
>>> experiment = run_evaluation_experiment(
|
|
628
|
+
... dataset_name="rem-lookup-golden",
|
|
629
|
+
... task=run_agent_task,
|
|
630
|
+
... evaluator_schema_path="rem-lookup-correctness",
|
|
631
|
+
... experiment_name="rem-v1-full-eval"
|
|
632
|
+
... )
|
|
633
|
+
|
|
634
|
+
Example - Evaluator Only (on existing results):
|
|
635
|
+
>>> experiment = run_evaluation_experiment(
|
|
636
|
+
... dataset_name="rem-v1-results",
|
|
637
|
+
... evaluator_schema_path="rem-lookup-correctness",
|
|
638
|
+
... experiment_name="rem-v1-scoring"
|
|
639
|
+
... )
|
|
640
|
+
"""
|
|
641
|
+
# Create Phoenix client if not provided
|
|
642
|
+
if phoenix_client is None:
|
|
643
|
+
from rem.services.phoenix import PhoenixClient
|
|
644
|
+
phoenix_client = PhoenixClient()
|
|
645
|
+
|
|
646
|
+
# Load dataset
|
|
647
|
+
logger.info(f"Loading dataset: {dataset_name}")
|
|
648
|
+
dataset = phoenix_client.get_dataset(dataset_name)
|
|
649
|
+
|
|
650
|
+
# Create evaluator if schema provided
|
|
651
|
+
evaluators = []
|
|
652
|
+
if evaluator_schema_path:
|
|
653
|
+
logger.info(f"Creating evaluator from schema: {evaluator_schema_path}")
|
|
654
|
+
evaluator = create_evaluator_from_schema(
|
|
655
|
+
evaluator_schema_path=evaluator_schema_path,
|
|
656
|
+
model_name=model_name,
|
|
657
|
+
)
|
|
658
|
+
evaluators.append(evaluator)
|
|
659
|
+
|
|
660
|
+
# Run experiment
|
|
661
|
+
logger.info(f"Running experiment: {experiment_name or 'unnamed'}")
|
|
662
|
+
experiment = phoenix_client.run_experiment(
|
|
663
|
+
dataset=dataset,
|
|
664
|
+
task=task,
|
|
665
|
+
evaluators=evaluators if evaluators else None,
|
|
666
|
+
experiment_name=experiment_name,
|
|
667
|
+
experiment_description=experiment_description,
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
logger.success(
|
|
671
|
+
f"Experiment complete. View results: {experiment.url if hasattr(experiment, 'url') else 'N/A'}" # type: ignore[attr-defined]
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
return experiment
|