remdb 0.3.242__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +129 -0
- rem/agentic/README.md +760 -0
- rem/agentic/__init__.py +54 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +38 -0
- rem/agentic/agents/agent_manager.py +311 -0
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +425 -0
- rem/agentic/context_builder.py +360 -0
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +273 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +240 -0
- rem/agentic/providers/phoenix.py +926 -0
- rem/agentic/providers/pydantic_ai.py +854 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +737 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +242 -0
- rem/api/README.md +657 -0
- rem/api/deps.py +253 -0
- rem/api/main.py +460 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +820 -0
- rem/api/mcp_router/server.py +243 -0
- rem/api/mcp_router/tools.py +1605 -0
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +520 -0
- rem/api/routers/auth.py +898 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/child_streaming.py +394 -0
- rem/api/routers/chat/completions.py +702 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +202 -0
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +546 -0
- rem/api/routers/chat/streaming.py +950 -0
- rem/api/routers/chat/streaming_utils.py +327 -0
- rem/api/routers/common.py +18 -0
- rem/api/routers/dev.py +87 -0
- rem/api/routers/feedback.py +276 -0
- rem/api/routers/messages.py +620 -0
- rem/api/routers/models.py +86 -0
- rem/api/routers/query.py +362 -0
- rem/api/routers/shared_sessions.py +422 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +36 -0
- rem/auth/jwt.py +367 -0
- rem/auth/middleware.py +318 -0
- rem/auth/providers/__init__.py +16 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/email.py +215 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +517 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +299 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +549 -0
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +495 -0
- rem/cli/commands/db.py +828 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1698 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +388 -0
- rem/cli/commands/query.py +109 -0
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +230 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/commands/session.py +453 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +123 -0
- rem/config.py +244 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +70 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +672 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +246 -0
- rem/models/entities/__init__.py +68 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +64 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +181 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/session.py +84 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/subscriber.py +175 -0
- rem/models/entities/user.py +93 -0
- rem/py.typed +0 -0
- rem/registry.py +373 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/agent-builder.yaml +235 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +132 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +18 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +760 -0
- rem/services/content/service.py +762 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +322 -0
- rem/services/dreaming/moment_service.py +251 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/email/__init__.py +10 -0
- rem/services/email/service.py +522 -0
- rem/services/email/templates.py +360 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +127 -0
- rem/services/embeddings/worker.py +435 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +960 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +757 -0
- rem/services/postgres/__init__.py +49 -0
- rem/services/postgres/diff_service.py +599 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
- rem/services/postgres/register_type.py +353 -0
- rem/services/postgres/repository.py +481 -0
- rem/services/postgres/schema_generator.py +661 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +355 -0
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +318 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +180 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +608 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +13 -0
- rem/services/session/compression.py +488 -0
- rem/services/session/pydantic_messages.py +310 -0
- rem/services/session/reload.py +85 -0
- rem/services/user_service.py +130 -0
- rem/settings.py +1877 -0
- rem/sql/background_indexes.sql +52 -0
- rem/sql/migrations/001_install.sql +983 -0
- rem/sql/migrations/002_install_models.sql +3157 -0
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +282 -0
- rem/sql/migrations/005_schema_update.sql +145 -0
- rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +628 -0
- rem/utils/__init__.py +61 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +436 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/files.py +323 -0
- rem/utils/markdown.py +16 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +492 -0
- rem/utils/schema_loader.py +649 -0
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +350 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +325 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +7 -0
- rem/workers/db_listener.py +579 -0
- rem/workers/db_maintainer.py +74 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- rem/workers/unlogged_maintainer.py +463 -0
- remdb-0.3.242.dist-info/METADATA +1632 -0
- remdb-0.3.242.dist-info/RECORD +235 -0
- remdb-0.3.242.dist-info/WHEEL +4 -0
- remdb-0.3.242.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,926 @@
|
|
|
1
|
+
"""Phoenix evaluator provider for REM agents.
|
|
2
|
+
|
|
3
|
+
This module provides factory functions for creating Phoenix-compatible evaluators
|
|
4
|
+
from schema definitions, following the same pattern as Pydantic AI agent creation.
|
|
5
|
+
|
|
6
|
+
Exported Functions:
|
|
7
|
+
===================
|
|
8
|
+
- load_evaluator_schema: Load evaluator schemas from schemas/evaluators/
|
|
9
|
+
- create_phoenix_evaluator: Create Phoenix evaluator config from schema
|
|
10
|
+
- create_evaluator_from_schema: Create callable evaluator function
|
|
11
|
+
- schema_to_prompt: Convert schema to Phoenix openai_params format
|
|
12
|
+
- sanitize_tool_name: Sanitize tool names for Phoenix/OpenAI compatibility
|
|
13
|
+
- run_evaluation_experiment: Run complete evaluation workflow
|
|
14
|
+
|
|
15
|
+
Design Pattern (mirrors Pydantic AI provider):
|
|
16
|
+
==============================================
|
|
17
|
+
1. Load evaluator schemas from schemas/evaluators/ directory
|
|
18
|
+
2. Extract system prompt, output schema, and metadata
|
|
19
|
+
3. Create Phoenix-compatible evaluator functions
|
|
20
|
+
4. Support both LLM-as-a-Judge and code-based evaluators
|
|
21
|
+
|
|
22
|
+
Two-Phase Evaluation Architecture:
|
|
23
|
+
===================================
|
|
24
|
+
|
|
25
|
+
Phase 1 - Golden Set Creation:
|
|
26
|
+
SMEs create datasets with (input, reference) pairs in Phoenix
|
|
27
|
+
|
|
28
|
+
Phase 2 - Automated Evaluation:
|
|
29
|
+
Step 1: Run agents → (input, agent_output)
|
|
30
|
+
Step 2: Run evaluators → (input, agent_output, reference) → scores
|
|
31
|
+
|
|
32
|
+
Evaluator Types:
|
|
33
|
+
================
|
|
34
|
+
|
|
35
|
+
1. LLM-as-a-Judge (uses Claude/GPT to evaluate):
|
|
36
|
+
- Compares agent output to reference
|
|
37
|
+
- Scores on multiple dimensions (correctness, completeness, etc.)
|
|
38
|
+
- Provides explanations and suggestions
|
|
39
|
+
|
|
40
|
+
2. Code-based (deterministic evaluation):
|
|
41
|
+
- Exact match checking
|
|
42
|
+
- Field presence validation
|
|
43
|
+
- Format compliance
|
|
44
|
+
|
|
45
|
+
Usage:
|
|
46
|
+
======
|
|
47
|
+
|
|
48
|
+
Create evaluator from schema:
|
|
49
|
+
>>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
|
|
50
|
+
>>> result = evaluator(example)
|
|
51
|
+
>>> # Returns: {"score": 0.95, "label": "correct", "explanation": "..."}
|
|
52
|
+
|
|
53
|
+
Run evaluation experiment:
|
|
54
|
+
>>> from rem.services.phoenix import PhoenixClient
|
|
55
|
+
>>> client = PhoenixClient()
|
|
56
|
+
>>> experiment = run_evaluation_experiment(
|
|
57
|
+
... dataset_name="rem-lookup-golden",
|
|
58
|
+
... task=run_agent_task,
|
|
59
|
+
... evaluator_schema_path="rem-lookup-correctness",
|
|
60
|
+
... phoenix_client=client
|
|
61
|
+
... )
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
from typing import Any, Callable, TYPE_CHECKING
|
|
65
|
+
from pathlib import Path
|
|
66
|
+
import json
|
|
67
|
+
import yaml
|
|
68
|
+
|
|
69
|
+
from loguru import logger
|
|
70
|
+
|
|
71
|
+
# Lazy import to avoid Phoenix initialization at module load time
|
|
72
|
+
if TYPE_CHECKING:
|
|
73
|
+
from phoenix.evals import LLMEvaluator
|
|
74
|
+
from phoenix.client.resources.datasets import Dataset
|
|
75
|
+
from phoenix.client.resources.experiments.types import RanExperiment
|
|
76
|
+
from rem.services.phoenix import PhoenixClient
|
|
77
|
+
|
|
78
|
+
PHOENIX_AVAILABLE = None # Lazy check on first use
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _check_phoenix_available() -> bool:
|
|
82
|
+
"""Lazy check if Phoenix is available (only imports when needed)."""
|
|
83
|
+
global PHOENIX_AVAILABLE
|
|
84
|
+
if PHOENIX_AVAILABLE is not None:
|
|
85
|
+
return PHOENIX_AVAILABLE
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
import phoenix.evals # noqa: F401
|
|
89
|
+
PHOENIX_AVAILABLE = True
|
|
90
|
+
except ImportError:
|
|
91
|
+
PHOENIX_AVAILABLE = False
|
|
92
|
+
logger.warning("arize-phoenix package not installed - evaluator factory unavailable")
|
|
93
|
+
|
|
94
|
+
return PHOENIX_AVAILABLE
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def validate_evaluator_credentials(
|
|
98
|
+
model_name: str | None = None,
|
|
99
|
+
) -> tuple[bool, str | None]:
|
|
100
|
+
"""Validate that the evaluator's LLM provider has working credentials.
|
|
101
|
+
|
|
102
|
+
Performs a minimal API call to verify credentials before running experiments.
|
|
103
|
+
This prevents running expensive agent tasks only to have evaluations fail.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
model_name: Model to validate (defaults to claude-sonnet-4-5-20250929)
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Tuple of (success: bool, error_message: str | None)
|
|
110
|
+
- (True, None) if credentials are valid
|
|
111
|
+
- (False, "error description") if validation fails
|
|
112
|
+
|
|
113
|
+
Example:
|
|
114
|
+
>>> success, error = validate_evaluator_credentials()
|
|
115
|
+
>>> if not success:
|
|
116
|
+
... print(f"Evaluator validation failed: {error}")
|
|
117
|
+
... return
|
|
118
|
+
"""
|
|
119
|
+
if not _check_phoenix_available():
|
|
120
|
+
return False, "arize-phoenix package not installed"
|
|
121
|
+
|
|
122
|
+
from phoenix.evals import OpenAIModel, AnthropicModel
|
|
123
|
+
|
|
124
|
+
# Default model (check env var first)
|
|
125
|
+
if model_name is None:
|
|
126
|
+
import os
|
|
127
|
+
model_name = os.environ.get("EVALUATOR_MODEL", "claude-sonnet-4-5-20250929")
|
|
128
|
+
|
|
129
|
+
# Parse provider
|
|
130
|
+
if ":" in model_name:
|
|
131
|
+
provider, phoenix_model_name = model_name.split(":", 1)
|
|
132
|
+
else:
|
|
133
|
+
if model_name.startswith("claude"):
|
|
134
|
+
provider = "anthropic"
|
|
135
|
+
else:
|
|
136
|
+
provider = "openai"
|
|
137
|
+
phoenix_model_name = model_name
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
# Create LLM wrapper
|
|
141
|
+
if provider.lower() == "anthropic":
|
|
142
|
+
llm = AnthropicModel(
|
|
143
|
+
model=phoenix_model_name,
|
|
144
|
+
temperature=0.0,
|
|
145
|
+
top_p=None,
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
llm = OpenAIModel(model=phoenix_model_name, temperature=0.0)
|
|
149
|
+
|
|
150
|
+
# Test with minimal prompt
|
|
151
|
+
logger.info(f"Validating evaluator credentials for {provider}:{phoenix_model_name}")
|
|
152
|
+
response = llm("Say 'ok' if you can read this.")
|
|
153
|
+
|
|
154
|
+
if response and len(response) > 0:
|
|
155
|
+
logger.info(f"Evaluator credentials validated successfully for {provider}")
|
|
156
|
+
return True, None
|
|
157
|
+
else:
|
|
158
|
+
return False, f"Empty response from {provider} model"
|
|
159
|
+
|
|
160
|
+
except Exception as e:
|
|
161
|
+
error_msg = str(e)
|
|
162
|
+
# Extract meaningful error from common API errors
|
|
163
|
+
if "credit balance is too low" in error_msg.lower():
|
|
164
|
+
return False, f"Anthropic API credits exhausted. Add credits at https://console.anthropic.com/settings/billing"
|
|
165
|
+
elif "api key" in error_msg.lower() or "authentication" in error_msg.lower():
|
|
166
|
+
return False, f"{provider.capitalize()} API key missing or invalid. Set ANTHROPIC_API_KEY or OPENAI_API_KEY environment variable."
|
|
167
|
+
elif "rate limit" in error_msg.lower():
|
|
168
|
+
return False, f"{provider.capitalize()} rate limit exceeded. Wait and retry."
|
|
169
|
+
else:
|
|
170
|
+
return False, f"{provider.capitalize()} API error: {error_msg[:200]}"
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# =============================================================================
|
|
174
|
+
# NAME SANITIZATION
|
|
175
|
+
# =============================================================================
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def sanitize_tool_name(tool_name: str) -> str:
|
|
179
|
+
"""Sanitize tool name for Phoenix/OpenAI compatibility.
|
|
180
|
+
|
|
181
|
+
Replaces all non-alphanumeric characters with underscores to prevent
|
|
182
|
+
prompt breaking and ensure compatibility with OpenAI function calling.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
tool_name: Original tool name (e.g., "ask_rem", "traverse-graph")
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
Sanitized name with only alphanumeric characters and underscores
|
|
189
|
+
|
|
190
|
+
Example:
|
|
191
|
+
>>> sanitize_tool_name("ask_rem")
|
|
192
|
+
'ask_rem'
|
|
193
|
+
>>> sanitize_tool_name("traverse-graph")
|
|
194
|
+
'traverse_graph'
|
|
195
|
+
>>> sanitize_tool_name("mcp://server/tool-name")
|
|
196
|
+
'mcp___server_tool_name'
|
|
197
|
+
"""
|
|
198
|
+
return "".join(c if c.isalnum() else "_" for c in tool_name)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# =============================================================================
|
|
202
|
+
# SCHEMA LOADING
|
|
203
|
+
# =============================================================================
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def load_evaluator_schema(evaluator_name: str) -> dict[str, Any]:
|
|
207
|
+
"""Load evaluator schema using centralized schema loader.
|
|
208
|
+
|
|
209
|
+
Uses the same unified search logic as agent schemas:
|
|
210
|
+
- "hello-world/default" → schemas/evaluators/hello-world/default.yaml
|
|
211
|
+
- "lookup-correctness" → schemas/evaluators/rem/lookup-correctness.yaml
|
|
212
|
+
- "rem-lookup-correctness" → schemas/evaluators/rem/lookup-correctness.yaml
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
evaluator_name: Evaluator name or path
|
|
216
|
+
e.g., "hello-world/default", "lookup-correctness"
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Evaluator schema dictionary with keys:
|
|
220
|
+
- description: System prompt for LLM evaluator
|
|
221
|
+
- properties: Output schema fields
|
|
222
|
+
- required: Required output fields
|
|
223
|
+
- labels: Optional labels for categorization
|
|
224
|
+
- version: Schema version
|
|
225
|
+
|
|
226
|
+
Raises:
|
|
227
|
+
FileNotFoundError: If evaluator schema not found
|
|
228
|
+
|
|
229
|
+
Example:
|
|
230
|
+
>>> schema = load_evaluator_schema("hello-world/default")
|
|
231
|
+
>>> print(schema["description"])
|
|
232
|
+
"""
|
|
233
|
+
from ...utils.schema_loader import load_agent_schema
|
|
234
|
+
|
|
235
|
+
# Use centralized schema loader (searches evaluator paths too)
|
|
236
|
+
return load_agent_schema(evaluator_name)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
# =============================================================================
|
|
240
|
+
# EVALUATOR CREATION
|
|
241
|
+
# =============================================================================
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def create_phoenix_evaluator(
|
|
245
|
+
evaluator_schema: dict[str, Any],
|
|
246
|
+
model_name: str | None = None,
|
|
247
|
+
) -> dict[str, Any]:
|
|
248
|
+
"""Create Phoenix evaluator configuration from schema.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
evaluator_schema: Evaluator schema dictionary
|
|
252
|
+
model_name: Optional LLM model to use (defaults to claude-sonnet-4-5)
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Evaluator config dict with:
|
|
256
|
+
- name: Evaluator name
|
|
257
|
+
- llm: Phoenix LLM wrapper
|
|
258
|
+
- prompt_template: System prompt
|
|
259
|
+
- schema: Output schema
|
|
260
|
+
|
|
261
|
+
Raises:
|
|
262
|
+
ImportError: If arize-phoenix not installed
|
|
263
|
+
KeyError: If required schema fields missing
|
|
264
|
+
"""
|
|
265
|
+
if not _check_phoenix_available():
|
|
266
|
+
raise ImportError(
|
|
267
|
+
"arize-phoenix package required for evaluators. "
|
|
268
|
+
"Install with: pip install arize-phoenix"
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Import Phoenix after availability check
|
|
272
|
+
from phoenix.evals import OpenAIModel, AnthropicModel
|
|
273
|
+
|
|
274
|
+
logger.debug("Creating Phoenix evaluator from schema")
|
|
275
|
+
|
|
276
|
+
# Extract schema fields
|
|
277
|
+
evaluator_name = evaluator_schema.get("title", "UnnamedEvaluator")
|
|
278
|
+
system_prompt = evaluator_schema.get("description", "")
|
|
279
|
+
output_schema = evaluator_schema.get("properties", {})
|
|
280
|
+
|
|
281
|
+
if not system_prompt:
|
|
282
|
+
raise KeyError("evaluator_schema must contain 'description' field with system prompt")
|
|
283
|
+
|
|
284
|
+
# Default model (use Claude Sonnet 4.5 for evaluators)
|
|
285
|
+
if model_name is None:
|
|
286
|
+
import os
|
|
287
|
+
model_name = os.environ.get("EVALUATOR_MODEL", "claude-sonnet-4-5-20250929")
|
|
288
|
+
logger.debug(f"Using evaluator model: {model_name}")
|
|
289
|
+
|
|
290
|
+
logger.info(f"Creating Phoenix evaluator: {evaluator_name} with model={model_name}")
|
|
291
|
+
|
|
292
|
+
# Parse provider and model name
|
|
293
|
+
if ":" in model_name:
|
|
294
|
+
provider, phoenix_model_name = model_name.split(":", 1)
|
|
295
|
+
else:
|
|
296
|
+
# Detect provider from model name
|
|
297
|
+
if model_name.startswith("claude"):
|
|
298
|
+
provider = "anthropic"
|
|
299
|
+
else:
|
|
300
|
+
provider = "openai"
|
|
301
|
+
phoenix_model_name = model_name
|
|
302
|
+
|
|
303
|
+
# Create appropriate Phoenix LLM wrapper based on provider
|
|
304
|
+
llm: OpenAIModel | AnthropicModel
|
|
305
|
+
if provider.lower() == "anthropic":
|
|
306
|
+
# Anthropic's newer Claude models (claude-sonnet-4, claude-opus-4, etc.)
|
|
307
|
+
# don't allow both temperature and top_p to be specified together.
|
|
308
|
+
# Phoenix's AnthropicModel defaults top_p=1, so we explicitly set it
|
|
309
|
+
# to None to prevent it from being sent in the API request.
|
|
310
|
+
# The invocation_parameters() method only includes params that are not None.
|
|
311
|
+
llm = AnthropicModel(
|
|
312
|
+
model=phoenix_model_name,
|
|
313
|
+
temperature=0.0,
|
|
314
|
+
top_p=None, # type: ignore[arg-type] - None prevents param from being sent
|
|
315
|
+
)
|
|
316
|
+
else:
|
|
317
|
+
# Default to OpenAI for other providers (gpt-4, etc.)
|
|
318
|
+
llm = OpenAIModel(model=phoenix_model_name, temperature=0.0)
|
|
319
|
+
|
|
320
|
+
# Return evaluator config (not an instance - we'll use llm_classify directly)
|
|
321
|
+
evaluator_config = {
|
|
322
|
+
"name": evaluator_name,
|
|
323
|
+
"llm": llm,
|
|
324
|
+
"prompt_template": system_prompt,
|
|
325
|
+
"schema": output_schema,
|
|
326
|
+
"labels": evaluator_schema.get("labels", []),
|
|
327
|
+
"version": evaluator_schema.get("version", "1.0.0"),
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
logger.info(f"Phoenix evaluator '{evaluator_name}' created successfully")
|
|
331
|
+
return evaluator_config
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _evaluate_expression(expression: str, context: dict[str, Any]) -> Any:
|
|
335
|
+
"""Safely evaluate a simple expression with context variables.
|
|
336
|
+
|
|
337
|
+
Supports: arithmetic, comparisons, boolean logic, len()
|
|
338
|
+
"""
|
|
339
|
+
try:
|
|
340
|
+
allowed_names = {
|
|
341
|
+
"len": len,
|
|
342
|
+
"True": True,
|
|
343
|
+
"False": False,
|
|
344
|
+
"true": True,
|
|
345
|
+
"false": False,
|
|
346
|
+
}
|
|
347
|
+
allowed_names.update(context)
|
|
348
|
+
return eval(expression, {"__builtins__": {}}, allowed_names)
|
|
349
|
+
except Exception as e:
|
|
350
|
+
logger.warning(f"Expression evaluation failed: {expression} - {e}")
|
|
351
|
+
return 0.0
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _calculate_derived_scores(
|
|
355
|
+
response_json: dict[str, Any],
|
|
356
|
+
derived_scores_config: dict[str, Any],
|
|
357
|
+
) -> dict[str, Any]:
|
|
358
|
+
"""Calculate derived scores from evaluator output using config formulas.
|
|
359
|
+
|
|
360
|
+
Supports:
|
|
361
|
+
- weighted_sum: Weighted average of fields
|
|
362
|
+
- conditional_weighted: Different formulas based on conditions
|
|
363
|
+
- boolean_logic: Boolean expression evaluation
|
|
364
|
+
"""
|
|
365
|
+
for score_name, score_config in derived_scores_config.items():
|
|
366
|
+
score_type = score_config.get("type")
|
|
367
|
+
|
|
368
|
+
if score_type == "weighted_sum":
|
|
369
|
+
weights = score_config.get("weights", {})
|
|
370
|
+
total = 0.0
|
|
371
|
+
for field, weight in weights.items():
|
|
372
|
+
field_value = response_json.get(field, 0.0)
|
|
373
|
+
if isinstance(field_value, (int, float)):
|
|
374
|
+
total += field_value * weight
|
|
375
|
+
response_json[score_name] = total
|
|
376
|
+
|
|
377
|
+
elif score_type == "conditional_weighted":
|
|
378
|
+
conditions = score_config.get("conditions", [])
|
|
379
|
+
formula_to_use = None
|
|
380
|
+
for cond_config in conditions:
|
|
381
|
+
condition = cond_config.get("condition")
|
|
382
|
+
if condition is None:
|
|
383
|
+
formula_to_use = cond_config.get("formula")
|
|
384
|
+
break
|
|
385
|
+
field = condition.get("field")
|
|
386
|
+
operator = condition.get("operator")
|
|
387
|
+
value = condition.get("value")
|
|
388
|
+
field_value = response_json.get(field, 0.0)
|
|
389
|
+
condition_met = False
|
|
390
|
+
if operator == ">=":
|
|
391
|
+
condition_met = field_value >= value
|
|
392
|
+
elif operator == ">":
|
|
393
|
+
condition_met = field_value > value
|
|
394
|
+
elif operator == "<=":
|
|
395
|
+
condition_met = field_value <= value
|
|
396
|
+
elif operator == "<":
|
|
397
|
+
condition_met = field_value < value
|
|
398
|
+
elif operator == "==":
|
|
399
|
+
condition_met = field_value == value
|
|
400
|
+
elif operator == "!=":
|
|
401
|
+
condition_met = field_value != value
|
|
402
|
+
if condition_met:
|
|
403
|
+
formula_to_use = cond_config.get("formula")
|
|
404
|
+
break
|
|
405
|
+
if formula_to_use and formula_to_use.get("type") == "weighted_sum":
|
|
406
|
+
weights = formula_to_use.get("weights", {})
|
|
407
|
+
total = 0.0
|
|
408
|
+
for field, weight in weights.items():
|
|
409
|
+
field_value = response_json.get(field, 0.0)
|
|
410
|
+
if isinstance(field_value, (int, float)):
|
|
411
|
+
total += field_value * weight
|
|
412
|
+
response_json[score_name] = total
|
|
413
|
+
|
|
414
|
+
elif score_type == "boolean_logic":
|
|
415
|
+
expression = score_config.get("expression", "")
|
|
416
|
+
result = _evaluate_expression(expression, response_json)
|
|
417
|
+
response_json[score_name] = result
|
|
418
|
+
|
|
419
|
+
return response_json
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def _create_phoenix_evaluations(
|
|
423
|
+
response_json: dict[str, Any],
|
|
424
|
+
evaluations_config: list[dict[str, Any]],
|
|
425
|
+
) -> list[dict[str, Any]]:
|
|
426
|
+
"""Create Phoenix evaluation dicts from evaluator output using config.
|
|
427
|
+
|
|
428
|
+
Each evaluation becomes a column in Phoenix UI with name, label, score, explanation.
|
|
429
|
+
"""
|
|
430
|
+
evaluations = []
|
|
431
|
+
for eval_config in evaluations_config:
|
|
432
|
+
eval_name = eval_config.get("name", "unnamed")
|
|
433
|
+
score_field = eval_config.get("score_field")
|
|
434
|
+
score_expression = eval_config.get("score_expression")
|
|
435
|
+
label_field = eval_config.get("label_field")
|
|
436
|
+
label_expression = eval_config.get("label_expression")
|
|
437
|
+
label_logic = eval_config.get("label_logic", [])
|
|
438
|
+
label_transform = eval_config.get("label_transform", {})
|
|
439
|
+
score_logic = eval_config.get("score_logic", {})
|
|
440
|
+
explanation_field = eval_config.get("explanation_field")
|
|
441
|
+
|
|
442
|
+
evaluation = {"name": eval_name}
|
|
443
|
+
|
|
444
|
+
# Get score
|
|
445
|
+
if score_expression:
|
|
446
|
+
evaluation["score"] = _evaluate_expression(score_expression, response_json)
|
|
447
|
+
elif score_field:
|
|
448
|
+
evaluation["score"] = response_json.get(score_field, 0.0)
|
|
449
|
+
elif score_logic and label_field:
|
|
450
|
+
label_value = response_json.get(label_field)
|
|
451
|
+
if isinstance(label_value, bool):
|
|
452
|
+
label_value = "true" if label_value else "false"
|
|
453
|
+
evaluation["score"] = score_logic.get(str(label_value), 0.0)
|
|
454
|
+
else:
|
|
455
|
+
evaluation["score"] = None
|
|
456
|
+
|
|
457
|
+
# Get label
|
|
458
|
+
if label_expression:
|
|
459
|
+
evaluation["label"] = str(_evaluate_expression(label_expression, response_json))
|
|
460
|
+
elif label_field:
|
|
461
|
+
label_value = response_json.get(label_field)
|
|
462
|
+
if isinstance(label_value, bool):
|
|
463
|
+
label_value = "true" if label_value else "false"
|
|
464
|
+
if label_transform:
|
|
465
|
+
evaluation["label"] = label_transform.get(str(label_value), str(label_value))
|
|
466
|
+
else:
|
|
467
|
+
evaluation["label"] = str(label_value)
|
|
468
|
+
elif label_logic and (score_field or score_expression):
|
|
469
|
+
score_value = evaluation.get("score", 0.0)
|
|
470
|
+
label = "unknown"
|
|
471
|
+
for logic in label_logic:
|
|
472
|
+
threshold = logic.get("threshold", 0.0)
|
|
473
|
+
operator = logic.get("operator", ">=")
|
|
474
|
+
if operator == ">=" and score_value >= threshold:
|
|
475
|
+
label = logic.get("label", "unknown")
|
|
476
|
+
break
|
|
477
|
+
elif operator == ">" and score_value > threshold:
|
|
478
|
+
label = logic.get("label", "unknown")
|
|
479
|
+
break
|
|
480
|
+
evaluation["label"] = label
|
|
481
|
+
else:
|
|
482
|
+
evaluation["label"] = None
|
|
483
|
+
|
|
484
|
+
# Get explanation
|
|
485
|
+
if explanation_field:
|
|
486
|
+
explanation_value = response_json.get(explanation_field, "")
|
|
487
|
+
if isinstance(explanation_value, list):
|
|
488
|
+
evaluation["explanation"] = ", ".join(str(x) for x in explanation_value) if explanation_value else "None"
|
|
489
|
+
else:
|
|
490
|
+
evaluation["explanation"] = str(explanation_value)
|
|
491
|
+
else:
|
|
492
|
+
evaluation["explanation"] = None
|
|
493
|
+
|
|
494
|
+
evaluations.append(evaluation)
|
|
495
|
+
return evaluations
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def create_evaluator_from_schema(
|
|
499
|
+
evaluator_schema_path: str | Path | dict[str, Any],
|
|
500
|
+
model_name: str | None = None,
|
|
501
|
+
) -> Callable[[Any], Any]:
|
|
502
|
+
"""Create an evaluator function from a schema file or dict.
|
|
503
|
+
|
|
504
|
+
Uses direct LLM call with JSON schema for structured output evaluation.
|
|
505
|
+
Supports phoenix_config for derived scores and evaluation column mappings.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
evaluator_schema_path: Path to schema file, evaluator name, or schema dict
|
|
509
|
+
model_name: Optional LLM model to use for evaluation
|
|
510
|
+
|
|
511
|
+
Returns:
|
|
512
|
+
Evaluator function compatible with Phoenix experiments
|
|
513
|
+
|
|
514
|
+
Raises:
|
|
515
|
+
FileNotFoundError: If schema file not found
|
|
516
|
+
ImportError: If arize-phoenix not installed
|
|
517
|
+
|
|
518
|
+
Example:
|
|
519
|
+
>>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
|
|
520
|
+
>>> result = evaluator(input={...}, output={...}, expected={...})
|
|
521
|
+
>>> # Returns: list of {"name": "...", "score": 0.95, "label": "...", "explanation": "..."}
|
|
522
|
+
"""
|
|
523
|
+
if not _check_phoenix_available():
|
|
524
|
+
raise ImportError(
|
|
525
|
+
"arize-phoenix package required for evaluators. "
|
|
526
|
+
"Install with: pip install arize-phoenix"
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
# Load schema if path/name provided
|
|
530
|
+
if isinstance(evaluator_schema_path, (str, Path)):
|
|
531
|
+
schema_path = Path(evaluator_schema_path)
|
|
532
|
+
if schema_path.exists():
|
|
533
|
+
logger.debug(f"Loading evaluator schema from {schema_path}")
|
|
534
|
+
if schema_path.suffix in [".yaml", ".yml"]:
|
|
535
|
+
with open(schema_path) as f:
|
|
536
|
+
schema = yaml.safe_load(f)
|
|
537
|
+
else:
|
|
538
|
+
with open(schema_path) as f:
|
|
539
|
+
schema = json.load(f)
|
|
540
|
+
else:
|
|
541
|
+
schema = load_evaluator_schema(str(evaluator_schema_path))
|
|
542
|
+
else:
|
|
543
|
+
schema = evaluator_schema_path
|
|
544
|
+
|
|
545
|
+
# Extract schema components
|
|
546
|
+
output_schema = schema.get("properties", {})
|
|
547
|
+
|
|
548
|
+
# Extract phoenix_config for derived scores and evaluations
|
|
549
|
+
phoenix_config = schema.get("phoenix_config", {})
|
|
550
|
+
derived_scores_config = phoenix_config.get("derived_scores", {})
|
|
551
|
+
evaluations_config = phoenix_config.get("evaluations", [])
|
|
552
|
+
|
|
553
|
+
# Create evaluator config (LLM wrapper, prompt, etc.)
|
|
554
|
+
evaluator_config = create_phoenix_evaluator(
|
|
555
|
+
evaluator_schema=schema,
|
|
556
|
+
model_name=model_name,
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
import re
|
|
560
|
+
|
|
561
|
+
def evaluator_fn(input: dict[str, Any], output: dict[str, Any], expected: dict[str, Any]) -> list[dict[str, Any]]:
|
|
562
|
+
"""Evaluate using Phoenix's named parameter binding with structured LLM output.
|
|
563
|
+
|
|
564
|
+
Phoenix automatically binds these parameters:
|
|
565
|
+
- input: Dataset input dict
|
|
566
|
+
- output: Task's return value (agent output)
|
|
567
|
+
- expected: Expected output dict (reference/ground truth)
|
|
568
|
+
|
|
569
|
+
Returns:
|
|
570
|
+
List of Phoenix evaluation dicts with name, score, label, explanation
|
|
571
|
+
"""
|
|
572
|
+
logger.debug("Evaluating with structured output pattern")
|
|
573
|
+
|
|
574
|
+
# Extract question from input
|
|
575
|
+
if isinstance(input, dict):
|
|
576
|
+
question = input.get("input", input.get("text", str(input)))
|
|
577
|
+
else:
|
|
578
|
+
question = str(input)
|
|
579
|
+
|
|
580
|
+
# Serialize agent output
|
|
581
|
+
if isinstance(output, dict):
|
|
582
|
+
output_str = json.dumps(output, indent=2)
|
|
583
|
+
else:
|
|
584
|
+
output_str = str(output)
|
|
585
|
+
|
|
586
|
+
# Get reference from expected
|
|
587
|
+
if isinstance(expected, dict):
|
|
588
|
+
reference = expected.get("reference", expected.get("expected_output",
|
|
589
|
+
expected.get("ground_truth", str(expected))))
|
|
590
|
+
else:
|
|
591
|
+
reference = str(expected)
|
|
592
|
+
|
|
593
|
+
try:
|
|
594
|
+
# Build user message
|
|
595
|
+
user_message = f"""Question/Input: {question}
|
|
596
|
+
|
|
597
|
+
Agent's Answer:
|
|
598
|
+
{output_str}
|
|
599
|
+
|
|
600
|
+
Expected Answer (Reference):
|
|
601
|
+
{reference}
|
|
602
|
+
|
|
603
|
+
Please evaluate the agent's answer according to the evaluation criteria."""
|
|
604
|
+
|
|
605
|
+
# Add JSON schema requirement to system prompt
|
|
606
|
+
system_prompt = evaluator_config["prompt_template"]
|
|
607
|
+
schema_instruction = f"\n\nYou MUST respond with valid JSON matching this schema:\n{json.dumps(output_schema, indent=2)}\n\nProvide ONLY the JSON response, no markdown code blocks or extra text."
|
|
608
|
+
system_with_schema = system_prompt + schema_instruction
|
|
609
|
+
|
|
610
|
+
# Phoenix LLM models expect a single prompt string
|
|
611
|
+
llm = evaluator_config["llm"]
|
|
612
|
+
full_prompt = f"{system_with_schema}\n\n{user_message}"
|
|
613
|
+
response_text = llm(full_prompt)
|
|
614
|
+
|
|
615
|
+
# Parse JSON response
|
|
616
|
+
try:
|
|
617
|
+
response_json = json.loads(response_text)
|
|
618
|
+
except json.JSONDecodeError:
|
|
619
|
+
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
|
|
620
|
+
if json_match:
|
|
621
|
+
response_json = json.loads(json_match.group(1))
|
|
622
|
+
else:
|
|
623
|
+
raise ValueError(f"Could not parse JSON from LLM response: {response_text[:200]}")
|
|
624
|
+
|
|
625
|
+
logger.debug(f"LLM response parsed: {list(response_json.keys())}")
|
|
626
|
+
|
|
627
|
+
# Calculate derived scores using config
|
|
628
|
+
if derived_scores_config:
|
|
629
|
+
logger.debug(f"Calculating {len(derived_scores_config)} derived scores")
|
|
630
|
+
response_json = _calculate_derived_scores(response_json, derived_scores_config)
|
|
631
|
+
|
|
632
|
+
# Create Phoenix evaluations using config
|
|
633
|
+
if evaluations_config:
|
|
634
|
+
logger.debug(f"Creating {len(evaluations_config)} Phoenix evaluations")
|
|
635
|
+
evaluations = _create_phoenix_evaluations(response_json, evaluations_config)
|
|
636
|
+
else:
|
|
637
|
+
# Fallback: create evaluations from all numeric/boolean fields
|
|
638
|
+
logger.warning("No evaluations_config - creating default evaluations from schema")
|
|
639
|
+
evaluations = []
|
|
640
|
+
for field_name, field_value in response_json.items():
|
|
641
|
+
if isinstance(field_value, (int, float)):
|
|
642
|
+
evaluations.append({
|
|
643
|
+
"name": field_name,
|
|
644
|
+
"score": float(field_value),
|
|
645
|
+
"label": "good" if field_value >= 0.5 else "poor",
|
|
646
|
+
"explanation": None
|
|
647
|
+
})
|
|
648
|
+
elif isinstance(field_value, bool):
|
|
649
|
+
evaluations.append({
|
|
650
|
+
"name": field_name,
|
|
651
|
+
"score": 1.0 if field_value else 0.0,
|
|
652
|
+
"label": "pass" if field_value else "fail",
|
|
653
|
+
"explanation": None
|
|
654
|
+
})
|
|
655
|
+
|
|
656
|
+
# Always add overall if not present
|
|
657
|
+
if not any(e["name"] == "overall" for e in evaluations):
|
|
658
|
+
overall_score = response_json.get("overall_score", 0.0)
|
|
659
|
+
overall_pass = response_json.get("pass", False)
|
|
660
|
+
evaluations.append({
|
|
661
|
+
"name": "overall",
|
|
662
|
+
"score": overall_score if isinstance(overall_score, (int, float)) else 0.0,
|
|
663
|
+
"label": "pass" if overall_pass else "fail",
|
|
664
|
+
"explanation": response_json.get("evaluation_notes", None)
|
|
665
|
+
})
|
|
666
|
+
|
|
667
|
+
logger.debug(f"Created {len(evaluations)} evaluations")
|
|
668
|
+
|
|
669
|
+
# Phoenix client expects a dict with score, label, explanation
|
|
670
|
+
# (not the old EvaluationResult class)
|
|
671
|
+
overall_eval = next(
|
|
672
|
+
(e for e in evaluations if e["name"] == "overall"),
|
|
673
|
+
{"score": 0.0, "label": "unknown", "explanation": None}
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
return {
|
|
677
|
+
"score": overall_eval.get("score", 0.0),
|
|
678
|
+
"label": overall_eval.get("label", "unknown"),
|
|
679
|
+
"explanation": overall_eval.get("explanation"),
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
except Exception as e:
|
|
683
|
+
logger.error(f"Evaluator error: {e}")
|
|
684
|
+
return {
|
|
685
|
+
"score": 0.0,
|
|
686
|
+
"label": "error",
|
|
687
|
+
"explanation": f"Evaluator failed: {str(e)}",
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
return evaluator_fn
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
def schema_to_prompt(
|
|
694
|
+
schema: dict[str, Any],
|
|
695
|
+
schema_type: str = "evaluator",
|
|
696
|
+
model_name: str = "gpt-4.1",
|
|
697
|
+
) -> dict[str, Any]:
|
|
698
|
+
"""Convert agent or evaluator schema to complete Phoenix openai_params.
|
|
699
|
+
|
|
700
|
+
Converts REM schema format to Phoenix PromptVersion.from_openai() format,
|
|
701
|
+
including messages, response_format, and tools (for agents).
|
|
702
|
+
|
|
703
|
+
Args:
|
|
704
|
+
schema: Schema dictionary (from load_evaluator_schema or agent schema)
|
|
705
|
+
schema_type: Type of schema - "agent" or "evaluator"
|
|
706
|
+
model_name: Model name for the prompt
|
|
707
|
+
|
|
708
|
+
Returns:
|
|
709
|
+
Complete openai_params dict ready for PromptVersion.from_openai()
|
|
710
|
+
Contains: model, messages, response_format, tools (for agents)
|
|
711
|
+
|
|
712
|
+
Example:
|
|
713
|
+
>>> schema = load_evaluator_schema("rem-lookup-correctness")
|
|
714
|
+
>>> openai_params = schema_to_prompt(schema, schema_type="evaluator")
|
|
715
|
+
>>> # Use with Phoenix: PromptVersion.from_openai(openai_params)
|
|
716
|
+
"""
|
|
717
|
+
system_prompt = schema.get("description", "")
|
|
718
|
+
properties = schema.get("properties", {})
|
|
719
|
+
required = schema.get("required", [])
|
|
720
|
+
|
|
721
|
+
# Extract tool definitions and convert to OpenAI format (for agents)
|
|
722
|
+
tool_definitions = [] # For metadata YAML
|
|
723
|
+
openai_tools = [] # For Phoenix tools parameter
|
|
724
|
+
|
|
725
|
+
if schema_type == "agent":
|
|
726
|
+
json_schema_extra = schema.get("json_schema_extra", {})
|
|
727
|
+
tools = json_schema_extra.get("tools", [])
|
|
728
|
+
|
|
729
|
+
for tool in tools:
|
|
730
|
+
# Keep metadata format for YAML section
|
|
731
|
+
tool_def = {
|
|
732
|
+
"mcp_server": tool.get("mcp_server"),
|
|
733
|
+
"tool_name": tool.get("tool_name"),
|
|
734
|
+
"usage": tool.get("usage", ""),
|
|
735
|
+
}
|
|
736
|
+
tool_definitions.append(tool_def)
|
|
737
|
+
|
|
738
|
+
# Convert to OpenAI function calling format
|
|
739
|
+
# Sanitize tool name to prevent prompt breaking
|
|
740
|
+
tool_name = tool.get("tool_name", "")
|
|
741
|
+
sanitized_name = sanitize_tool_name(tool_name)
|
|
742
|
+
|
|
743
|
+
openai_tool = {
|
|
744
|
+
"type": "function",
|
|
745
|
+
"function": {
|
|
746
|
+
"name": sanitized_name,
|
|
747
|
+
"description": tool.get("usage", "MCP tool"),
|
|
748
|
+
"parameters": {
|
|
749
|
+
"type": "object",
|
|
750
|
+
"properties": {},
|
|
751
|
+
"required": []
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
openai_tools.append(openai_tool)
|
|
756
|
+
|
|
757
|
+
# Build schema metadata section
|
|
758
|
+
info_key = "agent_info" if schema_type == "agent" else "evaluator_info"
|
|
759
|
+
schema_metadata = {
|
|
760
|
+
info_key: {
|
|
761
|
+
"version": schema.get("version", "1.0.0"),
|
|
762
|
+
"title": schema.get("title", ""),
|
|
763
|
+
},
|
|
764
|
+
"output_schema": {
|
|
765
|
+
"description": f"Structured output returned by this {schema_type}",
|
|
766
|
+
"properties": {
|
|
767
|
+
k: {
|
|
768
|
+
"type": v.get("type", "unknown"),
|
|
769
|
+
"description": v.get("description", ""),
|
|
770
|
+
}
|
|
771
|
+
for k, v in properties.items()
|
|
772
|
+
},
|
|
773
|
+
"required": required,
|
|
774
|
+
},
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
# Add tool definitions for agents
|
|
778
|
+
if tool_definitions:
|
|
779
|
+
schema_metadata["tools"] = {
|
|
780
|
+
"description": "MCP tools available to this agent",
|
|
781
|
+
"tool_definitions": tool_definitions,
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
# Add input format for evaluators
|
|
785
|
+
if schema_type == "evaluator":
|
|
786
|
+
schema_metadata["input_format"] = {
|
|
787
|
+
"description": "Evaluators receive dataset examples with 'input' and 'output' fields",
|
|
788
|
+
"structure": {
|
|
789
|
+
"input": "dict[str, Any] - What the agent receives (e.g., {'query': '...'})",
|
|
790
|
+
"output": "dict[str, Any] - Expected/ground truth (e.g., {'label': '...'})",
|
|
791
|
+
"metadata": "dict[str, Any] - Optional metadata (e.g., {'difficulty': 'medium'})",
|
|
792
|
+
},
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
# Append schema metadata to system prompt
|
|
796
|
+
schema_yaml = yaml.dump(schema_metadata, default_flow_style=False, sort_keys=False)
|
|
797
|
+
schema_section = f"\n\n---\n\n## Schema Metadata\n\n```yaml\n{schema_yaml}```"
|
|
798
|
+
system_prompt = system_prompt + schema_section
|
|
799
|
+
|
|
800
|
+
# Create structured template
|
|
801
|
+
user_content = "{{input}}" if schema_type == "agent" else "Question: {{input}}\nAgent's Answer: {{output}}"
|
|
802
|
+
|
|
803
|
+
template_messages = [
|
|
804
|
+
{"role": "system", "content": system_prompt},
|
|
805
|
+
{"role": "user", "content": user_content}
|
|
806
|
+
]
|
|
807
|
+
|
|
808
|
+
# Build response format
|
|
809
|
+
response_format = {
|
|
810
|
+
"type": "json_schema",
|
|
811
|
+
"json_schema": {
|
|
812
|
+
"name": schema.get("title", ""),
|
|
813
|
+
"schema": {
|
|
814
|
+
"type": "object",
|
|
815
|
+
"properties": properties,
|
|
816
|
+
"required": required,
|
|
817
|
+
"additionalProperties": False
|
|
818
|
+
},
|
|
819
|
+
"strict": True
|
|
820
|
+
}
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
# Build complete openai_params dict ready for PromptVersion.from_openai()
|
|
824
|
+
openai_params: dict[str, Any] = {
|
|
825
|
+
"model": model_name,
|
|
826
|
+
"messages": template_messages,
|
|
827
|
+
"response_format": response_format,
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
# Add tools for agents (OpenAI function calling format)
|
|
831
|
+
if openai_tools:
|
|
832
|
+
openai_params["tools"] = openai_tools
|
|
833
|
+
|
|
834
|
+
return openai_params
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
# =============================================================================
|
|
838
|
+
# EXPERIMENT WORKFLOWS
|
|
839
|
+
# =============================================================================
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
def run_evaluation_experiment(
|
|
843
|
+
dataset_name: str,
|
|
844
|
+
task: Callable[[Any], Any] | None = None,
|
|
845
|
+
evaluator_schema_path: str | Path | dict[str, Any] | None = None,
|
|
846
|
+
experiment_name: str | None = None,
|
|
847
|
+
experiment_description: str | None = None,
|
|
848
|
+
phoenix_client: "PhoenixClient | None" = None,
|
|
849
|
+
model_name: str | None = None,
|
|
850
|
+
) -> "RanExperiment":
|
|
851
|
+
"""Run a complete evaluation experiment using Phoenix.
|
|
852
|
+
|
|
853
|
+
High-level workflow that:
|
|
854
|
+
1. Loads dataset from Phoenix
|
|
855
|
+
2. Optionally runs task (agent) on dataset
|
|
856
|
+
3. Optionally runs evaluators on results
|
|
857
|
+
4. Tracks results in Phoenix UI
|
|
858
|
+
|
|
859
|
+
Args:
|
|
860
|
+
dataset_name: Name of dataset in Phoenix
|
|
861
|
+
task: Optional task function (agent) to run on dataset
|
|
862
|
+
evaluator_schema_path: Optional evaluator schema path/name/dict
|
|
863
|
+
experiment_name: Name for this experiment
|
|
864
|
+
experiment_description: Description of experiment
|
|
865
|
+
phoenix_client: Optional PhoenixClient (auto-creates if not provided)
|
|
866
|
+
model_name: LLM model for evaluation
|
|
867
|
+
|
|
868
|
+
Returns:
|
|
869
|
+
RanExperiment with results and metrics
|
|
870
|
+
|
|
871
|
+
Example - Agent Run Only:
|
|
872
|
+
>>> experiment = run_evaluation_experiment(
|
|
873
|
+
... dataset_name="rem-lookup-golden",
|
|
874
|
+
... task=run_agent_task,
|
|
875
|
+
... experiment_name="rem-v1-baseline"
|
|
876
|
+
... )
|
|
877
|
+
|
|
878
|
+
Example - Agent + Evaluator:
|
|
879
|
+
>>> experiment = run_evaluation_experiment(
|
|
880
|
+
... dataset_name="rem-lookup-golden",
|
|
881
|
+
... task=run_agent_task,
|
|
882
|
+
... evaluator_schema_path="rem-lookup-correctness",
|
|
883
|
+
... experiment_name="rem-v1-full-eval"
|
|
884
|
+
... )
|
|
885
|
+
|
|
886
|
+
Example - Evaluator Only (on existing results):
|
|
887
|
+
>>> experiment = run_evaluation_experiment(
|
|
888
|
+
... dataset_name="rem-v1-results",
|
|
889
|
+
... evaluator_schema_path="rem-lookup-correctness",
|
|
890
|
+
... experiment_name="rem-v1-scoring"
|
|
891
|
+
... )
|
|
892
|
+
"""
|
|
893
|
+
# Create Phoenix client if not provided
|
|
894
|
+
if phoenix_client is None:
|
|
895
|
+
from rem.services.phoenix import PhoenixClient
|
|
896
|
+
phoenix_client = PhoenixClient()
|
|
897
|
+
|
|
898
|
+
# Load dataset
|
|
899
|
+
logger.info(f"Loading dataset: {dataset_name}")
|
|
900
|
+
dataset = phoenix_client.get_dataset(dataset_name)
|
|
901
|
+
|
|
902
|
+
# Create evaluator if schema provided
|
|
903
|
+
evaluators = []
|
|
904
|
+
if evaluator_schema_path:
|
|
905
|
+
logger.info(f"Creating evaluator from schema: {evaluator_schema_path}")
|
|
906
|
+
evaluator = create_evaluator_from_schema(
|
|
907
|
+
evaluator_schema_path=evaluator_schema_path,
|
|
908
|
+
model_name=model_name,
|
|
909
|
+
)
|
|
910
|
+
evaluators.append(evaluator)
|
|
911
|
+
|
|
912
|
+
# Run experiment
|
|
913
|
+
logger.info(f"Running experiment: {experiment_name or 'unnamed'}")
|
|
914
|
+
experiment = phoenix_client.run_experiment(
|
|
915
|
+
dataset=dataset,
|
|
916
|
+
task=task,
|
|
917
|
+
evaluators=evaluators if evaluators else None,
|
|
918
|
+
experiment_name=experiment_name,
|
|
919
|
+
experiment_description=experiment_description,
|
|
920
|
+
)
|
|
921
|
+
|
|
922
|
+
logger.success(
|
|
923
|
+
f"Experiment complete. View results: {experiment.url if hasattr(experiment, 'url') else 'N/A'}" # type: ignore[attr-defined]
|
|
924
|
+
)
|
|
925
|
+
|
|
926
|
+
return experiment
|