remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,926 @@
1
+ """Phoenix evaluator provider for REM agents.
2
+
3
+ This module provides factory functions for creating Phoenix-compatible evaluators
4
+ from schema definitions, following the same pattern as Pydantic AI agent creation.
5
+
6
+ Exported Functions:
7
+ ===================
8
+ - load_evaluator_schema: Load evaluator schemas from schemas/evaluators/
9
+ - create_phoenix_evaluator: Create Phoenix evaluator config from schema
10
+ - create_evaluator_from_schema: Create callable evaluator function
11
+ - schema_to_prompt: Convert schema to Phoenix openai_params format
12
+ - sanitize_tool_name: Sanitize tool names for Phoenix/OpenAI compatibility
13
+ - run_evaluation_experiment: Run complete evaluation workflow
14
+
15
+ Design Pattern (mirrors Pydantic AI provider):
16
+ ==============================================
17
+ 1. Load evaluator schemas from schemas/evaluators/ directory
18
+ 2. Extract system prompt, output schema, and metadata
19
+ 3. Create Phoenix-compatible evaluator functions
20
+ 4. Support both LLM-as-a-Judge and code-based evaluators
21
+
22
+ Two-Phase Evaluation Architecture:
23
+ ===================================
24
+
25
+ Phase 1 - Golden Set Creation:
26
+ SMEs create datasets with (input, reference) pairs in Phoenix
27
+
28
+ Phase 2 - Automated Evaluation:
29
+ Step 1: Run agents → (input, agent_output)
30
+ Step 2: Run evaluators → (input, agent_output, reference) → scores
31
+
32
+ Evaluator Types:
33
+ ================
34
+
35
+ 1. LLM-as-a-Judge (uses Claude/GPT to evaluate):
36
+ - Compares agent output to reference
37
+ - Scores on multiple dimensions (correctness, completeness, etc.)
38
+ - Provides explanations and suggestions
39
+
40
+ 2. Code-based (deterministic evaluation):
41
+ - Exact match checking
42
+ - Field presence validation
43
+ - Format compliance
44
+
45
+ Usage:
46
+ ======
47
+
48
+ Create evaluator from schema:
49
+ >>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
50
+ >>> result = evaluator(example)
51
+ >>> # Returns: {"score": 0.95, "label": "correct", "explanation": "..."}
52
+
53
+ Run evaluation experiment:
54
+ >>> from rem.services.phoenix import PhoenixClient
55
+ >>> client = PhoenixClient()
56
+ >>> experiment = run_evaluation_experiment(
57
+ ... dataset_name="rem-lookup-golden",
58
+ ... task=run_agent_task,
59
+ ... evaluator_schema_path="rem-lookup-correctness",
60
+ ... phoenix_client=client
61
+ ... )
62
+ """
63
+
64
+ from typing import Any, Callable, TYPE_CHECKING
65
+ from pathlib import Path
66
+ import json
67
+ import yaml
68
+
69
+ from loguru import logger
70
+
71
+ # Lazy import to avoid Phoenix initialization at module load time
72
+ if TYPE_CHECKING:
73
+ from phoenix.evals import LLMEvaluator
74
+ from phoenix.client.resources.datasets import Dataset
75
+ from phoenix.client.resources.experiments.types import RanExperiment
76
+ from rem.services.phoenix import PhoenixClient
77
+
78
+ PHOENIX_AVAILABLE = None # Lazy check on first use
79
+
80
+
81
+ def _check_phoenix_available() -> bool:
82
+ """Lazy check if Phoenix is available (only imports when needed)."""
83
+ global PHOENIX_AVAILABLE
84
+ if PHOENIX_AVAILABLE is not None:
85
+ return PHOENIX_AVAILABLE
86
+
87
+ try:
88
+ import phoenix.evals # noqa: F401
89
+ PHOENIX_AVAILABLE = True
90
+ except ImportError:
91
+ PHOENIX_AVAILABLE = False
92
+ logger.warning("arize-phoenix package not installed - evaluator factory unavailable")
93
+
94
+ return PHOENIX_AVAILABLE
95
+
96
+
97
+ def validate_evaluator_credentials(
98
+ model_name: str | None = None,
99
+ ) -> tuple[bool, str | None]:
100
+ """Validate that the evaluator's LLM provider has working credentials.
101
+
102
+ Performs a minimal API call to verify credentials before running experiments.
103
+ This prevents running expensive agent tasks only to have evaluations fail.
104
+
105
+ Args:
106
+ model_name: Model to validate (defaults to claude-sonnet-4-5-20250929)
107
+
108
+ Returns:
109
+ Tuple of (success: bool, error_message: str | None)
110
+ - (True, None) if credentials are valid
111
+ - (False, "error description") if validation fails
112
+
113
+ Example:
114
+ >>> success, error = validate_evaluator_credentials()
115
+ >>> if not success:
116
+ ... print(f"Evaluator validation failed: {error}")
117
+ ... return
118
+ """
119
+ if not _check_phoenix_available():
120
+ return False, "arize-phoenix package not installed"
121
+
122
+ from phoenix.evals import OpenAIModel, AnthropicModel
123
+
124
+ # Default model (check env var first)
125
+ if model_name is None:
126
+ import os
127
+ model_name = os.environ.get("EVALUATOR_MODEL", "claude-sonnet-4-5-20250929")
128
+
129
+ # Parse provider
130
+ if ":" in model_name:
131
+ provider, phoenix_model_name = model_name.split(":", 1)
132
+ else:
133
+ if model_name.startswith("claude"):
134
+ provider = "anthropic"
135
+ else:
136
+ provider = "openai"
137
+ phoenix_model_name = model_name
138
+
139
+ try:
140
+ # Create LLM wrapper
141
+ if provider.lower() == "anthropic":
142
+ llm = AnthropicModel(
143
+ model=phoenix_model_name,
144
+ temperature=0.0,
145
+ top_p=None,
146
+ )
147
+ else:
148
+ llm = OpenAIModel(model=phoenix_model_name, temperature=0.0)
149
+
150
+ # Test with minimal prompt
151
+ logger.info(f"Validating evaluator credentials for {provider}:{phoenix_model_name}")
152
+ response = llm("Say 'ok' if you can read this.")
153
+
154
+ if response and len(response) > 0:
155
+ logger.info(f"Evaluator credentials validated successfully for {provider}")
156
+ return True, None
157
+ else:
158
+ return False, f"Empty response from {provider} model"
159
+
160
+ except Exception as e:
161
+ error_msg = str(e)
162
+ # Extract meaningful error from common API errors
163
+ if "credit balance is too low" in error_msg.lower():
164
+ return False, f"Anthropic API credits exhausted. Add credits at https://console.anthropic.com/settings/billing"
165
+ elif "api key" in error_msg.lower() or "authentication" in error_msg.lower():
166
+ return False, f"{provider.capitalize()} API key missing or invalid. Set ANTHROPIC_API_KEY or OPENAI_API_KEY environment variable."
167
+ elif "rate limit" in error_msg.lower():
168
+ return False, f"{provider.capitalize()} rate limit exceeded. Wait and retry."
169
+ else:
170
+ return False, f"{provider.capitalize()} API error: {error_msg[:200]}"
171
+
172
+
173
+ # =============================================================================
174
+ # NAME SANITIZATION
175
+ # =============================================================================
176
+
177
+
178
+ def sanitize_tool_name(tool_name: str) -> str:
179
+ """Sanitize tool name for Phoenix/OpenAI compatibility.
180
+
181
+ Replaces all non-alphanumeric characters with underscores to prevent
182
+ prompt breaking and ensure compatibility with OpenAI function calling.
183
+
184
+ Args:
185
+ tool_name: Original tool name (e.g., "ask_rem", "traverse-graph")
186
+
187
+ Returns:
188
+ Sanitized name with only alphanumeric characters and underscores
189
+
190
+ Example:
191
+ >>> sanitize_tool_name("ask_rem")
192
+ 'ask_rem'
193
+ >>> sanitize_tool_name("traverse-graph")
194
+ 'traverse_graph'
195
+ >>> sanitize_tool_name("mcp://server/tool-name")
196
+ 'mcp___server_tool_name'
197
+ """
198
+ return "".join(c if c.isalnum() else "_" for c in tool_name)
199
+
200
+
201
+ # =============================================================================
202
+ # SCHEMA LOADING
203
+ # =============================================================================
204
+
205
+
206
+ def load_evaluator_schema(evaluator_name: str) -> dict[str, Any]:
207
+ """Load evaluator schema using centralized schema loader.
208
+
209
+ Uses the same unified search logic as agent schemas:
210
+ - "hello-world/default" → schemas/evaluators/hello-world/default.yaml
211
+ - "lookup-correctness" → schemas/evaluators/rem/lookup-correctness.yaml
212
+ - "rem-lookup-correctness" → schemas/evaluators/rem/lookup-correctness.yaml
213
+
214
+ Args:
215
+ evaluator_name: Evaluator name or path
216
+ e.g., "hello-world/default", "lookup-correctness"
217
+
218
+ Returns:
219
+ Evaluator schema dictionary with keys:
220
+ - description: System prompt for LLM evaluator
221
+ - properties: Output schema fields
222
+ - required: Required output fields
223
+ - labels: Optional labels for categorization
224
+ - version: Schema version
225
+
226
+ Raises:
227
+ FileNotFoundError: If evaluator schema not found
228
+
229
+ Example:
230
+ >>> schema = load_evaluator_schema("hello-world/default")
231
+ >>> print(schema["description"])
232
+ """
233
+ from ...utils.schema_loader import load_agent_schema
234
+
235
+ # Use centralized schema loader (searches evaluator paths too)
236
+ return load_agent_schema(evaluator_name)
237
+
238
+
239
+ # =============================================================================
240
+ # EVALUATOR CREATION
241
+ # =============================================================================
242
+
243
+
244
+ def create_phoenix_evaluator(
245
+ evaluator_schema: dict[str, Any],
246
+ model_name: str | None = None,
247
+ ) -> dict[str, Any]:
248
+ """Create Phoenix evaluator configuration from schema.
249
+
250
+ Args:
251
+ evaluator_schema: Evaluator schema dictionary
252
+ model_name: Optional LLM model to use (defaults to claude-sonnet-4-5)
253
+
254
+ Returns:
255
+ Evaluator config dict with:
256
+ - name: Evaluator name
257
+ - llm: Phoenix LLM wrapper
258
+ - prompt_template: System prompt
259
+ - schema: Output schema
260
+
261
+ Raises:
262
+ ImportError: If arize-phoenix not installed
263
+ KeyError: If required schema fields missing
264
+ """
265
+ if not _check_phoenix_available():
266
+ raise ImportError(
267
+ "arize-phoenix package required for evaluators. "
268
+ "Install with: pip install arize-phoenix"
269
+ )
270
+
271
+ # Import Phoenix after availability check
272
+ from phoenix.evals import OpenAIModel, AnthropicModel
273
+
274
+ logger.debug("Creating Phoenix evaluator from schema")
275
+
276
+ # Extract schema fields
277
+ evaluator_name = evaluator_schema.get("title", "UnnamedEvaluator")
278
+ system_prompt = evaluator_schema.get("description", "")
279
+ output_schema = evaluator_schema.get("properties", {})
280
+
281
+ if not system_prompt:
282
+ raise KeyError("evaluator_schema must contain 'description' field with system prompt")
283
+
284
+ # Default model (use Claude Sonnet 4.5 for evaluators)
285
+ if model_name is None:
286
+ import os
287
+ model_name = os.environ.get("EVALUATOR_MODEL", "claude-sonnet-4-5-20250929")
288
+ logger.debug(f"Using evaluator model: {model_name}")
289
+
290
+ logger.info(f"Creating Phoenix evaluator: {evaluator_name} with model={model_name}")
291
+
292
+ # Parse provider and model name
293
+ if ":" in model_name:
294
+ provider, phoenix_model_name = model_name.split(":", 1)
295
+ else:
296
+ # Detect provider from model name
297
+ if model_name.startswith("claude"):
298
+ provider = "anthropic"
299
+ else:
300
+ provider = "openai"
301
+ phoenix_model_name = model_name
302
+
303
+ # Create appropriate Phoenix LLM wrapper based on provider
304
+ llm: OpenAIModel | AnthropicModel
305
+ if provider.lower() == "anthropic":
306
+ # Anthropic's newer Claude models (claude-sonnet-4, claude-opus-4, etc.)
307
+ # don't allow both temperature and top_p to be specified together.
308
+ # Phoenix's AnthropicModel defaults top_p=1, so we explicitly set it
309
+ # to None to prevent it from being sent in the API request.
310
+ # The invocation_parameters() method only includes params that are not None.
311
+ llm = AnthropicModel(
312
+ model=phoenix_model_name,
313
+ temperature=0.0,
314
+ top_p=None, # type: ignore[arg-type] - None prevents param from being sent
315
+ )
316
+ else:
317
+ # Default to OpenAI for other providers (gpt-4, etc.)
318
+ llm = OpenAIModel(model=phoenix_model_name, temperature=0.0)
319
+
320
+ # Return evaluator config (not an instance - we'll use llm_classify directly)
321
+ evaluator_config = {
322
+ "name": evaluator_name,
323
+ "llm": llm,
324
+ "prompt_template": system_prompt,
325
+ "schema": output_schema,
326
+ "labels": evaluator_schema.get("labels", []),
327
+ "version": evaluator_schema.get("version", "1.0.0"),
328
+ }
329
+
330
+ logger.info(f"Phoenix evaluator '{evaluator_name}' created successfully")
331
+ return evaluator_config
332
+
333
+
334
+ def _evaluate_expression(expression: str, context: dict[str, Any]) -> Any:
335
+ """Safely evaluate a simple expression with context variables.
336
+
337
+ Supports: arithmetic, comparisons, boolean logic, len()
338
+ """
339
+ try:
340
+ allowed_names = {
341
+ "len": len,
342
+ "True": True,
343
+ "False": False,
344
+ "true": True,
345
+ "false": False,
346
+ }
347
+ allowed_names.update(context)
348
+ return eval(expression, {"__builtins__": {}}, allowed_names)
349
+ except Exception as e:
350
+ logger.warning(f"Expression evaluation failed: {expression} - {e}")
351
+ return 0.0
352
+
353
+
354
+ def _calculate_derived_scores(
355
+ response_json: dict[str, Any],
356
+ derived_scores_config: dict[str, Any],
357
+ ) -> dict[str, Any]:
358
+ """Calculate derived scores from evaluator output using config formulas.
359
+
360
+ Supports:
361
+ - weighted_sum: Weighted average of fields
362
+ - conditional_weighted: Different formulas based on conditions
363
+ - boolean_logic: Boolean expression evaluation
364
+ """
365
+ for score_name, score_config in derived_scores_config.items():
366
+ score_type = score_config.get("type")
367
+
368
+ if score_type == "weighted_sum":
369
+ weights = score_config.get("weights", {})
370
+ total = 0.0
371
+ for field, weight in weights.items():
372
+ field_value = response_json.get(field, 0.0)
373
+ if isinstance(field_value, (int, float)):
374
+ total += field_value * weight
375
+ response_json[score_name] = total
376
+
377
+ elif score_type == "conditional_weighted":
378
+ conditions = score_config.get("conditions", [])
379
+ formula_to_use = None
380
+ for cond_config in conditions:
381
+ condition = cond_config.get("condition")
382
+ if condition is None:
383
+ formula_to_use = cond_config.get("formula")
384
+ break
385
+ field = condition.get("field")
386
+ operator = condition.get("operator")
387
+ value = condition.get("value")
388
+ field_value = response_json.get(field, 0.0)
389
+ condition_met = False
390
+ if operator == ">=":
391
+ condition_met = field_value >= value
392
+ elif operator == ">":
393
+ condition_met = field_value > value
394
+ elif operator == "<=":
395
+ condition_met = field_value <= value
396
+ elif operator == "<":
397
+ condition_met = field_value < value
398
+ elif operator == "==":
399
+ condition_met = field_value == value
400
+ elif operator == "!=":
401
+ condition_met = field_value != value
402
+ if condition_met:
403
+ formula_to_use = cond_config.get("formula")
404
+ break
405
+ if formula_to_use and formula_to_use.get("type") == "weighted_sum":
406
+ weights = formula_to_use.get("weights", {})
407
+ total = 0.0
408
+ for field, weight in weights.items():
409
+ field_value = response_json.get(field, 0.0)
410
+ if isinstance(field_value, (int, float)):
411
+ total += field_value * weight
412
+ response_json[score_name] = total
413
+
414
+ elif score_type == "boolean_logic":
415
+ expression = score_config.get("expression", "")
416
+ result = _evaluate_expression(expression, response_json)
417
+ response_json[score_name] = result
418
+
419
+ return response_json
420
+
421
+
422
+ def _create_phoenix_evaluations(
423
+ response_json: dict[str, Any],
424
+ evaluations_config: list[dict[str, Any]],
425
+ ) -> list[dict[str, Any]]:
426
+ """Create Phoenix evaluation dicts from evaluator output using config.
427
+
428
+ Each evaluation becomes a column in Phoenix UI with name, label, score, explanation.
429
+ """
430
+ evaluations = []
431
+ for eval_config in evaluations_config:
432
+ eval_name = eval_config.get("name", "unnamed")
433
+ score_field = eval_config.get("score_field")
434
+ score_expression = eval_config.get("score_expression")
435
+ label_field = eval_config.get("label_field")
436
+ label_expression = eval_config.get("label_expression")
437
+ label_logic = eval_config.get("label_logic", [])
438
+ label_transform = eval_config.get("label_transform", {})
439
+ score_logic = eval_config.get("score_logic", {})
440
+ explanation_field = eval_config.get("explanation_field")
441
+
442
+ evaluation = {"name": eval_name}
443
+
444
+ # Get score
445
+ if score_expression:
446
+ evaluation["score"] = _evaluate_expression(score_expression, response_json)
447
+ elif score_field:
448
+ evaluation["score"] = response_json.get(score_field, 0.0)
449
+ elif score_logic and label_field:
450
+ label_value = response_json.get(label_field)
451
+ if isinstance(label_value, bool):
452
+ label_value = "true" if label_value else "false"
453
+ evaluation["score"] = score_logic.get(str(label_value), 0.0)
454
+ else:
455
+ evaluation["score"] = None
456
+
457
+ # Get label
458
+ if label_expression:
459
+ evaluation["label"] = str(_evaluate_expression(label_expression, response_json))
460
+ elif label_field:
461
+ label_value = response_json.get(label_field)
462
+ if isinstance(label_value, bool):
463
+ label_value = "true" if label_value else "false"
464
+ if label_transform:
465
+ evaluation["label"] = label_transform.get(str(label_value), str(label_value))
466
+ else:
467
+ evaluation["label"] = str(label_value)
468
+ elif label_logic and (score_field or score_expression):
469
+ score_value = evaluation.get("score", 0.0)
470
+ label = "unknown"
471
+ for logic in label_logic:
472
+ threshold = logic.get("threshold", 0.0)
473
+ operator = logic.get("operator", ">=")
474
+ if operator == ">=" and score_value >= threshold:
475
+ label = logic.get("label", "unknown")
476
+ break
477
+ elif operator == ">" and score_value > threshold:
478
+ label = logic.get("label", "unknown")
479
+ break
480
+ evaluation["label"] = label
481
+ else:
482
+ evaluation["label"] = None
483
+
484
+ # Get explanation
485
+ if explanation_field:
486
+ explanation_value = response_json.get(explanation_field, "")
487
+ if isinstance(explanation_value, list):
488
+ evaluation["explanation"] = ", ".join(str(x) for x in explanation_value) if explanation_value else "None"
489
+ else:
490
+ evaluation["explanation"] = str(explanation_value)
491
+ else:
492
+ evaluation["explanation"] = None
493
+
494
+ evaluations.append(evaluation)
495
+ return evaluations
496
+
497
+
498
+ def create_evaluator_from_schema(
499
+ evaluator_schema_path: str | Path | dict[str, Any],
500
+ model_name: str | None = None,
501
+ ) -> Callable[[Any], Any]:
502
+ """Create an evaluator function from a schema file or dict.
503
+
504
+ Uses direct LLM call with JSON schema for structured output evaluation.
505
+ Supports phoenix_config for derived scores and evaluation column mappings.
506
+
507
+ Args:
508
+ evaluator_schema_path: Path to schema file, evaluator name, or schema dict
509
+ model_name: Optional LLM model to use for evaluation
510
+
511
+ Returns:
512
+ Evaluator function compatible with Phoenix experiments
513
+
514
+ Raises:
515
+ FileNotFoundError: If schema file not found
516
+ ImportError: If arize-phoenix not installed
517
+
518
+ Example:
519
+ >>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
520
+ >>> result = evaluator(input={...}, output={...}, expected={...})
521
+ >>> # Returns: list of {"name": "...", "score": 0.95, "label": "...", "explanation": "..."}
522
+ """
523
+ if not _check_phoenix_available():
524
+ raise ImportError(
525
+ "arize-phoenix package required for evaluators. "
526
+ "Install with: pip install arize-phoenix"
527
+ )
528
+
529
+ # Load schema if path/name provided
530
+ if isinstance(evaluator_schema_path, (str, Path)):
531
+ schema_path = Path(evaluator_schema_path)
532
+ if schema_path.exists():
533
+ logger.debug(f"Loading evaluator schema from {schema_path}")
534
+ if schema_path.suffix in [".yaml", ".yml"]:
535
+ with open(schema_path) as f:
536
+ schema = yaml.safe_load(f)
537
+ else:
538
+ with open(schema_path) as f:
539
+ schema = json.load(f)
540
+ else:
541
+ schema = load_evaluator_schema(str(evaluator_schema_path))
542
+ else:
543
+ schema = evaluator_schema_path
544
+
545
+ # Extract schema components
546
+ output_schema = schema.get("properties", {})
547
+
548
+ # Extract phoenix_config for derived scores and evaluations
549
+ phoenix_config = schema.get("phoenix_config", {})
550
+ derived_scores_config = phoenix_config.get("derived_scores", {})
551
+ evaluations_config = phoenix_config.get("evaluations", [])
552
+
553
+ # Create evaluator config (LLM wrapper, prompt, etc.)
554
+ evaluator_config = create_phoenix_evaluator(
555
+ evaluator_schema=schema,
556
+ model_name=model_name,
557
+ )
558
+
559
+ import re
560
+
561
+ def evaluator_fn(input: dict[str, Any], output: dict[str, Any], expected: dict[str, Any]) -> list[dict[str, Any]]:
562
+ """Evaluate using Phoenix's named parameter binding with structured LLM output.
563
+
564
+ Phoenix automatically binds these parameters:
565
+ - input: Dataset input dict
566
+ - output: Task's return value (agent output)
567
+ - expected: Expected output dict (reference/ground truth)
568
+
569
+ Returns:
570
+ List of Phoenix evaluation dicts with name, score, label, explanation
571
+ """
572
+ logger.debug("Evaluating with structured output pattern")
573
+
574
+ # Extract question from input
575
+ if isinstance(input, dict):
576
+ question = input.get("input", input.get("text", str(input)))
577
+ else:
578
+ question = str(input)
579
+
580
+ # Serialize agent output
581
+ if isinstance(output, dict):
582
+ output_str = json.dumps(output, indent=2)
583
+ else:
584
+ output_str = str(output)
585
+
586
+ # Get reference from expected
587
+ if isinstance(expected, dict):
588
+ reference = expected.get("reference", expected.get("expected_output",
589
+ expected.get("ground_truth", str(expected))))
590
+ else:
591
+ reference = str(expected)
592
+
593
+ try:
594
+ # Build user message
595
+ user_message = f"""Question/Input: {question}
596
+
597
+ Agent's Answer:
598
+ {output_str}
599
+
600
+ Expected Answer (Reference):
601
+ {reference}
602
+
603
+ Please evaluate the agent's answer according to the evaluation criteria."""
604
+
605
+ # Add JSON schema requirement to system prompt
606
+ system_prompt = evaluator_config["prompt_template"]
607
+ schema_instruction = f"\n\nYou MUST respond with valid JSON matching this schema:\n{json.dumps(output_schema, indent=2)}\n\nProvide ONLY the JSON response, no markdown code blocks or extra text."
608
+ system_with_schema = system_prompt + schema_instruction
609
+
610
+ # Phoenix LLM models expect a single prompt string
611
+ llm = evaluator_config["llm"]
612
+ full_prompt = f"{system_with_schema}\n\n{user_message}"
613
+ response_text = llm(full_prompt)
614
+
615
+ # Parse JSON response
616
+ try:
617
+ response_json = json.loads(response_text)
618
+ except json.JSONDecodeError:
619
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
620
+ if json_match:
621
+ response_json = json.loads(json_match.group(1))
622
+ else:
623
+ raise ValueError(f"Could not parse JSON from LLM response: {response_text[:200]}")
624
+
625
+ logger.debug(f"LLM response parsed: {list(response_json.keys())}")
626
+
627
+ # Calculate derived scores using config
628
+ if derived_scores_config:
629
+ logger.debug(f"Calculating {len(derived_scores_config)} derived scores")
630
+ response_json = _calculate_derived_scores(response_json, derived_scores_config)
631
+
632
+ # Create Phoenix evaluations using config
633
+ if evaluations_config:
634
+ logger.debug(f"Creating {len(evaluations_config)} Phoenix evaluations")
635
+ evaluations = _create_phoenix_evaluations(response_json, evaluations_config)
636
+ else:
637
+ # Fallback: create evaluations from all numeric/boolean fields
638
+ logger.warning("No evaluations_config - creating default evaluations from schema")
639
+ evaluations = []
640
+ for field_name, field_value in response_json.items():
641
+ if isinstance(field_value, (int, float)):
642
+ evaluations.append({
643
+ "name": field_name,
644
+ "score": float(field_value),
645
+ "label": "good" if field_value >= 0.5 else "poor",
646
+ "explanation": None
647
+ })
648
+ elif isinstance(field_value, bool):
649
+ evaluations.append({
650
+ "name": field_name,
651
+ "score": 1.0 if field_value else 0.0,
652
+ "label": "pass" if field_value else "fail",
653
+ "explanation": None
654
+ })
655
+
656
+ # Always add overall if not present
657
+ if not any(e["name"] == "overall" for e in evaluations):
658
+ overall_score = response_json.get("overall_score", 0.0)
659
+ overall_pass = response_json.get("pass", False)
660
+ evaluations.append({
661
+ "name": "overall",
662
+ "score": overall_score if isinstance(overall_score, (int, float)) else 0.0,
663
+ "label": "pass" if overall_pass else "fail",
664
+ "explanation": response_json.get("evaluation_notes", None)
665
+ })
666
+
667
+ logger.debug(f"Created {len(evaluations)} evaluations")
668
+
669
+ # Phoenix client expects a dict with score, label, explanation
670
+ # (not the old EvaluationResult class)
671
+ overall_eval = next(
672
+ (e for e in evaluations if e["name"] == "overall"),
673
+ {"score": 0.0, "label": "unknown", "explanation": None}
674
+ )
675
+
676
+ return {
677
+ "score": overall_eval.get("score", 0.0),
678
+ "label": overall_eval.get("label", "unknown"),
679
+ "explanation": overall_eval.get("explanation"),
680
+ }
681
+
682
+ except Exception as e:
683
+ logger.error(f"Evaluator error: {e}")
684
+ return {
685
+ "score": 0.0,
686
+ "label": "error",
687
+ "explanation": f"Evaluator failed: {str(e)}",
688
+ }
689
+
690
+ return evaluator_fn
691
+
692
+
693
+ def schema_to_prompt(
694
+ schema: dict[str, Any],
695
+ schema_type: str = "evaluator",
696
+ model_name: str = "gpt-4.1",
697
+ ) -> dict[str, Any]:
698
+ """Convert agent or evaluator schema to complete Phoenix openai_params.
699
+
700
+ Converts REM schema format to Phoenix PromptVersion.from_openai() format,
701
+ including messages, response_format, and tools (for agents).
702
+
703
+ Args:
704
+ schema: Schema dictionary (from load_evaluator_schema or agent schema)
705
+ schema_type: Type of schema - "agent" or "evaluator"
706
+ model_name: Model name for the prompt
707
+
708
+ Returns:
709
+ Complete openai_params dict ready for PromptVersion.from_openai()
710
+ Contains: model, messages, response_format, tools (for agents)
711
+
712
+ Example:
713
+ >>> schema = load_evaluator_schema("rem-lookup-correctness")
714
+ >>> openai_params = schema_to_prompt(schema, schema_type="evaluator")
715
+ >>> # Use with Phoenix: PromptVersion.from_openai(openai_params)
716
+ """
717
+ system_prompt = schema.get("description", "")
718
+ properties = schema.get("properties", {})
719
+ required = schema.get("required", [])
720
+
721
+ # Extract tool definitions and convert to OpenAI format (for agents)
722
+ tool_definitions = [] # For metadata YAML
723
+ openai_tools = [] # For Phoenix tools parameter
724
+
725
+ if schema_type == "agent":
726
+ json_schema_extra = schema.get("json_schema_extra", {})
727
+ tools = json_schema_extra.get("tools", [])
728
+
729
+ for tool in tools:
730
+ # Keep metadata format for YAML section
731
+ tool_def = {
732
+ "mcp_server": tool.get("mcp_server"),
733
+ "tool_name": tool.get("tool_name"),
734
+ "usage": tool.get("usage", ""),
735
+ }
736
+ tool_definitions.append(tool_def)
737
+
738
+ # Convert to OpenAI function calling format
739
+ # Sanitize tool name to prevent prompt breaking
740
+ tool_name = tool.get("tool_name", "")
741
+ sanitized_name = sanitize_tool_name(tool_name)
742
+
743
+ openai_tool = {
744
+ "type": "function",
745
+ "function": {
746
+ "name": sanitized_name,
747
+ "description": tool.get("usage", "MCP tool"),
748
+ "parameters": {
749
+ "type": "object",
750
+ "properties": {},
751
+ "required": []
752
+ }
753
+ }
754
+ }
755
+ openai_tools.append(openai_tool)
756
+
757
+ # Build schema metadata section
758
+ info_key = "agent_info" if schema_type == "agent" else "evaluator_info"
759
+ schema_metadata = {
760
+ info_key: {
761
+ "version": schema.get("version", "1.0.0"),
762
+ "title": schema.get("title", ""),
763
+ },
764
+ "output_schema": {
765
+ "description": f"Structured output returned by this {schema_type}",
766
+ "properties": {
767
+ k: {
768
+ "type": v.get("type", "unknown"),
769
+ "description": v.get("description", ""),
770
+ }
771
+ for k, v in properties.items()
772
+ },
773
+ "required": required,
774
+ },
775
+ }
776
+
777
+ # Add tool definitions for agents
778
+ if tool_definitions:
779
+ schema_metadata["tools"] = {
780
+ "description": "MCP tools available to this agent",
781
+ "tool_definitions": tool_definitions,
782
+ }
783
+
784
+ # Add input format for evaluators
785
+ if schema_type == "evaluator":
786
+ schema_metadata["input_format"] = {
787
+ "description": "Evaluators receive dataset examples with 'input' and 'output' fields",
788
+ "structure": {
789
+ "input": "dict[str, Any] - What the agent receives (e.g., {'query': '...'})",
790
+ "output": "dict[str, Any] - Expected/ground truth (e.g., {'label': '...'})",
791
+ "metadata": "dict[str, Any] - Optional metadata (e.g., {'difficulty': 'medium'})",
792
+ },
793
+ }
794
+
795
+ # Append schema metadata to system prompt
796
+ schema_yaml = yaml.dump(schema_metadata, default_flow_style=False, sort_keys=False)
797
+ schema_section = f"\n\n---\n\n## Schema Metadata\n\n```yaml\n{schema_yaml}```"
798
+ system_prompt = system_prompt + schema_section
799
+
800
+ # Create structured template
801
+ user_content = "{{input}}" if schema_type == "agent" else "Question: {{input}}\nAgent's Answer: {{output}}"
802
+
803
+ template_messages = [
804
+ {"role": "system", "content": system_prompt},
805
+ {"role": "user", "content": user_content}
806
+ ]
807
+
808
+ # Build response format
809
+ response_format = {
810
+ "type": "json_schema",
811
+ "json_schema": {
812
+ "name": schema.get("title", ""),
813
+ "schema": {
814
+ "type": "object",
815
+ "properties": properties,
816
+ "required": required,
817
+ "additionalProperties": False
818
+ },
819
+ "strict": True
820
+ }
821
+ }
822
+
823
+ # Build complete openai_params dict ready for PromptVersion.from_openai()
824
+ openai_params: dict[str, Any] = {
825
+ "model": model_name,
826
+ "messages": template_messages,
827
+ "response_format": response_format,
828
+ }
829
+
830
+ # Add tools for agents (OpenAI function calling format)
831
+ if openai_tools:
832
+ openai_params["tools"] = openai_tools
833
+
834
+ return openai_params
835
+
836
+
837
+ # =============================================================================
838
+ # EXPERIMENT WORKFLOWS
839
+ # =============================================================================
840
+
841
+
842
+ def run_evaluation_experiment(
843
+ dataset_name: str,
844
+ task: Callable[[Any], Any] | None = None,
845
+ evaluator_schema_path: str | Path | dict[str, Any] | None = None,
846
+ experiment_name: str | None = None,
847
+ experiment_description: str | None = None,
848
+ phoenix_client: "PhoenixClient | None" = None,
849
+ model_name: str | None = None,
850
+ ) -> "RanExperiment":
851
+ """Run a complete evaluation experiment using Phoenix.
852
+
853
+ High-level workflow that:
854
+ 1. Loads dataset from Phoenix
855
+ 2. Optionally runs task (agent) on dataset
856
+ 3. Optionally runs evaluators on results
857
+ 4. Tracks results in Phoenix UI
858
+
859
+ Args:
860
+ dataset_name: Name of dataset in Phoenix
861
+ task: Optional task function (agent) to run on dataset
862
+ evaluator_schema_path: Optional evaluator schema path/name/dict
863
+ experiment_name: Name for this experiment
864
+ experiment_description: Description of experiment
865
+ phoenix_client: Optional PhoenixClient (auto-creates if not provided)
866
+ model_name: LLM model for evaluation
867
+
868
+ Returns:
869
+ RanExperiment with results and metrics
870
+
871
+ Example - Agent Run Only:
872
+ >>> experiment = run_evaluation_experiment(
873
+ ... dataset_name="rem-lookup-golden",
874
+ ... task=run_agent_task,
875
+ ... experiment_name="rem-v1-baseline"
876
+ ... )
877
+
878
+ Example - Agent + Evaluator:
879
+ >>> experiment = run_evaluation_experiment(
880
+ ... dataset_name="rem-lookup-golden",
881
+ ... task=run_agent_task,
882
+ ... evaluator_schema_path="rem-lookup-correctness",
883
+ ... experiment_name="rem-v1-full-eval"
884
+ ... )
885
+
886
+ Example - Evaluator Only (on existing results):
887
+ >>> experiment = run_evaluation_experiment(
888
+ ... dataset_name="rem-v1-results",
889
+ ... evaluator_schema_path="rem-lookup-correctness",
890
+ ... experiment_name="rem-v1-scoring"
891
+ ... )
892
+ """
893
+ # Create Phoenix client if not provided
894
+ if phoenix_client is None:
895
+ from rem.services.phoenix import PhoenixClient
896
+ phoenix_client = PhoenixClient()
897
+
898
+ # Load dataset
899
+ logger.info(f"Loading dataset: {dataset_name}")
900
+ dataset = phoenix_client.get_dataset(dataset_name)
901
+
902
+ # Create evaluator if schema provided
903
+ evaluators = []
904
+ if evaluator_schema_path:
905
+ logger.info(f"Creating evaluator from schema: {evaluator_schema_path}")
906
+ evaluator = create_evaluator_from_schema(
907
+ evaluator_schema_path=evaluator_schema_path,
908
+ model_name=model_name,
909
+ )
910
+ evaluators.append(evaluator)
911
+
912
+ # Run experiment
913
+ logger.info(f"Running experiment: {experiment_name or 'unnamed'}")
914
+ experiment = phoenix_client.run_experiment(
915
+ dataset=dataset,
916
+ task=task,
917
+ evaluators=evaluators if evaluators else None,
918
+ experiment_name=experiment_name,
919
+ experiment_description=experiment_description,
920
+ )
921
+
922
+ logger.success(
923
+ f"Experiment complete. View results: {experiment.url if hasattr(experiment, 'url') else 'N/A'}" # type: ignore[attr-defined]
924
+ )
925
+
926
+ return experiment