remdb 0.3.7__py3-none-any.whl → 0.3.133__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/__init__.py +129 -2
- rem/agentic/README.md +76 -0
- rem/agentic/__init__.py +15 -0
- rem/agentic/agents/__init__.py +16 -2
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +51 -25
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/tool_wrapper.py +112 -17
- rem/agentic/otel/setup.py +93 -4
- rem/agentic/providers/phoenix.py +314 -132
- rem/agentic/providers/pydantic_ai.py +215 -26
- rem/agentic/schema.py +361 -21
- rem/agentic/tools/rem_tools.py +3 -3
- rem/api/README.md +238 -1
- rem/api/deps.py +255 -0
- rem/api/main.py +154 -37
- rem/api/mcp_router/resources.py +1 -1
- rem/api/mcp_router/server.py +26 -5
- rem/api/mcp_router/tools.py +465 -7
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +494 -0
- rem/api/routers/auth.py +124 -0
- rem/api/routers/chat/completions.py +402 -20
- rem/api/routers/chat/models.py +88 -10
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +542 -0
- rem/api/routers/chat/streaming.py +642 -45
- rem/api/routers/dev.py +81 -0
- rem/api/routers/feedback.py +268 -0
- rem/api/routers/messages.py +473 -0
- rem/api/routers/models.py +78 -0
- rem/api/routers/query.py +360 -0
- rem/api/routers/shared_sessions.py +406 -0
- rem/auth/middleware.py +126 -27
- rem/cli/commands/README.md +237 -64
- rem/cli/commands/ask.py +13 -10
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +5 -6
- rem/cli/commands/db.py +396 -139
- rem/cli/commands/experiments.py +469 -74
- rem/cli/commands/process.py +22 -15
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +97 -50
- rem/cli/main.py +29 -6
- rem/config.py +10 -3
- rem/models/core/core_model.py +7 -1
- rem/models/core/experiment.py +54 -0
- rem/models/core/rem_query.py +5 -2
- rem/models/entities/__init__.py +21 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/message.py +30 -1
- rem/models/entities/session.py +83 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/user.py +10 -3
- rem/registry.py +373 -0
- rem/schemas/agents/rem.yaml +7 -3
- rem/services/content/providers.py +92 -133
- rem/services/content/service.py +92 -20
- rem/services/dreaming/affinity_service.py +2 -16
- rem/services/dreaming/moment_service.py +2 -15
- rem/services/embeddings/api.py +24 -17
- rem/services/embeddings/worker.py +16 -16
- rem/services/phoenix/EXPERIMENT_DESIGN.md +3 -3
- rem/services/phoenix/client.py +302 -28
- rem/services/postgres/README.md +159 -15
- rem/services/postgres/__init__.py +2 -1
- rem/services/postgres/diff_service.py +531 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +427 -129
- rem/services/postgres/repository.py +132 -0
- rem/services/postgres/schema_generator.py +291 -9
- rem/services/postgres/service.py +6 -6
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +14 -0
- rem/services/rem/parser.py +44 -9
- rem/services/rem/service.py +36 -2
- rem/services/session/compression.py +24 -1
- rem/services/session/reload.py +1 -1
- rem/services/user_service.py +98 -0
- rem/settings.py +399 -29
- rem/sql/background_indexes.sql +21 -16
- rem/sql/migrations/001_install.sql +387 -54
- rem/sql/migrations/002_install_models.sql +2320 -393
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +548 -0
- rem/utils/__init__.py +18 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/embeddings.py +17 -4
- rem/utils/files.py +167 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +156 -1
- rem/utils/schema_loader.py +282 -35
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +3 -1
- rem/utils/vision.py +9 -14
- rem/workers/README.md +14 -14
- rem/workers/__init__.py +3 -1
- rem/workers/db_listener.py +579 -0
- rem/workers/db_maintainer.py +74 -0
- rem/workers/unlogged_maintainer.py +463 -0
- {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/METADATA +460 -303
- {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/RECORD +105 -74
- {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/WHEEL +1 -1
- rem/sql/002_install_models.sql +0 -1068
- rem/sql/install_models.sql +0 -1038
- {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/entry_points.txt +0 -0
rem/agentic/providers/phoenix.py
CHANGED
|
@@ -128,15 +128,16 @@ def sanitize_tool_name(tool_name: str) -> str:
|
|
|
128
128
|
|
|
129
129
|
|
|
130
130
|
def load_evaluator_schema(evaluator_name: str) -> dict[str, Any]:
|
|
131
|
-
"""Load evaluator schema
|
|
131
|
+
"""Load evaluator schema using centralized schema loader.
|
|
132
132
|
|
|
133
|
-
|
|
134
|
-
|
|
133
|
+
Uses the same unified search logic as agent schemas:
|
|
134
|
+
- "hello-world/default" → schemas/evaluators/hello-world/default.yaml
|
|
135
|
+
- "lookup-correctness" → schemas/evaluators/rem/lookup-correctness.yaml
|
|
136
|
+
- "rem-lookup-correctness" → schemas/evaluators/rem/lookup-correctness.yaml
|
|
135
137
|
|
|
136
138
|
Args:
|
|
137
|
-
evaluator_name: Evaluator name
|
|
138
|
-
e.g., "
|
|
139
|
-
"rem-lookup-correctness.yaml"
|
|
139
|
+
evaluator_name: Evaluator name or path
|
|
140
|
+
e.g., "hello-world/default", "lookup-correctness"
|
|
140
141
|
|
|
141
142
|
Returns:
|
|
142
143
|
Evaluator schema dictionary with keys:
|
|
@@ -150,43 +151,13 @@ def load_evaluator_schema(evaluator_name: str) -> dict[str, Any]:
|
|
|
150
151
|
FileNotFoundError: If evaluator schema not found
|
|
151
152
|
|
|
152
153
|
Example:
|
|
153
|
-
>>> schema = load_evaluator_schema("
|
|
154
|
+
>>> schema = load_evaluator_schema("hello-world/default")
|
|
154
155
|
>>> print(schema["description"])
|
|
155
156
|
"""
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
#
|
|
159
|
-
|
|
160
|
-
rem_module_dir = Path(rem.__file__).parent # rem/src/rem
|
|
161
|
-
rem_package_root = rem_module_dir.parent.parent # rem/src/rem -> rem/src -> rem
|
|
162
|
-
schema_dir = rem_package_root / "schemas" / "evaluators"
|
|
163
|
-
|
|
164
|
-
# Try .yaml first (preferred format)
|
|
165
|
-
yaml_path = schema_dir / f"{evaluator_name}.yaml"
|
|
166
|
-
if yaml_path.exists():
|
|
167
|
-
logger.debug(f"Loading evaluator schema from {yaml_path}")
|
|
168
|
-
with open(yaml_path) as f:
|
|
169
|
-
return yaml.safe_load(f)
|
|
170
|
-
|
|
171
|
-
# Try .yml
|
|
172
|
-
yml_path = schema_dir / f"{evaluator_name}.yml"
|
|
173
|
-
if yml_path.exists():
|
|
174
|
-
logger.debug(f"Loading evaluator schema from {yml_path}")
|
|
175
|
-
with open(yml_path) as f:
|
|
176
|
-
return yaml.safe_load(f)
|
|
177
|
-
|
|
178
|
-
# Try .json
|
|
179
|
-
json_path = schema_dir / f"{evaluator_name}.json"
|
|
180
|
-
if json_path.exists():
|
|
181
|
-
logger.debug(f"Loading evaluator schema from {json_path}")
|
|
182
|
-
with open(json_path) as f:
|
|
183
|
-
return json.load(f)
|
|
184
|
-
|
|
185
|
-
raise FileNotFoundError(
|
|
186
|
-
f"Evaluator schema not found: {evaluator_name}\n"
|
|
187
|
-
f"Searched in: {schema_dir}\n"
|
|
188
|
-
f"Supported formats: .yaml, .yml, .json"
|
|
189
|
-
)
|
|
157
|
+
from ...utils.schema_loader import load_agent_schema
|
|
158
|
+
|
|
159
|
+
# Use centralized schema loader (searches evaluator paths too)
|
|
160
|
+
return load_agent_schema(evaluator_name)
|
|
190
161
|
|
|
191
162
|
|
|
192
163
|
# =============================================================================
|
|
@@ -255,10 +226,15 @@ def create_phoenix_evaluator(
|
|
|
255
226
|
# Create appropriate Phoenix LLM wrapper based on provider
|
|
256
227
|
llm: OpenAIModel | AnthropicModel
|
|
257
228
|
if provider.lower() == "anthropic":
|
|
258
|
-
# Anthropic models
|
|
229
|
+
# Anthropic's newer Claude models (claude-sonnet-4, claude-opus-4, etc.)
|
|
230
|
+
# don't allow both temperature and top_p to be specified together.
|
|
231
|
+
# Phoenix's AnthropicModel defaults top_p=1, so we explicitly set it
|
|
232
|
+
# to None to prevent it from being sent in the API request.
|
|
233
|
+
# The invocation_parameters() method only includes params that are not None.
|
|
259
234
|
llm = AnthropicModel(
|
|
260
235
|
model=phoenix_model_name,
|
|
261
236
|
temperature=0.0,
|
|
237
|
+
top_p=None, # type: ignore[arg-type] - None prevents param from being sent
|
|
262
238
|
)
|
|
263
239
|
else:
|
|
264
240
|
# Default to OpenAI for other providers (gpt-4, etc.)
|
|
@@ -278,13 +254,178 @@ def create_phoenix_evaluator(
|
|
|
278
254
|
return evaluator_config
|
|
279
255
|
|
|
280
256
|
|
|
257
|
+
def _evaluate_expression(expression: str, context: dict[str, Any]) -> Any:
|
|
258
|
+
"""Safely evaluate a simple expression with context variables.
|
|
259
|
+
|
|
260
|
+
Supports: arithmetic, comparisons, boolean logic, len()
|
|
261
|
+
"""
|
|
262
|
+
try:
|
|
263
|
+
allowed_names = {
|
|
264
|
+
"len": len,
|
|
265
|
+
"True": True,
|
|
266
|
+
"False": False,
|
|
267
|
+
"true": True,
|
|
268
|
+
"false": False,
|
|
269
|
+
}
|
|
270
|
+
allowed_names.update(context)
|
|
271
|
+
return eval(expression, {"__builtins__": {}}, allowed_names)
|
|
272
|
+
except Exception as e:
|
|
273
|
+
logger.warning(f"Expression evaluation failed: {expression} - {e}")
|
|
274
|
+
return 0.0
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _calculate_derived_scores(
|
|
278
|
+
response_json: dict[str, Any],
|
|
279
|
+
derived_scores_config: dict[str, Any],
|
|
280
|
+
) -> dict[str, Any]:
|
|
281
|
+
"""Calculate derived scores from evaluator output using config formulas.
|
|
282
|
+
|
|
283
|
+
Supports:
|
|
284
|
+
- weighted_sum: Weighted average of fields
|
|
285
|
+
- conditional_weighted: Different formulas based on conditions
|
|
286
|
+
- boolean_logic: Boolean expression evaluation
|
|
287
|
+
"""
|
|
288
|
+
for score_name, score_config in derived_scores_config.items():
|
|
289
|
+
score_type = score_config.get("type")
|
|
290
|
+
|
|
291
|
+
if score_type == "weighted_sum":
|
|
292
|
+
weights = score_config.get("weights", {})
|
|
293
|
+
total = 0.0
|
|
294
|
+
for field, weight in weights.items():
|
|
295
|
+
field_value = response_json.get(field, 0.0)
|
|
296
|
+
if isinstance(field_value, (int, float)):
|
|
297
|
+
total += field_value * weight
|
|
298
|
+
response_json[score_name] = total
|
|
299
|
+
|
|
300
|
+
elif score_type == "conditional_weighted":
|
|
301
|
+
conditions = score_config.get("conditions", [])
|
|
302
|
+
formula_to_use = None
|
|
303
|
+
for cond_config in conditions:
|
|
304
|
+
condition = cond_config.get("condition")
|
|
305
|
+
if condition is None:
|
|
306
|
+
formula_to_use = cond_config.get("formula")
|
|
307
|
+
break
|
|
308
|
+
field = condition.get("field")
|
|
309
|
+
operator = condition.get("operator")
|
|
310
|
+
value = condition.get("value")
|
|
311
|
+
field_value = response_json.get(field, 0.0)
|
|
312
|
+
condition_met = False
|
|
313
|
+
if operator == ">=":
|
|
314
|
+
condition_met = field_value >= value
|
|
315
|
+
elif operator == ">":
|
|
316
|
+
condition_met = field_value > value
|
|
317
|
+
elif operator == "<=":
|
|
318
|
+
condition_met = field_value <= value
|
|
319
|
+
elif operator == "<":
|
|
320
|
+
condition_met = field_value < value
|
|
321
|
+
elif operator == "==":
|
|
322
|
+
condition_met = field_value == value
|
|
323
|
+
elif operator == "!=":
|
|
324
|
+
condition_met = field_value != value
|
|
325
|
+
if condition_met:
|
|
326
|
+
formula_to_use = cond_config.get("formula")
|
|
327
|
+
break
|
|
328
|
+
if formula_to_use and formula_to_use.get("type") == "weighted_sum":
|
|
329
|
+
weights = formula_to_use.get("weights", {})
|
|
330
|
+
total = 0.0
|
|
331
|
+
for field, weight in weights.items():
|
|
332
|
+
field_value = response_json.get(field, 0.0)
|
|
333
|
+
if isinstance(field_value, (int, float)):
|
|
334
|
+
total += field_value * weight
|
|
335
|
+
response_json[score_name] = total
|
|
336
|
+
|
|
337
|
+
elif score_type == "boolean_logic":
|
|
338
|
+
expression = score_config.get("expression", "")
|
|
339
|
+
result = _evaluate_expression(expression, response_json)
|
|
340
|
+
response_json[score_name] = result
|
|
341
|
+
|
|
342
|
+
return response_json
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _create_phoenix_evaluations(
|
|
346
|
+
response_json: dict[str, Any],
|
|
347
|
+
evaluations_config: list[dict[str, Any]],
|
|
348
|
+
) -> list[dict[str, Any]]:
|
|
349
|
+
"""Create Phoenix evaluation dicts from evaluator output using config.
|
|
350
|
+
|
|
351
|
+
Each evaluation becomes a column in Phoenix UI with name, label, score, explanation.
|
|
352
|
+
"""
|
|
353
|
+
evaluations = []
|
|
354
|
+
for eval_config in evaluations_config:
|
|
355
|
+
eval_name = eval_config.get("name", "unnamed")
|
|
356
|
+
score_field = eval_config.get("score_field")
|
|
357
|
+
score_expression = eval_config.get("score_expression")
|
|
358
|
+
label_field = eval_config.get("label_field")
|
|
359
|
+
label_expression = eval_config.get("label_expression")
|
|
360
|
+
label_logic = eval_config.get("label_logic", [])
|
|
361
|
+
label_transform = eval_config.get("label_transform", {})
|
|
362
|
+
score_logic = eval_config.get("score_logic", {})
|
|
363
|
+
explanation_field = eval_config.get("explanation_field")
|
|
364
|
+
|
|
365
|
+
evaluation = {"name": eval_name}
|
|
366
|
+
|
|
367
|
+
# Get score
|
|
368
|
+
if score_expression:
|
|
369
|
+
evaluation["score"] = _evaluate_expression(score_expression, response_json)
|
|
370
|
+
elif score_field:
|
|
371
|
+
evaluation["score"] = response_json.get(score_field, 0.0)
|
|
372
|
+
elif score_logic and label_field:
|
|
373
|
+
label_value = response_json.get(label_field)
|
|
374
|
+
if isinstance(label_value, bool):
|
|
375
|
+
label_value = "true" if label_value else "false"
|
|
376
|
+
evaluation["score"] = score_logic.get(str(label_value), 0.0)
|
|
377
|
+
else:
|
|
378
|
+
evaluation["score"] = None
|
|
379
|
+
|
|
380
|
+
# Get label
|
|
381
|
+
if label_expression:
|
|
382
|
+
evaluation["label"] = str(_evaluate_expression(label_expression, response_json))
|
|
383
|
+
elif label_field:
|
|
384
|
+
label_value = response_json.get(label_field)
|
|
385
|
+
if isinstance(label_value, bool):
|
|
386
|
+
label_value = "true" if label_value else "false"
|
|
387
|
+
if label_transform:
|
|
388
|
+
evaluation["label"] = label_transform.get(str(label_value), str(label_value))
|
|
389
|
+
else:
|
|
390
|
+
evaluation["label"] = str(label_value)
|
|
391
|
+
elif label_logic and (score_field or score_expression):
|
|
392
|
+
score_value = evaluation.get("score", 0.0)
|
|
393
|
+
label = "unknown"
|
|
394
|
+
for logic in label_logic:
|
|
395
|
+
threshold = logic.get("threshold", 0.0)
|
|
396
|
+
operator = logic.get("operator", ">=")
|
|
397
|
+
if operator == ">=" and score_value >= threshold:
|
|
398
|
+
label = logic.get("label", "unknown")
|
|
399
|
+
break
|
|
400
|
+
elif operator == ">" and score_value > threshold:
|
|
401
|
+
label = logic.get("label", "unknown")
|
|
402
|
+
break
|
|
403
|
+
evaluation["label"] = label
|
|
404
|
+
else:
|
|
405
|
+
evaluation["label"] = None
|
|
406
|
+
|
|
407
|
+
# Get explanation
|
|
408
|
+
if explanation_field:
|
|
409
|
+
explanation_value = response_json.get(explanation_field, "")
|
|
410
|
+
if isinstance(explanation_value, list):
|
|
411
|
+
evaluation["explanation"] = ", ".join(str(x) for x in explanation_value) if explanation_value else "None"
|
|
412
|
+
else:
|
|
413
|
+
evaluation["explanation"] = str(explanation_value)
|
|
414
|
+
else:
|
|
415
|
+
evaluation["explanation"] = None
|
|
416
|
+
|
|
417
|
+
evaluations.append(evaluation)
|
|
418
|
+
return evaluations
|
|
419
|
+
|
|
420
|
+
|
|
281
421
|
def create_evaluator_from_schema(
|
|
282
422
|
evaluator_schema_path: str | Path | dict[str, Any],
|
|
283
423
|
model_name: str | None = None,
|
|
284
424
|
) -> Callable[[Any], Any]:
|
|
285
425
|
"""Create an evaluator function from a schema file or dict.
|
|
286
426
|
|
|
287
|
-
|
|
427
|
+
Uses direct LLM call with JSON schema for structured output evaluation.
|
|
428
|
+
Supports phoenix_config for derived scores and evaluation column mappings.
|
|
288
429
|
|
|
289
430
|
Args:
|
|
290
431
|
evaluator_schema_path: Path to schema file, evaluator name, or schema dict
|
|
@@ -298,19 +439,9 @@ def create_evaluator_from_schema(
|
|
|
298
439
|
ImportError: If arize-phoenix not installed
|
|
299
440
|
|
|
300
441
|
Example:
|
|
301
|
-
>>> # From evaluator name (searches in schemas/evaluators/)
|
|
302
442
|
>>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
|
|
303
|
-
>>>
|
|
304
|
-
>>> #
|
|
305
|
-
>>> schema = {"description": "...", "properties": {...}}
|
|
306
|
-
>>> evaluator = create_evaluator_from_schema(schema)
|
|
307
|
-
>>>
|
|
308
|
-
>>> # Use in experiment
|
|
309
|
-
>>> result = evaluator({
|
|
310
|
-
... "input": {"query": "LOOKUP person:sarah-chen"},
|
|
311
|
-
... "output": {"label": "sarah-chen", "type": "person", ...},
|
|
312
|
-
... "expected": {"label": "sarah-chen", "type": "person", ...}
|
|
313
|
-
... })
|
|
443
|
+
>>> result = evaluator(input={...}, output={...}, expected={...})
|
|
444
|
+
>>> # Returns: list of {"name": "...", "score": 0.95, "label": "...", "explanation": "..."}
|
|
314
445
|
"""
|
|
315
446
|
if not _check_phoenix_available():
|
|
316
447
|
raise ImportError(
|
|
@@ -321,8 +452,6 @@ def create_evaluator_from_schema(
|
|
|
321
452
|
# Load schema if path/name provided
|
|
322
453
|
if isinstance(evaluator_schema_path, (str, Path)):
|
|
323
454
|
schema_path = Path(evaluator_schema_path)
|
|
324
|
-
|
|
325
|
-
# If it's a file path, load directly
|
|
326
455
|
if schema_path.exists():
|
|
327
456
|
logger.debug(f"Loading evaluator schema from {schema_path}")
|
|
328
457
|
if schema_path.suffix in [".yaml", ".yml"]:
|
|
@@ -332,108 +461,161 @@ def create_evaluator_from_schema(
|
|
|
332
461
|
with open(schema_path) as f:
|
|
333
462
|
schema = json.load(f)
|
|
334
463
|
else:
|
|
335
|
-
# Treat as evaluator name, search in schemas/evaluators/
|
|
336
464
|
schema = load_evaluator_schema(str(evaluator_schema_path))
|
|
337
465
|
else:
|
|
338
|
-
# Already a dict
|
|
339
466
|
schema = evaluator_schema_path
|
|
340
467
|
|
|
341
|
-
#
|
|
468
|
+
# Extract schema components
|
|
469
|
+
output_schema = schema.get("properties", {})
|
|
470
|
+
|
|
471
|
+
# Extract phoenix_config for derived scores and evaluations
|
|
472
|
+
phoenix_config = schema.get("phoenix_config", {})
|
|
473
|
+
derived_scores_config = phoenix_config.get("derived_scores", {})
|
|
474
|
+
evaluations_config = phoenix_config.get("evaluations", [])
|
|
475
|
+
|
|
476
|
+
# Create evaluator config (LLM wrapper, prompt, etc.)
|
|
342
477
|
evaluator_config = create_phoenix_evaluator(
|
|
343
478
|
evaluator_schema=schema,
|
|
344
479
|
model_name=model_name,
|
|
345
480
|
)
|
|
346
481
|
|
|
347
|
-
|
|
348
|
-
from phoenix.evals import llm_classify
|
|
349
|
-
import pandas as pd
|
|
482
|
+
import re
|
|
350
483
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
"""Evaluate a single example using Phoenix llm_classify.
|
|
484
|
+
def evaluator_fn(input: dict[str, Any], output: dict[str, Any], expected: dict[str, Any]) -> list[dict[str, Any]]:
|
|
485
|
+
"""Evaluate using Phoenix's named parameter binding with structured LLM output.
|
|
354
486
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
- expected: Expected output dict (ground truth from dataset)
|
|
487
|
+
Phoenix automatically binds these parameters:
|
|
488
|
+
- input: Dataset input dict
|
|
489
|
+
- output: Task's return value (agent output)
|
|
490
|
+
- expected: Expected output dict (reference/ground truth)
|
|
360
491
|
|
|
361
492
|
Returns:
|
|
362
|
-
|
|
493
|
+
List of Phoenix evaluation dicts with name, score, label, explanation
|
|
363
494
|
"""
|
|
364
|
-
logger.debug(
|
|
495
|
+
logger.debug("Evaluating with structured output pattern")
|
|
365
496
|
|
|
366
|
-
#
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
# Extract and flatten input fields
|
|
371
|
-
input_data = example.get("input", {})
|
|
372
|
-
if isinstance(input_data, dict):
|
|
373
|
-
for key, value in input_data.items():
|
|
374
|
-
eval_input[f"input_{key}"] = str(value) if value is not None else ""
|
|
497
|
+
# Extract question from input
|
|
498
|
+
if isinstance(input, dict):
|
|
499
|
+
question = input.get("input", input.get("text", str(input)))
|
|
375
500
|
else:
|
|
376
|
-
|
|
501
|
+
question = str(input)
|
|
377
502
|
|
|
378
|
-
#
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
for key, value in output_data.items():
|
|
382
|
-
eval_input[f"output_{key}"] = str(value) if value is not None else ""
|
|
503
|
+
# Serialize agent output
|
|
504
|
+
if isinstance(output, dict):
|
|
505
|
+
output_str = json.dumps(output, indent=2)
|
|
383
506
|
else:
|
|
384
|
-
|
|
507
|
+
output_str = str(output)
|
|
385
508
|
|
|
386
|
-
#
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
eval_input["expected"] = str(expected_data)
|
|
509
|
+
# Get reference from expected
|
|
510
|
+
if isinstance(expected, dict):
|
|
511
|
+
reference = expected.get("reference", expected.get("expected_output",
|
|
512
|
+
expected.get("ground_truth", str(expected))))
|
|
513
|
+
else:
|
|
514
|
+
reference = str(expected)
|
|
393
515
|
|
|
394
516
|
try:
|
|
395
|
-
#
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
517
|
+
# Build user message
|
|
518
|
+
user_message = f"""Question/Input: {question}
|
|
519
|
+
|
|
520
|
+
Agent's Answer:
|
|
521
|
+
{output_str}
|
|
522
|
+
|
|
523
|
+
Expected Answer (Reference):
|
|
524
|
+
{reference}
|
|
525
|
+
|
|
526
|
+
Please evaluate the agent's answer according to the evaluation criteria."""
|
|
527
|
+
|
|
528
|
+
# Add JSON schema requirement to system prompt
|
|
529
|
+
system_prompt = evaluator_config["prompt_template"]
|
|
530
|
+
schema_instruction = f"\n\nYou MUST respond with valid JSON matching this schema:\n{json.dumps(output_schema, indent=2)}\n\nProvide ONLY the JSON response, no markdown code blocks or extra text."
|
|
531
|
+
system_with_schema = system_prompt + schema_instruction
|
|
532
|
+
|
|
533
|
+
# Phoenix LLM models expect a single prompt string
|
|
534
|
+
llm = evaluator_config["llm"]
|
|
535
|
+
full_prompt = f"{system_with_schema}\n\n{user_message}"
|
|
536
|
+
response_text = llm(full_prompt)
|
|
537
|
+
|
|
538
|
+
# Parse JSON response
|
|
539
|
+
try:
|
|
540
|
+
response_json = json.loads(response_text)
|
|
541
|
+
except json.JSONDecodeError:
|
|
542
|
+
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
|
|
543
|
+
if json_match:
|
|
544
|
+
response_json = json.loads(json_match.group(1))
|
|
545
|
+
else:
|
|
546
|
+
raise ValueError(f"Could not parse JSON from LLM response: {response_text[:200]}")
|
|
547
|
+
|
|
548
|
+
logger.debug(f"LLM response parsed: {list(response_json.keys())}")
|
|
549
|
+
|
|
550
|
+
# Calculate derived scores using config
|
|
551
|
+
if derived_scores_config:
|
|
552
|
+
logger.debug(f"Calculating {len(derived_scores_config)} derived scores")
|
|
553
|
+
response_json = _calculate_derived_scores(response_json, derived_scores_config)
|
|
554
|
+
|
|
555
|
+
# Create Phoenix evaluations using config
|
|
556
|
+
if evaluations_config:
|
|
557
|
+
logger.debug(f"Creating {len(evaluations_config)} Phoenix evaluations")
|
|
558
|
+
evaluations = _create_phoenix_evaluations(response_json, evaluations_config)
|
|
559
|
+
else:
|
|
560
|
+
# Fallback: create evaluations from all numeric/boolean fields
|
|
561
|
+
logger.warning("No evaluations_config - creating default evaluations from schema")
|
|
562
|
+
evaluations = []
|
|
563
|
+
for field_name, field_value in response_json.items():
|
|
564
|
+
if isinstance(field_value, (int, float)):
|
|
565
|
+
evaluations.append({
|
|
566
|
+
"name": field_name,
|
|
567
|
+
"score": float(field_value),
|
|
568
|
+
"label": "good" if field_value >= 0.5 else "poor",
|
|
569
|
+
"explanation": None
|
|
570
|
+
})
|
|
571
|
+
elif isinstance(field_value, bool):
|
|
572
|
+
evaluations.append({
|
|
573
|
+
"name": field_name,
|
|
574
|
+
"score": 1.0 if field_value else 0.0,
|
|
575
|
+
"label": "pass" if field_value else "fail",
|
|
576
|
+
"explanation": None
|
|
577
|
+
})
|
|
578
|
+
|
|
579
|
+
# Always add overall if not present
|
|
580
|
+
if not any(e["name"] == "overall" for e in evaluations):
|
|
581
|
+
overall_score = response_json.get("overall_score", 0.0)
|
|
582
|
+
overall_pass = response_json.get("pass", False)
|
|
583
|
+
evaluations.append({
|
|
584
|
+
"name": "overall",
|
|
585
|
+
"score": overall_score if isinstance(overall_score, (int, float)) else 0.0,
|
|
586
|
+
"label": "pass" if overall_pass else "fail",
|
|
587
|
+
"explanation": response_json.get("evaluation_notes", None)
|
|
588
|
+
})
|
|
589
|
+
|
|
590
|
+
logger.debug(f"Created {len(evaluations)} evaluations")
|
|
591
|
+
|
|
592
|
+
# Phoenix run_experiment expects a single EvaluationResult, not a list.
|
|
593
|
+
# Return the overall score with detailed evaluations in metadata.
|
|
594
|
+
from phoenix.experiments.evaluators.base import EvaluationResult
|
|
595
|
+
|
|
596
|
+
overall_eval = next(
|
|
597
|
+
(e for e in evaluations if e["name"] == "overall"),
|
|
598
|
+
{"score": 0.0, "label": "unknown", "explanation": None}
|
|
405
599
|
)
|
|
406
600
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
score_map = {"correct": 1.0, "partial": 0.5, "incorrect": 0.0}
|
|
415
|
-
score = score_map.get(label, 0.0)
|
|
416
|
-
|
|
417
|
-
return {
|
|
418
|
-
"label": label,
|
|
419
|
-
"score": score,
|
|
420
|
-
"explanation": explanation or "",
|
|
421
|
-
}
|
|
422
|
-
else:
|
|
423
|
-
logger.warning("llm_classify returned empty DataFrame")
|
|
424
|
-
return {
|
|
425
|
-
"label": "error",
|
|
426
|
-
"score": 0.0,
|
|
427
|
-
"explanation": "Evaluator returned empty result",
|
|
601
|
+
return EvaluationResult(
|
|
602
|
+
score=overall_eval.get("score"),
|
|
603
|
+
label=overall_eval.get("label"),
|
|
604
|
+
explanation=overall_eval.get("explanation"),
|
|
605
|
+
metadata={
|
|
606
|
+
"evaluations": evaluations,
|
|
607
|
+
"raw_response": response_json,
|
|
428
608
|
}
|
|
609
|
+
)
|
|
429
610
|
|
|
430
611
|
except Exception as e:
|
|
431
612
|
logger.error(f"Evaluator error: {e}")
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
"
|
|
436
|
-
|
|
613
|
+
from phoenix.experiments.evaluators.base import EvaluationResult
|
|
614
|
+
return EvaluationResult(
|
|
615
|
+
score=0.0,
|
|
616
|
+
label="error",
|
|
617
|
+
explanation=f"Evaluator failed: {str(e)}",
|
|
618
|
+
)
|
|
437
619
|
|
|
438
620
|
return evaluator_fn
|
|
439
621
|
|