remdb 0.3.118__py3-none-any.whl → 0.3.146__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/agentic/agents/sse_simulator.py +2 -0
- rem/agentic/context.py +23 -3
- rem/agentic/mcp/tool_wrapper.py +126 -15
- rem/agentic/otel/setup.py +1 -0
- rem/agentic/providers/phoenix.py +371 -108
- rem/agentic/providers/pydantic_ai.py +122 -43
- rem/agentic/schema.py +4 -1
- rem/api/mcp_router/tools.py +13 -2
- rem/api/routers/chat/completions.py +250 -4
- rem/api/routers/chat/models.py +81 -7
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +17 -1
- rem/api/routers/chat/streaming.py +35 -1
- rem/api/routers/feedback.py +134 -14
- rem/auth/middleware.py +66 -1
- rem/cli/commands/cluster.py +590 -82
- rem/cli/commands/configure.py +3 -4
- rem/cli/commands/experiments.py +468 -76
- rem/cli/commands/session.py +336 -0
- rem/cli/dreaming.py +2 -2
- rem/cli/main.py +2 -0
- rem/config.py +8 -1
- rem/models/core/experiment.py +58 -14
- rem/models/entities/ontology.py +1 -1
- rem/models/entities/ontology_config.py +1 -1
- rem/schemas/agents/examples/contract-analyzer.yaml +1 -1
- rem/schemas/agents/examples/contract-extractor.yaml +1 -1
- rem/schemas/agents/examples/cv-parser.yaml +1 -1
- rem/services/phoenix/client.py +59 -18
- rem/services/postgres/pydantic_to_sqlalchemy.py +9 -12
- rem/services/session/compression.py +7 -0
- rem/settings.py +260 -17
- rem/sql/migrations/002_install_models.sql +91 -91
- rem/sql/migrations/004_cache_system.sql +1 -1
- rem/utils/README.md +45 -0
- rem/utils/files.py +157 -1
- rem/utils/schema_loader.py +94 -3
- rem/utils/vision.py +1 -1
- rem/workers/__init__.py +2 -1
- rem/workers/db_listener.py +579 -0
- {remdb-0.3.118.dist-info → remdb-0.3.146.dist-info}/METADATA +161 -147
- {remdb-0.3.118.dist-info → remdb-0.3.146.dist-info}/RECORD +44 -41
- {remdb-0.3.118.dist-info → remdb-0.3.146.dist-info}/WHEEL +0 -0
- {remdb-0.3.118.dist-info → remdb-0.3.146.dist-info}/entry_points.txt +0 -0
rem/agentic/providers/phoenix.py
CHANGED
|
@@ -94,6 +94,82 @@ def _check_phoenix_available() -> bool:
|
|
|
94
94
|
return PHOENIX_AVAILABLE
|
|
95
95
|
|
|
96
96
|
|
|
97
|
+
def validate_evaluator_credentials(
|
|
98
|
+
model_name: str | None = None,
|
|
99
|
+
) -> tuple[bool, str | None]:
|
|
100
|
+
"""Validate that the evaluator's LLM provider has working credentials.
|
|
101
|
+
|
|
102
|
+
Performs a minimal API call to verify credentials before running experiments.
|
|
103
|
+
This prevents running expensive agent tasks only to have evaluations fail.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
model_name: Model to validate (defaults to claude-sonnet-4-5-20250929)
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Tuple of (success: bool, error_message: str | None)
|
|
110
|
+
- (True, None) if credentials are valid
|
|
111
|
+
- (False, "error description") if validation fails
|
|
112
|
+
|
|
113
|
+
Example:
|
|
114
|
+
>>> success, error = validate_evaluator_credentials()
|
|
115
|
+
>>> if not success:
|
|
116
|
+
... print(f"Evaluator validation failed: {error}")
|
|
117
|
+
... return
|
|
118
|
+
"""
|
|
119
|
+
if not _check_phoenix_available():
|
|
120
|
+
return False, "arize-phoenix package not installed"
|
|
121
|
+
|
|
122
|
+
from phoenix.evals import OpenAIModel, AnthropicModel
|
|
123
|
+
|
|
124
|
+
# Default model (check env var first)
|
|
125
|
+
if model_name is None:
|
|
126
|
+
import os
|
|
127
|
+
model_name = os.environ.get("EVALUATOR_MODEL", "claude-sonnet-4-5-20250929")
|
|
128
|
+
|
|
129
|
+
# Parse provider
|
|
130
|
+
if ":" in model_name:
|
|
131
|
+
provider, phoenix_model_name = model_name.split(":", 1)
|
|
132
|
+
else:
|
|
133
|
+
if model_name.startswith("claude"):
|
|
134
|
+
provider = "anthropic"
|
|
135
|
+
else:
|
|
136
|
+
provider = "openai"
|
|
137
|
+
phoenix_model_name = model_name
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
# Create LLM wrapper
|
|
141
|
+
if provider.lower() == "anthropic":
|
|
142
|
+
llm = AnthropicModel(
|
|
143
|
+
model=phoenix_model_name,
|
|
144
|
+
temperature=0.0,
|
|
145
|
+
top_p=None,
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
llm = OpenAIModel(model=phoenix_model_name, temperature=0.0)
|
|
149
|
+
|
|
150
|
+
# Test with minimal prompt
|
|
151
|
+
logger.info(f"Validating evaluator credentials for {provider}:{phoenix_model_name}")
|
|
152
|
+
response = llm("Say 'ok' if you can read this.")
|
|
153
|
+
|
|
154
|
+
if response and len(response) > 0:
|
|
155
|
+
logger.info(f"Evaluator credentials validated successfully for {provider}")
|
|
156
|
+
return True, None
|
|
157
|
+
else:
|
|
158
|
+
return False, f"Empty response from {provider} model"
|
|
159
|
+
|
|
160
|
+
except Exception as e:
|
|
161
|
+
error_msg = str(e)
|
|
162
|
+
# Extract meaningful error from common API errors
|
|
163
|
+
if "credit balance is too low" in error_msg.lower():
|
|
164
|
+
return False, f"Anthropic API credits exhausted. Add credits at https://console.anthropic.com/settings/billing"
|
|
165
|
+
elif "api key" in error_msg.lower() or "authentication" in error_msg.lower():
|
|
166
|
+
return False, f"{provider.capitalize()} API key missing or invalid. Set ANTHROPIC_API_KEY or OPENAI_API_KEY environment variable."
|
|
167
|
+
elif "rate limit" in error_msg.lower():
|
|
168
|
+
return False, f"{provider.capitalize()} rate limit exceeded. Wait and retry."
|
|
169
|
+
else:
|
|
170
|
+
return False, f"{provider.capitalize()} API error: {error_msg[:200]}"
|
|
171
|
+
|
|
172
|
+
|
|
97
173
|
# =============================================================================
|
|
98
174
|
# NAME SANITIZATION
|
|
99
175
|
# =============================================================================
|
|
@@ -207,8 +283,9 @@ def create_phoenix_evaluator(
|
|
|
207
283
|
|
|
208
284
|
# Default model (use Claude Sonnet 4.5 for evaluators)
|
|
209
285
|
if model_name is None:
|
|
210
|
-
|
|
211
|
-
|
|
286
|
+
import os
|
|
287
|
+
model_name = os.environ.get("EVALUATOR_MODEL", "claude-sonnet-4-5-20250929")
|
|
288
|
+
logger.debug(f"Using evaluator model: {model_name}")
|
|
212
289
|
|
|
213
290
|
logger.info(f"Creating Phoenix evaluator: {evaluator_name} with model={model_name}")
|
|
214
291
|
|
|
@@ -226,10 +303,15 @@ def create_phoenix_evaluator(
|
|
|
226
303
|
# Create appropriate Phoenix LLM wrapper based on provider
|
|
227
304
|
llm: OpenAIModel | AnthropicModel
|
|
228
305
|
if provider.lower() == "anthropic":
|
|
229
|
-
# Anthropic models
|
|
306
|
+
# Anthropic's newer Claude models (claude-sonnet-4, claude-opus-4, etc.)
|
|
307
|
+
# don't allow both temperature and top_p to be specified together.
|
|
308
|
+
# Phoenix's AnthropicModel defaults top_p=1, so we explicitly set it
|
|
309
|
+
# to None to prevent it from being sent in the API request.
|
|
310
|
+
# The invocation_parameters() method only includes params that are not None.
|
|
230
311
|
llm = AnthropicModel(
|
|
231
312
|
model=phoenix_model_name,
|
|
232
313
|
temperature=0.0,
|
|
314
|
+
top_p=None, # type: ignore[arg-type] - None prevents param from being sent
|
|
233
315
|
)
|
|
234
316
|
else:
|
|
235
317
|
# Default to OpenAI for other providers (gpt-4, etc.)
|
|
@@ -249,13 +331,178 @@ def create_phoenix_evaluator(
|
|
|
249
331
|
return evaluator_config
|
|
250
332
|
|
|
251
333
|
|
|
334
|
+
def _evaluate_expression(expression: str, context: dict[str, Any]) -> Any:
|
|
335
|
+
"""Safely evaluate a simple expression with context variables.
|
|
336
|
+
|
|
337
|
+
Supports: arithmetic, comparisons, boolean logic, len()
|
|
338
|
+
"""
|
|
339
|
+
try:
|
|
340
|
+
allowed_names = {
|
|
341
|
+
"len": len,
|
|
342
|
+
"True": True,
|
|
343
|
+
"False": False,
|
|
344
|
+
"true": True,
|
|
345
|
+
"false": False,
|
|
346
|
+
}
|
|
347
|
+
allowed_names.update(context)
|
|
348
|
+
return eval(expression, {"__builtins__": {}}, allowed_names)
|
|
349
|
+
except Exception as e:
|
|
350
|
+
logger.warning(f"Expression evaluation failed: {expression} - {e}")
|
|
351
|
+
return 0.0
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _calculate_derived_scores(
|
|
355
|
+
response_json: dict[str, Any],
|
|
356
|
+
derived_scores_config: dict[str, Any],
|
|
357
|
+
) -> dict[str, Any]:
|
|
358
|
+
"""Calculate derived scores from evaluator output using config formulas.
|
|
359
|
+
|
|
360
|
+
Supports:
|
|
361
|
+
- weighted_sum: Weighted average of fields
|
|
362
|
+
- conditional_weighted: Different formulas based on conditions
|
|
363
|
+
- boolean_logic: Boolean expression evaluation
|
|
364
|
+
"""
|
|
365
|
+
for score_name, score_config in derived_scores_config.items():
|
|
366
|
+
score_type = score_config.get("type")
|
|
367
|
+
|
|
368
|
+
if score_type == "weighted_sum":
|
|
369
|
+
weights = score_config.get("weights", {})
|
|
370
|
+
total = 0.0
|
|
371
|
+
for field, weight in weights.items():
|
|
372
|
+
field_value = response_json.get(field, 0.0)
|
|
373
|
+
if isinstance(field_value, (int, float)):
|
|
374
|
+
total += field_value * weight
|
|
375
|
+
response_json[score_name] = total
|
|
376
|
+
|
|
377
|
+
elif score_type == "conditional_weighted":
|
|
378
|
+
conditions = score_config.get("conditions", [])
|
|
379
|
+
formula_to_use = None
|
|
380
|
+
for cond_config in conditions:
|
|
381
|
+
condition = cond_config.get("condition")
|
|
382
|
+
if condition is None:
|
|
383
|
+
formula_to_use = cond_config.get("formula")
|
|
384
|
+
break
|
|
385
|
+
field = condition.get("field")
|
|
386
|
+
operator = condition.get("operator")
|
|
387
|
+
value = condition.get("value")
|
|
388
|
+
field_value = response_json.get(field, 0.0)
|
|
389
|
+
condition_met = False
|
|
390
|
+
if operator == ">=":
|
|
391
|
+
condition_met = field_value >= value
|
|
392
|
+
elif operator == ">":
|
|
393
|
+
condition_met = field_value > value
|
|
394
|
+
elif operator == "<=":
|
|
395
|
+
condition_met = field_value <= value
|
|
396
|
+
elif operator == "<":
|
|
397
|
+
condition_met = field_value < value
|
|
398
|
+
elif operator == "==":
|
|
399
|
+
condition_met = field_value == value
|
|
400
|
+
elif operator == "!=":
|
|
401
|
+
condition_met = field_value != value
|
|
402
|
+
if condition_met:
|
|
403
|
+
formula_to_use = cond_config.get("formula")
|
|
404
|
+
break
|
|
405
|
+
if formula_to_use and formula_to_use.get("type") == "weighted_sum":
|
|
406
|
+
weights = formula_to_use.get("weights", {})
|
|
407
|
+
total = 0.0
|
|
408
|
+
for field, weight in weights.items():
|
|
409
|
+
field_value = response_json.get(field, 0.0)
|
|
410
|
+
if isinstance(field_value, (int, float)):
|
|
411
|
+
total += field_value * weight
|
|
412
|
+
response_json[score_name] = total
|
|
413
|
+
|
|
414
|
+
elif score_type == "boolean_logic":
|
|
415
|
+
expression = score_config.get("expression", "")
|
|
416
|
+
result = _evaluate_expression(expression, response_json)
|
|
417
|
+
response_json[score_name] = result
|
|
418
|
+
|
|
419
|
+
return response_json
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def _create_phoenix_evaluations(
|
|
423
|
+
response_json: dict[str, Any],
|
|
424
|
+
evaluations_config: list[dict[str, Any]],
|
|
425
|
+
) -> list[dict[str, Any]]:
|
|
426
|
+
"""Create Phoenix evaluation dicts from evaluator output using config.
|
|
427
|
+
|
|
428
|
+
Each evaluation becomes a column in Phoenix UI with name, label, score, explanation.
|
|
429
|
+
"""
|
|
430
|
+
evaluations = []
|
|
431
|
+
for eval_config in evaluations_config:
|
|
432
|
+
eval_name = eval_config.get("name", "unnamed")
|
|
433
|
+
score_field = eval_config.get("score_field")
|
|
434
|
+
score_expression = eval_config.get("score_expression")
|
|
435
|
+
label_field = eval_config.get("label_field")
|
|
436
|
+
label_expression = eval_config.get("label_expression")
|
|
437
|
+
label_logic = eval_config.get("label_logic", [])
|
|
438
|
+
label_transform = eval_config.get("label_transform", {})
|
|
439
|
+
score_logic = eval_config.get("score_logic", {})
|
|
440
|
+
explanation_field = eval_config.get("explanation_field")
|
|
441
|
+
|
|
442
|
+
evaluation = {"name": eval_name}
|
|
443
|
+
|
|
444
|
+
# Get score
|
|
445
|
+
if score_expression:
|
|
446
|
+
evaluation["score"] = _evaluate_expression(score_expression, response_json)
|
|
447
|
+
elif score_field:
|
|
448
|
+
evaluation["score"] = response_json.get(score_field, 0.0)
|
|
449
|
+
elif score_logic and label_field:
|
|
450
|
+
label_value = response_json.get(label_field)
|
|
451
|
+
if isinstance(label_value, bool):
|
|
452
|
+
label_value = "true" if label_value else "false"
|
|
453
|
+
evaluation["score"] = score_logic.get(str(label_value), 0.0)
|
|
454
|
+
else:
|
|
455
|
+
evaluation["score"] = None
|
|
456
|
+
|
|
457
|
+
# Get label
|
|
458
|
+
if label_expression:
|
|
459
|
+
evaluation["label"] = str(_evaluate_expression(label_expression, response_json))
|
|
460
|
+
elif label_field:
|
|
461
|
+
label_value = response_json.get(label_field)
|
|
462
|
+
if isinstance(label_value, bool):
|
|
463
|
+
label_value = "true" if label_value else "false"
|
|
464
|
+
if label_transform:
|
|
465
|
+
evaluation["label"] = label_transform.get(str(label_value), str(label_value))
|
|
466
|
+
else:
|
|
467
|
+
evaluation["label"] = str(label_value)
|
|
468
|
+
elif label_logic and (score_field or score_expression):
|
|
469
|
+
score_value = evaluation.get("score", 0.0)
|
|
470
|
+
label = "unknown"
|
|
471
|
+
for logic in label_logic:
|
|
472
|
+
threshold = logic.get("threshold", 0.0)
|
|
473
|
+
operator = logic.get("operator", ">=")
|
|
474
|
+
if operator == ">=" and score_value >= threshold:
|
|
475
|
+
label = logic.get("label", "unknown")
|
|
476
|
+
break
|
|
477
|
+
elif operator == ">" and score_value > threshold:
|
|
478
|
+
label = logic.get("label", "unknown")
|
|
479
|
+
break
|
|
480
|
+
evaluation["label"] = label
|
|
481
|
+
else:
|
|
482
|
+
evaluation["label"] = None
|
|
483
|
+
|
|
484
|
+
# Get explanation
|
|
485
|
+
if explanation_field:
|
|
486
|
+
explanation_value = response_json.get(explanation_field, "")
|
|
487
|
+
if isinstance(explanation_value, list):
|
|
488
|
+
evaluation["explanation"] = ", ".join(str(x) for x in explanation_value) if explanation_value else "None"
|
|
489
|
+
else:
|
|
490
|
+
evaluation["explanation"] = str(explanation_value)
|
|
491
|
+
else:
|
|
492
|
+
evaluation["explanation"] = None
|
|
493
|
+
|
|
494
|
+
evaluations.append(evaluation)
|
|
495
|
+
return evaluations
|
|
496
|
+
|
|
497
|
+
|
|
252
498
|
def create_evaluator_from_schema(
|
|
253
499
|
evaluator_schema_path: str | Path | dict[str, Any],
|
|
254
500
|
model_name: str | None = None,
|
|
255
501
|
) -> Callable[[Any], Any]:
|
|
256
502
|
"""Create an evaluator function from a schema file or dict.
|
|
257
503
|
|
|
258
|
-
|
|
504
|
+
Uses direct LLM call with JSON schema for structured output evaluation.
|
|
505
|
+
Supports phoenix_config for derived scores and evaluation column mappings.
|
|
259
506
|
|
|
260
507
|
Args:
|
|
261
508
|
evaluator_schema_path: Path to schema file, evaluator name, or schema dict
|
|
@@ -269,19 +516,9 @@ def create_evaluator_from_schema(
|
|
|
269
516
|
ImportError: If arize-phoenix not installed
|
|
270
517
|
|
|
271
518
|
Example:
|
|
272
|
-
>>> # From evaluator name (searches in schemas/evaluators/)
|
|
273
519
|
>>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
|
|
274
|
-
>>>
|
|
275
|
-
>>> #
|
|
276
|
-
>>> schema = {"description": "...", "properties": {...}}
|
|
277
|
-
>>> evaluator = create_evaluator_from_schema(schema)
|
|
278
|
-
>>>
|
|
279
|
-
>>> # Use in experiment
|
|
280
|
-
>>> result = evaluator({
|
|
281
|
-
... "input": {"query": "LOOKUP person:sarah-chen"},
|
|
282
|
-
... "output": {"label": "sarah-chen", "type": "person", ...},
|
|
283
|
-
... "expected": {"label": "sarah-chen", "type": "person", ...}
|
|
284
|
-
... })
|
|
520
|
+
>>> result = evaluator(input={...}, output={...}, expected={...})
|
|
521
|
+
>>> # Returns: list of {"name": "...", "score": 0.95, "label": "...", "explanation": "..."}
|
|
285
522
|
"""
|
|
286
523
|
if not _check_phoenix_available():
|
|
287
524
|
raise ImportError(
|
|
@@ -292,8 +529,6 @@ def create_evaluator_from_schema(
|
|
|
292
529
|
# Load schema if path/name provided
|
|
293
530
|
if isinstance(evaluator_schema_path, (str, Path)):
|
|
294
531
|
schema_path = Path(evaluator_schema_path)
|
|
295
|
-
|
|
296
|
-
# If it's a file path, load directly
|
|
297
532
|
if schema_path.exists():
|
|
298
533
|
logger.debug(f"Loading evaluator schema from {schema_path}")
|
|
299
534
|
if schema_path.suffix in [".yaml", ".yml"]:
|
|
@@ -303,124 +538,152 @@ def create_evaluator_from_schema(
|
|
|
303
538
|
with open(schema_path) as f:
|
|
304
539
|
schema = json.load(f)
|
|
305
540
|
else:
|
|
306
|
-
# Treat as evaluator name, search in schemas/evaluators/
|
|
307
541
|
schema = load_evaluator_schema(str(evaluator_schema_path))
|
|
308
542
|
else:
|
|
309
|
-
# Already a dict
|
|
310
543
|
schema = evaluator_schema_path
|
|
311
544
|
|
|
312
|
-
# Extract
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
provider_name = first_provider.get("provider_name", "openai")
|
|
320
|
-
schema_model_name = first_provider.get("model_name", "gpt-4o-mini")
|
|
321
|
-
# Format as "provider:model" if not OpenAI (OpenAI is default)
|
|
322
|
-
if provider_name == "openai":
|
|
323
|
-
model_name = schema_model_name
|
|
324
|
-
else:
|
|
325
|
-
model_name = f"{provider_name}:{schema_model_name}"
|
|
326
|
-
logger.debug(f"Using model from schema provider_configs: {model_name}")
|
|
545
|
+
# Extract schema components
|
|
546
|
+
output_schema = schema.get("properties", {})
|
|
547
|
+
|
|
548
|
+
# Extract phoenix_config for derived scores and evaluations
|
|
549
|
+
phoenix_config = schema.get("phoenix_config", {})
|
|
550
|
+
derived_scores_config = phoenix_config.get("derived_scores", {})
|
|
551
|
+
evaluations_config = phoenix_config.get("evaluations", [])
|
|
327
552
|
|
|
328
|
-
# Create evaluator config
|
|
553
|
+
# Create evaluator config (LLM wrapper, prompt, etc.)
|
|
329
554
|
evaluator_config = create_phoenix_evaluator(
|
|
330
555
|
evaluator_schema=schema,
|
|
331
556
|
model_name=model_name,
|
|
332
557
|
)
|
|
333
558
|
|
|
334
|
-
|
|
335
|
-
from phoenix.evals import llm_classify
|
|
336
|
-
import pandas as pd
|
|
559
|
+
import re
|
|
337
560
|
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
"""Evaluate a single example using Phoenix llm_classify.
|
|
561
|
+
def evaluator_fn(input: dict[str, Any], output: dict[str, Any], expected: dict[str, Any]) -> list[dict[str, Any]]:
|
|
562
|
+
"""Evaluate using Phoenix's named parameter binding with structured LLM output.
|
|
341
563
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
- expected: Expected output dict (ground truth from dataset)
|
|
564
|
+
Phoenix automatically binds these parameters:
|
|
565
|
+
- input: Dataset input dict
|
|
566
|
+
- output: Task's return value (agent output)
|
|
567
|
+
- expected: Expected output dict (reference/ground truth)
|
|
347
568
|
|
|
348
569
|
Returns:
|
|
349
|
-
|
|
570
|
+
List of Phoenix evaluation dicts with name, score, label, explanation
|
|
350
571
|
"""
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
eval_input = {}
|
|
357
|
-
|
|
358
|
-
# Extract and flatten input fields
|
|
359
|
-
input_data = example.get("input", {})
|
|
360
|
-
if isinstance(input_data, dict):
|
|
361
|
-
for key, value in input_data.items():
|
|
362
|
-
eval_input[f"input_{key}"] = str(value) if value is not None else ""
|
|
572
|
+
logger.debug("Evaluating with structured output pattern")
|
|
573
|
+
|
|
574
|
+
# Extract question from input
|
|
575
|
+
if isinstance(input, dict):
|
|
576
|
+
question = input.get("input", input.get("text", str(input)))
|
|
363
577
|
else:
|
|
364
|
-
|
|
578
|
+
question = str(input)
|
|
365
579
|
|
|
366
|
-
#
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
for key, value in output_data.items():
|
|
370
|
-
eval_input[f"output_{key}"] = str(value) if value is not None else ""
|
|
580
|
+
# Serialize agent output
|
|
581
|
+
if isinstance(output, dict):
|
|
582
|
+
output_str = json.dumps(output, indent=2)
|
|
371
583
|
else:
|
|
372
|
-
|
|
584
|
+
output_str = str(output)
|
|
373
585
|
|
|
374
|
-
#
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
eval_input["expected"] = str(expected_data)
|
|
586
|
+
# Get reference from expected
|
|
587
|
+
if isinstance(expected, dict):
|
|
588
|
+
reference = expected.get("reference", expected.get("expected_output",
|
|
589
|
+
expected.get("ground_truth", str(expected))))
|
|
590
|
+
else:
|
|
591
|
+
reference = str(expected)
|
|
381
592
|
|
|
382
593
|
try:
|
|
383
|
-
#
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
594
|
+
# Build user message
|
|
595
|
+
user_message = f"""Question/Input: {question}
|
|
596
|
+
|
|
597
|
+
Agent's Answer:
|
|
598
|
+
{output_str}
|
|
599
|
+
|
|
600
|
+
Expected Answer (Reference):
|
|
601
|
+
{reference}
|
|
602
|
+
|
|
603
|
+
Please evaluate the agent's answer according to the evaluation criteria."""
|
|
604
|
+
|
|
605
|
+
# Add JSON schema requirement to system prompt
|
|
606
|
+
system_prompt = evaluator_config["prompt_template"]
|
|
607
|
+
schema_instruction = f"\n\nYou MUST respond with valid JSON matching this schema:\n{json.dumps(output_schema, indent=2)}\n\nProvide ONLY the JSON response, no markdown code blocks or extra text."
|
|
608
|
+
system_with_schema = system_prompt + schema_instruction
|
|
609
|
+
|
|
610
|
+
# Phoenix LLM models expect a single prompt string
|
|
611
|
+
llm = evaluator_config["llm"]
|
|
612
|
+
full_prompt = f"{system_with_schema}\n\n{user_message}"
|
|
613
|
+
response_text = llm(full_prompt)
|
|
614
|
+
|
|
615
|
+
# Parse JSON response
|
|
616
|
+
try:
|
|
617
|
+
response_json = json.loads(response_text)
|
|
618
|
+
except json.JSONDecodeError:
|
|
619
|
+
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
|
|
620
|
+
if json_match:
|
|
621
|
+
response_json = json.loads(json_match.group(1))
|
|
622
|
+
else:
|
|
623
|
+
raise ValueError(f"Could not parse JSON from LLM response: {response_text[:200]}")
|
|
624
|
+
|
|
625
|
+
logger.debug(f"LLM response parsed: {list(response_json.keys())}")
|
|
626
|
+
|
|
627
|
+
# Calculate derived scores using config
|
|
628
|
+
if derived_scores_config:
|
|
629
|
+
logger.debug(f"Calculating {len(derived_scores_config)} derived scores")
|
|
630
|
+
response_json = _calculate_derived_scores(response_json, derived_scores_config)
|
|
631
|
+
|
|
632
|
+
# Create Phoenix evaluations using config
|
|
633
|
+
if evaluations_config:
|
|
634
|
+
logger.debug(f"Creating {len(evaluations_config)} Phoenix evaluations")
|
|
635
|
+
evaluations = _create_phoenix_evaluations(response_json, evaluations_config)
|
|
636
|
+
else:
|
|
637
|
+
# Fallback: create evaluations from all numeric/boolean fields
|
|
638
|
+
logger.warning("No evaluations_config - creating default evaluations from schema")
|
|
639
|
+
evaluations = []
|
|
640
|
+
for field_name, field_value in response_json.items():
|
|
641
|
+
if isinstance(field_value, (int, float)):
|
|
642
|
+
evaluations.append({
|
|
643
|
+
"name": field_name,
|
|
644
|
+
"score": float(field_value),
|
|
645
|
+
"label": "good" if field_value >= 0.5 else "poor",
|
|
646
|
+
"explanation": None
|
|
647
|
+
})
|
|
648
|
+
elif isinstance(field_value, bool):
|
|
649
|
+
evaluations.append({
|
|
650
|
+
"name": field_name,
|
|
651
|
+
"score": 1.0 if field_value else 0.0,
|
|
652
|
+
"label": "pass" if field_value else "fail",
|
|
653
|
+
"explanation": None
|
|
654
|
+
})
|
|
655
|
+
|
|
656
|
+
# Always add overall if not present
|
|
657
|
+
if not any(e["name"] == "overall" for e in evaluations):
|
|
658
|
+
overall_score = response_json.get("overall_score", 0.0)
|
|
659
|
+
overall_pass = response_json.get("pass", False)
|
|
660
|
+
evaluations.append({
|
|
661
|
+
"name": "overall",
|
|
662
|
+
"score": overall_score if isinstance(overall_score, (int, float)) else 0.0,
|
|
663
|
+
"label": "pass" if overall_pass else "fail",
|
|
664
|
+
"explanation": response_json.get("evaluation_notes", None)
|
|
665
|
+
})
|
|
666
|
+
|
|
667
|
+
logger.debug(f"Created {len(evaluations)} evaluations")
|
|
668
|
+
|
|
669
|
+
# Phoenix client expects a dict with score, label, explanation
|
|
670
|
+
# (not the old EvaluationResult class)
|
|
671
|
+
overall_eval = next(
|
|
672
|
+
(e for e in evaluations if e["name"] == "overall"),
|
|
673
|
+
{"score": 0.0, "label": "unknown", "explanation": None}
|
|
394
674
|
)
|
|
395
675
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
# Map labels to scores
|
|
403
|
-
score_map = {"correct": 1.0, "partial": 0.5, "incorrect": 0.0}
|
|
404
|
-
score = score_map.get(label, 0.0)
|
|
405
|
-
|
|
406
|
-
return {
|
|
407
|
-
"label": label,
|
|
408
|
-
"score": score,
|
|
409
|
-
"explanation": explanation or "",
|
|
410
|
-
}
|
|
411
|
-
else:
|
|
412
|
-
logger.warning("llm_classify returned empty DataFrame")
|
|
413
|
-
return {
|
|
414
|
-
"label": "error",
|
|
415
|
-
"score": 0.0,
|
|
416
|
-
"explanation": "Evaluator returned empty result",
|
|
417
|
-
}
|
|
676
|
+
return {
|
|
677
|
+
"score": overall_eval.get("score", 0.0),
|
|
678
|
+
"label": overall_eval.get("label", "unknown"),
|
|
679
|
+
"explanation": overall_eval.get("explanation"),
|
|
680
|
+
}
|
|
418
681
|
|
|
419
682
|
except Exception as e:
|
|
420
683
|
logger.error(f"Evaluator error: {e}")
|
|
421
684
|
return {
|
|
422
|
-
"label": "error",
|
|
423
685
|
"score": 0.0,
|
|
686
|
+
"label": "error",
|
|
424
687
|
"explanation": f"Evaluator failed: {str(e)}",
|
|
425
688
|
}
|
|
426
689
|
|