remdb 0.3.127__py3-none-any.whl → 0.3.172__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (62) hide show
  1. rem/agentic/agents/__init__.py +16 -0
  2. rem/agentic/agents/agent_manager.py +311 -0
  3. rem/agentic/context.py +81 -3
  4. rem/agentic/context_builder.py +36 -9
  5. rem/agentic/mcp/tool_wrapper.py +132 -15
  6. rem/agentic/providers/phoenix.py +371 -108
  7. rem/agentic/providers/pydantic_ai.py +163 -45
  8. rem/agentic/schema.py +8 -4
  9. rem/api/deps.py +3 -5
  10. rem/api/main.py +22 -3
  11. rem/api/mcp_router/resources.py +15 -10
  12. rem/api/mcp_router/server.py +2 -0
  13. rem/api/mcp_router/tools.py +94 -2
  14. rem/api/middleware/tracking.py +5 -5
  15. rem/api/routers/auth.py +349 -6
  16. rem/api/routers/chat/completions.py +5 -3
  17. rem/api/routers/chat/streaming.py +95 -22
  18. rem/api/routers/messages.py +24 -15
  19. rem/auth/__init__.py +13 -3
  20. rem/auth/jwt.py +352 -0
  21. rem/auth/middleware.py +115 -10
  22. rem/auth/providers/__init__.py +4 -1
  23. rem/auth/providers/email.py +215 -0
  24. rem/cli/commands/configure.py +3 -4
  25. rem/cli/commands/experiments.py +226 -50
  26. rem/cli/commands/session.py +336 -0
  27. rem/cli/dreaming.py +2 -2
  28. rem/cli/main.py +2 -0
  29. rem/models/core/experiment.py +58 -14
  30. rem/models/entities/__init__.py +4 -0
  31. rem/models/entities/ontology.py +1 -1
  32. rem/models/entities/ontology_config.py +1 -1
  33. rem/models/entities/subscriber.py +175 -0
  34. rem/models/entities/user.py +1 -0
  35. rem/schemas/agents/core/agent-builder.yaml +235 -0
  36. rem/schemas/agents/examples/contract-analyzer.yaml +1 -1
  37. rem/schemas/agents/examples/contract-extractor.yaml +1 -1
  38. rem/schemas/agents/examples/cv-parser.yaml +1 -1
  39. rem/services/__init__.py +3 -1
  40. rem/services/content/service.py +4 -3
  41. rem/services/email/__init__.py +10 -0
  42. rem/services/email/service.py +513 -0
  43. rem/services/email/templates.py +360 -0
  44. rem/services/postgres/README.md +38 -0
  45. rem/services/postgres/diff_service.py +19 -3
  46. rem/services/postgres/pydantic_to_sqlalchemy.py +45 -13
  47. rem/services/postgres/repository.py +5 -4
  48. rem/services/session/compression.py +113 -50
  49. rem/services/session/reload.py +14 -7
  50. rem/services/user_service.py +41 -9
  51. rem/settings.py +292 -5
  52. rem/sql/migrations/001_install.sql +1 -1
  53. rem/sql/migrations/002_install_models.sql +91 -91
  54. rem/sql/migrations/005_schema_update.sql +145 -0
  55. rem/utils/README.md +45 -0
  56. rem/utils/files.py +157 -1
  57. rem/utils/schema_loader.py +45 -7
  58. rem/utils/vision.py +1 -1
  59. {remdb-0.3.127.dist-info → remdb-0.3.172.dist-info}/METADATA +7 -5
  60. {remdb-0.3.127.dist-info → remdb-0.3.172.dist-info}/RECORD +62 -52
  61. {remdb-0.3.127.dist-info → remdb-0.3.172.dist-info}/WHEEL +0 -0
  62. {remdb-0.3.127.dist-info → remdb-0.3.172.dist-info}/entry_points.txt +0 -0
@@ -94,6 +94,82 @@ def _check_phoenix_available() -> bool:
94
94
  return PHOENIX_AVAILABLE
95
95
 
96
96
 
97
+ def validate_evaluator_credentials(
98
+ model_name: str | None = None,
99
+ ) -> tuple[bool, str | None]:
100
+ """Validate that the evaluator's LLM provider has working credentials.
101
+
102
+ Performs a minimal API call to verify credentials before running experiments.
103
+ This prevents running expensive agent tasks only to have evaluations fail.
104
+
105
+ Args:
106
+ model_name: Model to validate (defaults to claude-sonnet-4-5-20250929)
107
+
108
+ Returns:
109
+ Tuple of (success: bool, error_message: str | None)
110
+ - (True, None) if credentials are valid
111
+ - (False, "error description") if validation fails
112
+
113
+ Example:
114
+ >>> success, error = validate_evaluator_credentials()
115
+ >>> if not success:
116
+ ... print(f"Evaluator validation failed: {error}")
117
+ ... return
118
+ """
119
+ if not _check_phoenix_available():
120
+ return False, "arize-phoenix package not installed"
121
+
122
+ from phoenix.evals import OpenAIModel, AnthropicModel
123
+
124
+ # Default model (check env var first)
125
+ if model_name is None:
126
+ import os
127
+ model_name = os.environ.get("EVALUATOR_MODEL", "claude-sonnet-4-5-20250929")
128
+
129
+ # Parse provider
130
+ if ":" in model_name:
131
+ provider, phoenix_model_name = model_name.split(":", 1)
132
+ else:
133
+ if model_name.startswith("claude"):
134
+ provider = "anthropic"
135
+ else:
136
+ provider = "openai"
137
+ phoenix_model_name = model_name
138
+
139
+ try:
140
+ # Create LLM wrapper
141
+ if provider.lower() == "anthropic":
142
+ llm = AnthropicModel(
143
+ model=phoenix_model_name,
144
+ temperature=0.0,
145
+ top_p=None,
146
+ )
147
+ else:
148
+ llm = OpenAIModel(model=phoenix_model_name, temperature=0.0)
149
+
150
+ # Test with minimal prompt
151
+ logger.info(f"Validating evaluator credentials for {provider}:{phoenix_model_name}")
152
+ response = llm("Say 'ok' if you can read this.")
153
+
154
+ if response and len(response) > 0:
155
+ logger.info(f"Evaluator credentials validated successfully for {provider}")
156
+ return True, None
157
+ else:
158
+ return False, f"Empty response from {provider} model"
159
+
160
+ except Exception as e:
161
+ error_msg = str(e)
162
+ # Extract meaningful error from common API errors
163
+ if "credit balance is too low" in error_msg.lower():
164
+ return False, f"Anthropic API credits exhausted. Add credits at https://console.anthropic.com/settings/billing"
165
+ elif "api key" in error_msg.lower() or "authentication" in error_msg.lower():
166
+ return False, f"{provider.capitalize()} API key missing or invalid. Set ANTHROPIC_API_KEY or OPENAI_API_KEY environment variable."
167
+ elif "rate limit" in error_msg.lower():
168
+ return False, f"{provider.capitalize()} rate limit exceeded. Wait and retry."
169
+ else:
170
+ return False, f"{provider.capitalize()} API error: {error_msg[:200]}"
171
+
172
+
97
173
  # =============================================================================
98
174
  # NAME SANITIZATION
99
175
  # =============================================================================
@@ -207,8 +283,9 @@ def create_phoenix_evaluator(
207
283
 
208
284
  # Default model (use Claude Sonnet 4.5 for evaluators)
209
285
  if model_name is None:
210
- model_name = "claude-sonnet-4-5-20250929"
211
- logger.debug(f"Using default evaluator model: {model_name}")
286
+ import os
287
+ model_name = os.environ.get("EVALUATOR_MODEL", "claude-sonnet-4-5-20250929")
288
+ logger.debug(f"Using evaluator model: {model_name}")
212
289
 
213
290
  logger.info(f"Creating Phoenix evaluator: {evaluator_name} with model={model_name}")
214
291
 
@@ -226,10 +303,15 @@ def create_phoenix_evaluator(
226
303
  # Create appropriate Phoenix LLM wrapper based on provider
227
304
  llm: OpenAIModel | AnthropicModel
228
305
  if provider.lower() == "anthropic":
229
- # Anthropic models don't support top_p parameter
306
+ # Anthropic's newer Claude models (claude-sonnet-4, claude-opus-4, etc.)
307
+ # don't allow both temperature and top_p to be specified together.
308
+ # Phoenix's AnthropicModel defaults top_p=1, so we explicitly set it
309
+ # to None to prevent it from being sent in the API request.
310
+ # The invocation_parameters() method only includes params that are not None.
230
311
  llm = AnthropicModel(
231
312
  model=phoenix_model_name,
232
313
  temperature=0.0,
314
+ top_p=None, # type: ignore[arg-type] - None prevents param from being sent
233
315
  )
234
316
  else:
235
317
  # Default to OpenAI for other providers (gpt-4, etc.)
@@ -249,13 +331,178 @@ def create_phoenix_evaluator(
249
331
  return evaluator_config
250
332
 
251
333
 
334
+ def _evaluate_expression(expression: str, context: dict[str, Any]) -> Any:
335
+ """Safely evaluate a simple expression with context variables.
336
+
337
+ Supports: arithmetic, comparisons, boolean logic, len()
338
+ """
339
+ try:
340
+ allowed_names = {
341
+ "len": len,
342
+ "True": True,
343
+ "False": False,
344
+ "true": True,
345
+ "false": False,
346
+ }
347
+ allowed_names.update(context)
348
+ return eval(expression, {"__builtins__": {}}, allowed_names)
349
+ except Exception as e:
350
+ logger.warning(f"Expression evaluation failed: {expression} - {e}")
351
+ return 0.0
352
+
353
+
354
+ def _calculate_derived_scores(
355
+ response_json: dict[str, Any],
356
+ derived_scores_config: dict[str, Any],
357
+ ) -> dict[str, Any]:
358
+ """Calculate derived scores from evaluator output using config formulas.
359
+
360
+ Supports:
361
+ - weighted_sum: Weighted average of fields
362
+ - conditional_weighted: Different formulas based on conditions
363
+ - boolean_logic: Boolean expression evaluation
364
+ """
365
+ for score_name, score_config in derived_scores_config.items():
366
+ score_type = score_config.get("type")
367
+
368
+ if score_type == "weighted_sum":
369
+ weights = score_config.get("weights", {})
370
+ total = 0.0
371
+ for field, weight in weights.items():
372
+ field_value = response_json.get(field, 0.0)
373
+ if isinstance(field_value, (int, float)):
374
+ total += field_value * weight
375
+ response_json[score_name] = total
376
+
377
+ elif score_type == "conditional_weighted":
378
+ conditions = score_config.get("conditions", [])
379
+ formula_to_use = None
380
+ for cond_config in conditions:
381
+ condition = cond_config.get("condition")
382
+ if condition is None:
383
+ formula_to_use = cond_config.get("formula")
384
+ break
385
+ field = condition.get("field")
386
+ operator = condition.get("operator")
387
+ value = condition.get("value")
388
+ field_value = response_json.get(field, 0.0)
389
+ condition_met = False
390
+ if operator == ">=":
391
+ condition_met = field_value >= value
392
+ elif operator == ">":
393
+ condition_met = field_value > value
394
+ elif operator == "<=":
395
+ condition_met = field_value <= value
396
+ elif operator == "<":
397
+ condition_met = field_value < value
398
+ elif operator == "==":
399
+ condition_met = field_value == value
400
+ elif operator == "!=":
401
+ condition_met = field_value != value
402
+ if condition_met:
403
+ formula_to_use = cond_config.get("formula")
404
+ break
405
+ if formula_to_use and formula_to_use.get("type") == "weighted_sum":
406
+ weights = formula_to_use.get("weights", {})
407
+ total = 0.0
408
+ for field, weight in weights.items():
409
+ field_value = response_json.get(field, 0.0)
410
+ if isinstance(field_value, (int, float)):
411
+ total += field_value * weight
412
+ response_json[score_name] = total
413
+
414
+ elif score_type == "boolean_logic":
415
+ expression = score_config.get("expression", "")
416
+ result = _evaluate_expression(expression, response_json)
417
+ response_json[score_name] = result
418
+
419
+ return response_json
420
+
421
+
422
+ def _create_phoenix_evaluations(
423
+ response_json: dict[str, Any],
424
+ evaluations_config: list[dict[str, Any]],
425
+ ) -> list[dict[str, Any]]:
426
+ """Create Phoenix evaluation dicts from evaluator output using config.
427
+
428
+ Each evaluation becomes a column in Phoenix UI with name, label, score, explanation.
429
+ """
430
+ evaluations = []
431
+ for eval_config in evaluations_config:
432
+ eval_name = eval_config.get("name", "unnamed")
433
+ score_field = eval_config.get("score_field")
434
+ score_expression = eval_config.get("score_expression")
435
+ label_field = eval_config.get("label_field")
436
+ label_expression = eval_config.get("label_expression")
437
+ label_logic = eval_config.get("label_logic", [])
438
+ label_transform = eval_config.get("label_transform", {})
439
+ score_logic = eval_config.get("score_logic", {})
440
+ explanation_field = eval_config.get("explanation_field")
441
+
442
+ evaluation = {"name": eval_name}
443
+
444
+ # Get score
445
+ if score_expression:
446
+ evaluation["score"] = _evaluate_expression(score_expression, response_json)
447
+ elif score_field:
448
+ evaluation["score"] = response_json.get(score_field, 0.0)
449
+ elif score_logic and label_field:
450
+ label_value = response_json.get(label_field)
451
+ if isinstance(label_value, bool):
452
+ label_value = "true" if label_value else "false"
453
+ evaluation["score"] = score_logic.get(str(label_value), 0.0)
454
+ else:
455
+ evaluation["score"] = None
456
+
457
+ # Get label
458
+ if label_expression:
459
+ evaluation["label"] = str(_evaluate_expression(label_expression, response_json))
460
+ elif label_field:
461
+ label_value = response_json.get(label_field)
462
+ if isinstance(label_value, bool):
463
+ label_value = "true" if label_value else "false"
464
+ if label_transform:
465
+ evaluation["label"] = label_transform.get(str(label_value), str(label_value))
466
+ else:
467
+ evaluation["label"] = str(label_value)
468
+ elif label_logic and (score_field or score_expression):
469
+ score_value = evaluation.get("score", 0.0)
470
+ label = "unknown"
471
+ for logic in label_logic:
472
+ threshold = logic.get("threshold", 0.0)
473
+ operator = logic.get("operator", ">=")
474
+ if operator == ">=" and score_value >= threshold:
475
+ label = logic.get("label", "unknown")
476
+ break
477
+ elif operator == ">" and score_value > threshold:
478
+ label = logic.get("label", "unknown")
479
+ break
480
+ evaluation["label"] = label
481
+ else:
482
+ evaluation["label"] = None
483
+
484
+ # Get explanation
485
+ if explanation_field:
486
+ explanation_value = response_json.get(explanation_field, "")
487
+ if isinstance(explanation_value, list):
488
+ evaluation["explanation"] = ", ".join(str(x) for x in explanation_value) if explanation_value else "None"
489
+ else:
490
+ evaluation["explanation"] = str(explanation_value)
491
+ else:
492
+ evaluation["explanation"] = None
493
+
494
+ evaluations.append(evaluation)
495
+ return evaluations
496
+
497
+
252
498
  def create_evaluator_from_schema(
253
499
  evaluator_schema_path: str | Path | dict[str, Any],
254
500
  model_name: str | None = None,
255
501
  ) -> Callable[[Any], Any]:
256
502
  """Create an evaluator function from a schema file or dict.
257
503
 
258
- The returned evaluator is a callable that Phoenix experiments can use.
504
+ Uses direct LLM call with JSON schema for structured output evaluation.
505
+ Supports phoenix_config for derived scores and evaluation column mappings.
259
506
 
260
507
  Args:
261
508
  evaluator_schema_path: Path to schema file, evaluator name, or schema dict
@@ -269,19 +516,9 @@ def create_evaluator_from_schema(
269
516
  ImportError: If arize-phoenix not installed
270
517
 
271
518
  Example:
272
- >>> # From evaluator name (searches in schemas/evaluators/)
273
519
  >>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
274
- >>>
275
- >>> # From schema dict
276
- >>> schema = {"description": "...", "properties": {...}}
277
- >>> evaluator = create_evaluator_from_schema(schema)
278
- >>>
279
- >>> # Use in experiment
280
- >>> result = evaluator({
281
- ... "input": {"query": "LOOKUP person:sarah-chen"},
282
- ... "output": {"label": "sarah-chen", "type": "person", ...},
283
- ... "expected": {"label": "sarah-chen", "type": "person", ...}
284
- ... })
520
+ >>> result = evaluator(input={...}, output={...}, expected={...})
521
+ >>> # Returns: list of {"name": "...", "score": 0.95, "label": "...", "explanation": "..."}
285
522
  """
286
523
  if not _check_phoenix_available():
287
524
  raise ImportError(
@@ -292,8 +529,6 @@ def create_evaluator_from_schema(
292
529
  # Load schema if path/name provided
293
530
  if isinstance(evaluator_schema_path, (str, Path)):
294
531
  schema_path = Path(evaluator_schema_path)
295
-
296
- # If it's a file path, load directly
297
532
  if schema_path.exists():
298
533
  logger.debug(f"Loading evaluator schema from {schema_path}")
299
534
  if schema_path.suffix in [".yaml", ".yml"]:
@@ -303,124 +538,152 @@ def create_evaluator_from_schema(
303
538
  with open(schema_path) as f:
304
539
  schema = json.load(f)
305
540
  else:
306
- # Treat as evaluator name, search in schemas/evaluators/
307
541
  schema = load_evaluator_schema(str(evaluator_schema_path))
308
542
  else:
309
- # Already a dict
310
543
  schema = evaluator_schema_path
311
544
 
312
- # Extract model from schema's provider_configs if not explicitly provided
313
- if model_name is None:
314
- json_schema_extra = schema.get("json_schema_extra", {})
315
- provider_configs = json_schema_extra.get("provider_configs", [])
316
- if provider_configs:
317
- # Use first provider config
318
- first_provider = provider_configs[0]
319
- provider_name = first_provider.get("provider_name", "openai")
320
- schema_model_name = first_provider.get("model_name", "gpt-4o-mini")
321
- # Format as "provider:model" if not OpenAI (OpenAI is default)
322
- if provider_name == "openai":
323
- model_name = schema_model_name
324
- else:
325
- model_name = f"{provider_name}:{schema_model_name}"
326
- logger.debug(f"Using model from schema provider_configs: {model_name}")
545
+ # Extract schema components
546
+ output_schema = schema.get("properties", {})
547
+
548
+ # Extract phoenix_config for derived scores and evaluations
549
+ phoenix_config = schema.get("phoenix_config", {})
550
+ derived_scores_config = phoenix_config.get("derived_scores", {})
551
+ evaluations_config = phoenix_config.get("evaluations", [])
327
552
 
328
- # Create evaluator config
553
+ # Create evaluator config (LLM wrapper, prompt, etc.)
329
554
  evaluator_config = create_phoenix_evaluator(
330
555
  evaluator_schema=schema,
331
556
  model_name=model_name,
332
557
  )
333
558
 
334
- # Import llm_classify for evaluation
335
- from phoenix.evals import llm_classify
336
- import pandas as pd
559
+ import re
337
560
 
338
- # Wrap for Phoenix experiment compatibility
339
- def evaluator_fn(example: dict[str, Any]) -> dict[str, Any]:
340
- """Evaluate a single example using Phoenix llm_classify.
561
+ def evaluator_fn(input: dict[str, Any], output: dict[str, Any], expected: dict[str, Any]) -> list[dict[str, Any]]:
562
+ """Evaluate using Phoenix's named parameter binding with structured LLM output.
341
563
 
342
- Args:
343
- example: Dict with 'input', 'output', 'expected' keys
344
- - input: Agent input dict (e.g., {"query": "LOOKUP person:sarah-chen"})
345
- - output: Agent output dict (what the agent returned)
346
- - expected: Expected output dict (ground truth from dataset)
564
+ Phoenix automatically binds these parameters:
565
+ - input: Dataset input dict
566
+ - output: Task's return value (agent output)
567
+ - expected: Expected output dict (reference/ground truth)
347
568
 
348
569
  Returns:
349
- Evaluation result with score, label, explanation
570
+ List of Phoenix evaluation dicts with name, score, label, explanation
350
571
  """
351
- input_preview = str(example.get('input', ''))[:100]
352
- logger.debug(f"Evaluating example: {input_preview}...")
353
-
354
- # Phoenix llm_classify() expects a flat dict with string values
355
- # Build evaluation input by flattening nested dicts
356
- eval_input = {}
357
-
358
- # Extract and flatten input fields
359
- input_data = example.get("input", {})
360
- if isinstance(input_data, dict):
361
- for key, value in input_data.items():
362
- eval_input[f"input_{key}"] = str(value) if value is not None else ""
572
+ logger.debug("Evaluating with structured output pattern")
573
+
574
+ # Extract question from input
575
+ if isinstance(input, dict):
576
+ question = input.get("input", input.get("text", str(input)))
363
577
  else:
364
- eval_input["input"] = str(input_data) if input_data is not None else ""
578
+ question = str(input)
365
579
 
366
- # Extract and flatten agent output fields
367
- output_data = example.get("output", {})
368
- if isinstance(output_data, dict):
369
- for key, value in output_data.items():
370
- eval_input[f"output_{key}"] = str(value) if value is not None else ""
580
+ # Serialize agent output
581
+ if isinstance(output, dict):
582
+ output_str = json.dumps(output, indent=2)
371
583
  else:
372
- eval_input["output"] = str(output_data) if output_data is not None else ""
584
+ output_str = str(output)
373
585
 
374
- # Extract and flatten expected fields (reference/ground truth)
375
- expected_data = example.get("expected", {})
376
- if isinstance(expected_data, dict):
377
- for key, value in expected_data.items():
378
- eval_input[f"expected_{key}"] = str(value) if value is not None else ""
379
- elif expected_data:
380
- eval_input["expected"] = str(expected_data)
586
+ # Get reference from expected
587
+ if isinstance(expected, dict):
588
+ reference = expected.get("reference", expected.get("expected_output",
589
+ expected.get("ground_truth", str(expected))))
590
+ else:
591
+ reference = str(expected)
381
592
 
382
593
  try:
383
- # Create single-row DataFrame for llm_classify
384
- # Note: Phoenix's llm_classify requires pandas DataFrame (imported above)
385
- df = pd.DataFrame([eval_input])
386
-
387
- # Call Phoenix llm_classify
388
- results_df = llm_classify(
389
- dataframe=df,
390
- model=evaluator_config["llm"],
391
- template=evaluator_config["prompt_template"],
392
- rails=["correct", "partial", "incorrect"], # Common labels
393
- provide_explanation=True,
594
+ # Build user message
595
+ user_message = f"""Question/Input: {question}
596
+
597
+ Agent's Answer:
598
+ {output_str}
599
+
600
+ Expected Answer (Reference):
601
+ {reference}
602
+
603
+ Please evaluate the agent's answer according to the evaluation criteria."""
604
+
605
+ # Add JSON schema requirement to system prompt
606
+ system_prompt = evaluator_config["prompt_template"]
607
+ schema_instruction = f"\n\nYou MUST respond with valid JSON matching this schema:\n{json.dumps(output_schema, indent=2)}\n\nProvide ONLY the JSON response, no markdown code blocks or extra text."
608
+ system_with_schema = system_prompt + schema_instruction
609
+
610
+ # Phoenix LLM models expect a single prompt string
611
+ llm = evaluator_config["llm"]
612
+ full_prompt = f"{system_with_schema}\n\n{user_message}"
613
+ response_text = llm(full_prompt)
614
+
615
+ # Parse JSON response
616
+ try:
617
+ response_json = json.loads(response_text)
618
+ except json.JSONDecodeError:
619
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
620
+ if json_match:
621
+ response_json = json.loads(json_match.group(1))
622
+ else:
623
+ raise ValueError(f"Could not parse JSON from LLM response: {response_text[:200]}")
624
+
625
+ logger.debug(f"LLM response parsed: {list(response_json.keys())}")
626
+
627
+ # Calculate derived scores using config
628
+ if derived_scores_config:
629
+ logger.debug(f"Calculating {len(derived_scores_config)} derived scores")
630
+ response_json = _calculate_derived_scores(response_json, derived_scores_config)
631
+
632
+ # Create Phoenix evaluations using config
633
+ if evaluations_config:
634
+ logger.debug(f"Creating {len(evaluations_config)} Phoenix evaluations")
635
+ evaluations = _create_phoenix_evaluations(response_json, evaluations_config)
636
+ else:
637
+ # Fallback: create evaluations from all numeric/boolean fields
638
+ logger.warning("No evaluations_config - creating default evaluations from schema")
639
+ evaluations = []
640
+ for field_name, field_value in response_json.items():
641
+ if isinstance(field_value, (int, float)):
642
+ evaluations.append({
643
+ "name": field_name,
644
+ "score": float(field_value),
645
+ "label": "good" if field_value >= 0.5 else "poor",
646
+ "explanation": None
647
+ })
648
+ elif isinstance(field_value, bool):
649
+ evaluations.append({
650
+ "name": field_name,
651
+ "score": 1.0 if field_value else 0.0,
652
+ "label": "pass" if field_value else "fail",
653
+ "explanation": None
654
+ })
655
+
656
+ # Always add overall if not present
657
+ if not any(e["name"] == "overall" for e in evaluations):
658
+ overall_score = response_json.get("overall_score", 0.0)
659
+ overall_pass = response_json.get("pass", False)
660
+ evaluations.append({
661
+ "name": "overall",
662
+ "score": overall_score if isinstance(overall_score, (int, float)) else 0.0,
663
+ "label": "pass" if overall_pass else "fail",
664
+ "explanation": response_json.get("evaluation_notes", None)
665
+ })
666
+
667
+ logger.debug(f"Created {len(evaluations)} evaluations")
668
+
669
+ # Phoenix client expects a dict with score, label, explanation
670
+ # (not the old EvaluationResult class)
671
+ overall_eval = next(
672
+ (e for e in evaluations if e["name"] == "overall"),
673
+ {"score": 0.0, "label": "unknown", "explanation": None}
394
674
  )
395
675
 
396
- # Extract result (results_df is pandas DataFrame from Phoenix)
397
- if not results_df.empty:
398
- row = results_df.iloc[0]
399
- label = row.get("label", "error")
400
- explanation = row.get("explanation", "")
401
-
402
- # Map labels to scores
403
- score_map = {"correct": 1.0, "partial": 0.5, "incorrect": 0.0}
404
- score = score_map.get(label, 0.0)
405
-
406
- return {
407
- "label": label,
408
- "score": score,
409
- "explanation": explanation or "",
410
- }
411
- else:
412
- logger.warning("llm_classify returned empty DataFrame")
413
- return {
414
- "label": "error",
415
- "score": 0.0,
416
- "explanation": "Evaluator returned empty result",
417
- }
676
+ return {
677
+ "score": overall_eval.get("score", 0.0),
678
+ "label": overall_eval.get("label", "unknown"),
679
+ "explanation": overall_eval.get("explanation"),
680
+ }
418
681
 
419
682
  except Exception as e:
420
683
  logger.error(f"Evaluator error: {e}")
421
684
  return {
422
- "label": "error",
423
685
  "score": 0.0,
686
+ "label": "error",
424
687
  "explanation": f"Evaluator failed: {str(e)}",
425
688
  }
426
689