remdb 0.3.7__py3-none-any.whl → 0.3.133__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. rem/__init__.py +129 -2
  2. rem/agentic/README.md +76 -0
  3. rem/agentic/__init__.py +15 -0
  4. rem/agentic/agents/__init__.py +16 -2
  5. rem/agentic/agents/sse_simulator.py +502 -0
  6. rem/agentic/context.py +51 -25
  7. rem/agentic/llm_provider_models.py +301 -0
  8. rem/agentic/mcp/tool_wrapper.py +112 -17
  9. rem/agentic/otel/setup.py +93 -4
  10. rem/agentic/providers/phoenix.py +314 -132
  11. rem/agentic/providers/pydantic_ai.py +215 -26
  12. rem/agentic/schema.py +361 -21
  13. rem/agentic/tools/rem_tools.py +3 -3
  14. rem/api/README.md +238 -1
  15. rem/api/deps.py +255 -0
  16. rem/api/main.py +154 -37
  17. rem/api/mcp_router/resources.py +1 -1
  18. rem/api/mcp_router/server.py +26 -5
  19. rem/api/mcp_router/tools.py +465 -7
  20. rem/api/middleware/tracking.py +172 -0
  21. rem/api/routers/admin.py +494 -0
  22. rem/api/routers/auth.py +124 -0
  23. rem/api/routers/chat/completions.py +402 -20
  24. rem/api/routers/chat/models.py +88 -10
  25. rem/api/routers/chat/otel_utils.py +33 -0
  26. rem/api/routers/chat/sse_events.py +542 -0
  27. rem/api/routers/chat/streaming.py +642 -45
  28. rem/api/routers/dev.py +81 -0
  29. rem/api/routers/feedback.py +268 -0
  30. rem/api/routers/messages.py +473 -0
  31. rem/api/routers/models.py +78 -0
  32. rem/api/routers/query.py +360 -0
  33. rem/api/routers/shared_sessions.py +406 -0
  34. rem/auth/middleware.py +126 -27
  35. rem/cli/commands/README.md +237 -64
  36. rem/cli/commands/ask.py +13 -10
  37. rem/cli/commands/cluster.py +1808 -0
  38. rem/cli/commands/configure.py +5 -6
  39. rem/cli/commands/db.py +396 -139
  40. rem/cli/commands/experiments.py +469 -74
  41. rem/cli/commands/process.py +22 -15
  42. rem/cli/commands/scaffold.py +47 -0
  43. rem/cli/commands/schema.py +97 -50
  44. rem/cli/main.py +29 -6
  45. rem/config.py +10 -3
  46. rem/models/core/core_model.py +7 -1
  47. rem/models/core/experiment.py +54 -0
  48. rem/models/core/rem_query.py +5 -2
  49. rem/models/entities/__init__.py +21 -0
  50. rem/models/entities/domain_resource.py +38 -0
  51. rem/models/entities/feedback.py +123 -0
  52. rem/models/entities/message.py +30 -1
  53. rem/models/entities/session.py +83 -0
  54. rem/models/entities/shared_session.py +180 -0
  55. rem/models/entities/user.py +10 -3
  56. rem/registry.py +373 -0
  57. rem/schemas/agents/rem.yaml +7 -3
  58. rem/services/content/providers.py +92 -133
  59. rem/services/content/service.py +92 -20
  60. rem/services/dreaming/affinity_service.py +2 -16
  61. rem/services/dreaming/moment_service.py +2 -15
  62. rem/services/embeddings/api.py +24 -17
  63. rem/services/embeddings/worker.py +16 -16
  64. rem/services/phoenix/EXPERIMENT_DESIGN.md +3 -3
  65. rem/services/phoenix/client.py +302 -28
  66. rem/services/postgres/README.md +159 -15
  67. rem/services/postgres/__init__.py +2 -1
  68. rem/services/postgres/diff_service.py +531 -0
  69. rem/services/postgres/pydantic_to_sqlalchemy.py +427 -129
  70. rem/services/postgres/repository.py +132 -0
  71. rem/services/postgres/schema_generator.py +291 -9
  72. rem/services/postgres/service.py +6 -6
  73. rem/services/rate_limit.py +113 -0
  74. rem/services/rem/README.md +14 -0
  75. rem/services/rem/parser.py +44 -9
  76. rem/services/rem/service.py +36 -2
  77. rem/services/session/compression.py +24 -1
  78. rem/services/session/reload.py +1 -1
  79. rem/services/user_service.py +98 -0
  80. rem/settings.py +399 -29
  81. rem/sql/background_indexes.sql +21 -16
  82. rem/sql/migrations/001_install.sql +387 -54
  83. rem/sql/migrations/002_install_models.sql +2320 -393
  84. rem/sql/migrations/003_optional_extensions.sql +326 -0
  85. rem/sql/migrations/004_cache_system.sql +548 -0
  86. rem/utils/__init__.py +18 -0
  87. rem/utils/constants.py +97 -0
  88. rem/utils/date_utils.py +228 -0
  89. rem/utils/embeddings.py +17 -4
  90. rem/utils/files.py +167 -0
  91. rem/utils/mime_types.py +158 -0
  92. rem/utils/model_helpers.py +156 -1
  93. rem/utils/schema_loader.py +282 -35
  94. rem/utils/sql_paths.py +146 -0
  95. rem/utils/sql_types.py +3 -1
  96. rem/utils/vision.py +9 -14
  97. rem/workers/README.md +14 -14
  98. rem/workers/__init__.py +3 -1
  99. rem/workers/db_listener.py +579 -0
  100. rem/workers/db_maintainer.py +74 -0
  101. rem/workers/unlogged_maintainer.py +463 -0
  102. {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/METADATA +460 -303
  103. {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/RECORD +105 -74
  104. {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/WHEEL +1 -1
  105. rem/sql/002_install_models.sql +0 -1068
  106. rem/sql/install_models.sql +0 -1038
  107. {remdb-0.3.7.dist-info → remdb-0.3.133.dist-info}/entry_points.txt +0 -0
@@ -128,15 +128,16 @@ def sanitize_tool_name(tool_name: str) -> str:
128
128
 
129
129
 
130
130
  def load_evaluator_schema(evaluator_name: str) -> dict[str, Any]:
131
- """Load evaluator schema from schemas/evaluators/ directory.
131
+ """Load evaluator schema using centralized schema loader.
132
132
 
133
- Searches for evaluator schema in rem/schemas/evaluators/
134
- Supports .json, .yaml, and .yml files.
133
+ Uses the same unified search logic as agent schemas:
134
+ - "hello-world/default" → schemas/evaluators/hello-world/default.yaml
135
+ - "lookup-correctness" → schemas/evaluators/rem/lookup-correctness.yaml
136
+ - "rem-lookup-correctness" → schemas/evaluators/rem/lookup-correctness.yaml
135
137
 
136
138
  Args:
137
- evaluator_name: Evaluator name (with or without extension)
138
- e.g., "rem-lookup-correctness" or
139
- "rem-lookup-correctness.yaml"
139
+ evaluator_name: Evaluator name or path
140
+ e.g., "hello-world/default", "lookup-correctness"
140
141
 
141
142
  Returns:
142
143
  Evaluator schema dictionary with keys:
@@ -150,43 +151,13 @@ def load_evaluator_schema(evaluator_name: str) -> dict[str, Any]:
150
151
  FileNotFoundError: If evaluator schema not found
151
152
 
152
153
  Example:
153
- >>> schema = load_evaluator_schema("rem-lookup-correctness")
154
+ >>> schema = load_evaluator_schema("hello-world/default")
154
155
  >>> print(schema["description"])
155
156
  """
156
- # Get schemas directory (rem/schemas/evaluators/)
157
- # rem.__file__ = rem/src/rem/__init__.py
158
- # We need rem/schemas/evaluators/
159
- import rem
160
- rem_module_dir = Path(rem.__file__).parent # rem/src/rem
161
- rem_package_root = rem_module_dir.parent.parent # rem/src/rem -> rem/src -> rem
162
- schema_dir = rem_package_root / "schemas" / "evaluators"
163
-
164
- # Try .yaml first (preferred format)
165
- yaml_path = schema_dir / f"{evaluator_name}.yaml"
166
- if yaml_path.exists():
167
- logger.debug(f"Loading evaluator schema from {yaml_path}")
168
- with open(yaml_path) as f:
169
- return yaml.safe_load(f)
170
-
171
- # Try .yml
172
- yml_path = schema_dir / f"{evaluator_name}.yml"
173
- if yml_path.exists():
174
- logger.debug(f"Loading evaluator schema from {yml_path}")
175
- with open(yml_path) as f:
176
- return yaml.safe_load(f)
177
-
178
- # Try .json
179
- json_path = schema_dir / f"{evaluator_name}.json"
180
- if json_path.exists():
181
- logger.debug(f"Loading evaluator schema from {json_path}")
182
- with open(json_path) as f:
183
- return json.load(f)
184
-
185
- raise FileNotFoundError(
186
- f"Evaluator schema not found: {evaluator_name}\n"
187
- f"Searched in: {schema_dir}\n"
188
- f"Supported formats: .yaml, .yml, .json"
189
- )
157
+ from ...utils.schema_loader import load_agent_schema
158
+
159
+ # Use centralized schema loader (searches evaluator paths too)
160
+ return load_agent_schema(evaluator_name)
190
161
 
191
162
 
192
163
  # =============================================================================
@@ -255,10 +226,15 @@ def create_phoenix_evaluator(
255
226
  # Create appropriate Phoenix LLM wrapper based on provider
256
227
  llm: OpenAIModel | AnthropicModel
257
228
  if provider.lower() == "anthropic":
258
- # Anthropic models don't support top_p parameter
229
+ # Anthropic's newer Claude models (claude-sonnet-4, claude-opus-4, etc.)
230
+ # don't allow both temperature and top_p to be specified together.
231
+ # Phoenix's AnthropicModel defaults top_p=1, so we explicitly set it
232
+ # to None to prevent it from being sent in the API request.
233
+ # The invocation_parameters() method only includes params that are not None.
259
234
  llm = AnthropicModel(
260
235
  model=phoenix_model_name,
261
236
  temperature=0.0,
237
+ top_p=None, # type: ignore[arg-type] - None prevents param from being sent
262
238
  )
263
239
  else:
264
240
  # Default to OpenAI for other providers (gpt-4, etc.)
@@ -278,13 +254,178 @@ def create_phoenix_evaluator(
278
254
  return evaluator_config
279
255
 
280
256
 
257
+ def _evaluate_expression(expression: str, context: dict[str, Any]) -> Any:
258
+ """Safely evaluate a simple expression with context variables.
259
+
260
+ Supports: arithmetic, comparisons, boolean logic, len()
261
+ """
262
+ try:
263
+ allowed_names = {
264
+ "len": len,
265
+ "True": True,
266
+ "False": False,
267
+ "true": True,
268
+ "false": False,
269
+ }
270
+ allowed_names.update(context)
271
+ return eval(expression, {"__builtins__": {}}, allowed_names)
272
+ except Exception as e:
273
+ logger.warning(f"Expression evaluation failed: {expression} - {e}")
274
+ return 0.0
275
+
276
+
277
+ def _calculate_derived_scores(
278
+ response_json: dict[str, Any],
279
+ derived_scores_config: dict[str, Any],
280
+ ) -> dict[str, Any]:
281
+ """Calculate derived scores from evaluator output using config formulas.
282
+
283
+ Supports:
284
+ - weighted_sum: Weighted average of fields
285
+ - conditional_weighted: Different formulas based on conditions
286
+ - boolean_logic: Boolean expression evaluation
287
+ """
288
+ for score_name, score_config in derived_scores_config.items():
289
+ score_type = score_config.get("type")
290
+
291
+ if score_type == "weighted_sum":
292
+ weights = score_config.get("weights", {})
293
+ total = 0.0
294
+ for field, weight in weights.items():
295
+ field_value = response_json.get(field, 0.0)
296
+ if isinstance(field_value, (int, float)):
297
+ total += field_value * weight
298
+ response_json[score_name] = total
299
+
300
+ elif score_type == "conditional_weighted":
301
+ conditions = score_config.get("conditions", [])
302
+ formula_to_use = None
303
+ for cond_config in conditions:
304
+ condition = cond_config.get("condition")
305
+ if condition is None:
306
+ formula_to_use = cond_config.get("formula")
307
+ break
308
+ field = condition.get("field")
309
+ operator = condition.get("operator")
310
+ value = condition.get("value")
311
+ field_value = response_json.get(field, 0.0)
312
+ condition_met = False
313
+ if operator == ">=":
314
+ condition_met = field_value >= value
315
+ elif operator == ">":
316
+ condition_met = field_value > value
317
+ elif operator == "<=":
318
+ condition_met = field_value <= value
319
+ elif operator == "<":
320
+ condition_met = field_value < value
321
+ elif operator == "==":
322
+ condition_met = field_value == value
323
+ elif operator == "!=":
324
+ condition_met = field_value != value
325
+ if condition_met:
326
+ formula_to_use = cond_config.get("formula")
327
+ break
328
+ if formula_to_use and formula_to_use.get("type") == "weighted_sum":
329
+ weights = formula_to_use.get("weights", {})
330
+ total = 0.0
331
+ for field, weight in weights.items():
332
+ field_value = response_json.get(field, 0.0)
333
+ if isinstance(field_value, (int, float)):
334
+ total += field_value * weight
335
+ response_json[score_name] = total
336
+
337
+ elif score_type == "boolean_logic":
338
+ expression = score_config.get("expression", "")
339
+ result = _evaluate_expression(expression, response_json)
340
+ response_json[score_name] = result
341
+
342
+ return response_json
343
+
344
+
345
+ def _create_phoenix_evaluations(
346
+ response_json: dict[str, Any],
347
+ evaluations_config: list[dict[str, Any]],
348
+ ) -> list[dict[str, Any]]:
349
+ """Create Phoenix evaluation dicts from evaluator output using config.
350
+
351
+ Each evaluation becomes a column in Phoenix UI with name, label, score, explanation.
352
+ """
353
+ evaluations = []
354
+ for eval_config in evaluations_config:
355
+ eval_name = eval_config.get("name", "unnamed")
356
+ score_field = eval_config.get("score_field")
357
+ score_expression = eval_config.get("score_expression")
358
+ label_field = eval_config.get("label_field")
359
+ label_expression = eval_config.get("label_expression")
360
+ label_logic = eval_config.get("label_logic", [])
361
+ label_transform = eval_config.get("label_transform", {})
362
+ score_logic = eval_config.get("score_logic", {})
363
+ explanation_field = eval_config.get("explanation_field")
364
+
365
+ evaluation = {"name": eval_name}
366
+
367
+ # Get score
368
+ if score_expression:
369
+ evaluation["score"] = _evaluate_expression(score_expression, response_json)
370
+ elif score_field:
371
+ evaluation["score"] = response_json.get(score_field, 0.0)
372
+ elif score_logic and label_field:
373
+ label_value = response_json.get(label_field)
374
+ if isinstance(label_value, bool):
375
+ label_value = "true" if label_value else "false"
376
+ evaluation["score"] = score_logic.get(str(label_value), 0.0)
377
+ else:
378
+ evaluation["score"] = None
379
+
380
+ # Get label
381
+ if label_expression:
382
+ evaluation["label"] = str(_evaluate_expression(label_expression, response_json))
383
+ elif label_field:
384
+ label_value = response_json.get(label_field)
385
+ if isinstance(label_value, bool):
386
+ label_value = "true" if label_value else "false"
387
+ if label_transform:
388
+ evaluation["label"] = label_transform.get(str(label_value), str(label_value))
389
+ else:
390
+ evaluation["label"] = str(label_value)
391
+ elif label_logic and (score_field or score_expression):
392
+ score_value = evaluation.get("score", 0.0)
393
+ label = "unknown"
394
+ for logic in label_logic:
395
+ threshold = logic.get("threshold", 0.0)
396
+ operator = logic.get("operator", ">=")
397
+ if operator == ">=" and score_value >= threshold:
398
+ label = logic.get("label", "unknown")
399
+ break
400
+ elif operator == ">" and score_value > threshold:
401
+ label = logic.get("label", "unknown")
402
+ break
403
+ evaluation["label"] = label
404
+ else:
405
+ evaluation["label"] = None
406
+
407
+ # Get explanation
408
+ if explanation_field:
409
+ explanation_value = response_json.get(explanation_field, "")
410
+ if isinstance(explanation_value, list):
411
+ evaluation["explanation"] = ", ".join(str(x) for x in explanation_value) if explanation_value else "None"
412
+ else:
413
+ evaluation["explanation"] = str(explanation_value)
414
+ else:
415
+ evaluation["explanation"] = None
416
+
417
+ evaluations.append(evaluation)
418
+ return evaluations
419
+
420
+
281
421
  def create_evaluator_from_schema(
282
422
  evaluator_schema_path: str | Path | dict[str, Any],
283
423
  model_name: str | None = None,
284
424
  ) -> Callable[[Any], Any]:
285
425
  """Create an evaluator function from a schema file or dict.
286
426
 
287
- The returned evaluator is a callable that Phoenix experiments can use.
427
+ Uses direct LLM call with JSON schema for structured output evaluation.
428
+ Supports phoenix_config for derived scores and evaluation column mappings.
288
429
 
289
430
  Args:
290
431
  evaluator_schema_path: Path to schema file, evaluator name, or schema dict
@@ -298,19 +439,9 @@ def create_evaluator_from_schema(
298
439
  ImportError: If arize-phoenix not installed
299
440
 
300
441
  Example:
301
- >>> # From evaluator name (searches in schemas/evaluators/)
302
442
  >>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
303
- >>>
304
- >>> # From schema dict
305
- >>> schema = {"description": "...", "properties": {...}}
306
- >>> evaluator = create_evaluator_from_schema(schema)
307
- >>>
308
- >>> # Use in experiment
309
- >>> result = evaluator({
310
- ... "input": {"query": "LOOKUP person:sarah-chen"},
311
- ... "output": {"label": "sarah-chen", "type": "person", ...},
312
- ... "expected": {"label": "sarah-chen", "type": "person", ...}
313
- ... })
443
+ >>> result = evaluator(input={...}, output={...}, expected={...})
444
+ >>> # Returns: list of {"name": "...", "score": 0.95, "label": "...", "explanation": "..."}
314
445
  """
315
446
  if not _check_phoenix_available():
316
447
  raise ImportError(
@@ -321,8 +452,6 @@ def create_evaluator_from_schema(
321
452
  # Load schema if path/name provided
322
453
  if isinstance(evaluator_schema_path, (str, Path)):
323
454
  schema_path = Path(evaluator_schema_path)
324
-
325
- # If it's a file path, load directly
326
455
  if schema_path.exists():
327
456
  logger.debug(f"Loading evaluator schema from {schema_path}")
328
457
  if schema_path.suffix in [".yaml", ".yml"]:
@@ -332,108 +461,161 @@ def create_evaluator_from_schema(
332
461
  with open(schema_path) as f:
333
462
  schema = json.load(f)
334
463
  else:
335
- # Treat as evaluator name, search in schemas/evaluators/
336
464
  schema = load_evaluator_schema(str(evaluator_schema_path))
337
465
  else:
338
- # Already a dict
339
466
  schema = evaluator_schema_path
340
467
 
341
- # Create evaluator config
468
+ # Extract schema components
469
+ output_schema = schema.get("properties", {})
470
+
471
+ # Extract phoenix_config for derived scores and evaluations
472
+ phoenix_config = schema.get("phoenix_config", {})
473
+ derived_scores_config = phoenix_config.get("derived_scores", {})
474
+ evaluations_config = phoenix_config.get("evaluations", [])
475
+
476
+ # Create evaluator config (LLM wrapper, prompt, etc.)
342
477
  evaluator_config = create_phoenix_evaluator(
343
478
  evaluator_schema=schema,
344
479
  model_name=model_name,
345
480
  )
346
481
 
347
- # Import llm_classify for evaluation
348
- from phoenix.evals import llm_classify
349
- import pandas as pd
482
+ import re
350
483
 
351
- # Wrap for Phoenix experiment compatibility
352
- def evaluator_fn(example: dict[str, Any]) -> dict[str, Any]:
353
- """Evaluate a single example using Phoenix llm_classify.
484
+ def evaluator_fn(input: dict[str, Any], output: dict[str, Any], expected: dict[str, Any]) -> list[dict[str, Any]]:
485
+ """Evaluate using Phoenix's named parameter binding with structured LLM output.
354
486
 
355
- Args:
356
- example: Dict with 'input', 'output', 'expected' keys
357
- - input: Agent input dict (e.g., {"query": "LOOKUP person:sarah-chen"})
358
- - output: Agent output dict (what the agent returned)
359
- - expected: Expected output dict (ground truth from dataset)
487
+ Phoenix automatically binds these parameters:
488
+ - input: Dataset input dict
489
+ - output: Task's return value (agent output)
490
+ - expected: Expected output dict (reference/ground truth)
360
491
 
361
492
  Returns:
362
- Evaluation result with score, label, explanation
493
+ List of Phoenix evaluation dicts with name, score, label, explanation
363
494
  """
364
- logger.debug(f"Evaluating example: {example.get('input', '')[:100]}...")
495
+ logger.debug("Evaluating with structured output pattern")
365
496
 
366
- # Phoenix llm_classify() expects a flat dict with string values
367
- # Build evaluation input by flattening nested dicts
368
- eval_input = {}
369
-
370
- # Extract and flatten input fields
371
- input_data = example.get("input", {})
372
- if isinstance(input_data, dict):
373
- for key, value in input_data.items():
374
- eval_input[f"input_{key}"] = str(value) if value is not None else ""
497
+ # Extract question from input
498
+ if isinstance(input, dict):
499
+ question = input.get("input", input.get("text", str(input)))
375
500
  else:
376
- eval_input["input"] = str(input_data) if input_data is not None else ""
501
+ question = str(input)
377
502
 
378
- # Extract and flatten agent output fields
379
- output_data = example.get("output", {})
380
- if isinstance(output_data, dict):
381
- for key, value in output_data.items():
382
- eval_input[f"output_{key}"] = str(value) if value is not None else ""
503
+ # Serialize agent output
504
+ if isinstance(output, dict):
505
+ output_str = json.dumps(output, indent=2)
383
506
  else:
384
- eval_input["output"] = str(output_data) if output_data is not None else ""
507
+ output_str = str(output)
385
508
 
386
- # Extract and flatten expected fields (reference/ground truth)
387
- expected_data = example.get("expected", {})
388
- if isinstance(expected_data, dict):
389
- for key, value in expected_data.items():
390
- eval_input[f"expected_{key}"] = str(value) if value is not None else ""
391
- elif expected_data:
392
- eval_input["expected"] = str(expected_data)
509
+ # Get reference from expected
510
+ if isinstance(expected, dict):
511
+ reference = expected.get("reference", expected.get("expected_output",
512
+ expected.get("ground_truth", str(expected))))
513
+ else:
514
+ reference = str(expected)
393
515
 
394
516
  try:
395
- # Create single-row DataFrame for llm_classify
396
- df = pd.DataFrame([eval_input])
397
-
398
- # Call Phoenix llm_classify
399
- results_df = llm_classify(
400
- dataframe=df,
401
- model=evaluator_config["llm"],
402
- template=evaluator_config["prompt_template"],
403
- rails=["correct", "partial", "incorrect"], # Common labels
404
- provide_explanation=True,
517
+ # Build user message
518
+ user_message = f"""Question/Input: {question}
519
+
520
+ Agent's Answer:
521
+ {output_str}
522
+
523
+ Expected Answer (Reference):
524
+ {reference}
525
+
526
+ Please evaluate the agent's answer according to the evaluation criteria."""
527
+
528
+ # Add JSON schema requirement to system prompt
529
+ system_prompt = evaluator_config["prompt_template"]
530
+ schema_instruction = f"\n\nYou MUST respond with valid JSON matching this schema:\n{json.dumps(output_schema, indent=2)}\n\nProvide ONLY the JSON response, no markdown code blocks or extra text."
531
+ system_with_schema = system_prompt + schema_instruction
532
+
533
+ # Phoenix LLM models expect a single prompt string
534
+ llm = evaluator_config["llm"]
535
+ full_prompt = f"{system_with_schema}\n\n{user_message}"
536
+ response_text = llm(full_prompt)
537
+
538
+ # Parse JSON response
539
+ try:
540
+ response_json = json.loads(response_text)
541
+ except json.JSONDecodeError:
542
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
543
+ if json_match:
544
+ response_json = json.loads(json_match.group(1))
545
+ else:
546
+ raise ValueError(f"Could not parse JSON from LLM response: {response_text[:200]}")
547
+
548
+ logger.debug(f"LLM response parsed: {list(response_json.keys())}")
549
+
550
+ # Calculate derived scores using config
551
+ if derived_scores_config:
552
+ logger.debug(f"Calculating {len(derived_scores_config)} derived scores")
553
+ response_json = _calculate_derived_scores(response_json, derived_scores_config)
554
+
555
+ # Create Phoenix evaluations using config
556
+ if evaluations_config:
557
+ logger.debug(f"Creating {len(evaluations_config)} Phoenix evaluations")
558
+ evaluations = _create_phoenix_evaluations(response_json, evaluations_config)
559
+ else:
560
+ # Fallback: create evaluations from all numeric/boolean fields
561
+ logger.warning("No evaluations_config - creating default evaluations from schema")
562
+ evaluations = []
563
+ for field_name, field_value in response_json.items():
564
+ if isinstance(field_value, (int, float)):
565
+ evaluations.append({
566
+ "name": field_name,
567
+ "score": float(field_value),
568
+ "label": "good" if field_value >= 0.5 else "poor",
569
+ "explanation": None
570
+ })
571
+ elif isinstance(field_value, bool):
572
+ evaluations.append({
573
+ "name": field_name,
574
+ "score": 1.0 if field_value else 0.0,
575
+ "label": "pass" if field_value else "fail",
576
+ "explanation": None
577
+ })
578
+
579
+ # Always add overall if not present
580
+ if not any(e["name"] == "overall" for e in evaluations):
581
+ overall_score = response_json.get("overall_score", 0.0)
582
+ overall_pass = response_json.get("pass", False)
583
+ evaluations.append({
584
+ "name": "overall",
585
+ "score": overall_score if isinstance(overall_score, (int, float)) else 0.0,
586
+ "label": "pass" if overall_pass else "fail",
587
+ "explanation": response_json.get("evaluation_notes", None)
588
+ })
589
+
590
+ logger.debug(f"Created {len(evaluations)} evaluations")
591
+
592
+ # Phoenix run_experiment expects a single EvaluationResult, not a list.
593
+ # Return the overall score with detailed evaluations in metadata.
594
+ from phoenix.experiments.evaluators.base import EvaluationResult
595
+
596
+ overall_eval = next(
597
+ (e for e in evaluations if e["name"] == "overall"),
598
+ {"score": 0.0, "label": "unknown", "explanation": None}
405
599
  )
406
600
 
407
- # Extract result
408
- if not results_df.empty:
409
- row = results_df.iloc[0]
410
- label = row.get("label", "error")
411
- explanation = row.get("explanation", "")
412
-
413
- # Map labels to scores
414
- score_map = {"correct": 1.0, "partial": 0.5, "incorrect": 0.0}
415
- score = score_map.get(label, 0.0)
416
-
417
- return {
418
- "label": label,
419
- "score": score,
420
- "explanation": explanation or "",
421
- }
422
- else:
423
- logger.warning("llm_classify returned empty DataFrame")
424
- return {
425
- "label": "error",
426
- "score": 0.0,
427
- "explanation": "Evaluator returned empty result",
601
+ return EvaluationResult(
602
+ score=overall_eval.get("score"),
603
+ label=overall_eval.get("label"),
604
+ explanation=overall_eval.get("explanation"),
605
+ metadata={
606
+ "evaluations": evaluations,
607
+ "raw_response": response_json,
428
608
  }
609
+ )
429
610
 
430
611
  except Exception as e:
431
612
  logger.error(f"Evaluator error: {e}")
432
- return {
433
- "label": "error",
434
- "score": 0.0,
435
- "explanation": f"Evaluator failed: {str(e)}",
436
- }
613
+ from phoenix.experiments.evaluators.base import EvaluationResult
614
+ return EvaluationResult(
615
+ score=0.0,
616
+ label="error",
617
+ explanation=f"Evaluator failed: {str(e)}",
618
+ )
437
619
 
438
620
  return evaluator_fn
439
621