remdb 0.3.14__py3-none-any.whl → 0.3.133__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. rem/agentic/README.md +76 -0
  2. rem/agentic/__init__.py +15 -0
  3. rem/agentic/agents/__init__.py +16 -2
  4. rem/agentic/agents/sse_simulator.py +502 -0
  5. rem/agentic/context.py +51 -27
  6. rem/agentic/llm_provider_models.py +301 -0
  7. rem/agentic/mcp/tool_wrapper.py +112 -17
  8. rem/agentic/otel/setup.py +93 -4
  9. rem/agentic/providers/phoenix.py +302 -109
  10. rem/agentic/providers/pydantic_ai.py +215 -26
  11. rem/agentic/schema.py +361 -21
  12. rem/agentic/tools/rem_tools.py +3 -3
  13. rem/api/README.md +215 -1
  14. rem/api/deps.py +255 -0
  15. rem/api/main.py +132 -40
  16. rem/api/mcp_router/resources.py +1 -1
  17. rem/api/mcp_router/server.py +26 -5
  18. rem/api/mcp_router/tools.py +465 -7
  19. rem/api/routers/admin.py +494 -0
  20. rem/api/routers/auth.py +70 -0
  21. rem/api/routers/chat/completions.py +402 -20
  22. rem/api/routers/chat/models.py +88 -10
  23. rem/api/routers/chat/otel_utils.py +33 -0
  24. rem/api/routers/chat/sse_events.py +542 -0
  25. rem/api/routers/chat/streaming.py +642 -45
  26. rem/api/routers/dev.py +81 -0
  27. rem/api/routers/feedback.py +268 -0
  28. rem/api/routers/messages.py +473 -0
  29. rem/api/routers/models.py +78 -0
  30. rem/api/routers/query.py +360 -0
  31. rem/api/routers/shared_sessions.py +406 -0
  32. rem/auth/middleware.py +126 -27
  33. rem/cli/commands/README.md +237 -64
  34. rem/cli/commands/cluster.py +1808 -0
  35. rem/cli/commands/configure.py +1 -3
  36. rem/cli/commands/db.py +386 -143
  37. rem/cli/commands/experiments.py +418 -27
  38. rem/cli/commands/process.py +14 -8
  39. rem/cli/commands/schema.py +97 -50
  40. rem/cli/main.py +27 -6
  41. rem/config.py +10 -3
  42. rem/models/core/core_model.py +7 -1
  43. rem/models/core/experiment.py +54 -0
  44. rem/models/core/rem_query.py +5 -2
  45. rem/models/entities/__init__.py +21 -0
  46. rem/models/entities/domain_resource.py +38 -0
  47. rem/models/entities/feedback.py +123 -0
  48. rem/models/entities/message.py +30 -1
  49. rem/models/entities/session.py +83 -0
  50. rem/models/entities/shared_session.py +180 -0
  51. rem/registry.py +10 -4
  52. rem/schemas/agents/rem.yaml +7 -3
  53. rem/services/content/service.py +92 -20
  54. rem/services/embeddings/api.py +4 -4
  55. rem/services/embeddings/worker.py +16 -16
  56. rem/services/phoenix/client.py +154 -14
  57. rem/services/postgres/README.md +159 -15
  58. rem/services/postgres/__init__.py +2 -1
  59. rem/services/postgres/diff_service.py +531 -0
  60. rem/services/postgres/pydantic_to_sqlalchemy.py +427 -129
  61. rem/services/postgres/repository.py +132 -0
  62. rem/services/postgres/schema_generator.py +205 -4
  63. rem/services/postgres/service.py +6 -6
  64. rem/services/rem/parser.py +44 -9
  65. rem/services/rem/service.py +36 -2
  66. rem/services/session/compression.py +24 -1
  67. rem/services/session/reload.py +1 -1
  68. rem/settings.py +324 -23
  69. rem/sql/background_indexes.sql +21 -16
  70. rem/sql/migrations/001_install.sql +387 -54
  71. rem/sql/migrations/002_install_models.sql +2320 -393
  72. rem/sql/migrations/003_optional_extensions.sql +326 -0
  73. rem/sql/migrations/004_cache_system.sql +548 -0
  74. rem/utils/__init__.py +18 -0
  75. rem/utils/date_utils.py +2 -2
  76. rem/utils/model_helpers.py +156 -1
  77. rem/utils/schema_loader.py +220 -22
  78. rem/utils/sql_paths.py +146 -0
  79. rem/utils/sql_types.py +3 -1
  80. rem/workers/__init__.py +3 -1
  81. rem/workers/db_listener.py +579 -0
  82. rem/workers/unlogged_maintainer.py +463 -0
  83. {remdb-0.3.14.dist-info → remdb-0.3.133.dist-info}/METADATA +335 -226
  84. {remdb-0.3.14.dist-info → remdb-0.3.133.dist-info}/RECORD +86 -66
  85. {remdb-0.3.14.dist-info → remdb-0.3.133.dist-info}/WHEEL +1 -1
  86. rem/sql/002_install_models.sql +0 -1068
  87. rem/sql/install_models.sql +0 -1051
  88. rem/sql/migrations/003_seed_default_user.sql +0 -48
  89. {remdb-0.3.14.dist-info → remdb-0.3.133.dist-info}/entry_points.txt +0 -0
rem/agentic/otel/setup.py CHANGED
@@ -14,6 +14,7 @@ from loguru import logger
14
14
 
15
15
  from ...settings import settings
16
16
 
17
+
17
18
  # Global flag to track if instrumentation is initialized
18
19
  _instrumentation_initialized = False
19
20
 
@@ -52,12 +53,94 @@ def setup_instrumentation() -> None:
52
53
 
53
54
  try:
54
55
  from opentelemetry import trace
55
- from opentelemetry.sdk.trace import TracerProvider
56
- from opentelemetry.sdk.trace.export import BatchSpanProcessor
56
+ from opentelemetry.sdk.trace import TracerProvider, ReadableSpan
57
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter, SpanExportResult
57
58
  from opentelemetry.sdk.resources import Resource, SERVICE_NAME, DEPLOYMENT_ENVIRONMENT
58
59
  from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as HTTPExporter
59
60
  from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as GRPCExporter
60
61
 
62
+ class SanitizingSpanExporter(SpanExporter):
63
+ """
64
+ Wrapper exporter that sanitizes span attributes before export.
65
+
66
+ Removes None values that cause OTLP encoding failures like:
67
+ - llm.input_messages.3.message.content: None
68
+ """
69
+
70
+ def __init__(self, wrapped_exporter: SpanExporter):
71
+ self._wrapped = wrapped_exporter
72
+
73
+ def _sanitize_value(self, value):
74
+ """Recursively sanitize a value, replacing None with empty string."""
75
+ if value is None:
76
+ return "" # Replace None with empty string
77
+ if isinstance(value, dict):
78
+ return {k: self._sanitize_value(v) for k, v in value.items()}
79
+ if isinstance(value, (list, tuple)):
80
+ return [self._sanitize_value(v) for v in value]
81
+ return value
82
+
83
+ def export(self, spans: tuple[ReadableSpan, ...]) -> SpanExportResult:
84
+ # Create sanitized copies of spans
85
+ sanitized_spans = []
86
+ for span in spans:
87
+ if span.attributes:
88
+ # Sanitize all attribute values - replace None with empty string
89
+ sanitized_attrs = {}
90
+ for k, v in span.attributes.items():
91
+ sanitized_attrs[k] = self._sanitize_value(v)
92
+ sanitized_spans.append(_SanitizedSpan(span, sanitized_attrs))
93
+ else:
94
+ sanitized_spans.append(span)
95
+
96
+ return self._wrapped.export(tuple(sanitized_spans))
97
+
98
+ def shutdown(self) -> None:
99
+ self._wrapped.shutdown()
100
+
101
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
102
+ return self._wrapped.force_flush(timeout_millis)
103
+
104
+ class _SanitizedSpan(ReadableSpan):
105
+ """ReadableSpan wrapper with sanitized attributes."""
106
+
107
+ def __init__(self, original: ReadableSpan, sanitized_attributes: dict):
108
+ self._original = original
109
+ self._sanitized_attributes = sanitized_attributes
110
+
111
+ @property
112
+ def name(self): return self._original.name
113
+ @property
114
+ def context(self): return self._original.context
115
+ @property
116
+ def parent(self): return self._original.parent
117
+ @property
118
+ def resource(self): return self._original.resource
119
+ @property
120
+ def instrumentation_scope(self): return self._original.instrumentation_scope
121
+ @property
122
+ def status(self): return self._original.status
123
+ @property
124
+ def start_time(self): return self._original.start_time
125
+ @property
126
+ def end_time(self): return self._original.end_time
127
+ @property
128
+ def links(self): return self._original.links
129
+ @property
130
+ def events(self): return self._original.events
131
+ @property
132
+ def kind(self): return self._original.kind
133
+ @property
134
+ def attributes(self): return self._sanitized_attributes
135
+ @property
136
+ def dropped_attributes(self): return self._original.dropped_attributes
137
+ @property
138
+ def dropped_events(self): return self._original.dropped_events
139
+ @property
140
+ def dropped_links(self): return self._original.dropped_links
141
+
142
+ def get_span_context(self): return self._original.get_span_context()
143
+
61
144
  # Create resource with service metadata
62
145
  resource = Resource(
63
146
  attributes={
@@ -72,16 +155,20 @@ def setup_instrumentation() -> None:
72
155
 
73
156
  # Configure OTLP exporter based on protocol
74
157
  if settings.otel.protocol == "grpc":
75
- exporter = GRPCExporter(
158
+ base_exporter = GRPCExporter(
76
159
  endpoint=settings.otel.collector_endpoint,
77
160
  timeout=settings.otel.export_timeout,
161
+ insecure=settings.otel.insecure,
78
162
  )
79
163
  else: # http
80
- exporter = HTTPExporter(
164
+ base_exporter = HTTPExporter(
81
165
  endpoint=f"{settings.otel.collector_endpoint}/v1/traces",
82
166
  timeout=settings.otel.export_timeout,
83
167
  )
84
168
 
169
+ # Wrap with sanitizing exporter to handle None values
170
+ exporter = SanitizingSpanExporter(base_exporter)
171
+
85
172
  # Add span processor
86
173
  tracer_provider.add_span_processor(BatchSpanProcessor(exporter))
87
174
 
@@ -95,6 +182,8 @@ def setup_instrumentation() -> None:
95
182
  # Add OpenInference span processor for Pydantic AI
96
183
  # This adds rich attributes (openinference.span.kind, input/output, etc.) to ALL traces
97
184
  # Phoenix receives these traces via the OTLP collector - no separate "Phoenix integration" needed
185
+ # Note: The OTEL exporter may log warnings about None values in tool call messages,
186
+ # but this is a known limitation in openinference-instrumentation-pydantic-ai
98
187
  try:
99
188
  from openinference.instrumentation.pydantic_ai import OpenInferenceSpanProcessor as PydanticAISpanProcessor
100
189
 
@@ -226,10 +226,15 @@ def create_phoenix_evaluator(
226
226
  # Create appropriate Phoenix LLM wrapper based on provider
227
227
  llm: OpenAIModel | AnthropicModel
228
228
  if provider.lower() == "anthropic":
229
- # Anthropic models don't support top_p parameter
229
+ # Anthropic's newer Claude models (claude-sonnet-4, claude-opus-4, etc.)
230
+ # don't allow both temperature and top_p to be specified together.
231
+ # Phoenix's AnthropicModel defaults top_p=1, so we explicitly set it
232
+ # to None to prevent it from being sent in the API request.
233
+ # The invocation_parameters() method only includes params that are not None.
230
234
  llm = AnthropicModel(
231
235
  model=phoenix_model_name,
232
236
  temperature=0.0,
237
+ top_p=None, # type: ignore[arg-type] - None prevents param from being sent
233
238
  )
234
239
  else:
235
240
  # Default to OpenAI for other providers (gpt-4, etc.)
@@ -249,13 +254,178 @@ def create_phoenix_evaluator(
249
254
  return evaluator_config
250
255
 
251
256
 
257
+ def _evaluate_expression(expression: str, context: dict[str, Any]) -> Any:
258
+ """Safely evaluate a simple expression with context variables.
259
+
260
+ Supports: arithmetic, comparisons, boolean logic, len()
261
+ """
262
+ try:
263
+ allowed_names = {
264
+ "len": len,
265
+ "True": True,
266
+ "False": False,
267
+ "true": True,
268
+ "false": False,
269
+ }
270
+ allowed_names.update(context)
271
+ return eval(expression, {"__builtins__": {}}, allowed_names)
272
+ except Exception as e:
273
+ logger.warning(f"Expression evaluation failed: {expression} - {e}")
274
+ return 0.0
275
+
276
+
277
+ def _calculate_derived_scores(
278
+ response_json: dict[str, Any],
279
+ derived_scores_config: dict[str, Any],
280
+ ) -> dict[str, Any]:
281
+ """Calculate derived scores from evaluator output using config formulas.
282
+
283
+ Supports:
284
+ - weighted_sum: Weighted average of fields
285
+ - conditional_weighted: Different formulas based on conditions
286
+ - boolean_logic: Boolean expression evaluation
287
+ """
288
+ for score_name, score_config in derived_scores_config.items():
289
+ score_type = score_config.get("type")
290
+
291
+ if score_type == "weighted_sum":
292
+ weights = score_config.get("weights", {})
293
+ total = 0.0
294
+ for field, weight in weights.items():
295
+ field_value = response_json.get(field, 0.0)
296
+ if isinstance(field_value, (int, float)):
297
+ total += field_value * weight
298
+ response_json[score_name] = total
299
+
300
+ elif score_type == "conditional_weighted":
301
+ conditions = score_config.get("conditions", [])
302
+ formula_to_use = None
303
+ for cond_config in conditions:
304
+ condition = cond_config.get("condition")
305
+ if condition is None:
306
+ formula_to_use = cond_config.get("formula")
307
+ break
308
+ field = condition.get("field")
309
+ operator = condition.get("operator")
310
+ value = condition.get("value")
311
+ field_value = response_json.get(field, 0.0)
312
+ condition_met = False
313
+ if operator == ">=":
314
+ condition_met = field_value >= value
315
+ elif operator == ">":
316
+ condition_met = field_value > value
317
+ elif operator == "<=":
318
+ condition_met = field_value <= value
319
+ elif operator == "<":
320
+ condition_met = field_value < value
321
+ elif operator == "==":
322
+ condition_met = field_value == value
323
+ elif operator == "!=":
324
+ condition_met = field_value != value
325
+ if condition_met:
326
+ formula_to_use = cond_config.get("formula")
327
+ break
328
+ if formula_to_use and formula_to_use.get("type") == "weighted_sum":
329
+ weights = formula_to_use.get("weights", {})
330
+ total = 0.0
331
+ for field, weight in weights.items():
332
+ field_value = response_json.get(field, 0.0)
333
+ if isinstance(field_value, (int, float)):
334
+ total += field_value * weight
335
+ response_json[score_name] = total
336
+
337
+ elif score_type == "boolean_logic":
338
+ expression = score_config.get("expression", "")
339
+ result = _evaluate_expression(expression, response_json)
340
+ response_json[score_name] = result
341
+
342
+ return response_json
343
+
344
+
345
+ def _create_phoenix_evaluations(
346
+ response_json: dict[str, Any],
347
+ evaluations_config: list[dict[str, Any]],
348
+ ) -> list[dict[str, Any]]:
349
+ """Create Phoenix evaluation dicts from evaluator output using config.
350
+
351
+ Each evaluation becomes a column in Phoenix UI with name, label, score, explanation.
352
+ """
353
+ evaluations = []
354
+ for eval_config in evaluations_config:
355
+ eval_name = eval_config.get("name", "unnamed")
356
+ score_field = eval_config.get("score_field")
357
+ score_expression = eval_config.get("score_expression")
358
+ label_field = eval_config.get("label_field")
359
+ label_expression = eval_config.get("label_expression")
360
+ label_logic = eval_config.get("label_logic", [])
361
+ label_transform = eval_config.get("label_transform", {})
362
+ score_logic = eval_config.get("score_logic", {})
363
+ explanation_field = eval_config.get("explanation_field")
364
+
365
+ evaluation = {"name": eval_name}
366
+
367
+ # Get score
368
+ if score_expression:
369
+ evaluation["score"] = _evaluate_expression(score_expression, response_json)
370
+ elif score_field:
371
+ evaluation["score"] = response_json.get(score_field, 0.0)
372
+ elif score_logic and label_field:
373
+ label_value = response_json.get(label_field)
374
+ if isinstance(label_value, bool):
375
+ label_value = "true" if label_value else "false"
376
+ evaluation["score"] = score_logic.get(str(label_value), 0.0)
377
+ else:
378
+ evaluation["score"] = None
379
+
380
+ # Get label
381
+ if label_expression:
382
+ evaluation["label"] = str(_evaluate_expression(label_expression, response_json))
383
+ elif label_field:
384
+ label_value = response_json.get(label_field)
385
+ if isinstance(label_value, bool):
386
+ label_value = "true" if label_value else "false"
387
+ if label_transform:
388
+ evaluation["label"] = label_transform.get(str(label_value), str(label_value))
389
+ else:
390
+ evaluation["label"] = str(label_value)
391
+ elif label_logic and (score_field or score_expression):
392
+ score_value = evaluation.get("score", 0.0)
393
+ label = "unknown"
394
+ for logic in label_logic:
395
+ threshold = logic.get("threshold", 0.0)
396
+ operator = logic.get("operator", ">=")
397
+ if operator == ">=" and score_value >= threshold:
398
+ label = logic.get("label", "unknown")
399
+ break
400
+ elif operator == ">" and score_value > threshold:
401
+ label = logic.get("label", "unknown")
402
+ break
403
+ evaluation["label"] = label
404
+ else:
405
+ evaluation["label"] = None
406
+
407
+ # Get explanation
408
+ if explanation_field:
409
+ explanation_value = response_json.get(explanation_field, "")
410
+ if isinstance(explanation_value, list):
411
+ evaluation["explanation"] = ", ".join(str(x) for x in explanation_value) if explanation_value else "None"
412
+ else:
413
+ evaluation["explanation"] = str(explanation_value)
414
+ else:
415
+ evaluation["explanation"] = None
416
+
417
+ evaluations.append(evaluation)
418
+ return evaluations
419
+
420
+
252
421
  def create_evaluator_from_schema(
253
422
  evaluator_schema_path: str | Path | dict[str, Any],
254
423
  model_name: str | None = None,
255
424
  ) -> Callable[[Any], Any]:
256
425
  """Create an evaluator function from a schema file or dict.
257
426
 
258
- The returned evaluator is a callable that Phoenix experiments can use.
427
+ Uses direct LLM call with JSON schema for structured output evaluation.
428
+ Supports phoenix_config for derived scores and evaluation column mappings.
259
429
 
260
430
  Args:
261
431
  evaluator_schema_path: Path to schema file, evaluator name, or schema dict
@@ -269,19 +439,9 @@ def create_evaluator_from_schema(
269
439
  ImportError: If arize-phoenix not installed
270
440
 
271
441
  Example:
272
- >>> # From evaluator name (searches in schemas/evaluators/)
273
442
  >>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
274
- >>>
275
- >>> # From schema dict
276
- >>> schema = {"description": "...", "properties": {...}}
277
- >>> evaluator = create_evaluator_from_schema(schema)
278
- >>>
279
- >>> # Use in experiment
280
- >>> result = evaluator({
281
- ... "input": {"query": "LOOKUP person:sarah-chen"},
282
- ... "output": {"label": "sarah-chen", "type": "person", ...},
283
- ... "expected": {"label": "sarah-chen", "type": "person", ...}
284
- ... })
443
+ >>> result = evaluator(input={...}, output={...}, expected={...})
444
+ >>> # Returns: list of {"name": "...", "score": 0.95, "label": "...", "explanation": "..."}
285
445
  """
286
446
  if not _check_phoenix_available():
287
447
  raise ImportError(
@@ -292,8 +452,6 @@ def create_evaluator_from_schema(
292
452
  # Load schema if path/name provided
293
453
  if isinstance(evaluator_schema_path, (str, Path)):
294
454
  schema_path = Path(evaluator_schema_path)
295
-
296
- # If it's a file path, load directly
297
455
  if schema_path.exists():
298
456
  logger.debug(f"Loading evaluator schema from {schema_path}")
299
457
  if schema_path.suffix in [".yaml", ".yml"]:
@@ -303,126 +461,161 @@ def create_evaluator_from_schema(
303
461
  with open(schema_path) as f:
304
462
  schema = json.load(f)
305
463
  else:
306
- # Treat as evaluator name, search in schemas/evaluators/
307
464
  schema = load_evaluator_schema(str(evaluator_schema_path))
308
465
  else:
309
- # Already a dict
310
466
  schema = evaluator_schema_path
311
467
 
312
- # Extract model from schema's provider_configs if not explicitly provided
313
- if model_name is None:
314
- json_schema_extra = schema.get("json_schema_extra", {})
315
- provider_configs = json_schema_extra.get("provider_configs", [])
316
- if provider_configs:
317
- # Use first provider config
318
- first_provider = provider_configs[0]
319
- provider_name = first_provider.get("provider_name", "openai")
320
- schema_model_name = first_provider.get("model_name", "gpt-4o-mini")
321
- # Format as "provider:model" if not OpenAI (OpenAI is default)
322
- if provider_name == "openai":
323
- model_name = schema_model_name
324
- else:
325
- model_name = f"{provider_name}:{schema_model_name}"
326
- logger.debug(f"Using model from schema provider_configs: {model_name}")
468
+ # Extract schema components
469
+ output_schema = schema.get("properties", {})
470
+
471
+ # Extract phoenix_config for derived scores and evaluations
472
+ phoenix_config = schema.get("phoenix_config", {})
473
+ derived_scores_config = phoenix_config.get("derived_scores", {})
474
+ evaluations_config = phoenix_config.get("evaluations", [])
327
475
 
328
- # Create evaluator config
476
+ # Create evaluator config (LLM wrapper, prompt, etc.)
329
477
  evaluator_config = create_phoenix_evaluator(
330
478
  evaluator_schema=schema,
331
479
  model_name=model_name,
332
480
  )
333
481
 
334
- # Import llm_classify for evaluation
335
- from phoenix.evals import llm_classify
336
- import pandas as pd
482
+ import re
337
483
 
338
- # Wrap for Phoenix experiment compatibility
339
- def evaluator_fn(example: dict[str, Any]) -> dict[str, Any]:
340
- """Evaluate a single example using Phoenix llm_classify.
484
+ def evaluator_fn(input: dict[str, Any], output: dict[str, Any], expected: dict[str, Any]) -> list[dict[str, Any]]:
485
+ """Evaluate using Phoenix's named parameter binding with structured LLM output.
341
486
 
342
- Args:
343
- example: Dict with 'input', 'output', 'expected' keys
344
- - input: Agent input dict (e.g., {"query": "LOOKUP person:sarah-chen"})
345
- - output: Agent output dict (what the agent returned)
346
- - expected: Expected output dict (ground truth from dataset)
487
+ Phoenix automatically binds these parameters:
488
+ - input: Dataset input dict
489
+ - output: Task's return value (agent output)
490
+ - expected: Expected output dict (reference/ground truth)
347
491
 
348
492
  Returns:
349
- Evaluation result with score, label, explanation
493
+ List of Phoenix evaluation dicts with name, score, label, explanation
350
494
  """
351
- input_preview = str(example.get('input', ''))[:100]
352
- logger.debug(f"Evaluating example: {input_preview}...")
353
-
354
- # Phoenix llm_classify() expects a flat dict with string values
355
- # Build evaluation input by flattening nested dicts
356
- eval_input = {}
357
-
358
- # Extract and flatten input fields
359
- input_data = example.get("input", {})
360
- if isinstance(input_data, dict):
361
- for key, value in input_data.items():
362
- eval_input[f"input_{key}"] = str(value) if value is not None else ""
495
+ logger.debug("Evaluating with structured output pattern")
496
+
497
+ # Extract question from input
498
+ if isinstance(input, dict):
499
+ question = input.get("input", input.get("text", str(input)))
363
500
  else:
364
- eval_input["input"] = str(input_data) if input_data is not None else ""
501
+ question = str(input)
365
502
 
366
- # Extract and flatten agent output fields
367
- output_data = example.get("output", {})
368
- if isinstance(output_data, dict):
369
- for key, value in output_data.items():
370
- eval_input[f"output_{key}"] = str(value) if value is not None else ""
503
+ # Serialize agent output
504
+ if isinstance(output, dict):
505
+ output_str = json.dumps(output, indent=2)
371
506
  else:
372
- eval_input["output"] = str(output_data) if output_data is not None else ""
507
+ output_str = str(output)
373
508
 
374
- # Extract and flatten expected fields (reference/ground truth)
375
- expected_data = example.get("expected", {})
376
- if isinstance(expected_data, dict):
377
- for key, value in expected_data.items():
378
- eval_input[f"expected_{key}"] = str(value) if value is not None else ""
379
- elif expected_data:
380
- eval_input["expected"] = str(expected_data)
509
+ # Get reference from expected
510
+ if isinstance(expected, dict):
511
+ reference = expected.get("reference", expected.get("expected_output",
512
+ expected.get("ground_truth", str(expected))))
513
+ else:
514
+ reference = str(expected)
381
515
 
382
516
  try:
383
- # Create single-row DataFrame for llm_classify
384
- # Note: Phoenix's llm_classify requires pandas DataFrame (imported above)
385
- df = pd.DataFrame([eval_input])
386
-
387
- # Call Phoenix llm_classify
388
- results_df = llm_classify(
389
- dataframe=df,
390
- model=evaluator_config["llm"],
391
- template=evaluator_config["prompt_template"],
392
- rails=["correct", "partial", "incorrect"], # Common labels
393
- provide_explanation=True,
517
+ # Build user message
518
+ user_message = f"""Question/Input: {question}
519
+
520
+ Agent's Answer:
521
+ {output_str}
522
+
523
+ Expected Answer (Reference):
524
+ {reference}
525
+
526
+ Please evaluate the agent's answer according to the evaluation criteria."""
527
+
528
+ # Add JSON schema requirement to system prompt
529
+ system_prompt = evaluator_config["prompt_template"]
530
+ schema_instruction = f"\n\nYou MUST respond with valid JSON matching this schema:\n{json.dumps(output_schema, indent=2)}\n\nProvide ONLY the JSON response, no markdown code blocks or extra text."
531
+ system_with_schema = system_prompt + schema_instruction
532
+
533
+ # Phoenix LLM models expect a single prompt string
534
+ llm = evaluator_config["llm"]
535
+ full_prompt = f"{system_with_schema}\n\n{user_message}"
536
+ response_text = llm(full_prompt)
537
+
538
+ # Parse JSON response
539
+ try:
540
+ response_json = json.loads(response_text)
541
+ except json.JSONDecodeError:
542
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
543
+ if json_match:
544
+ response_json = json.loads(json_match.group(1))
545
+ else:
546
+ raise ValueError(f"Could not parse JSON from LLM response: {response_text[:200]}")
547
+
548
+ logger.debug(f"LLM response parsed: {list(response_json.keys())}")
549
+
550
+ # Calculate derived scores using config
551
+ if derived_scores_config:
552
+ logger.debug(f"Calculating {len(derived_scores_config)} derived scores")
553
+ response_json = _calculate_derived_scores(response_json, derived_scores_config)
554
+
555
+ # Create Phoenix evaluations using config
556
+ if evaluations_config:
557
+ logger.debug(f"Creating {len(evaluations_config)} Phoenix evaluations")
558
+ evaluations = _create_phoenix_evaluations(response_json, evaluations_config)
559
+ else:
560
+ # Fallback: create evaluations from all numeric/boolean fields
561
+ logger.warning("No evaluations_config - creating default evaluations from schema")
562
+ evaluations = []
563
+ for field_name, field_value in response_json.items():
564
+ if isinstance(field_value, (int, float)):
565
+ evaluations.append({
566
+ "name": field_name,
567
+ "score": float(field_value),
568
+ "label": "good" if field_value >= 0.5 else "poor",
569
+ "explanation": None
570
+ })
571
+ elif isinstance(field_value, bool):
572
+ evaluations.append({
573
+ "name": field_name,
574
+ "score": 1.0 if field_value else 0.0,
575
+ "label": "pass" if field_value else "fail",
576
+ "explanation": None
577
+ })
578
+
579
+ # Always add overall if not present
580
+ if not any(e["name"] == "overall" for e in evaluations):
581
+ overall_score = response_json.get("overall_score", 0.0)
582
+ overall_pass = response_json.get("pass", False)
583
+ evaluations.append({
584
+ "name": "overall",
585
+ "score": overall_score if isinstance(overall_score, (int, float)) else 0.0,
586
+ "label": "pass" if overall_pass else "fail",
587
+ "explanation": response_json.get("evaluation_notes", None)
588
+ })
589
+
590
+ logger.debug(f"Created {len(evaluations)} evaluations")
591
+
592
+ # Phoenix run_experiment expects a single EvaluationResult, not a list.
593
+ # Return the overall score with detailed evaluations in metadata.
594
+ from phoenix.experiments.evaluators.base import EvaluationResult
595
+
596
+ overall_eval = next(
597
+ (e for e in evaluations if e["name"] == "overall"),
598
+ {"score": 0.0, "label": "unknown", "explanation": None}
394
599
  )
395
600
 
396
- # Extract result (results_df is pandas DataFrame from Phoenix)
397
- if not results_df.empty:
398
- row = results_df.iloc[0]
399
- label = row.get("label", "error")
400
- explanation = row.get("explanation", "")
401
-
402
- # Map labels to scores
403
- score_map = {"correct": 1.0, "partial": 0.5, "incorrect": 0.0}
404
- score = score_map.get(label, 0.0)
405
-
406
- return {
407
- "label": label,
408
- "score": score,
409
- "explanation": explanation or "",
410
- }
411
- else:
412
- logger.warning("llm_classify returned empty DataFrame")
413
- return {
414
- "label": "error",
415
- "score": 0.0,
416
- "explanation": "Evaluator returned empty result",
601
+ return EvaluationResult(
602
+ score=overall_eval.get("score"),
603
+ label=overall_eval.get("label"),
604
+ explanation=overall_eval.get("explanation"),
605
+ metadata={
606
+ "evaluations": evaluations,
607
+ "raw_response": response_json,
417
608
  }
609
+ )
418
610
 
419
611
  except Exception as e:
420
612
  logger.error(f"Evaluator error: {e}")
421
- return {
422
- "label": "error",
423
- "score": 0.0,
424
- "explanation": f"Evaluator failed: {str(e)}",
425
- }
613
+ from phoenix.experiments.evaluators.base import EvaluationResult
614
+ return EvaluationResult(
615
+ score=0.0,
616
+ label="error",
617
+ explanation=f"Evaluator failed: {str(e)}",
618
+ )
426
619
 
427
620
  return evaluator_fn
428
621