remdb 0.3.14__py3-none-any.whl → 0.3.133__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/agentic/README.md +76 -0
- rem/agentic/__init__.py +15 -0
- rem/agentic/agents/__init__.py +16 -2
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +51 -27
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/tool_wrapper.py +112 -17
- rem/agentic/otel/setup.py +93 -4
- rem/agentic/providers/phoenix.py +302 -109
- rem/agentic/providers/pydantic_ai.py +215 -26
- rem/agentic/schema.py +361 -21
- rem/agentic/tools/rem_tools.py +3 -3
- rem/api/README.md +215 -1
- rem/api/deps.py +255 -0
- rem/api/main.py +132 -40
- rem/api/mcp_router/resources.py +1 -1
- rem/api/mcp_router/server.py +26 -5
- rem/api/mcp_router/tools.py +465 -7
- rem/api/routers/admin.py +494 -0
- rem/api/routers/auth.py +70 -0
- rem/api/routers/chat/completions.py +402 -20
- rem/api/routers/chat/models.py +88 -10
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +542 -0
- rem/api/routers/chat/streaming.py +642 -45
- rem/api/routers/dev.py +81 -0
- rem/api/routers/feedback.py +268 -0
- rem/api/routers/messages.py +473 -0
- rem/api/routers/models.py +78 -0
- rem/api/routers/query.py +360 -0
- rem/api/routers/shared_sessions.py +406 -0
- rem/auth/middleware.py +126 -27
- rem/cli/commands/README.md +237 -64
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +1 -3
- rem/cli/commands/db.py +386 -143
- rem/cli/commands/experiments.py +418 -27
- rem/cli/commands/process.py +14 -8
- rem/cli/commands/schema.py +97 -50
- rem/cli/main.py +27 -6
- rem/config.py +10 -3
- rem/models/core/core_model.py +7 -1
- rem/models/core/experiment.py +54 -0
- rem/models/core/rem_query.py +5 -2
- rem/models/entities/__init__.py +21 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/message.py +30 -1
- rem/models/entities/session.py +83 -0
- rem/models/entities/shared_session.py +180 -0
- rem/registry.py +10 -4
- rem/schemas/agents/rem.yaml +7 -3
- rem/services/content/service.py +92 -20
- rem/services/embeddings/api.py +4 -4
- rem/services/embeddings/worker.py +16 -16
- rem/services/phoenix/client.py +154 -14
- rem/services/postgres/README.md +159 -15
- rem/services/postgres/__init__.py +2 -1
- rem/services/postgres/diff_service.py +531 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +427 -129
- rem/services/postgres/repository.py +132 -0
- rem/services/postgres/schema_generator.py +205 -4
- rem/services/postgres/service.py +6 -6
- rem/services/rem/parser.py +44 -9
- rem/services/rem/service.py +36 -2
- rem/services/session/compression.py +24 -1
- rem/services/session/reload.py +1 -1
- rem/settings.py +324 -23
- rem/sql/background_indexes.sql +21 -16
- rem/sql/migrations/001_install.sql +387 -54
- rem/sql/migrations/002_install_models.sql +2320 -393
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +548 -0
- rem/utils/__init__.py +18 -0
- rem/utils/date_utils.py +2 -2
- rem/utils/model_helpers.py +156 -1
- rem/utils/schema_loader.py +220 -22
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +3 -1
- rem/workers/__init__.py +3 -1
- rem/workers/db_listener.py +579 -0
- rem/workers/unlogged_maintainer.py +463 -0
- {remdb-0.3.14.dist-info → remdb-0.3.133.dist-info}/METADATA +335 -226
- {remdb-0.3.14.dist-info → remdb-0.3.133.dist-info}/RECORD +86 -66
- {remdb-0.3.14.dist-info → remdb-0.3.133.dist-info}/WHEEL +1 -1
- rem/sql/002_install_models.sql +0 -1068
- rem/sql/install_models.sql +0 -1051
- rem/sql/migrations/003_seed_default_user.sql +0 -48
- {remdb-0.3.14.dist-info → remdb-0.3.133.dist-info}/entry_points.txt +0 -0
rem/agentic/otel/setup.py
CHANGED
|
@@ -14,6 +14,7 @@ from loguru import logger
|
|
|
14
14
|
|
|
15
15
|
from ...settings import settings
|
|
16
16
|
|
|
17
|
+
|
|
17
18
|
# Global flag to track if instrumentation is initialized
|
|
18
19
|
_instrumentation_initialized = False
|
|
19
20
|
|
|
@@ -52,12 +53,94 @@ def setup_instrumentation() -> None:
|
|
|
52
53
|
|
|
53
54
|
try:
|
|
54
55
|
from opentelemetry import trace
|
|
55
|
-
from opentelemetry.sdk.trace import TracerProvider
|
|
56
|
-
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
56
|
+
from opentelemetry.sdk.trace import TracerProvider, ReadableSpan
|
|
57
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter, SpanExportResult
|
|
57
58
|
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, DEPLOYMENT_ENVIRONMENT
|
|
58
59
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as HTTPExporter
|
|
59
60
|
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as GRPCExporter
|
|
60
61
|
|
|
62
|
+
class SanitizingSpanExporter(SpanExporter):
|
|
63
|
+
"""
|
|
64
|
+
Wrapper exporter that sanitizes span attributes before export.
|
|
65
|
+
|
|
66
|
+
Removes None values that cause OTLP encoding failures like:
|
|
67
|
+
- llm.input_messages.3.message.content: None
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(self, wrapped_exporter: SpanExporter):
|
|
71
|
+
self._wrapped = wrapped_exporter
|
|
72
|
+
|
|
73
|
+
def _sanitize_value(self, value):
|
|
74
|
+
"""Recursively sanitize a value, replacing None with empty string."""
|
|
75
|
+
if value is None:
|
|
76
|
+
return "" # Replace None with empty string
|
|
77
|
+
if isinstance(value, dict):
|
|
78
|
+
return {k: self._sanitize_value(v) for k, v in value.items()}
|
|
79
|
+
if isinstance(value, (list, tuple)):
|
|
80
|
+
return [self._sanitize_value(v) for v in value]
|
|
81
|
+
return value
|
|
82
|
+
|
|
83
|
+
def export(self, spans: tuple[ReadableSpan, ...]) -> SpanExportResult:
|
|
84
|
+
# Create sanitized copies of spans
|
|
85
|
+
sanitized_spans = []
|
|
86
|
+
for span in spans:
|
|
87
|
+
if span.attributes:
|
|
88
|
+
# Sanitize all attribute values - replace None with empty string
|
|
89
|
+
sanitized_attrs = {}
|
|
90
|
+
for k, v in span.attributes.items():
|
|
91
|
+
sanitized_attrs[k] = self._sanitize_value(v)
|
|
92
|
+
sanitized_spans.append(_SanitizedSpan(span, sanitized_attrs))
|
|
93
|
+
else:
|
|
94
|
+
sanitized_spans.append(span)
|
|
95
|
+
|
|
96
|
+
return self._wrapped.export(tuple(sanitized_spans))
|
|
97
|
+
|
|
98
|
+
def shutdown(self) -> None:
|
|
99
|
+
self._wrapped.shutdown()
|
|
100
|
+
|
|
101
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
|
102
|
+
return self._wrapped.force_flush(timeout_millis)
|
|
103
|
+
|
|
104
|
+
class _SanitizedSpan(ReadableSpan):
|
|
105
|
+
"""ReadableSpan wrapper with sanitized attributes."""
|
|
106
|
+
|
|
107
|
+
def __init__(self, original: ReadableSpan, sanitized_attributes: dict):
|
|
108
|
+
self._original = original
|
|
109
|
+
self._sanitized_attributes = sanitized_attributes
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def name(self): return self._original.name
|
|
113
|
+
@property
|
|
114
|
+
def context(self): return self._original.context
|
|
115
|
+
@property
|
|
116
|
+
def parent(self): return self._original.parent
|
|
117
|
+
@property
|
|
118
|
+
def resource(self): return self._original.resource
|
|
119
|
+
@property
|
|
120
|
+
def instrumentation_scope(self): return self._original.instrumentation_scope
|
|
121
|
+
@property
|
|
122
|
+
def status(self): return self._original.status
|
|
123
|
+
@property
|
|
124
|
+
def start_time(self): return self._original.start_time
|
|
125
|
+
@property
|
|
126
|
+
def end_time(self): return self._original.end_time
|
|
127
|
+
@property
|
|
128
|
+
def links(self): return self._original.links
|
|
129
|
+
@property
|
|
130
|
+
def events(self): return self._original.events
|
|
131
|
+
@property
|
|
132
|
+
def kind(self): return self._original.kind
|
|
133
|
+
@property
|
|
134
|
+
def attributes(self): return self._sanitized_attributes
|
|
135
|
+
@property
|
|
136
|
+
def dropped_attributes(self): return self._original.dropped_attributes
|
|
137
|
+
@property
|
|
138
|
+
def dropped_events(self): return self._original.dropped_events
|
|
139
|
+
@property
|
|
140
|
+
def dropped_links(self): return self._original.dropped_links
|
|
141
|
+
|
|
142
|
+
def get_span_context(self): return self._original.get_span_context()
|
|
143
|
+
|
|
61
144
|
# Create resource with service metadata
|
|
62
145
|
resource = Resource(
|
|
63
146
|
attributes={
|
|
@@ -72,16 +155,20 @@ def setup_instrumentation() -> None:
|
|
|
72
155
|
|
|
73
156
|
# Configure OTLP exporter based on protocol
|
|
74
157
|
if settings.otel.protocol == "grpc":
|
|
75
|
-
|
|
158
|
+
base_exporter = GRPCExporter(
|
|
76
159
|
endpoint=settings.otel.collector_endpoint,
|
|
77
160
|
timeout=settings.otel.export_timeout,
|
|
161
|
+
insecure=settings.otel.insecure,
|
|
78
162
|
)
|
|
79
163
|
else: # http
|
|
80
|
-
|
|
164
|
+
base_exporter = HTTPExporter(
|
|
81
165
|
endpoint=f"{settings.otel.collector_endpoint}/v1/traces",
|
|
82
166
|
timeout=settings.otel.export_timeout,
|
|
83
167
|
)
|
|
84
168
|
|
|
169
|
+
# Wrap with sanitizing exporter to handle None values
|
|
170
|
+
exporter = SanitizingSpanExporter(base_exporter)
|
|
171
|
+
|
|
85
172
|
# Add span processor
|
|
86
173
|
tracer_provider.add_span_processor(BatchSpanProcessor(exporter))
|
|
87
174
|
|
|
@@ -95,6 +182,8 @@ def setup_instrumentation() -> None:
|
|
|
95
182
|
# Add OpenInference span processor for Pydantic AI
|
|
96
183
|
# This adds rich attributes (openinference.span.kind, input/output, etc.) to ALL traces
|
|
97
184
|
# Phoenix receives these traces via the OTLP collector - no separate "Phoenix integration" needed
|
|
185
|
+
# Note: The OTEL exporter may log warnings about None values in tool call messages,
|
|
186
|
+
# but this is a known limitation in openinference-instrumentation-pydantic-ai
|
|
98
187
|
try:
|
|
99
188
|
from openinference.instrumentation.pydantic_ai import OpenInferenceSpanProcessor as PydanticAISpanProcessor
|
|
100
189
|
|
rem/agentic/providers/phoenix.py
CHANGED
|
@@ -226,10 +226,15 @@ def create_phoenix_evaluator(
|
|
|
226
226
|
# Create appropriate Phoenix LLM wrapper based on provider
|
|
227
227
|
llm: OpenAIModel | AnthropicModel
|
|
228
228
|
if provider.lower() == "anthropic":
|
|
229
|
-
# Anthropic models
|
|
229
|
+
# Anthropic's newer Claude models (claude-sonnet-4, claude-opus-4, etc.)
|
|
230
|
+
# don't allow both temperature and top_p to be specified together.
|
|
231
|
+
# Phoenix's AnthropicModel defaults top_p=1, so we explicitly set it
|
|
232
|
+
# to None to prevent it from being sent in the API request.
|
|
233
|
+
# The invocation_parameters() method only includes params that are not None.
|
|
230
234
|
llm = AnthropicModel(
|
|
231
235
|
model=phoenix_model_name,
|
|
232
236
|
temperature=0.0,
|
|
237
|
+
top_p=None, # type: ignore[arg-type] - None prevents param from being sent
|
|
233
238
|
)
|
|
234
239
|
else:
|
|
235
240
|
# Default to OpenAI for other providers (gpt-4, etc.)
|
|
@@ -249,13 +254,178 @@ def create_phoenix_evaluator(
|
|
|
249
254
|
return evaluator_config
|
|
250
255
|
|
|
251
256
|
|
|
257
|
+
def _evaluate_expression(expression: str, context: dict[str, Any]) -> Any:
|
|
258
|
+
"""Safely evaluate a simple expression with context variables.
|
|
259
|
+
|
|
260
|
+
Supports: arithmetic, comparisons, boolean logic, len()
|
|
261
|
+
"""
|
|
262
|
+
try:
|
|
263
|
+
allowed_names = {
|
|
264
|
+
"len": len,
|
|
265
|
+
"True": True,
|
|
266
|
+
"False": False,
|
|
267
|
+
"true": True,
|
|
268
|
+
"false": False,
|
|
269
|
+
}
|
|
270
|
+
allowed_names.update(context)
|
|
271
|
+
return eval(expression, {"__builtins__": {}}, allowed_names)
|
|
272
|
+
except Exception as e:
|
|
273
|
+
logger.warning(f"Expression evaluation failed: {expression} - {e}")
|
|
274
|
+
return 0.0
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _calculate_derived_scores(
|
|
278
|
+
response_json: dict[str, Any],
|
|
279
|
+
derived_scores_config: dict[str, Any],
|
|
280
|
+
) -> dict[str, Any]:
|
|
281
|
+
"""Calculate derived scores from evaluator output using config formulas.
|
|
282
|
+
|
|
283
|
+
Supports:
|
|
284
|
+
- weighted_sum: Weighted average of fields
|
|
285
|
+
- conditional_weighted: Different formulas based on conditions
|
|
286
|
+
- boolean_logic: Boolean expression evaluation
|
|
287
|
+
"""
|
|
288
|
+
for score_name, score_config in derived_scores_config.items():
|
|
289
|
+
score_type = score_config.get("type")
|
|
290
|
+
|
|
291
|
+
if score_type == "weighted_sum":
|
|
292
|
+
weights = score_config.get("weights", {})
|
|
293
|
+
total = 0.0
|
|
294
|
+
for field, weight in weights.items():
|
|
295
|
+
field_value = response_json.get(field, 0.0)
|
|
296
|
+
if isinstance(field_value, (int, float)):
|
|
297
|
+
total += field_value * weight
|
|
298
|
+
response_json[score_name] = total
|
|
299
|
+
|
|
300
|
+
elif score_type == "conditional_weighted":
|
|
301
|
+
conditions = score_config.get("conditions", [])
|
|
302
|
+
formula_to_use = None
|
|
303
|
+
for cond_config in conditions:
|
|
304
|
+
condition = cond_config.get("condition")
|
|
305
|
+
if condition is None:
|
|
306
|
+
formula_to_use = cond_config.get("formula")
|
|
307
|
+
break
|
|
308
|
+
field = condition.get("field")
|
|
309
|
+
operator = condition.get("operator")
|
|
310
|
+
value = condition.get("value")
|
|
311
|
+
field_value = response_json.get(field, 0.0)
|
|
312
|
+
condition_met = False
|
|
313
|
+
if operator == ">=":
|
|
314
|
+
condition_met = field_value >= value
|
|
315
|
+
elif operator == ">":
|
|
316
|
+
condition_met = field_value > value
|
|
317
|
+
elif operator == "<=":
|
|
318
|
+
condition_met = field_value <= value
|
|
319
|
+
elif operator == "<":
|
|
320
|
+
condition_met = field_value < value
|
|
321
|
+
elif operator == "==":
|
|
322
|
+
condition_met = field_value == value
|
|
323
|
+
elif operator == "!=":
|
|
324
|
+
condition_met = field_value != value
|
|
325
|
+
if condition_met:
|
|
326
|
+
formula_to_use = cond_config.get("formula")
|
|
327
|
+
break
|
|
328
|
+
if formula_to_use and formula_to_use.get("type") == "weighted_sum":
|
|
329
|
+
weights = formula_to_use.get("weights", {})
|
|
330
|
+
total = 0.0
|
|
331
|
+
for field, weight in weights.items():
|
|
332
|
+
field_value = response_json.get(field, 0.0)
|
|
333
|
+
if isinstance(field_value, (int, float)):
|
|
334
|
+
total += field_value * weight
|
|
335
|
+
response_json[score_name] = total
|
|
336
|
+
|
|
337
|
+
elif score_type == "boolean_logic":
|
|
338
|
+
expression = score_config.get("expression", "")
|
|
339
|
+
result = _evaluate_expression(expression, response_json)
|
|
340
|
+
response_json[score_name] = result
|
|
341
|
+
|
|
342
|
+
return response_json
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _create_phoenix_evaluations(
|
|
346
|
+
response_json: dict[str, Any],
|
|
347
|
+
evaluations_config: list[dict[str, Any]],
|
|
348
|
+
) -> list[dict[str, Any]]:
|
|
349
|
+
"""Create Phoenix evaluation dicts from evaluator output using config.
|
|
350
|
+
|
|
351
|
+
Each evaluation becomes a column in Phoenix UI with name, label, score, explanation.
|
|
352
|
+
"""
|
|
353
|
+
evaluations = []
|
|
354
|
+
for eval_config in evaluations_config:
|
|
355
|
+
eval_name = eval_config.get("name", "unnamed")
|
|
356
|
+
score_field = eval_config.get("score_field")
|
|
357
|
+
score_expression = eval_config.get("score_expression")
|
|
358
|
+
label_field = eval_config.get("label_field")
|
|
359
|
+
label_expression = eval_config.get("label_expression")
|
|
360
|
+
label_logic = eval_config.get("label_logic", [])
|
|
361
|
+
label_transform = eval_config.get("label_transform", {})
|
|
362
|
+
score_logic = eval_config.get("score_logic", {})
|
|
363
|
+
explanation_field = eval_config.get("explanation_field")
|
|
364
|
+
|
|
365
|
+
evaluation = {"name": eval_name}
|
|
366
|
+
|
|
367
|
+
# Get score
|
|
368
|
+
if score_expression:
|
|
369
|
+
evaluation["score"] = _evaluate_expression(score_expression, response_json)
|
|
370
|
+
elif score_field:
|
|
371
|
+
evaluation["score"] = response_json.get(score_field, 0.0)
|
|
372
|
+
elif score_logic and label_field:
|
|
373
|
+
label_value = response_json.get(label_field)
|
|
374
|
+
if isinstance(label_value, bool):
|
|
375
|
+
label_value = "true" if label_value else "false"
|
|
376
|
+
evaluation["score"] = score_logic.get(str(label_value), 0.0)
|
|
377
|
+
else:
|
|
378
|
+
evaluation["score"] = None
|
|
379
|
+
|
|
380
|
+
# Get label
|
|
381
|
+
if label_expression:
|
|
382
|
+
evaluation["label"] = str(_evaluate_expression(label_expression, response_json))
|
|
383
|
+
elif label_field:
|
|
384
|
+
label_value = response_json.get(label_field)
|
|
385
|
+
if isinstance(label_value, bool):
|
|
386
|
+
label_value = "true" if label_value else "false"
|
|
387
|
+
if label_transform:
|
|
388
|
+
evaluation["label"] = label_transform.get(str(label_value), str(label_value))
|
|
389
|
+
else:
|
|
390
|
+
evaluation["label"] = str(label_value)
|
|
391
|
+
elif label_logic and (score_field or score_expression):
|
|
392
|
+
score_value = evaluation.get("score", 0.0)
|
|
393
|
+
label = "unknown"
|
|
394
|
+
for logic in label_logic:
|
|
395
|
+
threshold = logic.get("threshold", 0.0)
|
|
396
|
+
operator = logic.get("operator", ">=")
|
|
397
|
+
if operator == ">=" and score_value >= threshold:
|
|
398
|
+
label = logic.get("label", "unknown")
|
|
399
|
+
break
|
|
400
|
+
elif operator == ">" and score_value > threshold:
|
|
401
|
+
label = logic.get("label", "unknown")
|
|
402
|
+
break
|
|
403
|
+
evaluation["label"] = label
|
|
404
|
+
else:
|
|
405
|
+
evaluation["label"] = None
|
|
406
|
+
|
|
407
|
+
# Get explanation
|
|
408
|
+
if explanation_field:
|
|
409
|
+
explanation_value = response_json.get(explanation_field, "")
|
|
410
|
+
if isinstance(explanation_value, list):
|
|
411
|
+
evaluation["explanation"] = ", ".join(str(x) for x in explanation_value) if explanation_value else "None"
|
|
412
|
+
else:
|
|
413
|
+
evaluation["explanation"] = str(explanation_value)
|
|
414
|
+
else:
|
|
415
|
+
evaluation["explanation"] = None
|
|
416
|
+
|
|
417
|
+
evaluations.append(evaluation)
|
|
418
|
+
return evaluations
|
|
419
|
+
|
|
420
|
+
|
|
252
421
|
def create_evaluator_from_schema(
|
|
253
422
|
evaluator_schema_path: str | Path | dict[str, Any],
|
|
254
423
|
model_name: str | None = None,
|
|
255
424
|
) -> Callable[[Any], Any]:
|
|
256
425
|
"""Create an evaluator function from a schema file or dict.
|
|
257
426
|
|
|
258
|
-
|
|
427
|
+
Uses direct LLM call with JSON schema for structured output evaluation.
|
|
428
|
+
Supports phoenix_config for derived scores and evaluation column mappings.
|
|
259
429
|
|
|
260
430
|
Args:
|
|
261
431
|
evaluator_schema_path: Path to schema file, evaluator name, or schema dict
|
|
@@ -269,19 +439,9 @@ def create_evaluator_from_schema(
|
|
|
269
439
|
ImportError: If arize-phoenix not installed
|
|
270
440
|
|
|
271
441
|
Example:
|
|
272
|
-
>>> # From evaluator name (searches in schemas/evaluators/)
|
|
273
442
|
>>> evaluator = create_evaluator_from_schema("rem-lookup-correctness")
|
|
274
|
-
>>>
|
|
275
|
-
>>> #
|
|
276
|
-
>>> schema = {"description": "...", "properties": {...}}
|
|
277
|
-
>>> evaluator = create_evaluator_from_schema(schema)
|
|
278
|
-
>>>
|
|
279
|
-
>>> # Use in experiment
|
|
280
|
-
>>> result = evaluator({
|
|
281
|
-
... "input": {"query": "LOOKUP person:sarah-chen"},
|
|
282
|
-
... "output": {"label": "sarah-chen", "type": "person", ...},
|
|
283
|
-
... "expected": {"label": "sarah-chen", "type": "person", ...}
|
|
284
|
-
... })
|
|
443
|
+
>>> result = evaluator(input={...}, output={...}, expected={...})
|
|
444
|
+
>>> # Returns: list of {"name": "...", "score": 0.95, "label": "...", "explanation": "..."}
|
|
285
445
|
"""
|
|
286
446
|
if not _check_phoenix_available():
|
|
287
447
|
raise ImportError(
|
|
@@ -292,8 +452,6 @@ def create_evaluator_from_schema(
|
|
|
292
452
|
# Load schema if path/name provided
|
|
293
453
|
if isinstance(evaluator_schema_path, (str, Path)):
|
|
294
454
|
schema_path = Path(evaluator_schema_path)
|
|
295
|
-
|
|
296
|
-
# If it's a file path, load directly
|
|
297
455
|
if schema_path.exists():
|
|
298
456
|
logger.debug(f"Loading evaluator schema from {schema_path}")
|
|
299
457
|
if schema_path.suffix in [".yaml", ".yml"]:
|
|
@@ -303,126 +461,161 @@ def create_evaluator_from_schema(
|
|
|
303
461
|
with open(schema_path) as f:
|
|
304
462
|
schema = json.load(f)
|
|
305
463
|
else:
|
|
306
|
-
# Treat as evaluator name, search in schemas/evaluators/
|
|
307
464
|
schema = load_evaluator_schema(str(evaluator_schema_path))
|
|
308
465
|
else:
|
|
309
|
-
# Already a dict
|
|
310
466
|
schema = evaluator_schema_path
|
|
311
467
|
|
|
312
|
-
# Extract
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
provider_name = first_provider.get("provider_name", "openai")
|
|
320
|
-
schema_model_name = first_provider.get("model_name", "gpt-4o-mini")
|
|
321
|
-
# Format as "provider:model" if not OpenAI (OpenAI is default)
|
|
322
|
-
if provider_name == "openai":
|
|
323
|
-
model_name = schema_model_name
|
|
324
|
-
else:
|
|
325
|
-
model_name = f"{provider_name}:{schema_model_name}"
|
|
326
|
-
logger.debug(f"Using model from schema provider_configs: {model_name}")
|
|
468
|
+
# Extract schema components
|
|
469
|
+
output_schema = schema.get("properties", {})
|
|
470
|
+
|
|
471
|
+
# Extract phoenix_config for derived scores and evaluations
|
|
472
|
+
phoenix_config = schema.get("phoenix_config", {})
|
|
473
|
+
derived_scores_config = phoenix_config.get("derived_scores", {})
|
|
474
|
+
evaluations_config = phoenix_config.get("evaluations", [])
|
|
327
475
|
|
|
328
|
-
# Create evaluator config
|
|
476
|
+
# Create evaluator config (LLM wrapper, prompt, etc.)
|
|
329
477
|
evaluator_config = create_phoenix_evaluator(
|
|
330
478
|
evaluator_schema=schema,
|
|
331
479
|
model_name=model_name,
|
|
332
480
|
)
|
|
333
481
|
|
|
334
|
-
|
|
335
|
-
from phoenix.evals import llm_classify
|
|
336
|
-
import pandas as pd
|
|
482
|
+
import re
|
|
337
483
|
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
"""Evaluate a single example using Phoenix llm_classify.
|
|
484
|
+
def evaluator_fn(input: dict[str, Any], output: dict[str, Any], expected: dict[str, Any]) -> list[dict[str, Any]]:
|
|
485
|
+
"""Evaluate using Phoenix's named parameter binding with structured LLM output.
|
|
341
486
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
- expected: Expected output dict (ground truth from dataset)
|
|
487
|
+
Phoenix automatically binds these parameters:
|
|
488
|
+
- input: Dataset input dict
|
|
489
|
+
- output: Task's return value (agent output)
|
|
490
|
+
- expected: Expected output dict (reference/ground truth)
|
|
347
491
|
|
|
348
492
|
Returns:
|
|
349
|
-
|
|
493
|
+
List of Phoenix evaluation dicts with name, score, label, explanation
|
|
350
494
|
"""
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
eval_input = {}
|
|
357
|
-
|
|
358
|
-
# Extract and flatten input fields
|
|
359
|
-
input_data = example.get("input", {})
|
|
360
|
-
if isinstance(input_data, dict):
|
|
361
|
-
for key, value in input_data.items():
|
|
362
|
-
eval_input[f"input_{key}"] = str(value) if value is not None else ""
|
|
495
|
+
logger.debug("Evaluating with structured output pattern")
|
|
496
|
+
|
|
497
|
+
# Extract question from input
|
|
498
|
+
if isinstance(input, dict):
|
|
499
|
+
question = input.get("input", input.get("text", str(input)))
|
|
363
500
|
else:
|
|
364
|
-
|
|
501
|
+
question = str(input)
|
|
365
502
|
|
|
366
|
-
#
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
for key, value in output_data.items():
|
|
370
|
-
eval_input[f"output_{key}"] = str(value) if value is not None else ""
|
|
503
|
+
# Serialize agent output
|
|
504
|
+
if isinstance(output, dict):
|
|
505
|
+
output_str = json.dumps(output, indent=2)
|
|
371
506
|
else:
|
|
372
|
-
|
|
507
|
+
output_str = str(output)
|
|
373
508
|
|
|
374
|
-
#
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
eval_input["expected"] = str(expected_data)
|
|
509
|
+
# Get reference from expected
|
|
510
|
+
if isinstance(expected, dict):
|
|
511
|
+
reference = expected.get("reference", expected.get("expected_output",
|
|
512
|
+
expected.get("ground_truth", str(expected))))
|
|
513
|
+
else:
|
|
514
|
+
reference = str(expected)
|
|
381
515
|
|
|
382
516
|
try:
|
|
383
|
-
#
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
517
|
+
# Build user message
|
|
518
|
+
user_message = f"""Question/Input: {question}
|
|
519
|
+
|
|
520
|
+
Agent's Answer:
|
|
521
|
+
{output_str}
|
|
522
|
+
|
|
523
|
+
Expected Answer (Reference):
|
|
524
|
+
{reference}
|
|
525
|
+
|
|
526
|
+
Please evaluate the agent's answer according to the evaluation criteria."""
|
|
527
|
+
|
|
528
|
+
# Add JSON schema requirement to system prompt
|
|
529
|
+
system_prompt = evaluator_config["prompt_template"]
|
|
530
|
+
schema_instruction = f"\n\nYou MUST respond with valid JSON matching this schema:\n{json.dumps(output_schema, indent=2)}\n\nProvide ONLY the JSON response, no markdown code blocks or extra text."
|
|
531
|
+
system_with_schema = system_prompt + schema_instruction
|
|
532
|
+
|
|
533
|
+
# Phoenix LLM models expect a single prompt string
|
|
534
|
+
llm = evaluator_config["llm"]
|
|
535
|
+
full_prompt = f"{system_with_schema}\n\n{user_message}"
|
|
536
|
+
response_text = llm(full_prompt)
|
|
537
|
+
|
|
538
|
+
# Parse JSON response
|
|
539
|
+
try:
|
|
540
|
+
response_json = json.loads(response_text)
|
|
541
|
+
except json.JSONDecodeError:
|
|
542
|
+
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
|
|
543
|
+
if json_match:
|
|
544
|
+
response_json = json.loads(json_match.group(1))
|
|
545
|
+
else:
|
|
546
|
+
raise ValueError(f"Could not parse JSON from LLM response: {response_text[:200]}")
|
|
547
|
+
|
|
548
|
+
logger.debug(f"LLM response parsed: {list(response_json.keys())}")
|
|
549
|
+
|
|
550
|
+
# Calculate derived scores using config
|
|
551
|
+
if derived_scores_config:
|
|
552
|
+
logger.debug(f"Calculating {len(derived_scores_config)} derived scores")
|
|
553
|
+
response_json = _calculate_derived_scores(response_json, derived_scores_config)
|
|
554
|
+
|
|
555
|
+
# Create Phoenix evaluations using config
|
|
556
|
+
if evaluations_config:
|
|
557
|
+
logger.debug(f"Creating {len(evaluations_config)} Phoenix evaluations")
|
|
558
|
+
evaluations = _create_phoenix_evaluations(response_json, evaluations_config)
|
|
559
|
+
else:
|
|
560
|
+
# Fallback: create evaluations from all numeric/boolean fields
|
|
561
|
+
logger.warning("No evaluations_config - creating default evaluations from schema")
|
|
562
|
+
evaluations = []
|
|
563
|
+
for field_name, field_value in response_json.items():
|
|
564
|
+
if isinstance(field_value, (int, float)):
|
|
565
|
+
evaluations.append({
|
|
566
|
+
"name": field_name,
|
|
567
|
+
"score": float(field_value),
|
|
568
|
+
"label": "good" if field_value >= 0.5 else "poor",
|
|
569
|
+
"explanation": None
|
|
570
|
+
})
|
|
571
|
+
elif isinstance(field_value, bool):
|
|
572
|
+
evaluations.append({
|
|
573
|
+
"name": field_name,
|
|
574
|
+
"score": 1.0 if field_value else 0.0,
|
|
575
|
+
"label": "pass" if field_value else "fail",
|
|
576
|
+
"explanation": None
|
|
577
|
+
})
|
|
578
|
+
|
|
579
|
+
# Always add overall if not present
|
|
580
|
+
if not any(e["name"] == "overall" for e in evaluations):
|
|
581
|
+
overall_score = response_json.get("overall_score", 0.0)
|
|
582
|
+
overall_pass = response_json.get("pass", False)
|
|
583
|
+
evaluations.append({
|
|
584
|
+
"name": "overall",
|
|
585
|
+
"score": overall_score if isinstance(overall_score, (int, float)) else 0.0,
|
|
586
|
+
"label": "pass" if overall_pass else "fail",
|
|
587
|
+
"explanation": response_json.get("evaluation_notes", None)
|
|
588
|
+
})
|
|
589
|
+
|
|
590
|
+
logger.debug(f"Created {len(evaluations)} evaluations")
|
|
591
|
+
|
|
592
|
+
# Phoenix run_experiment expects a single EvaluationResult, not a list.
|
|
593
|
+
# Return the overall score with detailed evaluations in metadata.
|
|
594
|
+
from phoenix.experiments.evaluators.base import EvaluationResult
|
|
595
|
+
|
|
596
|
+
overall_eval = next(
|
|
597
|
+
(e for e in evaluations if e["name"] == "overall"),
|
|
598
|
+
{"score": 0.0, "label": "unknown", "explanation": None}
|
|
394
599
|
)
|
|
395
600
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
score_map = {"correct": 1.0, "partial": 0.5, "incorrect": 0.0}
|
|
404
|
-
score = score_map.get(label, 0.0)
|
|
405
|
-
|
|
406
|
-
return {
|
|
407
|
-
"label": label,
|
|
408
|
-
"score": score,
|
|
409
|
-
"explanation": explanation or "",
|
|
410
|
-
}
|
|
411
|
-
else:
|
|
412
|
-
logger.warning("llm_classify returned empty DataFrame")
|
|
413
|
-
return {
|
|
414
|
-
"label": "error",
|
|
415
|
-
"score": 0.0,
|
|
416
|
-
"explanation": "Evaluator returned empty result",
|
|
601
|
+
return EvaluationResult(
|
|
602
|
+
score=overall_eval.get("score"),
|
|
603
|
+
label=overall_eval.get("label"),
|
|
604
|
+
explanation=overall_eval.get("explanation"),
|
|
605
|
+
metadata={
|
|
606
|
+
"evaluations": evaluations,
|
|
607
|
+
"raw_response": response_json,
|
|
417
608
|
}
|
|
609
|
+
)
|
|
418
610
|
|
|
419
611
|
except Exception as e:
|
|
420
612
|
logger.error(f"Evaluator error: {e}")
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
"
|
|
425
|
-
|
|
613
|
+
from phoenix.experiments.evaluators.base import EvaluationResult
|
|
614
|
+
return EvaluationResult(
|
|
615
|
+
score=0.0,
|
|
616
|
+
label="error",
|
|
617
|
+
explanation=f"Evaluator failed: {str(e)}",
|
|
618
|
+
)
|
|
426
619
|
|
|
427
620
|
return evaluator_fn
|
|
428
621
|
|