spanforge 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spanforge/__init__.py +695 -0
- spanforge/_batch_exporter.py +322 -0
- spanforge/_cli.py +3081 -0
- spanforge/_hooks.py +340 -0
- spanforge/_server.py +953 -0
- spanforge/_span.py +1015 -0
- spanforge/_store.py +287 -0
- spanforge/_stream.py +654 -0
- spanforge/_trace.py +334 -0
- spanforge/_tracer.py +253 -0
- spanforge/actor.py +141 -0
- spanforge/alerts.py +464 -0
- spanforge/auto.py +181 -0
- spanforge/baseline.py +336 -0
- spanforge/config.py +460 -0
- spanforge/consent.py +227 -0
- spanforge/consumer.py +379 -0
- spanforge/core/__init__.py +5 -0
- spanforge/core/compliance_mapping.py +1060 -0
- spanforge/cost.py +597 -0
- spanforge/debug.py +514 -0
- spanforge/drift.py +488 -0
- spanforge/egress.py +63 -0
- spanforge/eval.py +575 -0
- spanforge/event.py +1052 -0
- spanforge/exceptions.py +246 -0
- spanforge/explain.py +181 -0
- spanforge/export/__init__.py +50 -0
- spanforge/export/append_only.py +342 -0
- spanforge/export/cloud.py +349 -0
- spanforge/export/datadog.py +495 -0
- spanforge/export/grafana.py +331 -0
- spanforge/export/jsonl.py +198 -0
- spanforge/export/otel_bridge.py +291 -0
- spanforge/export/otlp.py +817 -0
- spanforge/export/otlp_bridge.py +231 -0
- spanforge/export/redis_backend.py +282 -0
- spanforge/export/webhook.py +302 -0
- spanforge/exporters/__init__.py +29 -0
- spanforge/exporters/console.py +271 -0
- spanforge/exporters/jsonl.py +144 -0
- spanforge/hitl.py +297 -0
- spanforge/inspect.py +429 -0
- spanforge/integrations/__init__.py +39 -0
- spanforge/integrations/_pricing.py +277 -0
- spanforge/integrations/anthropic.py +388 -0
- spanforge/integrations/bedrock.py +306 -0
- spanforge/integrations/crewai.py +251 -0
- spanforge/integrations/gemini.py +349 -0
- spanforge/integrations/groq.py +444 -0
- spanforge/integrations/langchain.py +349 -0
- spanforge/integrations/llamaindex.py +370 -0
- spanforge/integrations/ollama.py +286 -0
- spanforge/integrations/openai.py +370 -0
- spanforge/integrations/together.py +485 -0
- spanforge/metrics.py +393 -0
- spanforge/metrics_export.py +342 -0
- spanforge/migrate.py +278 -0
- spanforge/model_registry.py +282 -0
- spanforge/models.py +407 -0
- spanforge/namespaces/__init__.py +215 -0
- spanforge/namespaces/audit.py +253 -0
- spanforge/namespaces/cache.py +209 -0
- spanforge/namespaces/chain.py +74 -0
- spanforge/namespaces/confidence.py +69 -0
- spanforge/namespaces/consent.py +85 -0
- spanforge/namespaces/cost.py +175 -0
- spanforge/namespaces/decision.py +135 -0
- spanforge/namespaces/diff.py +146 -0
- spanforge/namespaces/drift.py +79 -0
- spanforge/namespaces/eval_.py +232 -0
- spanforge/namespaces/fence.py +180 -0
- spanforge/namespaces/guard.py +104 -0
- spanforge/namespaces/hitl.py +92 -0
- spanforge/namespaces/latency.py +69 -0
- spanforge/namespaces/prompt.py +185 -0
- spanforge/namespaces/redact.py +172 -0
- spanforge/namespaces/template.py +197 -0
- spanforge/namespaces/tool_call.py +76 -0
- spanforge/namespaces/trace.py +1006 -0
- spanforge/normalizer.py +183 -0
- spanforge/presidio_backend.py +149 -0
- spanforge/processor.py +258 -0
- spanforge/prompt_registry.py +415 -0
- spanforge/py.typed +0 -0
- spanforge/redact.py +780 -0
- spanforge/sampling.py +500 -0
- spanforge/schemas/v1.0/schema.json +170 -0
- spanforge/schemas/v2.0/schema.json +536 -0
- spanforge/signing.py +1152 -0
- spanforge/stream.py +559 -0
- spanforge/testing.py +376 -0
- spanforge/trace.py +199 -0
- spanforge/types.py +696 -0
- spanforge/ulid.py +304 -0
- spanforge/validate.py +383 -0
- spanforge-2.0.0.dist-info/METADATA +1777 -0
- spanforge-2.0.0.dist-info/RECORD +101 -0
- spanforge-2.0.0.dist-info/WHEEL +4 -0
- spanforge-2.0.0.dist-info/entry_points.txt +5 -0
- spanforge-2.0.0.dist-info/licenses/LICENSE +21 -0
spanforge/eval.py
ADDED
|
@@ -0,0 +1,575 @@
|
|
|
1
|
+
"""spanforge.eval — Evaluation framework hooks for LLM / agent quality scoring.
|
|
2
|
+
|
|
3
|
+
This module provides lightweight instrumentation for attaching quality scores
|
|
4
|
+
to active spans and emitting them as RFC-0001 ``llm.eval.*`` events. It is
|
|
5
|
+
intentionally infrastructure-agnostic: scores can be produced by RAGAS,
|
|
6
|
+
DeepEval, custom rubric LLMs, or simple rule-based checks.
|
|
7
|
+
|
|
8
|
+
Quick start
|
|
9
|
+
-----------
|
|
10
|
+
::
|
|
11
|
+
|
|
12
|
+
from spanforge import start_span
|
|
13
|
+
from spanforge.eval import record_eval_score, EvalScore
|
|
14
|
+
|
|
15
|
+
with start_span("rag_pipeline") as span:
|
|
16
|
+
answer = run_rag(query)
|
|
17
|
+
# Attach an evaluation score to the active span.
|
|
18
|
+
record_eval_score(
|
|
19
|
+
metric="faithfulness",
|
|
20
|
+
value=0.87,
|
|
21
|
+
span_id=span.span_id,
|
|
22
|
+
trace_id=span.trace_id,
|
|
23
|
+
label="pass",
|
|
24
|
+
metadata={"evaluator": "ragas", "version": "0.1.12"},
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
Batch evaluation
|
|
28
|
+
----------------
|
|
29
|
+
Use :class:`EvalRunner` to run a set of :class:`EvalScorer` callables over a
|
|
30
|
+
list of trace outputs and compare them against a baseline::
|
|
31
|
+
|
|
32
|
+
runner = EvalRunner(scorers=[FaithfulnessScorer(), RelevanceScorer()])
|
|
33
|
+
report = runner.run(dataset)
|
|
34
|
+
report.print_summary()
|
|
35
|
+
|
|
36
|
+
Regression detection
|
|
37
|
+
--------------------
|
|
38
|
+
:class:`RegressionDetector` detects when mean scores drop below a configurable
|
|
39
|
+
threshold relative to a saved baseline and emits
|
|
40
|
+
``llm.eval.regression.detected`` events automatically.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
import logging
|
|
46
|
+
import re
|
|
47
|
+
import time
|
|
48
|
+
from dataclasses import dataclass, field
|
|
49
|
+
from typing import Any, Callable, Protocol, runtime_checkable
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
"EvalReport",
|
|
53
|
+
"EvalRunner",
|
|
54
|
+
"EvalScore",
|
|
55
|
+
"EvalScorer",
|
|
56
|
+
"FaithfulnessScorer",
|
|
57
|
+
"PIILeakageScorer",
|
|
58
|
+
"RefusalDetectionScorer",
|
|
59
|
+
"RegressionDetector",
|
|
60
|
+
"record_eval_score",
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
_log = logging.getLogger("spanforge.eval")
|
|
64
|
+
|
|
65
|
+
# H13 — span_id / trace_id format patterns (RFC-0001 §8.2)
|
|
66
|
+
_SPAN_ID_PAT: re.Pattern[str] = re.compile(r"^[0-9a-f]{16}$")
|
|
67
|
+
_TRACE_ID_PAT: re.Pattern[str] = re.compile(r"^[0-9a-f]{32}$")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
# EvalScore dataclass
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class EvalScore:
|
|
77
|
+
"""A single quality measurement attached to a span or agent run.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
metric: Name of the metric (e.g. ``"faithfulness"``, ``"toxicity"``).
|
|
81
|
+
value: Numeric score. Typically in ``[0.0, 1.0]`` but any float
|
|
82
|
+
is accepted (some metrics like BLEU can exceed 1.0).
|
|
83
|
+
span_id: Optional 16-hex-char span ID of the parent span.
|
|
84
|
+
trace_id: Optional 32-hex-char trace ID.
|
|
85
|
+
label: Optional string label (``"pass"`` / ``"fail"`` / ``"warn"``).
|
|
86
|
+
metadata: Optional free-form metadata dict (evaluator version, etc.).
|
|
87
|
+
timestamp: Unix timestamp (seconds). Set automatically if omitted.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
metric: str
|
|
91
|
+
value: float
|
|
92
|
+
span_id: str | None = None
|
|
93
|
+
trace_id: str | None = None
|
|
94
|
+
label: str | None = None
|
|
95
|
+
metadata: dict[str, Any] | None = None
|
|
96
|
+
timestamp: float = field(default_factory=time.time)
|
|
97
|
+
|
|
98
|
+
def to_dict(self) -> dict[str, Any]:
|
|
99
|
+
d: dict[str, Any] = {
|
|
100
|
+
"metric": self.metric,
|
|
101
|
+
"value": self.value,
|
|
102
|
+
"timestamp": self.timestamp,
|
|
103
|
+
}
|
|
104
|
+
if self.span_id is not None:
|
|
105
|
+
d["span_id"] = self.span_id
|
|
106
|
+
if self.trace_id is not None:
|
|
107
|
+
d["trace_id"] = self.trace_id
|
|
108
|
+
if self.label is not None:
|
|
109
|
+
d["label"] = self.label
|
|
110
|
+
if self.metadata is not None:
|
|
111
|
+
d["metadata"] = self.metadata
|
|
112
|
+
return d
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def from_dict(cls, data: dict[str, Any]) -> "EvalScore":
|
|
116
|
+
return cls(
|
|
117
|
+
metric=data["metric"],
|
|
118
|
+
value=float(data["value"]),
|
|
119
|
+
span_id=data.get("span_id"),
|
|
120
|
+
trace_id=data.get("trace_id"),
|
|
121
|
+
label=data.get("label"),
|
|
122
|
+
metadata=data.get("metadata"),
|
|
123
|
+
timestamp=float(data.get("timestamp", time.time())),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# ---------------------------------------------------------------------------
|
|
128
|
+
# record_eval_score — primary public function
|
|
129
|
+
# ---------------------------------------------------------------------------
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def record_eval_score(
|
|
133
|
+
metric: str,
|
|
134
|
+
value: float,
|
|
135
|
+
*,
|
|
136
|
+
span_id: str | None = None,
|
|
137
|
+
trace_id: str | None = None,
|
|
138
|
+
label: str | None = None,
|
|
139
|
+
metadata: dict[str, Any] | None = None,
|
|
140
|
+
) -> EvalScore:
|
|
141
|
+
"""Record an evaluation score and emit it as an RFC-0001 event.
|
|
142
|
+
|
|
143
|
+
The score is emitted as a ``llm.eval.score.recorded`` event via the
|
|
144
|
+
configured SpanForge exporter. It is also returned for convenience so
|
|
145
|
+
callers can inspect or store it locally.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
metric: Name of the quality metric.
|
|
149
|
+
value: Numeric score value.
|
|
150
|
+
span_id: Optional parent span ID (16 hex chars).
|
|
151
|
+
trace_id: Optional trace ID (32 hex chars).
|
|
152
|
+
label: Optional human-readable label (``"pass"``/``"fail"``/etc.).
|
|
153
|
+
metadata: Optional free-form dict with evaluator details.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
The :class:`EvalScore` that was recorded and emitted.
|
|
157
|
+
|
|
158
|
+
Example::
|
|
159
|
+
|
|
160
|
+
score = record_eval_score("faithfulness", 0.92, span_id=span.span_id)
|
|
161
|
+
"""
|
|
162
|
+
# H13: validate span_id / trace_id format at the boundary.
|
|
163
|
+
if span_id is not None and not _SPAN_ID_PAT.match(span_id):
|
|
164
|
+
raise ValueError(f"span_id must be 16 lowercase hex chars, got {span_id!r}")
|
|
165
|
+
if trace_id is not None and not _TRACE_ID_PAT.match(trace_id):
|
|
166
|
+
raise ValueError(f"trace_id must be 32 lowercase hex chars, got {trace_id!r}")
|
|
167
|
+
|
|
168
|
+
score = EvalScore(
|
|
169
|
+
metric=metric,
|
|
170
|
+
value=value,
|
|
171
|
+
span_id=span_id,
|
|
172
|
+
trace_id=trace_id,
|
|
173
|
+
label=label,
|
|
174
|
+
metadata=metadata,
|
|
175
|
+
)
|
|
176
|
+
try:
|
|
177
|
+
from spanforge._stream import emit_rfc_event # noqa: PLC0415
|
|
178
|
+
from spanforge.types import EventType # noqa: PLC0415
|
|
179
|
+
emit_rfc_event(
|
|
180
|
+
EventType.EVAL_SCORE_RECORDED,
|
|
181
|
+
payload=score.to_dict(),
|
|
182
|
+
span_id=span_id,
|
|
183
|
+
trace_id=trace_id,
|
|
184
|
+
)
|
|
185
|
+
except Exception as exc: # NOSONAR
|
|
186
|
+
_log.warning("spanforge.eval: failed to emit eval score event: %s", exc)
|
|
187
|
+
|
|
188
|
+
return score
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# ---------------------------------------------------------------------------
|
|
192
|
+
# EvalScorer protocol
|
|
193
|
+
# ---------------------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
@runtime_checkable
|
|
197
|
+
class EvalScorer(Protocol):
|
|
198
|
+
"""Protocol for evaluation scorers compatible with :class:`EvalRunner`.
|
|
199
|
+
|
|
200
|
+
Each scorer must implement :meth:`score` which receives a single example
|
|
201
|
+
dict and returns an :class:`EvalScore`.
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
@property
|
|
205
|
+
def metric_name(self) -> str:
|
|
206
|
+
"""Unique name of this scorer's metric (e.g. ``"faithfulness"``)."""
|
|
207
|
+
...
|
|
208
|
+
|
|
209
|
+
def score(self, example: dict[str, Any]) -> EvalScore:
|
|
210
|
+
"""Score a single example.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
example: Dict containing at least ``"output"`` key; may also
|
|
214
|
+
include ``"reference"``, ``"context"``, ``"span_id"``
|
|
215
|
+
and ``"trace_id"`` for correlation.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
An :class:`EvalScore` with the metric value.
|
|
219
|
+
"""
|
|
220
|
+
...
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# ---------------------------------------------------------------------------
|
|
224
|
+
# EvalReport
|
|
225
|
+
# ---------------------------------------------------------------------------
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
@dataclass
|
|
229
|
+
class EvalReport:
|
|
230
|
+
"""Aggregated result of running multiple scorers over a dataset.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
scores: Flat list of all :class:`EvalScore` instances produced.
|
|
234
|
+
dataset: The dataset used to generate this report.
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
scores: list[EvalScore] = field(default_factory=list)
|
|
238
|
+
dataset: list[dict[str, Any]] = field(default_factory=list)
|
|
239
|
+
|
|
240
|
+
def summary(self) -> dict[str, float]:
|
|
241
|
+
"""Return a ``{metric: mean_value}`` dict."""
|
|
242
|
+
from collections import defaultdict # noqa: PLC0415
|
|
243
|
+
totals: dict[str, list[float]] = defaultdict(list)
|
|
244
|
+
for s in self.scores:
|
|
245
|
+
totals[s.metric].append(s.value)
|
|
246
|
+
return {m: sum(vs) / len(vs) for m, vs in totals.items()}
|
|
247
|
+
|
|
248
|
+
def print_summary(self) -> None: # pragma: no cover
|
|
249
|
+
"""Print a human-readable summary table."""
|
|
250
|
+
summary = self.summary()
|
|
251
|
+
print(f"{'Metric':<40} {'Mean':>10}")
|
|
252
|
+
print("-" * 53)
|
|
253
|
+
for metric, mean in sorted(summary.items()):
|
|
254
|
+
print(f"{metric:<40} {mean:>10.4f}")
|
|
255
|
+
print("-" * 53)
|
|
256
|
+
print(f"Total scores recorded: {len(self.scores)}")
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# ---------------------------------------------------------------------------
|
|
260
|
+
# EvalRunner
|
|
261
|
+
# ---------------------------------------------------------------------------
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
class EvalRunner:
|
|
265
|
+
"""Run one or more :class:`EvalScorer` callables over a dataset.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
scorers: List of scorers to apply to each example.
|
|
269
|
+
emit: If ``True`` (default), each score is emitted via
|
|
270
|
+
:func:`record_eval_score`. Set to ``False`` to collect
|
|
271
|
+
scores in-process only.
|
|
272
|
+
|
|
273
|
+
Example::
|
|
274
|
+
|
|
275
|
+
class FaithfulnessScorer:
|
|
276
|
+
metric_name = "faithfulness"
|
|
277
|
+
|
|
278
|
+
def score(self, example):
|
|
279
|
+
# run your faithfulness check here
|
|
280
|
+
return EvalScore("faithfulness", value=..., span_id=example.get("span_id"))
|
|
281
|
+
|
|
282
|
+
runner = EvalRunner(scorers=[FaithfulnessScorer()])
|
|
283
|
+
report = runner.run([{"output": "Paris", "reference": "Paris is the capital."}])
|
|
284
|
+
report.print_summary()
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
def __init__(
|
|
288
|
+
self,
|
|
289
|
+
scorers: list[EvalScorer] | None = None,
|
|
290
|
+
*,
|
|
291
|
+
emit: bool = True,
|
|
292
|
+
) -> None:
|
|
293
|
+
self._scorers: list[Any] = list(scorers or [])
|
|
294
|
+
self._emit = emit
|
|
295
|
+
|
|
296
|
+
def add_scorer(self, scorer: EvalScorer) -> None:
|
|
297
|
+
"""Append *scorer* to the runner."""
|
|
298
|
+
self._scorers.append(scorer)
|
|
299
|
+
|
|
300
|
+
def run(self, dataset: list[dict[str, Any]]) -> EvalReport:
|
|
301
|
+
"""Score every example in *dataset* with every scorer.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
dataset: List of example dicts passed to each scorer's
|
|
305
|
+
:meth:`~EvalScorer.score` method.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
An :class:`EvalReport` containing all scores.
|
|
309
|
+
"""
|
|
310
|
+
all_scores: list[EvalScore] = []
|
|
311
|
+
for example in dataset:
|
|
312
|
+
for scorer in self._scorers:
|
|
313
|
+
try:
|
|
314
|
+
score = scorer.score(example)
|
|
315
|
+
except Exception as exc: # NOSONAR
|
|
316
|
+
_log.warning(
|
|
317
|
+
"EvalRunner: scorer %r raised on example %r: %s",
|
|
318
|
+
getattr(scorer, "metric_name", type(scorer).__name__),
|
|
319
|
+
example,
|
|
320
|
+
exc,
|
|
321
|
+
)
|
|
322
|
+
continue
|
|
323
|
+
if self._emit:
|
|
324
|
+
try:
|
|
325
|
+
record_eval_score(
|
|
326
|
+
metric=score.metric,
|
|
327
|
+
value=score.value,
|
|
328
|
+
span_id=score.span_id,
|
|
329
|
+
trace_id=score.trace_id,
|
|
330
|
+
label=score.label,
|
|
331
|
+
metadata=score.metadata,
|
|
332
|
+
)
|
|
333
|
+
except Exception as exc: # NOSONAR
|
|
334
|
+
_log.warning("EvalRunner: emit failed: %s", exc)
|
|
335
|
+
all_scores.append(score)
|
|
336
|
+
return EvalReport(scores=all_scores, dataset=dataset)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
# ---------------------------------------------------------------------------
|
|
340
|
+
# RegressionDetector
|
|
341
|
+
# ---------------------------------------------------------------------------
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
class RegressionDetector:
|
|
345
|
+
"""Detect quality regressions by comparing current scores against a baseline.
|
|
346
|
+
|
|
347
|
+
When the mean score for a metric drops below
|
|
348
|
+
``baseline_mean * (1 - threshold_pct / 100)`` the detector emits a
|
|
349
|
+
``llm.eval.regression.detected`` RFC-0001 event.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
baseline: ``{metric: baseline_mean}`` dict. Use :meth:`set_baseline`.
|
|
353
|
+
threshold_pct: Float percentage drop that triggers a regression.
|
|
354
|
+
Default: ``5.0`` (5 % drop).
|
|
355
|
+
emit: If ``True`` (default), regression events are emitted.
|
|
356
|
+
|
|
357
|
+
Example::
|
|
358
|
+
|
|
359
|
+
detector = RegressionDetector(baseline={"faithfulness": 0.90}, threshold_pct=5.0)
|
|
360
|
+
detector.check(report)
|
|
361
|
+
"""
|
|
362
|
+
|
|
363
|
+
def __init__(
|
|
364
|
+
self,
|
|
365
|
+
baseline: dict[str, float] | None = None,
|
|
366
|
+
*,
|
|
367
|
+
threshold_pct: float = 5.0,
|
|
368
|
+
emit: bool = True,
|
|
369
|
+
) -> None:
|
|
370
|
+
self._baseline: dict[str, float] = dict(baseline or {})
|
|
371
|
+
self._threshold_pct = threshold_pct
|
|
372
|
+
self._emit = emit
|
|
373
|
+
|
|
374
|
+
def set_baseline(self, metric: str, value: float) -> None:
|
|
375
|
+
"""Update the baseline mean for *metric*."""
|
|
376
|
+
self._baseline[metric] = value
|
|
377
|
+
|
|
378
|
+
def check(self, report: EvalReport) -> list[dict[str, Any]]:
|
|
379
|
+
"""Compare *report* summary against the baseline.
|
|
380
|
+
|
|
381
|
+
Returns a list of regression dicts (may be empty). Each dict has
|
|
382
|
+
keys ``metric``, ``baseline``, ``current``, and ``drop_pct``.
|
|
383
|
+
"""
|
|
384
|
+
regressions: list[dict[str, Any]] = []
|
|
385
|
+
summary = report.summary()
|
|
386
|
+
for metric, current in summary.items():
|
|
387
|
+
baseline = self._baseline.get(metric)
|
|
388
|
+
if baseline is None or baseline <= 0:
|
|
389
|
+
continue
|
|
390
|
+
drop_pct = (baseline - current) / baseline * 100
|
|
391
|
+
if drop_pct >= self._threshold_pct:
|
|
392
|
+
reg = {
|
|
393
|
+
"metric": metric,
|
|
394
|
+
"baseline": baseline,
|
|
395
|
+
"current": current,
|
|
396
|
+
"drop_pct": round(drop_pct, 4),
|
|
397
|
+
}
|
|
398
|
+
regressions.append(reg)
|
|
399
|
+
_log.warning(
|
|
400
|
+
"spanforge.eval: regression detected for metric=%r "
|
|
401
|
+
"(baseline=%.4f current=%.4f drop=%.2f%%)",
|
|
402
|
+
metric,
|
|
403
|
+
baseline,
|
|
404
|
+
current,
|
|
405
|
+
drop_pct,
|
|
406
|
+
)
|
|
407
|
+
if self._emit:
|
|
408
|
+
try:
|
|
409
|
+
from spanforge._stream import emit_rfc_event # noqa: PLC0415
|
|
410
|
+
from spanforge.types import EventType # noqa: PLC0415
|
|
411
|
+
emit_rfc_event(
|
|
412
|
+
EventType.EVAL_REGRESSION_DETECTED,
|
|
413
|
+
payload=reg,
|
|
414
|
+
)
|
|
415
|
+
except Exception as exc: # NOSONAR
|
|
416
|
+
_log.warning(
|
|
417
|
+
"spanforge.eval: failed to emit regression event: %s", exc
|
|
418
|
+
)
|
|
419
|
+
return regressions
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
# ---------------------------------------------------------------------------
|
|
423
|
+
# Built-in scorers
|
|
424
|
+
# ---------------------------------------------------------------------------
|
|
425
|
+
|
|
426
|
+
# Refusal phrases (case-insensitive) — common patterns indicating model refusal
|
|
427
|
+
_REFUSAL_PHRASES: tuple[str, ...] = (
|
|
428
|
+
"i cannot",
|
|
429
|
+
"i can't",
|
|
430
|
+
"i'm not able to",
|
|
431
|
+
"i am not able to",
|
|
432
|
+
"i'm unable to",
|
|
433
|
+
"i am unable to",
|
|
434
|
+
"i must decline",
|
|
435
|
+
"i must refuse",
|
|
436
|
+
"as an ai",
|
|
437
|
+
"as a language model",
|
|
438
|
+
"i'm sorry, but i",
|
|
439
|
+
"i apologize, but i",
|
|
440
|
+
"i don't think i can",
|
|
441
|
+
"it would be inappropriate",
|
|
442
|
+
"i'm not allowed to",
|
|
443
|
+
"i cannot assist with",
|
|
444
|
+
"i can't help with",
|
|
445
|
+
"i won't be able to",
|
|
446
|
+
"sorry, i can't",
|
|
447
|
+
"i refuse to",
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
class FaithfulnessScorer:
|
|
452
|
+
"""Score whether the output is faithful to the provided context.
|
|
453
|
+
|
|
454
|
+
Measures token overlap between *output* and *context* as a proxy for
|
|
455
|
+
factual grounding. Returns 1.0 when every non-trivial output word
|
|
456
|
+
appears in the context, 0.0 when none do.
|
|
457
|
+
|
|
458
|
+
If no ``"context"`` key is present the scorer returns 0.0 with label
|
|
459
|
+
``"skip"`` (cannot evaluate faithfulness without a reference context).
|
|
460
|
+
|
|
461
|
+
Example::
|
|
462
|
+
|
|
463
|
+
scorer = FaithfulnessScorer()
|
|
464
|
+
score = scorer.score({
|
|
465
|
+
"output": "Paris is the capital of France.",
|
|
466
|
+
"context": "France is a country in Europe. Its capital is Paris.",
|
|
467
|
+
})
|
|
468
|
+
"""
|
|
469
|
+
|
|
470
|
+
metric_name: str = "faithfulness"
|
|
471
|
+
|
|
472
|
+
def score(self, example: dict[str, Any]) -> EvalScore:
|
|
473
|
+
output: str = str(example.get("output", ""))
|
|
474
|
+
context: str = str(example.get("context", ""))
|
|
475
|
+
|
|
476
|
+
if not context:
|
|
477
|
+
return EvalScore(
|
|
478
|
+
metric=self.metric_name,
|
|
479
|
+
value=0.0,
|
|
480
|
+
span_id=example.get("span_id"),
|
|
481
|
+
trace_id=example.get("trace_id"),
|
|
482
|
+
label="skip",
|
|
483
|
+
metadata={"reason": "no context provided"},
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
# Tokenise: lowercase, alpha-numeric tokens, skip stopwords / short words
|
|
487
|
+
def _tokens(text: str) -> set[str]:
|
|
488
|
+
return {w for w in re.findall(r"[a-z0-9]+", text.lower()) if len(w) > 2}
|
|
489
|
+
|
|
490
|
+
out_tokens = _tokens(output)
|
|
491
|
+
ctx_tokens = _tokens(context)
|
|
492
|
+
|
|
493
|
+
if not out_tokens:
|
|
494
|
+
return EvalScore(
|
|
495
|
+
metric=self.metric_name,
|
|
496
|
+
value=0.0,
|
|
497
|
+
span_id=example.get("span_id"),
|
|
498
|
+
trace_id=example.get("trace_id"),
|
|
499
|
+
label="skip",
|
|
500
|
+
metadata={"reason": "empty output"},
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
overlap = len(out_tokens & ctx_tokens) / len(out_tokens)
|
|
504
|
+
label = "pass" if overlap >= 0.5 else "fail"
|
|
505
|
+
|
|
506
|
+
return EvalScore(
|
|
507
|
+
metric=self.metric_name,
|
|
508
|
+
value=round(overlap, 4),
|
|
509
|
+
span_id=example.get("span_id"),
|
|
510
|
+
trace_id=example.get("trace_id"),
|
|
511
|
+
label=label,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
class RefusalDetectionScorer:
|
|
516
|
+
"""Detect whether the model output is a refusal / decline.
|
|
517
|
+
|
|
518
|
+
Checks the output against a set of common refusal phrases. Returns 1.0
|
|
519
|
+
if a refusal is detected, 0.0 otherwise.
|
|
520
|
+
|
|
521
|
+
Example::
|
|
522
|
+
|
|
523
|
+
scorer = RefusalDetectionScorer()
|
|
524
|
+
score = scorer.score({"output": "I'm sorry, but I can't help with that."})
|
|
525
|
+
assert score.value == 1.0
|
|
526
|
+
"""
|
|
527
|
+
|
|
528
|
+
metric_name: str = "refusal_detection"
|
|
529
|
+
|
|
530
|
+
def score(self, example: dict[str, Any]) -> EvalScore:
|
|
531
|
+
output: str = str(example.get("output", "")).lower()
|
|
532
|
+
|
|
533
|
+
detected = any(phrase in output for phrase in _REFUSAL_PHRASES)
|
|
534
|
+
|
|
535
|
+
return EvalScore(
|
|
536
|
+
metric=self.metric_name,
|
|
537
|
+
value=1.0 if detected else 0.0,
|
|
538
|
+
span_id=example.get("span_id"),
|
|
539
|
+
trace_id=example.get("trace_id"),
|
|
540
|
+
label="refusal" if detected else "pass",
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
class PIILeakageScorer:
|
|
545
|
+
"""Detect PII leakage in the model output.
|
|
546
|
+
|
|
547
|
+
Uses :func:`~spanforge.redact.scan_payload` to scan the ``"output"``
|
|
548
|
+
value for PII patterns. Returns 1.0 if PII is detected (leakage),
|
|
549
|
+
0.0 if the output is clean.
|
|
550
|
+
|
|
551
|
+
Example::
|
|
552
|
+
|
|
553
|
+
scorer = PIILeakageScorer()
|
|
554
|
+
score = scorer.score({"output": "Contact me at alice@example.com"})
|
|
555
|
+
assert score.value == 1.0
|
|
556
|
+
"""
|
|
557
|
+
|
|
558
|
+
metric_name: str = "pii_leakage"
|
|
559
|
+
|
|
560
|
+
def score(self, example: dict[str, Any]) -> EvalScore:
|
|
561
|
+
from spanforge.redact import scan_payload # noqa: PLC0415
|
|
562
|
+
|
|
563
|
+
output: str = str(example.get("output", ""))
|
|
564
|
+
|
|
565
|
+
result = scan_payload({"output": output})
|
|
566
|
+
leaked = not result.clean
|
|
567
|
+
|
|
568
|
+
return EvalScore(
|
|
569
|
+
metric=self.metric_name,
|
|
570
|
+
value=1.0 if leaked else 0.0,
|
|
571
|
+
span_id=example.get("span_id"),
|
|
572
|
+
trace_id=example.get("trace_id"),
|
|
573
|
+
label="leak" if leaked else "pass",
|
|
574
|
+
metadata={"hit_count": len(result.hits)} if leaked else None,
|
|
575
|
+
)
|