spanforge 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. spanforge/__init__.py +695 -0
  2. spanforge/_batch_exporter.py +322 -0
  3. spanforge/_cli.py +3081 -0
  4. spanforge/_hooks.py +340 -0
  5. spanforge/_server.py +953 -0
  6. spanforge/_span.py +1015 -0
  7. spanforge/_store.py +287 -0
  8. spanforge/_stream.py +654 -0
  9. spanforge/_trace.py +334 -0
  10. spanforge/_tracer.py +253 -0
  11. spanforge/actor.py +141 -0
  12. spanforge/alerts.py +464 -0
  13. spanforge/auto.py +181 -0
  14. spanforge/baseline.py +336 -0
  15. spanforge/config.py +460 -0
  16. spanforge/consent.py +227 -0
  17. spanforge/consumer.py +379 -0
  18. spanforge/core/__init__.py +5 -0
  19. spanforge/core/compliance_mapping.py +1060 -0
  20. spanforge/cost.py +597 -0
  21. spanforge/debug.py +514 -0
  22. spanforge/drift.py +488 -0
  23. spanforge/egress.py +63 -0
  24. spanforge/eval.py +575 -0
  25. spanforge/event.py +1052 -0
  26. spanforge/exceptions.py +246 -0
  27. spanforge/explain.py +181 -0
  28. spanforge/export/__init__.py +50 -0
  29. spanforge/export/append_only.py +342 -0
  30. spanforge/export/cloud.py +349 -0
  31. spanforge/export/datadog.py +495 -0
  32. spanforge/export/grafana.py +331 -0
  33. spanforge/export/jsonl.py +198 -0
  34. spanforge/export/otel_bridge.py +291 -0
  35. spanforge/export/otlp.py +817 -0
  36. spanforge/export/otlp_bridge.py +231 -0
  37. spanforge/export/redis_backend.py +282 -0
  38. spanforge/export/webhook.py +302 -0
  39. spanforge/exporters/__init__.py +29 -0
  40. spanforge/exporters/console.py +271 -0
  41. spanforge/exporters/jsonl.py +144 -0
  42. spanforge/hitl.py +297 -0
  43. spanforge/inspect.py +429 -0
  44. spanforge/integrations/__init__.py +39 -0
  45. spanforge/integrations/_pricing.py +277 -0
  46. spanforge/integrations/anthropic.py +388 -0
  47. spanforge/integrations/bedrock.py +306 -0
  48. spanforge/integrations/crewai.py +251 -0
  49. spanforge/integrations/gemini.py +349 -0
  50. spanforge/integrations/groq.py +444 -0
  51. spanforge/integrations/langchain.py +349 -0
  52. spanforge/integrations/llamaindex.py +370 -0
  53. spanforge/integrations/ollama.py +286 -0
  54. spanforge/integrations/openai.py +370 -0
  55. spanforge/integrations/together.py +485 -0
  56. spanforge/metrics.py +393 -0
  57. spanforge/metrics_export.py +342 -0
  58. spanforge/migrate.py +278 -0
  59. spanforge/model_registry.py +282 -0
  60. spanforge/models.py +407 -0
  61. spanforge/namespaces/__init__.py +215 -0
  62. spanforge/namespaces/audit.py +253 -0
  63. spanforge/namespaces/cache.py +209 -0
  64. spanforge/namespaces/chain.py +74 -0
  65. spanforge/namespaces/confidence.py +69 -0
  66. spanforge/namespaces/consent.py +85 -0
  67. spanforge/namespaces/cost.py +175 -0
  68. spanforge/namespaces/decision.py +135 -0
  69. spanforge/namespaces/diff.py +146 -0
  70. spanforge/namespaces/drift.py +79 -0
  71. spanforge/namespaces/eval_.py +232 -0
  72. spanforge/namespaces/fence.py +180 -0
  73. spanforge/namespaces/guard.py +104 -0
  74. spanforge/namespaces/hitl.py +92 -0
  75. spanforge/namespaces/latency.py +69 -0
  76. spanforge/namespaces/prompt.py +185 -0
  77. spanforge/namespaces/redact.py +172 -0
  78. spanforge/namespaces/template.py +197 -0
  79. spanforge/namespaces/tool_call.py +76 -0
  80. spanforge/namespaces/trace.py +1006 -0
  81. spanforge/normalizer.py +183 -0
  82. spanforge/presidio_backend.py +149 -0
  83. spanforge/processor.py +258 -0
  84. spanforge/prompt_registry.py +415 -0
  85. spanforge/py.typed +0 -0
  86. spanforge/redact.py +780 -0
  87. spanforge/sampling.py +500 -0
  88. spanforge/schemas/v1.0/schema.json +170 -0
  89. spanforge/schemas/v2.0/schema.json +536 -0
  90. spanforge/signing.py +1152 -0
  91. spanforge/stream.py +559 -0
  92. spanforge/testing.py +376 -0
  93. spanforge/trace.py +199 -0
  94. spanforge/types.py +696 -0
  95. spanforge/ulid.py +304 -0
  96. spanforge/validate.py +383 -0
  97. spanforge-2.0.0.dist-info/METADATA +1777 -0
  98. spanforge-2.0.0.dist-info/RECORD +101 -0
  99. spanforge-2.0.0.dist-info/WHEEL +4 -0
  100. spanforge-2.0.0.dist-info/entry_points.txt +5 -0
  101. spanforge-2.0.0.dist-info/licenses/LICENSE +21 -0
spanforge/eval.py ADDED
@@ -0,0 +1,575 @@
1
+ """spanforge.eval — Evaluation framework hooks for LLM / agent quality scoring.
2
+
3
+ This module provides lightweight instrumentation for attaching quality scores
4
+ to active spans and emitting them as RFC-0001 ``llm.eval.*`` events. It is
5
+ intentionally infrastructure-agnostic: scores can be produced by RAGAS,
6
+ DeepEval, custom rubric LLMs, or simple rule-based checks.
7
+
8
+ Quick start
9
+ -----------
10
+ ::
11
+
12
+ from spanforge import start_span
13
+ from spanforge.eval import record_eval_score, EvalScore
14
+
15
+ with start_span("rag_pipeline") as span:
16
+ answer = run_rag(query)
17
+ # Attach an evaluation score to the active span.
18
+ record_eval_score(
19
+ metric="faithfulness",
20
+ value=0.87,
21
+ span_id=span.span_id,
22
+ trace_id=span.trace_id,
23
+ label="pass",
24
+ metadata={"evaluator": "ragas", "version": "0.1.12"},
25
+ )
26
+
27
+ Batch evaluation
28
+ ----------------
29
+ Use :class:`EvalRunner` to run a set of :class:`EvalScorer` callables over a
30
+ list of trace outputs and compare them against a baseline::
31
+
32
+ runner = EvalRunner(scorers=[FaithfulnessScorer(), RelevanceScorer()])
33
+ report = runner.run(dataset)
34
+ report.print_summary()
35
+
36
+ Regression detection
37
+ --------------------
38
+ :class:`RegressionDetector` detects when mean scores drop below a configurable
39
+ threshold relative to a saved baseline and emits
40
+ ``llm.eval.regression.detected`` events automatically.
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ import logging
46
+ import re
47
+ import time
48
+ from dataclasses import dataclass, field
49
+ from typing import Any, Callable, Protocol, runtime_checkable
50
+
51
+ __all__ = [
52
+ "EvalReport",
53
+ "EvalRunner",
54
+ "EvalScore",
55
+ "EvalScorer",
56
+ "FaithfulnessScorer",
57
+ "PIILeakageScorer",
58
+ "RefusalDetectionScorer",
59
+ "RegressionDetector",
60
+ "record_eval_score",
61
+ ]
62
+
63
+ _log = logging.getLogger("spanforge.eval")
64
+
65
+ # H13 — span_id / trace_id format patterns (RFC-0001 §8.2)
66
+ _SPAN_ID_PAT: re.Pattern[str] = re.compile(r"^[0-9a-f]{16}$")
67
+ _TRACE_ID_PAT: re.Pattern[str] = re.compile(r"^[0-9a-f]{32}$")
68
+
69
+
70
+ # ---------------------------------------------------------------------------
71
+ # EvalScore dataclass
72
+ # ---------------------------------------------------------------------------
73
+
74
+
75
+ @dataclass
76
+ class EvalScore:
77
+ """A single quality measurement attached to a span or agent run.
78
+
79
+ Args:
80
+ metric: Name of the metric (e.g. ``"faithfulness"``, ``"toxicity"``).
81
+ value: Numeric score. Typically in ``[0.0, 1.0]`` but any float
82
+ is accepted (some metrics like BLEU can exceed 1.0).
83
+ span_id: Optional 16-hex-char span ID of the parent span.
84
+ trace_id: Optional 32-hex-char trace ID.
85
+ label: Optional string label (``"pass"`` / ``"fail"`` / ``"warn"``).
86
+ metadata: Optional free-form metadata dict (evaluator version, etc.).
87
+ timestamp: Unix timestamp (seconds). Set automatically if omitted.
88
+ """
89
+
90
+ metric: str
91
+ value: float
92
+ span_id: str | None = None
93
+ trace_id: str | None = None
94
+ label: str | None = None
95
+ metadata: dict[str, Any] | None = None
96
+ timestamp: float = field(default_factory=time.time)
97
+
98
+ def to_dict(self) -> dict[str, Any]:
99
+ d: dict[str, Any] = {
100
+ "metric": self.metric,
101
+ "value": self.value,
102
+ "timestamp": self.timestamp,
103
+ }
104
+ if self.span_id is not None:
105
+ d["span_id"] = self.span_id
106
+ if self.trace_id is not None:
107
+ d["trace_id"] = self.trace_id
108
+ if self.label is not None:
109
+ d["label"] = self.label
110
+ if self.metadata is not None:
111
+ d["metadata"] = self.metadata
112
+ return d
113
+
114
+ @classmethod
115
+ def from_dict(cls, data: dict[str, Any]) -> "EvalScore":
116
+ return cls(
117
+ metric=data["metric"],
118
+ value=float(data["value"]),
119
+ span_id=data.get("span_id"),
120
+ trace_id=data.get("trace_id"),
121
+ label=data.get("label"),
122
+ metadata=data.get("metadata"),
123
+ timestamp=float(data.get("timestamp", time.time())),
124
+ )
125
+
126
+
127
+ # ---------------------------------------------------------------------------
128
+ # record_eval_score — primary public function
129
+ # ---------------------------------------------------------------------------
130
+
131
+
132
+ def record_eval_score(
133
+ metric: str,
134
+ value: float,
135
+ *,
136
+ span_id: str | None = None,
137
+ trace_id: str | None = None,
138
+ label: str | None = None,
139
+ metadata: dict[str, Any] | None = None,
140
+ ) -> EvalScore:
141
+ """Record an evaluation score and emit it as an RFC-0001 event.
142
+
143
+ The score is emitted as a ``llm.eval.score.recorded`` event via the
144
+ configured SpanForge exporter. It is also returned for convenience so
145
+ callers can inspect or store it locally.
146
+
147
+ Args:
148
+ metric: Name of the quality metric.
149
+ value: Numeric score value.
150
+ span_id: Optional parent span ID (16 hex chars).
151
+ trace_id: Optional trace ID (32 hex chars).
152
+ label: Optional human-readable label (``"pass"``/``"fail"``/etc.).
153
+ metadata: Optional free-form dict with evaluator details.
154
+
155
+ Returns:
156
+ The :class:`EvalScore` that was recorded and emitted.
157
+
158
+ Example::
159
+
160
+ score = record_eval_score("faithfulness", 0.92, span_id=span.span_id)
161
+ """
162
+ # H13: validate span_id / trace_id format at the boundary.
163
+ if span_id is not None and not _SPAN_ID_PAT.match(span_id):
164
+ raise ValueError(f"span_id must be 16 lowercase hex chars, got {span_id!r}")
165
+ if trace_id is not None and not _TRACE_ID_PAT.match(trace_id):
166
+ raise ValueError(f"trace_id must be 32 lowercase hex chars, got {trace_id!r}")
167
+
168
+ score = EvalScore(
169
+ metric=metric,
170
+ value=value,
171
+ span_id=span_id,
172
+ trace_id=trace_id,
173
+ label=label,
174
+ metadata=metadata,
175
+ )
176
+ try:
177
+ from spanforge._stream import emit_rfc_event # noqa: PLC0415
178
+ from spanforge.types import EventType # noqa: PLC0415
179
+ emit_rfc_event(
180
+ EventType.EVAL_SCORE_RECORDED,
181
+ payload=score.to_dict(),
182
+ span_id=span_id,
183
+ trace_id=trace_id,
184
+ )
185
+ except Exception as exc: # NOSONAR
186
+ _log.warning("spanforge.eval: failed to emit eval score event: %s", exc)
187
+
188
+ return score
189
+
190
+
191
+ # ---------------------------------------------------------------------------
192
+ # EvalScorer protocol
193
+ # ---------------------------------------------------------------------------
194
+
195
+
196
+ @runtime_checkable
197
+ class EvalScorer(Protocol):
198
+ """Protocol for evaluation scorers compatible with :class:`EvalRunner`.
199
+
200
+ Each scorer must implement :meth:`score` which receives a single example
201
+ dict and returns an :class:`EvalScore`.
202
+ """
203
+
204
+ @property
205
+ def metric_name(self) -> str:
206
+ """Unique name of this scorer's metric (e.g. ``"faithfulness"``)."""
207
+ ...
208
+
209
+ def score(self, example: dict[str, Any]) -> EvalScore:
210
+ """Score a single example.
211
+
212
+ Args:
213
+ example: Dict containing at least ``"output"`` key; may also
214
+ include ``"reference"``, ``"context"``, ``"span_id"``
215
+ and ``"trace_id"`` for correlation.
216
+
217
+ Returns:
218
+ An :class:`EvalScore` with the metric value.
219
+ """
220
+ ...
221
+
222
+
223
+ # ---------------------------------------------------------------------------
224
+ # EvalReport
225
+ # ---------------------------------------------------------------------------
226
+
227
+
228
+ @dataclass
229
+ class EvalReport:
230
+ """Aggregated result of running multiple scorers over a dataset.
231
+
232
+ Args:
233
+ scores: Flat list of all :class:`EvalScore` instances produced.
234
+ dataset: The dataset used to generate this report.
235
+ """
236
+
237
+ scores: list[EvalScore] = field(default_factory=list)
238
+ dataset: list[dict[str, Any]] = field(default_factory=list)
239
+
240
+ def summary(self) -> dict[str, float]:
241
+ """Return a ``{metric: mean_value}`` dict."""
242
+ from collections import defaultdict # noqa: PLC0415
243
+ totals: dict[str, list[float]] = defaultdict(list)
244
+ for s in self.scores:
245
+ totals[s.metric].append(s.value)
246
+ return {m: sum(vs) / len(vs) for m, vs in totals.items()}
247
+
248
+ def print_summary(self) -> None: # pragma: no cover
249
+ """Print a human-readable summary table."""
250
+ summary = self.summary()
251
+ print(f"{'Metric':<40} {'Mean':>10}")
252
+ print("-" * 53)
253
+ for metric, mean in sorted(summary.items()):
254
+ print(f"{metric:<40} {mean:>10.4f}")
255
+ print("-" * 53)
256
+ print(f"Total scores recorded: {len(self.scores)}")
257
+
258
+
259
+ # ---------------------------------------------------------------------------
260
+ # EvalRunner
261
+ # ---------------------------------------------------------------------------
262
+
263
+
264
+ class EvalRunner:
265
+ """Run one or more :class:`EvalScorer` callables over a dataset.
266
+
267
+ Args:
268
+ scorers: List of scorers to apply to each example.
269
+ emit: If ``True`` (default), each score is emitted via
270
+ :func:`record_eval_score`. Set to ``False`` to collect
271
+ scores in-process only.
272
+
273
+ Example::
274
+
275
+ class FaithfulnessScorer:
276
+ metric_name = "faithfulness"
277
+
278
+ def score(self, example):
279
+ # run your faithfulness check here
280
+ return EvalScore("faithfulness", value=..., span_id=example.get("span_id"))
281
+
282
+ runner = EvalRunner(scorers=[FaithfulnessScorer()])
283
+ report = runner.run([{"output": "Paris", "reference": "Paris is the capital."}])
284
+ report.print_summary()
285
+ """
286
+
287
+ def __init__(
288
+ self,
289
+ scorers: list[EvalScorer] | None = None,
290
+ *,
291
+ emit: bool = True,
292
+ ) -> None:
293
+ self._scorers: list[Any] = list(scorers or [])
294
+ self._emit = emit
295
+
296
+ def add_scorer(self, scorer: EvalScorer) -> None:
297
+ """Append *scorer* to the runner."""
298
+ self._scorers.append(scorer)
299
+
300
+ def run(self, dataset: list[dict[str, Any]]) -> EvalReport:
301
+ """Score every example in *dataset* with every scorer.
302
+
303
+ Args:
304
+ dataset: List of example dicts passed to each scorer's
305
+ :meth:`~EvalScorer.score` method.
306
+
307
+ Returns:
308
+ An :class:`EvalReport` containing all scores.
309
+ """
310
+ all_scores: list[EvalScore] = []
311
+ for example in dataset:
312
+ for scorer in self._scorers:
313
+ try:
314
+ score = scorer.score(example)
315
+ except Exception as exc: # NOSONAR
316
+ _log.warning(
317
+ "EvalRunner: scorer %r raised on example %r: %s",
318
+ getattr(scorer, "metric_name", type(scorer).__name__),
319
+ example,
320
+ exc,
321
+ )
322
+ continue
323
+ if self._emit:
324
+ try:
325
+ record_eval_score(
326
+ metric=score.metric,
327
+ value=score.value,
328
+ span_id=score.span_id,
329
+ trace_id=score.trace_id,
330
+ label=score.label,
331
+ metadata=score.metadata,
332
+ )
333
+ except Exception as exc: # NOSONAR
334
+ _log.warning("EvalRunner: emit failed: %s", exc)
335
+ all_scores.append(score)
336
+ return EvalReport(scores=all_scores, dataset=dataset)
337
+
338
+
339
+ # ---------------------------------------------------------------------------
340
+ # RegressionDetector
341
+ # ---------------------------------------------------------------------------
342
+
343
+
344
+ class RegressionDetector:
345
+ """Detect quality regressions by comparing current scores against a baseline.
346
+
347
+ When the mean score for a metric drops below
348
+ ``baseline_mean * (1 - threshold_pct / 100)`` the detector emits a
349
+ ``llm.eval.regression.detected`` RFC-0001 event.
350
+
351
+ Args:
352
+ baseline: ``{metric: baseline_mean}`` dict. Use :meth:`set_baseline`.
353
+ threshold_pct: Float percentage drop that triggers a regression.
354
+ Default: ``5.0`` (5 % drop).
355
+ emit: If ``True`` (default), regression events are emitted.
356
+
357
+ Example::
358
+
359
+ detector = RegressionDetector(baseline={"faithfulness": 0.90}, threshold_pct=5.0)
360
+ detector.check(report)
361
+ """
362
+
363
+ def __init__(
364
+ self,
365
+ baseline: dict[str, float] | None = None,
366
+ *,
367
+ threshold_pct: float = 5.0,
368
+ emit: bool = True,
369
+ ) -> None:
370
+ self._baseline: dict[str, float] = dict(baseline or {})
371
+ self._threshold_pct = threshold_pct
372
+ self._emit = emit
373
+
374
+ def set_baseline(self, metric: str, value: float) -> None:
375
+ """Update the baseline mean for *metric*."""
376
+ self._baseline[metric] = value
377
+
378
+ def check(self, report: EvalReport) -> list[dict[str, Any]]:
379
+ """Compare *report* summary against the baseline.
380
+
381
+ Returns a list of regression dicts (may be empty). Each dict has
382
+ keys ``metric``, ``baseline``, ``current``, and ``drop_pct``.
383
+ """
384
+ regressions: list[dict[str, Any]] = []
385
+ summary = report.summary()
386
+ for metric, current in summary.items():
387
+ baseline = self._baseline.get(metric)
388
+ if baseline is None or baseline <= 0:
389
+ continue
390
+ drop_pct = (baseline - current) / baseline * 100
391
+ if drop_pct >= self._threshold_pct:
392
+ reg = {
393
+ "metric": metric,
394
+ "baseline": baseline,
395
+ "current": current,
396
+ "drop_pct": round(drop_pct, 4),
397
+ }
398
+ regressions.append(reg)
399
+ _log.warning(
400
+ "spanforge.eval: regression detected for metric=%r "
401
+ "(baseline=%.4f current=%.4f drop=%.2f%%)",
402
+ metric,
403
+ baseline,
404
+ current,
405
+ drop_pct,
406
+ )
407
+ if self._emit:
408
+ try:
409
+ from spanforge._stream import emit_rfc_event # noqa: PLC0415
410
+ from spanforge.types import EventType # noqa: PLC0415
411
+ emit_rfc_event(
412
+ EventType.EVAL_REGRESSION_DETECTED,
413
+ payload=reg,
414
+ )
415
+ except Exception as exc: # NOSONAR
416
+ _log.warning(
417
+ "spanforge.eval: failed to emit regression event: %s", exc
418
+ )
419
+ return regressions
420
+
421
+
422
+ # ---------------------------------------------------------------------------
423
+ # Built-in scorers
424
+ # ---------------------------------------------------------------------------
425
+
426
+ # Refusal phrases (case-insensitive) — common patterns indicating model refusal
427
+ _REFUSAL_PHRASES: tuple[str, ...] = (
428
+ "i cannot",
429
+ "i can't",
430
+ "i'm not able to",
431
+ "i am not able to",
432
+ "i'm unable to",
433
+ "i am unable to",
434
+ "i must decline",
435
+ "i must refuse",
436
+ "as an ai",
437
+ "as a language model",
438
+ "i'm sorry, but i",
439
+ "i apologize, but i",
440
+ "i don't think i can",
441
+ "it would be inappropriate",
442
+ "i'm not allowed to",
443
+ "i cannot assist with",
444
+ "i can't help with",
445
+ "i won't be able to",
446
+ "sorry, i can't",
447
+ "i refuse to",
448
+ )
449
+
450
+
451
+ class FaithfulnessScorer:
452
+ """Score whether the output is faithful to the provided context.
453
+
454
+ Measures token overlap between *output* and *context* as a proxy for
455
+ factual grounding. Returns 1.0 when every non-trivial output word
456
+ appears in the context, 0.0 when none do.
457
+
458
+ If no ``"context"`` key is present the scorer returns 0.0 with label
459
+ ``"skip"`` (cannot evaluate faithfulness without a reference context).
460
+
461
+ Example::
462
+
463
+ scorer = FaithfulnessScorer()
464
+ score = scorer.score({
465
+ "output": "Paris is the capital of France.",
466
+ "context": "France is a country in Europe. Its capital is Paris.",
467
+ })
468
+ """
469
+
470
+ metric_name: str = "faithfulness"
471
+
472
+ def score(self, example: dict[str, Any]) -> EvalScore:
473
+ output: str = str(example.get("output", ""))
474
+ context: str = str(example.get("context", ""))
475
+
476
+ if not context:
477
+ return EvalScore(
478
+ metric=self.metric_name,
479
+ value=0.0,
480
+ span_id=example.get("span_id"),
481
+ trace_id=example.get("trace_id"),
482
+ label="skip",
483
+ metadata={"reason": "no context provided"},
484
+ )
485
+
486
+ # Tokenise: lowercase, alpha-numeric tokens, skip stopwords / short words
487
+ def _tokens(text: str) -> set[str]:
488
+ return {w for w in re.findall(r"[a-z0-9]+", text.lower()) if len(w) > 2}
489
+
490
+ out_tokens = _tokens(output)
491
+ ctx_tokens = _tokens(context)
492
+
493
+ if not out_tokens:
494
+ return EvalScore(
495
+ metric=self.metric_name,
496
+ value=0.0,
497
+ span_id=example.get("span_id"),
498
+ trace_id=example.get("trace_id"),
499
+ label="skip",
500
+ metadata={"reason": "empty output"},
501
+ )
502
+
503
+ overlap = len(out_tokens & ctx_tokens) / len(out_tokens)
504
+ label = "pass" if overlap >= 0.5 else "fail"
505
+
506
+ return EvalScore(
507
+ metric=self.metric_name,
508
+ value=round(overlap, 4),
509
+ span_id=example.get("span_id"),
510
+ trace_id=example.get("trace_id"),
511
+ label=label,
512
+ )
513
+
514
+
515
+ class RefusalDetectionScorer:
516
+ """Detect whether the model output is a refusal / decline.
517
+
518
+ Checks the output against a set of common refusal phrases. Returns 1.0
519
+ if a refusal is detected, 0.0 otherwise.
520
+
521
+ Example::
522
+
523
+ scorer = RefusalDetectionScorer()
524
+ score = scorer.score({"output": "I'm sorry, but I can't help with that."})
525
+ assert score.value == 1.0
526
+ """
527
+
528
+ metric_name: str = "refusal_detection"
529
+
530
+ def score(self, example: dict[str, Any]) -> EvalScore:
531
+ output: str = str(example.get("output", "")).lower()
532
+
533
+ detected = any(phrase in output for phrase in _REFUSAL_PHRASES)
534
+
535
+ return EvalScore(
536
+ metric=self.metric_name,
537
+ value=1.0 if detected else 0.0,
538
+ span_id=example.get("span_id"),
539
+ trace_id=example.get("trace_id"),
540
+ label="refusal" if detected else "pass",
541
+ )
542
+
543
+
544
+ class PIILeakageScorer:
545
+ """Detect PII leakage in the model output.
546
+
547
+ Uses :func:`~spanforge.redact.scan_payload` to scan the ``"output"``
548
+ value for PII patterns. Returns 1.0 if PII is detected (leakage),
549
+ 0.0 if the output is clean.
550
+
551
+ Example::
552
+
553
+ scorer = PIILeakageScorer()
554
+ score = scorer.score({"output": "Contact me at alice@example.com"})
555
+ assert score.value == 1.0
556
+ """
557
+
558
+ metric_name: str = "pii_leakage"
559
+
560
+ def score(self, example: dict[str, Any]) -> EvalScore:
561
+ from spanforge.redact import scan_payload # noqa: PLC0415
562
+
563
+ output: str = str(example.get("output", ""))
564
+
565
+ result = scan_payload({"output": output})
566
+ leaked = not result.clean
567
+
568
+ return EvalScore(
569
+ metric=self.metric_name,
570
+ value=1.0 if leaked else 0.0,
571
+ span_id=example.get("span_id"),
572
+ trace_id=example.get("trace_id"),
573
+ label="leak" if leaked else "pass",
574
+ metadata={"hit_count": len(result.hits)} if leaked else None,
575
+ )