spanforge 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. spanforge/__init__.py +815 -0
  2. spanforge/_ansi.py +93 -0
  3. spanforge/_batch_exporter.py +409 -0
  4. spanforge/_cli.py +2094 -0
  5. spanforge/_cli_audit.py +639 -0
  6. spanforge/_cli_compliance.py +711 -0
  7. spanforge/_cli_cost.py +243 -0
  8. spanforge/_cli_ops.py +791 -0
  9. spanforge/_cli_phase11.py +356 -0
  10. spanforge/_hooks.py +337 -0
  11. spanforge/_server.py +1708 -0
  12. spanforge/_span.py +1036 -0
  13. spanforge/_store.py +288 -0
  14. spanforge/_stream.py +664 -0
  15. spanforge/_trace.py +335 -0
  16. spanforge/_tracer.py +254 -0
  17. spanforge/actor.py +141 -0
  18. spanforge/alerts.py +469 -0
  19. spanforge/auto.py +464 -0
  20. spanforge/baseline.py +335 -0
  21. spanforge/cache.py +635 -0
  22. spanforge/compliance.py +325 -0
  23. spanforge/config.py +532 -0
  24. spanforge/consent.py +228 -0
  25. spanforge/consumer.py +377 -0
  26. spanforge/core/__init__.py +5 -0
  27. spanforge/core/compliance_mapping.py +1254 -0
  28. spanforge/cost.py +600 -0
  29. spanforge/debug.py +548 -0
  30. spanforge/deprecations.py +205 -0
  31. spanforge/drift.py +482 -0
  32. spanforge/egress.py +58 -0
  33. spanforge/eval.py +648 -0
  34. spanforge/event.py +1064 -0
  35. spanforge/exceptions.py +240 -0
  36. spanforge/explain.py +178 -0
  37. spanforge/export/__init__.py +69 -0
  38. spanforge/export/append_only.py +337 -0
  39. spanforge/export/cloud.py +357 -0
  40. spanforge/export/datadog.py +497 -0
  41. spanforge/export/grafana.py +320 -0
  42. spanforge/export/jsonl.py +195 -0
  43. spanforge/export/openinference.py +158 -0
  44. spanforge/export/otel_bridge.py +294 -0
  45. spanforge/export/otlp.py +811 -0
  46. spanforge/export/otlp_bridge.py +233 -0
  47. spanforge/export/redis_backend.py +282 -0
  48. spanforge/export/siem_schema.py +98 -0
  49. spanforge/export/siem_splunk.py +264 -0
  50. spanforge/export/siem_syslog.py +212 -0
  51. spanforge/export/webhook.py +299 -0
  52. spanforge/exporters/__init__.py +30 -0
  53. spanforge/exporters/console.py +271 -0
  54. spanforge/exporters/jsonl.py +144 -0
  55. spanforge/exporters/sqlite.py +142 -0
  56. spanforge/gate.py +1150 -0
  57. spanforge/governance.py +181 -0
  58. spanforge/hitl.py +295 -0
  59. spanforge/http.py +187 -0
  60. spanforge/inspect.py +427 -0
  61. spanforge/integrations/__init__.py +45 -0
  62. spanforge/integrations/_pricing.py +280 -0
  63. spanforge/integrations/anthropic.py +388 -0
  64. spanforge/integrations/azure_openai.py +133 -0
  65. spanforge/integrations/bedrock.py +292 -0
  66. spanforge/integrations/crewai.py +251 -0
  67. spanforge/integrations/gemini.py +351 -0
  68. spanforge/integrations/groq.py +442 -0
  69. spanforge/integrations/langchain.py +349 -0
  70. spanforge/integrations/langgraph.py +306 -0
  71. spanforge/integrations/llamaindex.py +373 -0
  72. spanforge/integrations/ollama.py +287 -0
  73. spanforge/integrations/openai.py +368 -0
  74. spanforge/integrations/together.py +483 -0
  75. spanforge/io.py +214 -0
  76. spanforge/lint.py +322 -0
  77. spanforge/metrics.py +417 -0
  78. spanforge/metrics_export.py +343 -0
  79. spanforge/migrate.py +402 -0
  80. spanforge/model_registry.py +278 -0
  81. spanforge/models.py +389 -0
  82. spanforge/namespaces/__init__.py +254 -0
  83. spanforge/namespaces/audit.py +256 -0
  84. spanforge/namespaces/cache.py +237 -0
  85. spanforge/namespaces/chain.py +77 -0
  86. spanforge/namespaces/confidence.py +72 -0
  87. spanforge/namespaces/consent.py +92 -0
  88. spanforge/namespaces/cost.py +179 -0
  89. spanforge/namespaces/decision.py +143 -0
  90. spanforge/namespaces/diff.py +157 -0
  91. spanforge/namespaces/drift.py +80 -0
  92. spanforge/namespaces/eval_.py +251 -0
  93. spanforge/namespaces/feedback.py +241 -0
  94. spanforge/namespaces/fence.py +193 -0
  95. spanforge/namespaces/guard.py +105 -0
  96. spanforge/namespaces/hitl.py +91 -0
  97. spanforge/namespaces/latency.py +72 -0
  98. spanforge/namespaces/prompt.py +190 -0
  99. spanforge/namespaces/redact.py +173 -0
  100. spanforge/namespaces/retrieval.py +379 -0
  101. spanforge/namespaces/runtime_governance.py +494 -0
  102. spanforge/namespaces/template.py +208 -0
  103. spanforge/namespaces/tool_call.py +77 -0
  104. spanforge/namespaces/trace.py +1029 -0
  105. spanforge/normalizer.py +171 -0
  106. spanforge/plugins.py +82 -0
  107. spanforge/presidio_backend.py +349 -0
  108. spanforge/processor.py +258 -0
  109. spanforge/prompt_registry.py +418 -0
  110. spanforge/py.typed +0 -0
  111. spanforge/redact.py +914 -0
  112. spanforge/regression.py +192 -0
  113. spanforge/runtime_policy.py +159 -0
  114. spanforge/sampling.py +511 -0
  115. spanforge/schema.py +183 -0
  116. spanforge/schemas/v1.0/schema.json +170 -0
  117. spanforge/schemas/v2.0/schema.json +536 -0
  118. spanforge/sdk/__init__.py +625 -0
  119. spanforge/sdk/_base.py +584 -0
  120. spanforge/sdk/_base.pyi +71 -0
  121. spanforge/sdk/_exceptions.py +1096 -0
  122. spanforge/sdk/_types.py +2184 -0
  123. spanforge/sdk/alert.py +1514 -0
  124. spanforge/sdk/alert.pyi +56 -0
  125. spanforge/sdk/audit.py +1196 -0
  126. spanforge/sdk/audit.pyi +67 -0
  127. spanforge/sdk/cec.py +1215 -0
  128. spanforge/sdk/cec.pyi +37 -0
  129. spanforge/sdk/config.py +641 -0
  130. spanforge/sdk/config.pyi +55 -0
  131. spanforge/sdk/enterprise.py +714 -0
  132. spanforge/sdk/enterprise.pyi +79 -0
  133. spanforge/sdk/explain.py +170 -0
  134. spanforge/sdk/fallback.py +432 -0
  135. spanforge/sdk/feedback.py +351 -0
  136. spanforge/sdk/gate.py +874 -0
  137. spanforge/sdk/gate.pyi +51 -0
  138. spanforge/sdk/identity.py +2114 -0
  139. spanforge/sdk/identity.pyi +47 -0
  140. spanforge/sdk/lineage.py +175 -0
  141. spanforge/sdk/observe.py +1065 -0
  142. spanforge/sdk/observe.pyi +50 -0
  143. spanforge/sdk/operator.py +338 -0
  144. spanforge/sdk/pii.py +1473 -0
  145. spanforge/sdk/pii.pyi +119 -0
  146. spanforge/sdk/pipelines.py +458 -0
  147. spanforge/sdk/pipelines.pyi +39 -0
  148. spanforge/sdk/policy.py +930 -0
  149. spanforge/sdk/rag.py +594 -0
  150. spanforge/sdk/rbac.py +280 -0
  151. spanforge/sdk/registry.py +430 -0
  152. spanforge/sdk/registry.pyi +46 -0
  153. spanforge/sdk/scope.py +279 -0
  154. spanforge/sdk/secrets.py +293 -0
  155. spanforge/sdk/secrets.pyi +25 -0
  156. spanforge/sdk/security.py +560 -0
  157. spanforge/sdk/security.pyi +57 -0
  158. spanforge/sdk/trust.py +472 -0
  159. spanforge/sdk/trust.pyi +41 -0
  160. spanforge/secrets.py +799 -0
  161. spanforge/signing.py +1179 -0
  162. spanforge/stats.py +100 -0
  163. spanforge/stream.py +560 -0
  164. spanforge/testing.py +378 -0
  165. spanforge/testing_mocks.py +1052 -0
  166. spanforge/trace.py +199 -0
  167. spanforge/types.py +696 -0
  168. spanforge/ulid.py +300 -0
  169. spanforge/validate.py +379 -0
  170. spanforge-1.0.0.dist-info/METADATA +1509 -0
  171. spanforge-1.0.0.dist-info/RECORD +174 -0
  172. spanforge-1.0.0.dist-info/WHEEL +4 -0
  173. spanforge-1.0.0.dist-info/entry_points.txt +5 -0
  174. spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
spanforge/egress.py ADDED
@@ -0,0 +1,58 @@
1
+ """Egress enforcement for SpanForge export pipeline.
2
+
3
+ Provides a centralized guard that blocks network exports when the SDK is
4
+ configured in no-egress (air-gapped) mode. Exporters call
5
+ :func:`check_egress` before making any HTTP request.
6
+
7
+ Configuration
8
+ -------------
9
+ * ``no_egress=True`` on :class:`~spanforge.config.SpanForgeConfig` blocks
10
+ **all** outbound network traffic from SpanForge exporters.
11
+ * ``egress_allowlist`` is a ``frozenset[str]`` of URL **prefixes** that are
12
+ permitted even when ``no_egress`` is ``True``. For example::
13
+
14
+ configure(no_egress=True, egress_allowlist=frozenset(["https://internal-collector.corp.local/"]))
15
+
16
+ Raises :class:`~spanforge.exceptions.EgressViolationError` when a blocked
17
+ export is attempted.
18
+
19
+ Example::
20
+
21
+ from spanforge.egress import check_egress
22
+ check_egress("https://example.com/v1/traces", backend="otlp")
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from spanforge.exceptions import EgressViolationError
28
+
29
+ __all__ = ["check_egress"]
30
+
31
+
32
+ def check_egress(endpoint: str, backend: str = "unknown") -> None:
33
+ """Raise :class:`EgressViolationError` if egress to *endpoint* is blocked.
34
+
35
+ This function is a no-op when ``no_egress`` is ``False``.
36
+
37
+ Args:
38
+ endpoint: The URL being accessed.
39
+ backend: Exporter name for the error message (e.g. ``"otlp"``).
40
+
41
+ Raises:
42
+ EgressViolationError: If the endpoint is blocked by the egress policy.
43
+ """
44
+ from spanforge.config import get_config
45
+
46
+ cfg = get_config()
47
+
48
+ if not cfg.no_egress:
49
+ return
50
+
51
+ # Check allowlist
52
+ allowlist = cfg.egress_allowlist
53
+ if allowlist:
54
+ for prefix in allowlist:
55
+ if endpoint.startswith(prefix):
56
+ return
57
+
58
+ raise EgressViolationError(backend=backend, endpoint=endpoint)
spanforge/eval.py ADDED
@@ -0,0 +1,648 @@
1
+ """spanforge.eval — Evaluation framework hooks for LLM / agent quality scoring.
2
+
3
+ This module provides lightweight instrumentation for attaching quality scores
4
+ to active spans and emitting them as RFC-0001 ``llm.eval.*`` events. It is
5
+ intentionally infrastructure-agnostic: scores can be produced by RAGAS,
6
+ DeepEval, custom rubric LLMs, or simple rule-based checks.
7
+
8
+ Quick start
9
+ -----------
10
+ ::
11
+
12
+ from spanforge import start_span
13
+ from spanforge.eval import record_eval_score, EvalScore
14
+
15
+ with start_span("rag_pipeline") as span:
16
+ answer = run_rag(query)
17
+ # Attach an evaluation score to the active span.
18
+ record_eval_score(
19
+ metric="faithfulness",
20
+ value=0.87,
21
+ span_id=span.span_id,
22
+ trace_id=span.trace_id,
23
+ label="pass",
24
+ metadata={"evaluator": "ragas", "version": "0.1.12"},
25
+ )
26
+
27
+ Batch evaluation
28
+ ----------------
29
+ Use :class:`EvalRunner` to run a set of :class:`EvalScorer` callables over a
30
+ list of trace outputs and compare them against a baseline::
31
+
32
+ runner = EvalRunner(scorers=[FaithfulnessScorer(), RelevanceScorer()])
33
+ report = runner.run(dataset)
34
+ report.print_summary()
35
+
36
+ Regression detection
37
+ --------------------
38
+ :class:`RegressionDetector` detects when mean scores drop below a configurable
39
+ threshold relative to a saved baseline and emits
40
+ ``llm.eval.regression.detected`` events automatically.
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ import logging
46
+ import re
47
+ import time
48
+ from abc import ABC, abstractmethod
49
+ from dataclasses import dataclass, field
50
+ from typing import Any, Protocol, runtime_checkable
51
+
52
+ __all__ = [
53
+ "BehaviourScorer",
54
+ "EvalReport",
55
+ "EvalRunner",
56
+ "EvalScore",
57
+ "EvalScorer",
58
+ "FaithfulnessScorer",
59
+ "PIILeakageScorer",
60
+ "RefusalDetectionScorer",
61
+ "RegressionDetector",
62
+ "record_eval_score",
63
+ ]
64
+
65
+ _log = logging.getLogger("spanforge.eval")
66
+
67
+ # H13 — span_id / trace_id format patterns (RFC-0001 §8.2)
68
+ _SPAN_ID_PAT: re.Pattern[str] = re.compile(r"^[0-9a-f]{16}$")
69
+ _TRACE_ID_PAT: re.Pattern[str] = re.compile(r"^[0-9a-f]{32}$")
70
+
71
+
72
+ # ---------------------------------------------------------------------------
73
+ # EvalScore dataclass
74
+ # ---------------------------------------------------------------------------
75
+
76
+
77
+ @dataclass
78
+ class EvalScore:
79
+ """A single quality measurement attached to a span or agent run.
80
+
81
+ Args:
82
+ metric: Name of the metric (e.g. ``"faithfulness"``, ``"toxicity"``).
83
+ value: Numeric score. Typically in ``[0.0, 1.0]`` but any float
84
+ is accepted (some metrics like BLEU can exceed 1.0).
85
+ span_id: Optional 16-hex-char span ID of the parent span.
86
+ trace_id: Optional 32-hex-char trace ID.
87
+ label: Optional string label (``"pass"`` / ``"fail"`` / ``"warn"``).
88
+ metadata: Optional free-form metadata dict (evaluator version, etc.).
89
+ timestamp: Unix timestamp (seconds). Set automatically if omitted.
90
+ """
91
+
92
+ metric: str
93
+ value: float
94
+ span_id: str | None = None
95
+ trace_id: str | None = None
96
+ label: str | None = None
97
+ metadata: dict[str, Any] | None = None
98
+ timestamp: float = field(default_factory=time.time)
99
+
100
+ def to_dict(self) -> dict[str, Any]:
101
+ """Serialise to a plain dict."""
102
+ d: dict[str, Any] = {
103
+ "metric": self.metric,
104
+ "value": self.value,
105
+ "timestamp": self.timestamp,
106
+ }
107
+ if self.span_id is not None:
108
+ d["span_id"] = self.span_id
109
+ if self.trace_id is not None:
110
+ d["trace_id"] = self.trace_id
111
+ if self.label is not None:
112
+ d["label"] = self.label
113
+ if self.metadata is not None:
114
+ d["metadata"] = self.metadata
115
+ return d
116
+
117
+ @classmethod
118
+ def from_dict(cls, data: dict[str, Any]) -> EvalScore:
119
+ """Deserialise from a plain dict."""
120
+ return cls(
121
+ metric=data["metric"],
122
+ value=float(data["value"]),
123
+ span_id=data.get("span_id"),
124
+ trace_id=data.get("trace_id"),
125
+ label=data.get("label"),
126
+ metadata=data.get("metadata"),
127
+ timestamp=float(data.get("timestamp", time.time())),
128
+ )
129
+
130
+
131
+ # ---------------------------------------------------------------------------
132
+ # record_eval_score — primary public function
133
+ # ---------------------------------------------------------------------------
134
+
135
+
136
+ def record_eval_score(
137
+ metric: str,
138
+ value: float,
139
+ *,
140
+ span_id: str | None = None,
141
+ trace_id: str | None = None,
142
+ label: str | None = None,
143
+ metadata: dict[str, Any] | None = None,
144
+ ) -> EvalScore:
145
+ """Record an evaluation score and emit it as an RFC-0001 event.
146
+
147
+ The score is emitted as a ``llm.eval.score.recorded`` event via the
148
+ configured SpanForge exporter. It is also returned for convenience so
149
+ callers can inspect or store it locally.
150
+
151
+ Args:
152
+ metric: Name of the quality metric.
153
+ value: Numeric score value.
154
+ span_id: Optional parent span ID (16 hex chars).
155
+ trace_id: Optional trace ID (32 hex chars).
156
+ label: Optional human-readable label (``"pass"``/``"fail"``/etc.).
157
+ metadata: Optional free-form dict with evaluator details.
158
+
159
+ Returns:
160
+ The :class:`EvalScore` that was recorded and emitted.
161
+
162
+ Example::
163
+
164
+ score = record_eval_score("faithfulness", 0.92, span_id=span.span_id)
165
+ """
166
+ # H13: validate span_id / trace_id format at the boundary.
167
+ if span_id is not None and not _SPAN_ID_PAT.match(span_id):
168
+ raise ValueError(f"span_id must be 16 lowercase hex chars, got {span_id!r}")
169
+ if trace_id is not None and not _TRACE_ID_PAT.match(trace_id):
170
+ raise ValueError(f"trace_id must be 32 lowercase hex chars, got {trace_id!r}")
171
+
172
+ score = EvalScore(
173
+ metric=metric,
174
+ value=value,
175
+ span_id=span_id,
176
+ trace_id=trace_id,
177
+ label=label,
178
+ metadata=metadata,
179
+ )
180
+ try:
181
+ from spanforge._stream import emit_rfc_event
182
+ from spanforge.types import EventType
183
+
184
+ emit_rfc_event(
185
+ EventType.EVAL_SCORE_RECORDED,
186
+ payload=score.to_dict(),
187
+ span_id=span_id,
188
+ trace_id=trace_id,
189
+ )
190
+ except Exception as exc: # NOSONAR
191
+ _log.warning("spanforge.eval: failed to emit eval score event: %s", exc)
192
+
193
+ return score
194
+
195
+
196
+ # ---------------------------------------------------------------------------
197
+ # EvalScorer protocol
198
+ # ---------------------------------------------------------------------------
199
+
200
+
201
+ @runtime_checkable
202
+ class EvalScorer(Protocol):
203
+ """Protocol for evaluation scorers compatible with :class:`EvalRunner`.
204
+
205
+ Each scorer must implement :meth:`score` which receives a single example
206
+ dict and returns an :class:`EvalScore`.
207
+ """
208
+
209
+ @property
210
+ def metric_name(self) -> str:
211
+ """Unique name of this scorer's metric (e.g. ``"faithfulness"``)."""
212
+ ...
213
+
214
+ def score(self, example: dict[str, Any]) -> EvalScore:
215
+ """Score a single example.
216
+
217
+ Args:
218
+ example: Dict containing at least ``"output"`` key; may also
219
+ include ``"reference"``, ``"context"``, ``"span_id"``
220
+ and ``"trace_id"`` for correlation.
221
+
222
+ Returns:
223
+ An :class:`EvalScore` with the metric value.
224
+ """
225
+ ...
226
+
227
+
228
+ # ---------------------------------------------------------------------------
229
+ # EvalReport
230
+ # ---------------------------------------------------------------------------
231
+
232
+
233
+ @dataclass
234
+ class EvalReport:
235
+ """Aggregated result of running multiple scorers over a dataset.
236
+
237
+ Args:
238
+ scores: Flat list of all :class:`EvalScore` instances produced.
239
+ dataset: The dataset used to generate this report.
240
+ """
241
+
242
+ scores: list[EvalScore] = field(default_factory=list)
243
+ dataset: list[dict[str, Any]] = field(default_factory=list)
244
+
245
+ def summary(self) -> dict[str, float]:
246
+ """Return a ``{metric: mean_value}`` dict."""
247
+ from collections import defaultdict
248
+
249
+ totals: dict[str, list[float]] = defaultdict(list)
250
+ for s in self.scores:
251
+ totals[s.metric].append(s.value)
252
+ return {m: sum(vs) / len(vs) for m, vs in totals.items()}
253
+
254
+ def print_summary(self) -> None: # pragma: no cover
255
+ """Print a human-readable summary table."""
256
+ summary = self.summary()
257
+ print(f"{'Metric':<40} {'Mean':>10}")
258
+ print("-" * 53)
259
+ for metric, mean in sorted(summary.items()):
260
+ print(f"{metric:<40} {mean:>10.4f}")
261
+ print("-" * 53)
262
+ print(f"Total scores recorded: {len(self.scores)}")
263
+
264
+
265
+ # ---------------------------------------------------------------------------
266
+ # EvalRunner
267
+ # ---------------------------------------------------------------------------
268
+
269
+
270
+ class EvalRunner:
271
+ """Run one or more :class:`EvalScorer` callables over a dataset.
272
+
273
+ Args:
274
+ scorers: List of scorers to apply to each example.
275
+ emit: If ``True`` (default), each score is emitted via
276
+ :func:`record_eval_score`. Set to ``False`` to collect
277
+ scores in-process only.
278
+
279
+ Example::
280
+
281
+ class FaithfulnessScorer:
282
+ metric_name = "faithfulness"
283
+
284
+ def score(self, example):
285
+ # run your faithfulness check here
286
+ return EvalScore("faithfulness", value=..., span_id=example.get("span_id"))
287
+
288
+ runner = EvalRunner(scorers=[FaithfulnessScorer()])
289
+ report = runner.run([{"output": "Paris", "reference": "Paris is the capital."}])
290
+ report.print_summary()
291
+ """
292
+
293
+ def __init__(
294
+ self,
295
+ scorers: list[EvalScorer] | None = None,
296
+ *,
297
+ emit: bool = True,
298
+ ) -> None:
299
+ self._scorers: list[Any] = list(scorers or [])
300
+ self._emit = emit
301
+
302
+ def add_scorer(self, scorer: EvalScorer) -> None:
303
+ """Append *scorer* to the runner."""
304
+ self._scorers.append(scorer)
305
+
306
+ def run(self, dataset: list[dict[str, Any]]) -> EvalReport:
307
+ """Score every example in *dataset* with every scorer.
308
+
309
+ Args:
310
+ dataset: List of example dicts passed to each scorer's
311
+ :meth:`~EvalScorer.score` method.
312
+
313
+ Returns:
314
+ An :class:`EvalReport` containing all scores.
315
+ """
316
+ all_scores: list[EvalScore] = []
317
+ for example in dataset:
318
+ for scorer in self._scorers:
319
+ try:
320
+ score = scorer.score(example)
321
+ except Exception as exc: # NOSONAR
322
+ _log.warning(
323
+ "EvalRunner: scorer %r raised on example %r: %s",
324
+ getattr(scorer, "metric_name", type(scorer).__name__),
325
+ example,
326
+ exc,
327
+ )
328
+ continue
329
+ if self._emit:
330
+ try:
331
+ record_eval_score(
332
+ metric=score.metric,
333
+ value=score.value,
334
+ span_id=score.span_id,
335
+ trace_id=score.trace_id,
336
+ label=score.label,
337
+ metadata=score.metadata,
338
+ )
339
+ except Exception as exc: # NOSONAR
340
+ _log.warning("EvalRunner: emit failed: %s", exc)
341
+ all_scores.append(score)
342
+ return EvalReport(scores=all_scores, dataset=dataset)
343
+
344
+
345
+ # ---------------------------------------------------------------------------
346
+ # RegressionDetector
347
+ # ---------------------------------------------------------------------------
348
+
349
+
350
+ class RegressionDetector:
351
+ """Detect quality regressions by comparing current scores against a baseline.
352
+
353
+ When the mean score for a metric drops below
354
+ ``baseline_mean * (1 - threshold_pct / 100)`` the detector emits a
355
+ ``llm.eval.regression.detected`` RFC-0001 event.
356
+
357
+ Args:
358
+ baseline: ``{metric: baseline_mean}`` dict. Use :meth:`set_baseline`.
359
+ threshold_pct: Float percentage drop that triggers a regression.
360
+ Default: ``5.0`` (5 % drop).
361
+ emit: If ``True`` (default), regression events are emitted.
362
+
363
+ Example::
364
+
365
+ detector = RegressionDetector(baseline={"faithfulness": 0.90}, threshold_pct=5.0)
366
+ detector.check(report)
367
+ """
368
+
369
+ def __init__(
370
+ self,
371
+ baseline: dict[str, float] | None = None,
372
+ *,
373
+ threshold_pct: float = 5.0,
374
+ emit: bool = True,
375
+ ) -> None:
376
+ self._baseline: dict[str, float] = dict(baseline or {})
377
+ self._threshold_pct = threshold_pct
378
+ self._emit = emit
379
+
380
+ def set_baseline(self, metric: str, value: float) -> None:
381
+ """Update the baseline mean for *metric*."""
382
+ self._baseline[metric] = value
383
+
384
+ def check(self, report: EvalReport) -> list[dict[str, Any]]:
385
+ """Compare *report* summary against the baseline.
386
+
387
+ Returns a list of regression dicts (may be empty). Each dict has
388
+ keys ``metric``, ``baseline``, ``current``, and ``drop_pct``.
389
+ """
390
+ regressions: list[dict[str, Any]] = []
391
+ summary = report.summary()
392
+ for metric, current in summary.items():
393
+ baseline = self._baseline.get(metric)
394
+ if baseline is None or baseline <= 0:
395
+ continue
396
+ drop_pct = (baseline - current) / baseline * 100
397
+ if drop_pct >= self._threshold_pct:
398
+ reg = {
399
+ "metric": metric,
400
+ "baseline": baseline,
401
+ "current": current,
402
+ "drop_pct": round(drop_pct, 4),
403
+ }
404
+ regressions.append(reg)
405
+ _log.warning(
406
+ "spanforge.eval: regression detected for metric=%r "
407
+ "(baseline=%.4f current=%.4f drop=%.2f%%)",
408
+ metric,
409
+ baseline,
410
+ current,
411
+ drop_pct,
412
+ )
413
+ if self._emit:
414
+ try:
415
+ from spanforge._stream import emit_rfc_event
416
+ from spanforge.types import EventType
417
+
418
+ emit_rfc_event(
419
+ EventType.EVAL_REGRESSION_DETECTED,
420
+ payload=reg,
421
+ )
422
+ except Exception as exc: # NOSONAR
423
+ _log.warning("spanforge.eval: failed to emit regression event: %s", exc)
424
+ return regressions
425
+
426
+
427
+ # ---------------------------------------------------------------------------
428
+ # Built-in scorers
429
+ # ---------------------------------------------------------------------------
430
+
431
+ # Refusal phrases (case-insensitive) — common patterns indicating model refusal
432
+ _REFUSAL_PHRASES: tuple[str, ...] = (
433
+ "i cannot",
434
+ "i can't",
435
+ "i'm not able to",
436
+ "i am not able to",
437
+ "i'm unable to",
438
+ "i am unable to",
439
+ "i must decline",
440
+ "i must refuse",
441
+ "as an ai",
442
+ "as a language model",
443
+ "i'm sorry, but i",
444
+ "i apologize, but i",
445
+ "i don't think i can",
446
+ "it would be inappropriate",
447
+ "i'm not allowed to",
448
+ "i cannot assist with",
449
+ "i can't help with",
450
+ "i won't be able to",
451
+ "sorry, i can't",
452
+ "i refuse to",
453
+ )
454
+
455
+
456
+ class FaithfulnessScorer:
457
+ """Score whether the output is faithful to the provided context.
458
+
459
+ Measures token overlap between *output* and *context* as a proxy for
460
+ factual grounding. Returns 1.0 when every non-trivial output word
461
+ appears in the context, 0.0 when none do.
462
+
463
+ If no ``"context"`` key is present the scorer returns 0.0 with label
464
+ ``"skip"`` (cannot evaluate faithfulness without a reference context).
465
+
466
+ Example::
467
+
468
+ scorer = FaithfulnessScorer()
469
+ score = scorer.score({
470
+ "output": "Paris is the capital of France.",
471
+ "context": "France is a country in Europe. Its capital is Paris.",
472
+ })
473
+ """
474
+
475
+ metric_name: str = "faithfulness"
476
+
477
+ def score(self, example: dict[str, Any]) -> EvalScore:
478
+ """Score output faithfulness against context."""
479
+ output: str = str(example.get("output", ""))
480
+ context: str = str(example.get("context", ""))
481
+
482
+ if not context:
483
+ return EvalScore(
484
+ metric=self.metric_name,
485
+ value=0.0,
486
+ span_id=example.get("span_id"),
487
+ trace_id=example.get("trace_id"),
488
+ label="skip",
489
+ metadata={"reason": "no context provided"},
490
+ )
491
+
492
+ # Tokenise: lowercase, alpha-numeric tokens, skip stopwords / short words
493
+ def _tokens(text: str) -> set[str]:
494
+ return {w for w in re.findall(r"[a-z0-9]+", text.lower()) if len(w) > 2}
495
+
496
+ out_tokens = _tokens(output)
497
+ ctx_tokens = _tokens(context)
498
+
499
+ if not out_tokens:
500
+ return EvalScore(
501
+ metric=self.metric_name,
502
+ value=0.0,
503
+ span_id=example.get("span_id"),
504
+ trace_id=example.get("trace_id"),
505
+ label="skip",
506
+ metadata={"reason": "empty output"},
507
+ )
508
+
509
+ overlap = len(out_tokens & ctx_tokens) / len(out_tokens)
510
+ label = "pass" if overlap >= 0.5 else "fail"
511
+
512
+ return EvalScore(
513
+ metric=self.metric_name,
514
+ value=round(overlap, 4),
515
+ span_id=example.get("span_id"),
516
+ trace_id=example.get("trace_id"),
517
+ label=label,
518
+ )
519
+
520
+
521
+ class RefusalDetectionScorer:
522
+ """Detect whether the model output is a refusal / decline.
523
+
524
+ Checks the output against a set of common refusal phrases. Returns 1.0
525
+ if a refusal is detected, 0.0 otherwise.
526
+
527
+ Example::
528
+
529
+ scorer = RefusalDetectionScorer()
530
+ score = scorer.score({"output": "I'm sorry, but I can't help with that."})
531
+ assert score.value == 1.0
532
+ """
533
+
534
+ metric_name: str = "refusal_detection"
535
+
536
+ def score(self, example: dict[str, Any]) -> EvalScore:
537
+ """Score whether the model output is a refusal."""
538
+ output: str = str(example.get("output", "")).lower()
539
+
540
+ detected = any(phrase in output for phrase in _REFUSAL_PHRASES)
541
+
542
+ return EvalScore(
543
+ metric=self.metric_name,
544
+ value=1.0 if detected else 0.0,
545
+ span_id=example.get("span_id"),
546
+ trace_id=example.get("trace_id"),
547
+ label="refusal" if detected else "pass",
548
+ )
549
+
550
+
551
+ class PIILeakageScorer:
552
+ """Detect PII leakage in the model output.
553
+
554
+ Uses :func:`~spanforge.redact.scan_payload` to scan the ``"output"``
555
+ value for PII patterns. Returns 1.0 if PII is detected (leakage),
556
+ 0.0 if the output is clean.
557
+
558
+ Example::
559
+
560
+ scorer = PIILeakageScorer()
561
+ score = scorer.score({"output": "Contact me at alice@example.com"})
562
+ assert score.value == 1.0
563
+ """
564
+
565
+ metric_name: str = "pii_leakage"
566
+
567
+ def score(self, example: dict[str, Any]) -> EvalScore:
568
+ """Score PII leakage in the model output."""
569
+ from spanforge.redact import scan_payload
570
+
571
+ output: str = str(example.get("output", ""))
572
+
573
+ result = scan_payload({"output": output})
574
+ leaked = not result.clean
575
+
576
+ return EvalScore(
577
+ metric=self.metric_name,
578
+ value=1.0 if leaked else 0.0,
579
+ span_id=example.get("span_id"),
580
+ trace_id=example.get("trace_id"),
581
+ label="leak" if leaked else "pass",
582
+ metadata={"hit_count": len(result.hits)} if leaked else None,
583
+ )
584
+
585
+
586
+ # ---------------------------------------------------------------------------
587
+ # BehaviourScorer — ABC for named plug-in scorers
588
+ # ---------------------------------------------------------------------------
589
+
590
+
591
+ class BehaviourScorer(ABC):
592
+ """Abstract base class for plug-in behaviour scorers.
593
+
594
+ Unlike :class:`EvalScorer` (a :class:`~typing.Protocol` that accepts an
595
+ arbitrary example dict), ``BehaviourScorer`` targets the *named
596
+ test-case* workflow where a scorer receives a structured test case object
597
+ and the raw model response string, returning a ``(score, reason)`` tuple.
598
+
599
+ This is the contract expected by the ``spanforge.scorers`` entry-point
600
+ group, allowing third-party scorers to be discovered and loaded
601
+ automatically via :func:`spanforge.plugins.discover`.
602
+
603
+ Subclasses must:
604
+
605
+ * Set a unique class-level :attr:`name` string.
606
+ * Implement :meth:`score`.
607
+
608
+ The returned float must be in ``[0.0, 1.0]``; the string is a short
609
+ human-readable reason suitable for CI log output.
610
+
611
+ Example::
612
+
613
+ from spanforge.eval import BehaviourScorer
614
+
615
+ class ToxicityScorer(BehaviourScorer):
616
+ name = "toxicity"
617
+
618
+ def score(self, case, response: str) -> tuple[float, str]:
619
+ # 1.0 = no toxicity, 0.0 = toxic
620
+ if any(w in response.lower() for w in ("hate", "kill")):
621
+ return 0.0, "toxic content detected"
622
+ return 1.0, "no toxicity detected"
623
+
624
+ Registration in ``pyproject.toml``::
625
+
626
+ [project.entry-points."spanforge.scorers"]
627
+ toxicity = "my_package.scorers:ToxicityScorer"
628
+ """
629
+
630
+ #: Unique identifier for this scorer. Must be overridden in subclasses.
631
+ name: str = "base"
632
+
633
+ @abstractmethod
634
+ def score(self, case: Any, response: str) -> tuple[float, str]:
635
+ """Score *response* for the given *case*.
636
+
637
+ Args:
638
+ case: The test case being evaluated. In the spanforge
639
+ ecosystem this is typically a plain dict or a
640
+ dataclass with ``id``, ``messages``, and ``scorers``
641
+ attributes, but the exact type depends on the calling
642
+ framework.
643
+ response: The raw text returned by the model under test.
644
+
645
+ Returns:
646
+ ``(score, reason)`` where *score* is in ``[0.0, 1.0]`` and
647
+ *reason* is a short explanation (one sentence).
648
+ """