spanforge 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spanforge/__init__.py +815 -0
- spanforge/_ansi.py +93 -0
- spanforge/_batch_exporter.py +409 -0
- spanforge/_cli.py +2094 -0
- spanforge/_cli_audit.py +639 -0
- spanforge/_cli_compliance.py +711 -0
- spanforge/_cli_cost.py +243 -0
- spanforge/_cli_ops.py +791 -0
- spanforge/_cli_phase11.py +356 -0
- spanforge/_hooks.py +337 -0
- spanforge/_server.py +1708 -0
- spanforge/_span.py +1036 -0
- spanforge/_store.py +288 -0
- spanforge/_stream.py +664 -0
- spanforge/_trace.py +335 -0
- spanforge/_tracer.py +254 -0
- spanforge/actor.py +141 -0
- spanforge/alerts.py +469 -0
- spanforge/auto.py +464 -0
- spanforge/baseline.py +335 -0
- spanforge/cache.py +635 -0
- spanforge/compliance.py +325 -0
- spanforge/config.py +532 -0
- spanforge/consent.py +228 -0
- spanforge/consumer.py +377 -0
- spanforge/core/__init__.py +5 -0
- spanforge/core/compliance_mapping.py +1254 -0
- spanforge/cost.py +600 -0
- spanforge/debug.py +548 -0
- spanforge/deprecations.py +205 -0
- spanforge/drift.py +482 -0
- spanforge/egress.py +58 -0
- spanforge/eval.py +648 -0
- spanforge/event.py +1064 -0
- spanforge/exceptions.py +240 -0
- spanforge/explain.py +178 -0
- spanforge/export/__init__.py +69 -0
- spanforge/export/append_only.py +337 -0
- spanforge/export/cloud.py +357 -0
- spanforge/export/datadog.py +497 -0
- spanforge/export/grafana.py +320 -0
- spanforge/export/jsonl.py +195 -0
- spanforge/export/openinference.py +158 -0
- spanforge/export/otel_bridge.py +294 -0
- spanforge/export/otlp.py +811 -0
- spanforge/export/otlp_bridge.py +233 -0
- spanforge/export/redis_backend.py +282 -0
- spanforge/export/siem_schema.py +98 -0
- spanforge/export/siem_splunk.py +264 -0
- spanforge/export/siem_syslog.py +212 -0
- spanforge/export/webhook.py +299 -0
- spanforge/exporters/__init__.py +30 -0
- spanforge/exporters/console.py +271 -0
- spanforge/exporters/jsonl.py +144 -0
- spanforge/exporters/sqlite.py +142 -0
- spanforge/gate.py +1150 -0
- spanforge/governance.py +181 -0
- spanforge/hitl.py +295 -0
- spanforge/http.py +187 -0
- spanforge/inspect.py +427 -0
- spanforge/integrations/__init__.py +45 -0
- spanforge/integrations/_pricing.py +280 -0
- spanforge/integrations/anthropic.py +388 -0
- spanforge/integrations/azure_openai.py +133 -0
- spanforge/integrations/bedrock.py +292 -0
- spanforge/integrations/crewai.py +251 -0
- spanforge/integrations/gemini.py +351 -0
- spanforge/integrations/groq.py +442 -0
- spanforge/integrations/langchain.py +349 -0
- spanforge/integrations/langgraph.py +306 -0
- spanforge/integrations/llamaindex.py +373 -0
- spanforge/integrations/ollama.py +287 -0
- spanforge/integrations/openai.py +368 -0
- spanforge/integrations/together.py +483 -0
- spanforge/io.py +214 -0
- spanforge/lint.py +322 -0
- spanforge/metrics.py +417 -0
- spanforge/metrics_export.py +343 -0
- spanforge/migrate.py +402 -0
- spanforge/model_registry.py +278 -0
- spanforge/models.py +389 -0
- spanforge/namespaces/__init__.py +254 -0
- spanforge/namespaces/audit.py +256 -0
- spanforge/namespaces/cache.py +237 -0
- spanforge/namespaces/chain.py +77 -0
- spanforge/namespaces/confidence.py +72 -0
- spanforge/namespaces/consent.py +92 -0
- spanforge/namespaces/cost.py +179 -0
- spanforge/namespaces/decision.py +143 -0
- spanforge/namespaces/diff.py +157 -0
- spanforge/namespaces/drift.py +80 -0
- spanforge/namespaces/eval_.py +251 -0
- spanforge/namespaces/feedback.py +241 -0
- spanforge/namespaces/fence.py +193 -0
- spanforge/namespaces/guard.py +105 -0
- spanforge/namespaces/hitl.py +91 -0
- spanforge/namespaces/latency.py +72 -0
- spanforge/namespaces/prompt.py +190 -0
- spanforge/namespaces/redact.py +173 -0
- spanforge/namespaces/retrieval.py +379 -0
- spanforge/namespaces/runtime_governance.py +494 -0
- spanforge/namespaces/template.py +208 -0
- spanforge/namespaces/tool_call.py +77 -0
- spanforge/namespaces/trace.py +1029 -0
- spanforge/normalizer.py +171 -0
- spanforge/plugins.py +82 -0
- spanforge/presidio_backend.py +349 -0
- spanforge/processor.py +258 -0
- spanforge/prompt_registry.py +418 -0
- spanforge/py.typed +0 -0
- spanforge/redact.py +914 -0
- spanforge/regression.py +192 -0
- spanforge/runtime_policy.py +159 -0
- spanforge/sampling.py +511 -0
- spanforge/schema.py +183 -0
- spanforge/schemas/v1.0/schema.json +170 -0
- spanforge/schemas/v2.0/schema.json +536 -0
- spanforge/sdk/__init__.py +625 -0
- spanforge/sdk/_base.py +584 -0
- spanforge/sdk/_base.pyi +71 -0
- spanforge/sdk/_exceptions.py +1096 -0
- spanforge/sdk/_types.py +2184 -0
- spanforge/sdk/alert.py +1514 -0
- spanforge/sdk/alert.pyi +56 -0
- spanforge/sdk/audit.py +1196 -0
- spanforge/sdk/audit.pyi +67 -0
- spanforge/sdk/cec.py +1215 -0
- spanforge/sdk/cec.pyi +37 -0
- spanforge/sdk/config.py +641 -0
- spanforge/sdk/config.pyi +55 -0
- spanforge/sdk/enterprise.py +714 -0
- spanforge/sdk/enterprise.pyi +79 -0
- spanforge/sdk/explain.py +170 -0
- spanforge/sdk/fallback.py +432 -0
- spanforge/sdk/feedback.py +351 -0
- spanforge/sdk/gate.py +874 -0
- spanforge/sdk/gate.pyi +51 -0
- spanforge/sdk/identity.py +2114 -0
- spanforge/sdk/identity.pyi +47 -0
- spanforge/sdk/lineage.py +175 -0
- spanforge/sdk/observe.py +1065 -0
- spanforge/sdk/observe.pyi +50 -0
- spanforge/sdk/operator.py +338 -0
- spanforge/sdk/pii.py +1473 -0
- spanforge/sdk/pii.pyi +119 -0
- spanforge/sdk/pipelines.py +458 -0
- spanforge/sdk/pipelines.pyi +39 -0
- spanforge/sdk/policy.py +930 -0
- spanforge/sdk/rag.py +594 -0
- spanforge/sdk/rbac.py +280 -0
- spanforge/sdk/registry.py +430 -0
- spanforge/sdk/registry.pyi +46 -0
- spanforge/sdk/scope.py +279 -0
- spanforge/sdk/secrets.py +293 -0
- spanforge/sdk/secrets.pyi +25 -0
- spanforge/sdk/security.py +560 -0
- spanforge/sdk/security.pyi +57 -0
- spanforge/sdk/trust.py +472 -0
- spanforge/sdk/trust.pyi +41 -0
- spanforge/secrets.py +799 -0
- spanforge/signing.py +1179 -0
- spanforge/stats.py +100 -0
- spanforge/stream.py +560 -0
- spanforge/testing.py +378 -0
- spanforge/testing_mocks.py +1052 -0
- spanforge/trace.py +199 -0
- spanforge/types.py +696 -0
- spanforge/ulid.py +300 -0
- spanforge/validate.py +379 -0
- spanforge-1.0.0.dist-info/METADATA +1509 -0
- spanforge-1.0.0.dist-info/RECORD +174 -0
- spanforge-1.0.0.dist-info/WHEEL +4 -0
- spanforge-1.0.0.dist-info/entry_points.txt +5 -0
- spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
spanforge/egress.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Egress enforcement for SpanForge export pipeline.
|
|
2
|
+
|
|
3
|
+
Provides a centralized guard that blocks network exports when the SDK is
|
|
4
|
+
configured in no-egress (air-gapped) mode. Exporters call
|
|
5
|
+
:func:`check_egress` before making any HTTP request.
|
|
6
|
+
|
|
7
|
+
Configuration
|
|
8
|
+
-------------
|
|
9
|
+
* ``no_egress=True`` on :class:`~spanforge.config.SpanForgeConfig` blocks
|
|
10
|
+
**all** outbound network traffic from SpanForge exporters.
|
|
11
|
+
* ``egress_allowlist`` is a ``frozenset[str]`` of URL **prefixes** that are
|
|
12
|
+
permitted even when ``no_egress`` is ``True``. For example::
|
|
13
|
+
|
|
14
|
+
configure(no_egress=True, egress_allowlist=frozenset(["https://internal-collector.corp.local/"]))
|
|
15
|
+
|
|
16
|
+
Raises :class:`~spanforge.exceptions.EgressViolationError` when a blocked
|
|
17
|
+
export is attempted.
|
|
18
|
+
|
|
19
|
+
Example::
|
|
20
|
+
|
|
21
|
+
from spanforge.egress import check_egress
|
|
22
|
+
check_egress("https://example.com/v1/traces", backend="otlp")
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from spanforge.exceptions import EgressViolationError
|
|
28
|
+
|
|
29
|
+
__all__ = ["check_egress"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def check_egress(endpoint: str, backend: str = "unknown") -> None:
|
|
33
|
+
"""Raise :class:`EgressViolationError` if egress to *endpoint* is blocked.
|
|
34
|
+
|
|
35
|
+
This function is a no-op when ``no_egress`` is ``False``.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
endpoint: The URL being accessed.
|
|
39
|
+
backend: Exporter name for the error message (e.g. ``"otlp"``).
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
EgressViolationError: If the endpoint is blocked by the egress policy.
|
|
43
|
+
"""
|
|
44
|
+
from spanforge.config import get_config
|
|
45
|
+
|
|
46
|
+
cfg = get_config()
|
|
47
|
+
|
|
48
|
+
if not cfg.no_egress:
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
# Check allowlist
|
|
52
|
+
allowlist = cfg.egress_allowlist
|
|
53
|
+
if allowlist:
|
|
54
|
+
for prefix in allowlist:
|
|
55
|
+
if endpoint.startswith(prefix):
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
raise EgressViolationError(backend=backend, endpoint=endpoint)
|
spanforge/eval.py
ADDED
|
@@ -0,0 +1,648 @@
|
|
|
1
|
+
"""spanforge.eval — Evaluation framework hooks for LLM / agent quality scoring.
|
|
2
|
+
|
|
3
|
+
This module provides lightweight instrumentation for attaching quality scores
|
|
4
|
+
to active spans and emitting them as RFC-0001 ``llm.eval.*`` events. It is
|
|
5
|
+
intentionally infrastructure-agnostic: scores can be produced by RAGAS,
|
|
6
|
+
DeepEval, custom rubric LLMs, or simple rule-based checks.
|
|
7
|
+
|
|
8
|
+
Quick start
|
|
9
|
+
-----------
|
|
10
|
+
::
|
|
11
|
+
|
|
12
|
+
from spanforge import start_span
|
|
13
|
+
from spanforge.eval import record_eval_score, EvalScore
|
|
14
|
+
|
|
15
|
+
with start_span("rag_pipeline") as span:
|
|
16
|
+
answer = run_rag(query)
|
|
17
|
+
# Attach an evaluation score to the active span.
|
|
18
|
+
record_eval_score(
|
|
19
|
+
metric="faithfulness",
|
|
20
|
+
value=0.87,
|
|
21
|
+
span_id=span.span_id,
|
|
22
|
+
trace_id=span.trace_id,
|
|
23
|
+
label="pass",
|
|
24
|
+
metadata={"evaluator": "ragas", "version": "0.1.12"},
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
Batch evaluation
|
|
28
|
+
----------------
|
|
29
|
+
Use :class:`EvalRunner` to run a set of :class:`EvalScorer` callables over a
|
|
30
|
+
list of trace outputs and compare them against a baseline::
|
|
31
|
+
|
|
32
|
+
runner = EvalRunner(scorers=[FaithfulnessScorer(), RelevanceScorer()])
|
|
33
|
+
report = runner.run(dataset)
|
|
34
|
+
report.print_summary()
|
|
35
|
+
|
|
36
|
+
Regression detection
|
|
37
|
+
--------------------
|
|
38
|
+
:class:`RegressionDetector` detects when mean scores drop below a configurable
|
|
39
|
+
threshold relative to a saved baseline and emits
|
|
40
|
+
``llm.eval.regression.detected`` events automatically.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
import logging
|
|
46
|
+
import re
|
|
47
|
+
import time
|
|
48
|
+
from abc import ABC, abstractmethod
|
|
49
|
+
from dataclasses import dataclass, field
|
|
50
|
+
from typing import Any, Protocol, runtime_checkable
|
|
51
|
+
|
|
52
|
+
__all__ = [
|
|
53
|
+
"BehaviourScorer",
|
|
54
|
+
"EvalReport",
|
|
55
|
+
"EvalRunner",
|
|
56
|
+
"EvalScore",
|
|
57
|
+
"EvalScorer",
|
|
58
|
+
"FaithfulnessScorer",
|
|
59
|
+
"PIILeakageScorer",
|
|
60
|
+
"RefusalDetectionScorer",
|
|
61
|
+
"RegressionDetector",
|
|
62
|
+
"record_eval_score",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
_log = logging.getLogger("spanforge.eval")
|
|
66
|
+
|
|
67
|
+
# H13 — span_id / trace_id format patterns (RFC-0001 §8.2)
|
|
68
|
+
_SPAN_ID_PAT: re.Pattern[str] = re.compile(r"^[0-9a-f]{16}$")
|
|
69
|
+
_TRACE_ID_PAT: re.Pattern[str] = re.compile(r"^[0-9a-f]{32}$")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
# EvalScore dataclass
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class EvalScore:
|
|
79
|
+
"""A single quality measurement attached to a span or agent run.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
metric: Name of the metric (e.g. ``"faithfulness"``, ``"toxicity"``).
|
|
83
|
+
value: Numeric score. Typically in ``[0.0, 1.0]`` but any float
|
|
84
|
+
is accepted (some metrics like BLEU can exceed 1.0).
|
|
85
|
+
span_id: Optional 16-hex-char span ID of the parent span.
|
|
86
|
+
trace_id: Optional 32-hex-char trace ID.
|
|
87
|
+
label: Optional string label (``"pass"`` / ``"fail"`` / ``"warn"``).
|
|
88
|
+
metadata: Optional free-form metadata dict (evaluator version, etc.).
|
|
89
|
+
timestamp: Unix timestamp (seconds). Set automatically if omitted.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
metric: str
|
|
93
|
+
value: float
|
|
94
|
+
span_id: str | None = None
|
|
95
|
+
trace_id: str | None = None
|
|
96
|
+
label: str | None = None
|
|
97
|
+
metadata: dict[str, Any] | None = None
|
|
98
|
+
timestamp: float = field(default_factory=time.time)
|
|
99
|
+
|
|
100
|
+
def to_dict(self) -> dict[str, Any]:
|
|
101
|
+
"""Serialise to a plain dict."""
|
|
102
|
+
d: dict[str, Any] = {
|
|
103
|
+
"metric": self.metric,
|
|
104
|
+
"value": self.value,
|
|
105
|
+
"timestamp": self.timestamp,
|
|
106
|
+
}
|
|
107
|
+
if self.span_id is not None:
|
|
108
|
+
d["span_id"] = self.span_id
|
|
109
|
+
if self.trace_id is not None:
|
|
110
|
+
d["trace_id"] = self.trace_id
|
|
111
|
+
if self.label is not None:
|
|
112
|
+
d["label"] = self.label
|
|
113
|
+
if self.metadata is not None:
|
|
114
|
+
d["metadata"] = self.metadata
|
|
115
|
+
return d
|
|
116
|
+
|
|
117
|
+
@classmethod
|
|
118
|
+
def from_dict(cls, data: dict[str, Any]) -> EvalScore:
|
|
119
|
+
"""Deserialise from a plain dict."""
|
|
120
|
+
return cls(
|
|
121
|
+
metric=data["metric"],
|
|
122
|
+
value=float(data["value"]),
|
|
123
|
+
span_id=data.get("span_id"),
|
|
124
|
+
trace_id=data.get("trace_id"),
|
|
125
|
+
label=data.get("label"),
|
|
126
|
+
metadata=data.get("metadata"),
|
|
127
|
+
timestamp=float(data.get("timestamp", time.time())),
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ---------------------------------------------------------------------------
|
|
132
|
+
# record_eval_score — primary public function
|
|
133
|
+
# ---------------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def record_eval_score(
|
|
137
|
+
metric: str,
|
|
138
|
+
value: float,
|
|
139
|
+
*,
|
|
140
|
+
span_id: str | None = None,
|
|
141
|
+
trace_id: str | None = None,
|
|
142
|
+
label: str | None = None,
|
|
143
|
+
metadata: dict[str, Any] | None = None,
|
|
144
|
+
) -> EvalScore:
|
|
145
|
+
"""Record an evaluation score and emit it as an RFC-0001 event.
|
|
146
|
+
|
|
147
|
+
The score is emitted as a ``llm.eval.score.recorded`` event via the
|
|
148
|
+
configured SpanForge exporter. It is also returned for convenience so
|
|
149
|
+
callers can inspect or store it locally.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
metric: Name of the quality metric.
|
|
153
|
+
value: Numeric score value.
|
|
154
|
+
span_id: Optional parent span ID (16 hex chars).
|
|
155
|
+
trace_id: Optional trace ID (32 hex chars).
|
|
156
|
+
label: Optional human-readable label (``"pass"``/``"fail"``/etc.).
|
|
157
|
+
metadata: Optional free-form dict with evaluator details.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
The :class:`EvalScore` that was recorded and emitted.
|
|
161
|
+
|
|
162
|
+
Example::
|
|
163
|
+
|
|
164
|
+
score = record_eval_score("faithfulness", 0.92, span_id=span.span_id)
|
|
165
|
+
"""
|
|
166
|
+
# H13: validate span_id / trace_id format at the boundary.
|
|
167
|
+
if span_id is not None and not _SPAN_ID_PAT.match(span_id):
|
|
168
|
+
raise ValueError(f"span_id must be 16 lowercase hex chars, got {span_id!r}")
|
|
169
|
+
if trace_id is not None and not _TRACE_ID_PAT.match(trace_id):
|
|
170
|
+
raise ValueError(f"trace_id must be 32 lowercase hex chars, got {trace_id!r}")
|
|
171
|
+
|
|
172
|
+
score = EvalScore(
|
|
173
|
+
metric=metric,
|
|
174
|
+
value=value,
|
|
175
|
+
span_id=span_id,
|
|
176
|
+
trace_id=trace_id,
|
|
177
|
+
label=label,
|
|
178
|
+
metadata=metadata,
|
|
179
|
+
)
|
|
180
|
+
try:
|
|
181
|
+
from spanforge._stream import emit_rfc_event
|
|
182
|
+
from spanforge.types import EventType
|
|
183
|
+
|
|
184
|
+
emit_rfc_event(
|
|
185
|
+
EventType.EVAL_SCORE_RECORDED,
|
|
186
|
+
payload=score.to_dict(),
|
|
187
|
+
span_id=span_id,
|
|
188
|
+
trace_id=trace_id,
|
|
189
|
+
)
|
|
190
|
+
except Exception as exc: # NOSONAR
|
|
191
|
+
_log.warning("spanforge.eval: failed to emit eval score event: %s", exc)
|
|
192
|
+
|
|
193
|
+
return score
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# ---------------------------------------------------------------------------
|
|
197
|
+
# EvalScorer protocol
|
|
198
|
+
# ---------------------------------------------------------------------------
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@runtime_checkable
|
|
202
|
+
class EvalScorer(Protocol):
|
|
203
|
+
"""Protocol for evaluation scorers compatible with :class:`EvalRunner`.
|
|
204
|
+
|
|
205
|
+
Each scorer must implement :meth:`score` which receives a single example
|
|
206
|
+
dict and returns an :class:`EvalScore`.
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def metric_name(self) -> str:
|
|
211
|
+
"""Unique name of this scorer's metric (e.g. ``"faithfulness"``)."""
|
|
212
|
+
...
|
|
213
|
+
|
|
214
|
+
def score(self, example: dict[str, Any]) -> EvalScore:
|
|
215
|
+
"""Score a single example.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
example: Dict containing at least ``"output"`` key; may also
|
|
219
|
+
include ``"reference"``, ``"context"``, ``"span_id"``
|
|
220
|
+
and ``"trace_id"`` for correlation.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
An :class:`EvalScore` with the metric value.
|
|
224
|
+
"""
|
|
225
|
+
...
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# ---------------------------------------------------------------------------
|
|
229
|
+
# EvalReport
|
|
230
|
+
# ---------------------------------------------------------------------------
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@dataclass
|
|
234
|
+
class EvalReport:
|
|
235
|
+
"""Aggregated result of running multiple scorers over a dataset.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
scores: Flat list of all :class:`EvalScore` instances produced.
|
|
239
|
+
dataset: The dataset used to generate this report.
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
scores: list[EvalScore] = field(default_factory=list)
|
|
243
|
+
dataset: list[dict[str, Any]] = field(default_factory=list)
|
|
244
|
+
|
|
245
|
+
def summary(self) -> dict[str, float]:
|
|
246
|
+
"""Return a ``{metric: mean_value}`` dict."""
|
|
247
|
+
from collections import defaultdict
|
|
248
|
+
|
|
249
|
+
totals: dict[str, list[float]] = defaultdict(list)
|
|
250
|
+
for s in self.scores:
|
|
251
|
+
totals[s.metric].append(s.value)
|
|
252
|
+
return {m: sum(vs) / len(vs) for m, vs in totals.items()}
|
|
253
|
+
|
|
254
|
+
def print_summary(self) -> None: # pragma: no cover
|
|
255
|
+
"""Print a human-readable summary table."""
|
|
256
|
+
summary = self.summary()
|
|
257
|
+
print(f"{'Metric':<40} {'Mean':>10}")
|
|
258
|
+
print("-" * 53)
|
|
259
|
+
for metric, mean in sorted(summary.items()):
|
|
260
|
+
print(f"{metric:<40} {mean:>10.4f}")
|
|
261
|
+
print("-" * 53)
|
|
262
|
+
print(f"Total scores recorded: {len(self.scores)}")
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# ---------------------------------------------------------------------------
|
|
266
|
+
# EvalRunner
|
|
267
|
+
# ---------------------------------------------------------------------------
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class EvalRunner:
|
|
271
|
+
"""Run one or more :class:`EvalScorer` callables over a dataset.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
scorers: List of scorers to apply to each example.
|
|
275
|
+
emit: If ``True`` (default), each score is emitted via
|
|
276
|
+
:func:`record_eval_score`. Set to ``False`` to collect
|
|
277
|
+
scores in-process only.
|
|
278
|
+
|
|
279
|
+
Example::
|
|
280
|
+
|
|
281
|
+
class FaithfulnessScorer:
|
|
282
|
+
metric_name = "faithfulness"
|
|
283
|
+
|
|
284
|
+
def score(self, example):
|
|
285
|
+
# run your faithfulness check here
|
|
286
|
+
return EvalScore("faithfulness", value=..., span_id=example.get("span_id"))
|
|
287
|
+
|
|
288
|
+
runner = EvalRunner(scorers=[FaithfulnessScorer()])
|
|
289
|
+
report = runner.run([{"output": "Paris", "reference": "Paris is the capital."}])
|
|
290
|
+
report.print_summary()
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
def __init__(
|
|
294
|
+
self,
|
|
295
|
+
scorers: list[EvalScorer] | None = None,
|
|
296
|
+
*,
|
|
297
|
+
emit: bool = True,
|
|
298
|
+
) -> None:
|
|
299
|
+
self._scorers: list[Any] = list(scorers or [])
|
|
300
|
+
self._emit = emit
|
|
301
|
+
|
|
302
|
+
def add_scorer(self, scorer: EvalScorer) -> None:
|
|
303
|
+
"""Append *scorer* to the runner."""
|
|
304
|
+
self._scorers.append(scorer)
|
|
305
|
+
|
|
306
|
+
def run(self, dataset: list[dict[str, Any]]) -> EvalReport:
|
|
307
|
+
"""Score every example in *dataset* with every scorer.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
dataset: List of example dicts passed to each scorer's
|
|
311
|
+
:meth:`~EvalScorer.score` method.
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
An :class:`EvalReport` containing all scores.
|
|
315
|
+
"""
|
|
316
|
+
all_scores: list[EvalScore] = []
|
|
317
|
+
for example in dataset:
|
|
318
|
+
for scorer in self._scorers:
|
|
319
|
+
try:
|
|
320
|
+
score = scorer.score(example)
|
|
321
|
+
except Exception as exc: # NOSONAR
|
|
322
|
+
_log.warning(
|
|
323
|
+
"EvalRunner: scorer %r raised on example %r: %s",
|
|
324
|
+
getattr(scorer, "metric_name", type(scorer).__name__),
|
|
325
|
+
example,
|
|
326
|
+
exc,
|
|
327
|
+
)
|
|
328
|
+
continue
|
|
329
|
+
if self._emit:
|
|
330
|
+
try:
|
|
331
|
+
record_eval_score(
|
|
332
|
+
metric=score.metric,
|
|
333
|
+
value=score.value,
|
|
334
|
+
span_id=score.span_id,
|
|
335
|
+
trace_id=score.trace_id,
|
|
336
|
+
label=score.label,
|
|
337
|
+
metadata=score.metadata,
|
|
338
|
+
)
|
|
339
|
+
except Exception as exc: # NOSONAR
|
|
340
|
+
_log.warning("EvalRunner: emit failed: %s", exc)
|
|
341
|
+
all_scores.append(score)
|
|
342
|
+
return EvalReport(scores=all_scores, dataset=dataset)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
# ---------------------------------------------------------------------------
|
|
346
|
+
# RegressionDetector
|
|
347
|
+
# ---------------------------------------------------------------------------
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
class RegressionDetector:
|
|
351
|
+
"""Detect quality regressions by comparing current scores against a baseline.
|
|
352
|
+
|
|
353
|
+
When the mean score for a metric drops below
|
|
354
|
+
``baseline_mean * (1 - threshold_pct / 100)`` the detector emits a
|
|
355
|
+
``llm.eval.regression.detected`` RFC-0001 event.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
baseline: ``{metric: baseline_mean}`` dict. Use :meth:`set_baseline`.
|
|
359
|
+
threshold_pct: Float percentage drop that triggers a regression.
|
|
360
|
+
Default: ``5.0`` (5 % drop).
|
|
361
|
+
emit: If ``True`` (default), regression events are emitted.
|
|
362
|
+
|
|
363
|
+
Example::
|
|
364
|
+
|
|
365
|
+
detector = RegressionDetector(baseline={"faithfulness": 0.90}, threshold_pct=5.0)
|
|
366
|
+
detector.check(report)
|
|
367
|
+
"""
|
|
368
|
+
|
|
369
|
+
def __init__(
|
|
370
|
+
self,
|
|
371
|
+
baseline: dict[str, float] | None = None,
|
|
372
|
+
*,
|
|
373
|
+
threshold_pct: float = 5.0,
|
|
374
|
+
emit: bool = True,
|
|
375
|
+
) -> None:
|
|
376
|
+
self._baseline: dict[str, float] = dict(baseline or {})
|
|
377
|
+
self._threshold_pct = threshold_pct
|
|
378
|
+
self._emit = emit
|
|
379
|
+
|
|
380
|
+
def set_baseline(self, metric: str, value: float) -> None:
|
|
381
|
+
"""Update the baseline mean for *metric*."""
|
|
382
|
+
self._baseline[metric] = value
|
|
383
|
+
|
|
384
|
+
def check(self, report: EvalReport) -> list[dict[str, Any]]:
|
|
385
|
+
"""Compare *report* summary against the baseline.
|
|
386
|
+
|
|
387
|
+
Returns a list of regression dicts (may be empty). Each dict has
|
|
388
|
+
keys ``metric``, ``baseline``, ``current``, and ``drop_pct``.
|
|
389
|
+
"""
|
|
390
|
+
regressions: list[dict[str, Any]] = []
|
|
391
|
+
summary = report.summary()
|
|
392
|
+
for metric, current in summary.items():
|
|
393
|
+
baseline = self._baseline.get(metric)
|
|
394
|
+
if baseline is None or baseline <= 0:
|
|
395
|
+
continue
|
|
396
|
+
drop_pct = (baseline - current) / baseline * 100
|
|
397
|
+
if drop_pct >= self._threshold_pct:
|
|
398
|
+
reg = {
|
|
399
|
+
"metric": metric,
|
|
400
|
+
"baseline": baseline,
|
|
401
|
+
"current": current,
|
|
402
|
+
"drop_pct": round(drop_pct, 4),
|
|
403
|
+
}
|
|
404
|
+
regressions.append(reg)
|
|
405
|
+
_log.warning(
|
|
406
|
+
"spanforge.eval: regression detected for metric=%r "
|
|
407
|
+
"(baseline=%.4f current=%.4f drop=%.2f%%)",
|
|
408
|
+
metric,
|
|
409
|
+
baseline,
|
|
410
|
+
current,
|
|
411
|
+
drop_pct,
|
|
412
|
+
)
|
|
413
|
+
if self._emit:
|
|
414
|
+
try:
|
|
415
|
+
from spanforge._stream import emit_rfc_event
|
|
416
|
+
from spanforge.types import EventType
|
|
417
|
+
|
|
418
|
+
emit_rfc_event(
|
|
419
|
+
EventType.EVAL_REGRESSION_DETECTED,
|
|
420
|
+
payload=reg,
|
|
421
|
+
)
|
|
422
|
+
except Exception as exc: # NOSONAR
|
|
423
|
+
_log.warning("spanforge.eval: failed to emit regression event: %s", exc)
|
|
424
|
+
return regressions
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
# ---------------------------------------------------------------------------
|
|
428
|
+
# Built-in scorers
|
|
429
|
+
# ---------------------------------------------------------------------------
|
|
430
|
+
|
|
431
|
+
# Refusal phrases (case-insensitive) — common patterns indicating model refusal
|
|
432
|
+
_REFUSAL_PHRASES: tuple[str, ...] = (
|
|
433
|
+
"i cannot",
|
|
434
|
+
"i can't",
|
|
435
|
+
"i'm not able to",
|
|
436
|
+
"i am not able to",
|
|
437
|
+
"i'm unable to",
|
|
438
|
+
"i am unable to",
|
|
439
|
+
"i must decline",
|
|
440
|
+
"i must refuse",
|
|
441
|
+
"as an ai",
|
|
442
|
+
"as a language model",
|
|
443
|
+
"i'm sorry, but i",
|
|
444
|
+
"i apologize, but i",
|
|
445
|
+
"i don't think i can",
|
|
446
|
+
"it would be inappropriate",
|
|
447
|
+
"i'm not allowed to",
|
|
448
|
+
"i cannot assist with",
|
|
449
|
+
"i can't help with",
|
|
450
|
+
"i won't be able to",
|
|
451
|
+
"sorry, i can't",
|
|
452
|
+
"i refuse to",
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
class FaithfulnessScorer:
|
|
457
|
+
"""Score whether the output is faithful to the provided context.
|
|
458
|
+
|
|
459
|
+
Measures token overlap between *output* and *context* as a proxy for
|
|
460
|
+
factual grounding. Returns 1.0 when every non-trivial output word
|
|
461
|
+
appears in the context, 0.0 when none do.
|
|
462
|
+
|
|
463
|
+
If no ``"context"`` key is present the scorer returns 0.0 with label
|
|
464
|
+
``"skip"`` (cannot evaluate faithfulness without a reference context).
|
|
465
|
+
|
|
466
|
+
Example::
|
|
467
|
+
|
|
468
|
+
scorer = FaithfulnessScorer()
|
|
469
|
+
score = scorer.score({
|
|
470
|
+
"output": "Paris is the capital of France.",
|
|
471
|
+
"context": "France is a country in Europe. Its capital is Paris.",
|
|
472
|
+
})
|
|
473
|
+
"""
|
|
474
|
+
|
|
475
|
+
metric_name: str = "faithfulness"
|
|
476
|
+
|
|
477
|
+
def score(self, example: dict[str, Any]) -> EvalScore:
|
|
478
|
+
"""Score output faithfulness against context."""
|
|
479
|
+
output: str = str(example.get("output", ""))
|
|
480
|
+
context: str = str(example.get("context", ""))
|
|
481
|
+
|
|
482
|
+
if not context:
|
|
483
|
+
return EvalScore(
|
|
484
|
+
metric=self.metric_name,
|
|
485
|
+
value=0.0,
|
|
486
|
+
span_id=example.get("span_id"),
|
|
487
|
+
trace_id=example.get("trace_id"),
|
|
488
|
+
label="skip",
|
|
489
|
+
metadata={"reason": "no context provided"},
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
# Tokenise: lowercase, alpha-numeric tokens, skip stopwords / short words
|
|
493
|
+
def _tokens(text: str) -> set[str]:
|
|
494
|
+
return {w for w in re.findall(r"[a-z0-9]+", text.lower()) if len(w) > 2}
|
|
495
|
+
|
|
496
|
+
out_tokens = _tokens(output)
|
|
497
|
+
ctx_tokens = _tokens(context)
|
|
498
|
+
|
|
499
|
+
if not out_tokens:
|
|
500
|
+
return EvalScore(
|
|
501
|
+
metric=self.metric_name,
|
|
502
|
+
value=0.0,
|
|
503
|
+
span_id=example.get("span_id"),
|
|
504
|
+
trace_id=example.get("trace_id"),
|
|
505
|
+
label="skip",
|
|
506
|
+
metadata={"reason": "empty output"},
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
overlap = len(out_tokens & ctx_tokens) / len(out_tokens)
|
|
510
|
+
label = "pass" if overlap >= 0.5 else "fail"
|
|
511
|
+
|
|
512
|
+
return EvalScore(
|
|
513
|
+
metric=self.metric_name,
|
|
514
|
+
value=round(overlap, 4),
|
|
515
|
+
span_id=example.get("span_id"),
|
|
516
|
+
trace_id=example.get("trace_id"),
|
|
517
|
+
label=label,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
class RefusalDetectionScorer:
|
|
522
|
+
"""Detect whether the model output is a refusal / decline.
|
|
523
|
+
|
|
524
|
+
Checks the output against a set of common refusal phrases. Returns 1.0
|
|
525
|
+
if a refusal is detected, 0.0 otherwise.
|
|
526
|
+
|
|
527
|
+
Example::
|
|
528
|
+
|
|
529
|
+
scorer = RefusalDetectionScorer()
|
|
530
|
+
score = scorer.score({"output": "I'm sorry, but I can't help with that."})
|
|
531
|
+
assert score.value == 1.0
|
|
532
|
+
"""
|
|
533
|
+
|
|
534
|
+
metric_name: str = "refusal_detection"
|
|
535
|
+
|
|
536
|
+
def score(self, example: dict[str, Any]) -> EvalScore:
|
|
537
|
+
"""Score whether the model output is a refusal."""
|
|
538
|
+
output: str = str(example.get("output", "")).lower()
|
|
539
|
+
|
|
540
|
+
detected = any(phrase in output for phrase in _REFUSAL_PHRASES)
|
|
541
|
+
|
|
542
|
+
return EvalScore(
|
|
543
|
+
metric=self.metric_name,
|
|
544
|
+
value=1.0 if detected else 0.0,
|
|
545
|
+
span_id=example.get("span_id"),
|
|
546
|
+
trace_id=example.get("trace_id"),
|
|
547
|
+
label="refusal" if detected else "pass",
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
class PIILeakageScorer:
|
|
552
|
+
"""Detect PII leakage in the model output.
|
|
553
|
+
|
|
554
|
+
Uses :func:`~spanforge.redact.scan_payload` to scan the ``"output"``
|
|
555
|
+
value for PII patterns. Returns 1.0 if PII is detected (leakage),
|
|
556
|
+
0.0 if the output is clean.
|
|
557
|
+
|
|
558
|
+
Example::
|
|
559
|
+
|
|
560
|
+
scorer = PIILeakageScorer()
|
|
561
|
+
score = scorer.score({"output": "Contact me at alice@example.com"})
|
|
562
|
+
assert score.value == 1.0
|
|
563
|
+
"""
|
|
564
|
+
|
|
565
|
+
metric_name: str = "pii_leakage"
|
|
566
|
+
|
|
567
|
+
def score(self, example: dict[str, Any]) -> EvalScore:
|
|
568
|
+
"""Score PII leakage in the model output."""
|
|
569
|
+
from spanforge.redact import scan_payload
|
|
570
|
+
|
|
571
|
+
output: str = str(example.get("output", ""))
|
|
572
|
+
|
|
573
|
+
result = scan_payload({"output": output})
|
|
574
|
+
leaked = not result.clean
|
|
575
|
+
|
|
576
|
+
return EvalScore(
|
|
577
|
+
metric=self.metric_name,
|
|
578
|
+
value=1.0 if leaked else 0.0,
|
|
579
|
+
span_id=example.get("span_id"),
|
|
580
|
+
trace_id=example.get("trace_id"),
|
|
581
|
+
label="leak" if leaked else "pass",
|
|
582
|
+
metadata={"hit_count": len(result.hits)} if leaked else None,
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
# ---------------------------------------------------------------------------
|
|
587
|
+
# BehaviourScorer — ABC for named plug-in scorers
|
|
588
|
+
# ---------------------------------------------------------------------------
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
class BehaviourScorer(ABC):
|
|
592
|
+
"""Abstract base class for plug-in behaviour scorers.
|
|
593
|
+
|
|
594
|
+
Unlike :class:`EvalScorer` (a :class:`~typing.Protocol` that accepts an
|
|
595
|
+
arbitrary example dict), ``BehaviourScorer`` targets the *named
|
|
596
|
+
test-case* workflow where a scorer receives a structured test case object
|
|
597
|
+
and the raw model response string, returning a ``(score, reason)`` tuple.
|
|
598
|
+
|
|
599
|
+
This is the contract expected by the ``spanforge.scorers`` entry-point
|
|
600
|
+
group, allowing third-party scorers to be discovered and loaded
|
|
601
|
+
automatically via :func:`spanforge.plugins.discover`.
|
|
602
|
+
|
|
603
|
+
Subclasses must:
|
|
604
|
+
|
|
605
|
+
* Set a unique class-level :attr:`name` string.
|
|
606
|
+
* Implement :meth:`score`.
|
|
607
|
+
|
|
608
|
+
The returned float must be in ``[0.0, 1.0]``; the string is a short
|
|
609
|
+
human-readable reason suitable for CI log output.
|
|
610
|
+
|
|
611
|
+
Example::
|
|
612
|
+
|
|
613
|
+
from spanforge.eval import BehaviourScorer
|
|
614
|
+
|
|
615
|
+
class ToxicityScorer(BehaviourScorer):
|
|
616
|
+
name = "toxicity"
|
|
617
|
+
|
|
618
|
+
def score(self, case, response: str) -> tuple[float, str]:
|
|
619
|
+
# 1.0 = no toxicity, 0.0 = toxic
|
|
620
|
+
if any(w in response.lower() for w in ("hate", "kill")):
|
|
621
|
+
return 0.0, "toxic content detected"
|
|
622
|
+
return 1.0, "no toxicity detected"
|
|
623
|
+
|
|
624
|
+
Registration in ``pyproject.toml``::
|
|
625
|
+
|
|
626
|
+
[project.entry-points."spanforge.scorers"]
|
|
627
|
+
toxicity = "my_package.scorers:ToxicityScorer"
|
|
628
|
+
"""
|
|
629
|
+
|
|
630
|
+
#: Unique identifier for this scorer. Must be overridden in subclasses.
|
|
631
|
+
name: str = "base"
|
|
632
|
+
|
|
633
|
+
@abstractmethod
|
|
634
|
+
def score(self, case: Any, response: str) -> tuple[float, str]:
|
|
635
|
+
"""Score *response* for the given *case*.
|
|
636
|
+
|
|
637
|
+
Args:
|
|
638
|
+
case: The test case being evaluated. In the spanforge
|
|
639
|
+
ecosystem this is typically a plain dict or a
|
|
640
|
+
dataclass with ``id``, ``messages``, and ``scorers``
|
|
641
|
+
attributes, but the exact type depends on the calling
|
|
642
|
+
framework.
|
|
643
|
+
response: The raw text returned by the model under test.
|
|
644
|
+
|
|
645
|
+
Returns:
|
|
646
|
+
``(score, reason)`` where *score* is in ``[0.0, 1.0]`` and
|
|
647
|
+
*reason* is a short explanation (one sentence).
|
|
648
|
+
"""
|