spanforge 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spanforge/__init__.py +815 -0
- spanforge/_ansi.py +93 -0
- spanforge/_batch_exporter.py +409 -0
- spanforge/_cli.py +2094 -0
- spanforge/_cli_audit.py +639 -0
- spanforge/_cli_compliance.py +711 -0
- spanforge/_cli_cost.py +243 -0
- spanforge/_cli_ops.py +791 -0
- spanforge/_cli_phase11.py +356 -0
- spanforge/_hooks.py +337 -0
- spanforge/_server.py +1708 -0
- spanforge/_span.py +1036 -0
- spanforge/_store.py +288 -0
- spanforge/_stream.py +664 -0
- spanforge/_trace.py +335 -0
- spanforge/_tracer.py +254 -0
- spanforge/actor.py +141 -0
- spanforge/alerts.py +469 -0
- spanforge/auto.py +464 -0
- spanforge/baseline.py +335 -0
- spanforge/cache.py +635 -0
- spanforge/compliance.py +325 -0
- spanforge/config.py +532 -0
- spanforge/consent.py +228 -0
- spanforge/consumer.py +377 -0
- spanforge/core/__init__.py +5 -0
- spanforge/core/compliance_mapping.py +1254 -0
- spanforge/cost.py +600 -0
- spanforge/debug.py +548 -0
- spanforge/deprecations.py +205 -0
- spanforge/drift.py +482 -0
- spanforge/egress.py +58 -0
- spanforge/eval.py +648 -0
- spanforge/event.py +1064 -0
- spanforge/exceptions.py +240 -0
- spanforge/explain.py +178 -0
- spanforge/export/__init__.py +69 -0
- spanforge/export/append_only.py +337 -0
- spanforge/export/cloud.py +357 -0
- spanforge/export/datadog.py +497 -0
- spanforge/export/grafana.py +320 -0
- spanforge/export/jsonl.py +195 -0
- spanforge/export/openinference.py +158 -0
- spanforge/export/otel_bridge.py +294 -0
- spanforge/export/otlp.py +811 -0
- spanforge/export/otlp_bridge.py +233 -0
- spanforge/export/redis_backend.py +282 -0
- spanforge/export/siem_schema.py +98 -0
- spanforge/export/siem_splunk.py +264 -0
- spanforge/export/siem_syslog.py +212 -0
- spanforge/export/webhook.py +299 -0
- spanforge/exporters/__init__.py +30 -0
- spanforge/exporters/console.py +271 -0
- spanforge/exporters/jsonl.py +144 -0
- spanforge/exporters/sqlite.py +142 -0
- spanforge/gate.py +1150 -0
- spanforge/governance.py +181 -0
- spanforge/hitl.py +295 -0
- spanforge/http.py +187 -0
- spanforge/inspect.py +427 -0
- spanforge/integrations/__init__.py +45 -0
- spanforge/integrations/_pricing.py +280 -0
- spanforge/integrations/anthropic.py +388 -0
- spanforge/integrations/azure_openai.py +133 -0
- spanforge/integrations/bedrock.py +292 -0
- spanforge/integrations/crewai.py +251 -0
- spanforge/integrations/gemini.py +351 -0
- spanforge/integrations/groq.py +442 -0
- spanforge/integrations/langchain.py +349 -0
- spanforge/integrations/langgraph.py +306 -0
- spanforge/integrations/llamaindex.py +373 -0
- spanforge/integrations/ollama.py +287 -0
- spanforge/integrations/openai.py +368 -0
- spanforge/integrations/together.py +483 -0
- spanforge/io.py +214 -0
- spanforge/lint.py +322 -0
- spanforge/metrics.py +417 -0
- spanforge/metrics_export.py +343 -0
- spanforge/migrate.py +402 -0
- spanforge/model_registry.py +278 -0
- spanforge/models.py +389 -0
- spanforge/namespaces/__init__.py +254 -0
- spanforge/namespaces/audit.py +256 -0
- spanforge/namespaces/cache.py +237 -0
- spanforge/namespaces/chain.py +77 -0
- spanforge/namespaces/confidence.py +72 -0
- spanforge/namespaces/consent.py +92 -0
- spanforge/namespaces/cost.py +179 -0
- spanforge/namespaces/decision.py +143 -0
- spanforge/namespaces/diff.py +157 -0
- spanforge/namespaces/drift.py +80 -0
- spanforge/namespaces/eval_.py +251 -0
- spanforge/namespaces/feedback.py +241 -0
- spanforge/namespaces/fence.py +193 -0
- spanforge/namespaces/guard.py +105 -0
- spanforge/namespaces/hitl.py +91 -0
- spanforge/namespaces/latency.py +72 -0
- spanforge/namespaces/prompt.py +190 -0
- spanforge/namespaces/redact.py +173 -0
- spanforge/namespaces/retrieval.py +379 -0
- spanforge/namespaces/runtime_governance.py +494 -0
- spanforge/namespaces/template.py +208 -0
- spanforge/namespaces/tool_call.py +77 -0
- spanforge/namespaces/trace.py +1029 -0
- spanforge/normalizer.py +171 -0
- spanforge/plugins.py +82 -0
- spanforge/presidio_backend.py +349 -0
- spanforge/processor.py +258 -0
- spanforge/prompt_registry.py +418 -0
- spanforge/py.typed +0 -0
- spanforge/redact.py +914 -0
- spanforge/regression.py +192 -0
- spanforge/runtime_policy.py +159 -0
- spanforge/sampling.py +511 -0
- spanforge/schema.py +183 -0
- spanforge/schemas/v1.0/schema.json +170 -0
- spanforge/schemas/v2.0/schema.json +536 -0
- spanforge/sdk/__init__.py +625 -0
- spanforge/sdk/_base.py +584 -0
- spanforge/sdk/_base.pyi +71 -0
- spanforge/sdk/_exceptions.py +1096 -0
- spanforge/sdk/_types.py +2184 -0
- spanforge/sdk/alert.py +1514 -0
- spanforge/sdk/alert.pyi +56 -0
- spanforge/sdk/audit.py +1196 -0
- spanforge/sdk/audit.pyi +67 -0
- spanforge/sdk/cec.py +1215 -0
- spanforge/sdk/cec.pyi +37 -0
- spanforge/sdk/config.py +641 -0
- spanforge/sdk/config.pyi +55 -0
- spanforge/sdk/enterprise.py +714 -0
- spanforge/sdk/enterprise.pyi +79 -0
- spanforge/sdk/explain.py +170 -0
- spanforge/sdk/fallback.py +432 -0
- spanforge/sdk/feedback.py +351 -0
- spanforge/sdk/gate.py +874 -0
- spanforge/sdk/gate.pyi +51 -0
- spanforge/sdk/identity.py +2114 -0
- spanforge/sdk/identity.pyi +47 -0
- spanforge/sdk/lineage.py +175 -0
- spanforge/sdk/observe.py +1065 -0
- spanforge/sdk/observe.pyi +50 -0
- spanforge/sdk/operator.py +338 -0
- spanforge/sdk/pii.py +1473 -0
- spanforge/sdk/pii.pyi +119 -0
- spanforge/sdk/pipelines.py +458 -0
- spanforge/sdk/pipelines.pyi +39 -0
- spanforge/sdk/policy.py +930 -0
- spanforge/sdk/rag.py +594 -0
- spanforge/sdk/rbac.py +280 -0
- spanforge/sdk/registry.py +430 -0
- spanforge/sdk/registry.pyi +46 -0
- spanforge/sdk/scope.py +279 -0
- spanforge/sdk/secrets.py +293 -0
- spanforge/sdk/secrets.pyi +25 -0
- spanforge/sdk/security.py +560 -0
- spanforge/sdk/security.pyi +57 -0
- spanforge/sdk/trust.py +472 -0
- spanforge/sdk/trust.pyi +41 -0
- spanforge/secrets.py +799 -0
- spanforge/signing.py +1179 -0
- spanforge/stats.py +100 -0
- spanforge/stream.py +560 -0
- spanforge/testing.py +378 -0
- spanforge/testing_mocks.py +1052 -0
- spanforge/trace.py +199 -0
- spanforge/types.py +696 -0
- spanforge/ulid.py +300 -0
- spanforge/validate.py +379 -0
- spanforge-1.0.0.dist-info/METADATA +1509 -0
- spanforge-1.0.0.dist-info/RECORD +174 -0
- spanforge-1.0.0.dist-info/WHEEL +4 -0
- spanforge-1.0.0.dist-info/entry_points.txt +5 -0
- spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""spanforge.namespaces.eval_ — Evaluation payload types (RFC-0001).
|
|
2
|
+
|
|
3
|
+
Classes
|
|
4
|
+
-------
|
|
5
|
+
EvalScoreRecordedPayload llm.eval.score.recorded
|
|
6
|
+
EvalRegressionDetectedPayload llm.eval.regression.detected
|
|
7
|
+
EvalScenarioStartedPayload llm.eval.scenario.started
|
|
8
|
+
EvalScenarioCompletedPayload llm.eval.scenario.completed
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from spanforge.namespaces.trace import ModelInfo
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"EvalRegressionDetectedPayload",
|
|
20
|
+
"EvalScenarioCompletedPayload",
|
|
21
|
+
"EvalScenarioStartedPayload",
|
|
22
|
+
"EvalScoreRecordedPayload",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
_VALID_SEVERITIES = frozenset({"low", "medium", "high", "critical"})
|
|
26
|
+
_VALID_STATUSES = frozenset({"passed", "failed", "error", "cancelled"})
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class EvalScoreRecordedPayload:
|
|
31
|
+
"""RFC-0001 — A single evaluation score recorded for a subject event."""
|
|
32
|
+
|
|
33
|
+
evaluator: str
|
|
34
|
+
metric_name: str
|
|
35
|
+
score: float
|
|
36
|
+
score_min: float | None = None
|
|
37
|
+
score_max: float | None = None
|
|
38
|
+
threshold: float | None = None
|
|
39
|
+
passed: bool | None = None
|
|
40
|
+
subject_event_id: str | None = None
|
|
41
|
+
subject_type: str | None = None
|
|
42
|
+
eval_run_id: str | None = None
|
|
43
|
+
rationale: str | None = None
|
|
44
|
+
model: ModelInfo | None = None # judge model
|
|
45
|
+
|
|
46
|
+
def __post_init__(self) -> None:
|
|
47
|
+
if not isinstance(self.evaluator, str) or not self.evaluator:
|
|
48
|
+
raise ValueError("EvalScoreRecordedPayload.evaluator must be non-empty")
|
|
49
|
+
if not isinstance(self.metric_name, str) or not self.metric_name:
|
|
50
|
+
raise ValueError("EvalScoreRecordedPayload.metric_name must be non-empty")
|
|
51
|
+
|
|
52
|
+
def to_dict(self) -> dict[str, Any]:
|
|
53
|
+
"""Serialise the payload to a plain ``dict``."""
|
|
54
|
+
d: dict[str, Any] = {
|
|
55
|
+
"evaluator": self.evaluator,
|
|
56
|
+
"metric_name": self.metric_name,
|
|
57
|
+
"score": self.score,
|
|
58
|
+
}
|
|
59
|
+
for f in (
|
|
60
|
+
"score_min",
|
|
61
|
+
"score_max",
|
|
62
|
+
"threshold",
|
|
63
|
+
"passed",
|
|
64
|
+
"subject_event_id",
|
|
65
|
+
"subject_type",
|
|
66
|
+
"eval_run_id",
|
|
67
|
+
"rationale",
|
|
68
|
+
):
|
|
69
|
+
v = getattr(self, f)
|
|
70
|
+
if v is not None:
|
|
71
|
+
d[f] = v
|
|
72
|
+
if self.model is not None:
|
|
73
|
+
d["model"] = self.model.to_dict()
|
|
74
|
+
return d
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def from_dict(cls, data: dict[str, Any]) -> EvalScoreRecordedPayload:
|
|
78
|
+
"""Deserialise from a plain ``dict``."""
|
|
79
|
+
return cls(
|
|
80
|
+
evaluator=data["evaluator"],
|
|
81
|
+
metric_name=data["metric_name"],
|
|
82
|
+
score=float(data["score"]),
|
|
83
|
+
score_min=float(data["score_min"]) if "score_min" in data else None,
|
|
84
|
+
score_max=float(data["score_max"]) if "score_max" in data else None,
|
|
85
|
+
threshold=float(data["threshold"]) if "threshold" in data else None,
|
|
86
|
+
passed=bool(data["passed"]) if "passed" in data else None,
|
|
87
|
+
subject_event_id=data.get("subject_event_id"),
|
|
88
|
+
subject_type=data.get("subject_type"),
|
|
89
|
+
eval_run_id=data.get("eval_run_id"),
|
|
90
|
+
rationale=data.get("rationale"),
|
|
91
|
+
model=ModelInfo.from_dict(data["model"]) if "model" in data else None,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class EvalRegressionDetectedPayload:
|
|
97
|
+
"""RFC-0001 — A metric regression detected between baseline and current."""
|
|
98
|
+
|
|
99
|
+
metric_name: str
|
|
100
|
+
baseline_score: float
|
|
101
|
+
current_score: float
|
|
102
|
+
delta: float
|
|
103
|
+
regression_pct: float
|
|
104
|
+
severity: str | None = None # "low"|"medium"|"high"|"critical"
|
|
105
|
+
affected_model: ModelInfo | None = None
|
|
106
|
+
eval_run_id: str | None = None
|
|
107
|
+
sample_count: int | None = None
|
|
108
|
+
|
|
109
|
+
def __post_init__(self) -> None:
|
|
110
|
+
if not isinstance(self.metric_name, str) or not self.metric_name:
|
|
111
|
+
raise ValueError("EvalRegressionDetectedPayload.metric_name must be non-empty")
|
|
112
|
+
if self.severity is not None and self.severity not in _VALID_SEVERITIES:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"EvalRegressionDetectedPayload.severity must be one of {sorted(_VALID_SEVERITIES)}"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def to_dict(self) -> dict[str, Any]:
|
|
118
|
+
"""Serialise the payload to a plain ``dict``."""
|
|
119
|
+
d: dict[str, Any] = {
|
|
120
|
+
"metric_name": self.metric_name,
|
|
121
|
+
"baseline_score": self.baseline_score,
|
|
122
|
+
"current_score": self.current_score,
|
|
123
|
+
"delta": self.delta,
|
|
124
|
+
"regression_pct": self.regression_pct,
|
|
125
|
+
}
|
|
126
|
+
if self.severity is not None:
|
|
127
|
+
d["severity"] = self.severity
|
|
128
|
+
if self.affected_model is not None:
|
|
129
|
+
d["affected_model"] = self.affected_model.to_dict()
|
|
130
|
+
if self.eval_run_id is not None:
|
|
131
|
+
d["eval_run_id"] = self.eval_run_id
|
|
132
|
+
if self.sample_count is not None:
|
|
133
|
+
d["sample_count"] = self.sample_count
|
|
134
|
+
return d
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def from_dict(cls, data: dict[str, Any]) -> EvalRegressionDetectedPayload:
|
|
138
|
+
"""Deserialise from a plain ``dict``."""
|
|
139
|
+
return cls(
|
|
140
|
+
metric_name=data["metric_name"],
|
|
141
|
+
baseline_score=float(data["baseline_score"]),
|
|
142
|
+
current_score=float(data["current_score"]),
|
|
143
|
+
delta=float(data["delta"]),
|
|
144
|
+
regression_pct=float(data["regression_pct"]),
|
|
145
|
+
severity=data.get("severity"),
|
|
146
|
+
affected_model=ModelInfo.from_dict(data["affected_model"])
|
|
147
|
+
if "affected_model" in data
|
|
148
|
+
else None,
|
|
149
|
+
eval_run_id=data.get("eval_run_id"),
|
|
150
|
+
sample_count=int(data["sample_count"]) if "sample_count" in data else None,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@dataclass
|
|
155
|
+
class EvalScenarioStartedPayload:
|
|
156
|
+
"""RFC-0001 — An evaluation scenario has started."""
|
|
157
|
+
|
|
158
|
+
scenario_id: str
|
|
159
|
+
scenario_name: str
|
|
160
|
+
evaluator: str
|
|
161
|
+
dataset_id: str | None = None
|
|
162
|
+
expected_sample_count: int | None = None
|
|
163
|
+
metrics: list[str] = field(default_factory=list)
|
|
164
|
+
|
|
165
|
+
def __post_init__(self) -> None:
|
|
166
|
+
if not self.scenario_id:
|
|
167
|
+
raise ValueError("EvalScenarioStartedPayload.scenario_id must be non-empty")
|
|
168
|
+
if not self.scenario_name:
|
|
169
|
+
raise ValueError("EvalScenarioStartedPayload.scenario_name must be non-empty")
|
|
170
|
+
if not self.evaluator:
|
|
171
|
+
raise ValueError("EvalScenarioStartedPayload.evaluator must be non-empty")
|
|
172
|
+
|
|
173
|
+
def to_dict(self) -> dict[str, Any]:
|
|
174
|
+
"""Serialise the payload to a plain ``dict``."""
|
|
175
|
+
d: dict[str, Any] = {
|
|
176
|
+
"scenario_id": self.scenario_id,
|
|
177
|
+
"scenario_name": self.scenario_name,
|
|
178
|
+
"evaluator": self.evaluator,
|
|
179
|
+
}
|
|
180
|
+
if self.dataset_id is not None:
|
|
181
|
+
d["dataset_id"] = self.dataset_id
|
|
182
|
+
if self.expected_sample_count is not None:
|
|
183
|
+
d["expected_sample_count"] = self.expected_sample_count
|
|
184
|
+
if self.metrics:
|
|
185
|
+
d["metrics"] = list(self.metrics)
|
|
186
|
+
return d
|
|
187
|
+
|
|
188
|
+
@classmethod
|
|
189
|
+
def from_dict(cls, data: dict[str, Any]) -> EvalScenarioStartedPayload:
|
|
190
|
+
"""Deserialise from a plain ``dict``."""
|
|
191
|
+
return cls(
|
|
192
|
+
scenario_id=data["scenario_id"],
|
|
193
|
+
scenario_name=data["scenario_name"],
|
|
194
|
+
evaluator=data["evaluator"],
|
|
195
|
+
dataset_id=data.get("dataset_id"),
|
|
196
|
+
expected_sample_count=int(data["expected_sample_count"])
|
|
197
|
+
if "expected_sample_count" in data
|
|
198
|
+
else None,
|
|
199
|
+
metrics=list(data.get("metrics", [])),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@dataclass
|
|
204
|
+
class EvalScenarioCompletedPayload:
|
|
205
|
+
"""RFC-0001 — An evaluation scenario has completed."""
|
|
206
|
+
|
|
207
|
+
scenario_id: str
|
|
208
|
+
status: str # "passed"|"failed"|"error"|"cancelled"
|
|
209
|
+
duration_ms: float
|
|
210
|
+
completed_sample_count: int | None = None
|
|
211
|
+
scores_summary: dict[str, float] | None = None
|
|
212
|
+
errors: list[str] | None = None
|
|
213
|
+
|
|
214
|
+
def __post_init__(self) -> None:
|
|
215
|
+
if not self.scenario_id:
|
|
216
|
+
raise ValueError("EvalScenarioCompletedPayload.scenario_id must be non-empty")
|
|
217
|
+
if self.status not in _VALID_STATUSES:
|
|
218
|
+
raise ValueError(
|
|
219
|
+
f"EvalScenarioCompletedPayload.status must be one of {sorted(_VALID_STATUSES)}"
|
|
220
|
+
)
|
|
221
|
+
if self.duration_ms < 0:
|
|
222
|
+
raise ValueError("EvalScenarioCompletedPayload.duration_ms must be non-negative")
|
|
223
|
+
|
|
224
|
+
def to_dict(self) -> dict[str, Any]:
|
|
225
|
+
"""Serialise the payload to a plain ``dict``."""
|
|
226
|
+
d: dict[str, Any] = {
|
|
227
|
+
"scenario_id": self.scenario_id,
|
|
228
|
+
"status": self.status,
|
|
229
|
+
"duration_ms": self.duration_ms,
|
|
230
|
+
}
|
|
231
|
+
if self.completed_sample_count is not None:
|
|
232
|
+
d["completed_sample_count"] = self.completed_sample_count
|
|
233
|
+
if self.scores_summary is not None:
|
|
234
|
+
d["scores_summary"] = dict(self.scores_summary)
|
|
235
|
+
if self.errors is not None:
|
|
236
|
+
d["errors"] = list(self.errors)
|
|
237
|
+
return d
|
|
238
|
+
|
|
239
|
+
@classmethod
|
|
240
|
+
def from_dict(cls, data: dict[str, Any]) -> EvalScenarioCompletedPayload:
|
|
241
|
+
"""Deserialise from a plain ``dict``."""
|
|
242
|
+
return cls(
|
|
243
|
+
scenario_id=data["scenario_id"],
|
|
244
|
+
status=data["status"],
|
|
245
|
+
duration_ms=float(data["duration_ms"]),
|
|
246
|
+
completed_sample_count=int(data["completed_sample_count"])
|
|
247
|
+
if "completed_sample_count" in data
|
|
248
|
+
else None,
|
|
249
|
+
scores_summary=dict(data["scores_summary"]) if "scores_summary" in data else None,
|
|
250
|
+
errors=list(data["errors"]) if "errors" in data else None,
|
|
251
|
+
)
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""spanforge.namespaces.feedback — User feedback namespace payload types.
|
|
2
|
+
|
|
3
|
+
Provides dataclasses for the ``llm.feedback.*`` event namespace, covering
|
|
4
|
+
all supported feedback rating modalities:
|
|
5
|
+
|
|
6
|
+
1. **Thumbs** — binary thumbs-up / thumbs-down feedback.
|
|
7
|
+
2. **Star** — 1–5 star rating.
|
|
8
|
+
3. **Likert** — 1–5 Likert scale response.
|
|
9
|
+
4. **Free-text** — open-ended qualitative comment (stored hashed, not raw).
|
|
10
|
+
|
|
11
|
+
Classes
|
|
12
|
+
-------
|
|
13
|
+
FeedbackRating
|
|
14
|
+
Enum of supported rating types.
|
|
15
|
+
FeedbackSubmittedPayload
|
|
16
|
+
``llm.feedback.submitted`` events — the primary payload for any feedback.
|
|
17
|
+
FeedbackSummaryPayload
|
|
18
|
+
``llm.feedback.summary`` events — aggregated feedback for a session /
|
|
19
|
+
trace / response.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from enum import Enum
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"FeedbackRating",
|
|
30
|
+
"FeedbackSubmittedPayload",
|
|
31
|
+
"FeedbackSummaryPayload",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Rating type enum
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class FeedbackRating(str, Enum):
|
|
41
|
+
"""Supported feedback rating modalities.
|
|
42
|
+
|
|
43
|
+
Attributes:
|
|
44
|
+
THUMBS_UP: Binary positive feedback.
|
|
45
|
+
THUMBS_DOWN: Binary negative feedback.
|
|
46
|
+
STAR_1: 1 out of 5 stars.
|
|
47
|
+
STAR_2: 2 out of 5 stars.
|
|
48
|
+
STAR_3: 3 out of 5 stars.
|
|
49
|
+
STAR_4: 4 out of 5 stars.
|
|
50
|
+
STAR_5: 5 out of 5 stars.
|
|
51
|
+
LIKERT_1: Strongly disagree (Likert 1/5).
|
|
52
|
+
LIKERT_2: Disagree (Likert 2/5).
|
|
53
|
+
LIKERT_3: Neutral (Likert 3/5).
|
|
54
|
+
LIKERT_4: Agree (Likert 4/5).
|
|
55
|
+
LIKERT_5: Strongly agree (Likert 5/5).
|
|
56
|
+
FREE_TEXT: Open-ended qualitative comment.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
THUMBS_UP = "thumbs_up"
|
|
60
|
+
THUMBS_DOWN = "thumbs_down"
|
|
61
|
+
STAR_1 = "star_1"
|
|
62
|
+
STAR_2 = "star_2"
|
|
63
|
+
STAR_3 = "star_3"
|
|
64
|
+
STAR_4 = "star_4"
|
|
65
|
+
STAR_5 = "star_5"
|
|
66
|
+
LIKERT_1 = "likert_1"
|
|
67
|
+
LIKERT_2 = "likert_2"
|
|
68
|
+
LIKERT_3 = "likert_3"
|
|
69
|
+
LIKERT_4 = "likert_4"
|
|
70
|
+
LIKERT_5 = "likert_5"
|
|
71
|
+
FREE_TEXT = "free_text"
|
|
72
|
+
|
|
73
|
+
def numeric_value(self) -> float | None:
|
|
74
|
+
"""Return a 0.0–1.0 normalised numeric value for ratings that have one.
|
|
75
|
+
|
|
76
|
+
Returns ``None`` for :attr:`FREE_TEXT` (non-numeric). Thumbs are
|
|
77
|
+
mapped to ``0.0`` / ``1.0``; Star and Likert scales are mapped to
|
|
78
|
+
their (value - 1) / 4 position on a 0–1 scale.
|
|
79
|
+
"""
|
|
80
|
+
_map: dict[str, float] = {
|
|
81
|
+
"thumbs_up": 1.0,
|
|
82
|
+
"thumbs_down": 0.0,
|
|
83
|
+
"star_1": 0.0,
|
|
84
|
+
"star_2": 0.25,
|
|
85
|
+
"star_3": 0.5,
|
|
86
|
+
"star_4": 0.75,
|
|
87
|
+
"star_5": 1.0,
|
|
88
|
+
"likert_1": 0.0,
|
|
89
|
+
"likert_2": 0.25,
|
|
90
|
+
"likert_3": 0.5,
|
|
91
|
+
"likert_4": 0.75,
|
|
92
|
+
"likert_5": 1.0,
|
|
93
|
+
}
|
|
94
|
+
return _map.get(self.value)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
# Payload dataclasses
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class FeedbackSubmittedPayload:
|
|
104
|
+
"""Payload for ``llm.feedback.submitted`` events.
|
|
105
|
+
|
|
106
|
+
Raw free-text comments are **never stored**; when *rating* is
|
|
107
|
+
``FeedbackRating.FREE_TEXT`` the *comment_hash* field holds the SHA-256
|
|
108
|
+
digest of the comment text.
|
|
109
|
+
|
|
110
|
+
Attributes:
|
|
111
|
+
feedback_id: Unique identifier for this feedback record (ULID).
|
|
112
|
+
session_id: Session or conversation this feedback applies to.
|
|
113
|
+
trace_id: Trace ID of the specific LLM call being rated.
|
|
114
|
+
rating: The :class:`FeedbackRating` value.
|
|
115
|
+
comment_hash: SHA-256 hex digest of the free-text comment, or ``""``
|
|
116
|
+
when *rating* is not ``FREE_TEXT``.
|
|
117
|
+
user_id_hash: SHA-256 hex digest of the user identifier, or ``""``
|
|
118
|
+
when the submission is anonymous.
|
|
119
|
+
source: Feedback collection channel (e.g. ``"widget"``,
|
|
120
|
+
``"api"``, ``"email"``).
|
|
121
|
+
metadata: Arbitrary key-value metadata (e.g. page URL, A/B variant).
|
|
122
|
+
linked_trust_dimension:
|
|
123
|
+
Optional T.R.U.S.T. dimension this feedback should
|
|
124
|
+
influence (e.g. ``"reliability"``).
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
feedback_id: str
|
|
128
|
+
session_id: str
|
|
129
|
+
trace_id: str
|
|
130
|
+
rating: FeedbackRating
|
|
131
|
+
comment_hash: str = ""
|
|
132
|
+
user_id_hash: str = ""
|
|
133
|
+
source: str = "api"
|
|
134
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
135
|
+
linked_trust_dimension: str | None = None
|
|
136
|
+
|
|
137
|
+
def __post_init__(self) -> None:
|
|
138
|
+
if not self.feedback_id:
|
|
139
|
+
raise ValueError("FeedbackSubmittedPayload.feedback_id must be non-empty")
|
|
140
|
+
if not self.session_id:
|
|
141
|
+
raise ValueError("FeedbackSubmittedPayload.session_id must be non-empty")
|
|
142
|
+
if not isinstance(self.rating, FeedbackRating):
|
|
143
|
+
# Accept raw string values for convenience.
|
|
144
|
+
self.rating = FeedbackRating(self.rating)
|
|
145
|
+
|
|
146
|
+
def to_dict(self) -> dict[str, Any]:
|
|
147
|
+
"""Serialise to a plain dict."""
|
|
148
|
+
d: dict[str, Any] = {
|
|
149
|
+
"feedback_id": self.feedback_id,
|
|
150
|
+
"session_id": self.session_id,
|
|
151
|
+
"trace_id": self.trace_id,
|
|
152
|
+
"rating": self.rating.value,
|
|
153
|
+
"comment_hash": self.comment_hash,
|
|
154
|
+
"user_id_hash": self.user_id_hash,
|
|
155
|
+
"source": self.source,
|
|
156
|
+
"metadata": self.metadata,
|
|
157
|
+
}
|
|
158
|
+
if self.linked_trust_dimension is not None:
|
|
159
|
+
d["linked_trust_dimension"] = self.linked_trust_dimension
|
|
160
|
+
return d
|
|
161
|
+
|
|
162
|
+
@classmethod
|
|
163
|
+
def from_dict(cls, data: dict[str, Any]) -> FeedbackSubmittedPayload:
|
|
164
|
+
"""Deserialise from a plain dict."""
|
|
165
|
+
return cls(
|
|
166
|
+
feedback_id=str(data["feedback_id"]),
|
|
167
|
+
session_id=str(data["session_id"]),
|
|
168
|
+
trace_id=str(data.get("trace_id", "")),
|
|
169
|
+
rating=FeedbackRating(data["rating"]),
|
|
170
|
+
comment_hash=str(data.get("comment_hash", "")),
|
|
171
|
+
user_id_hash=str(data.get("user_id_hash", "")),
|
|
172
|
+
source=str(data.get("source", "api")),
|
|
173
|
+
metadata=dict(data.get("metadata", {})),
|
|
174
|
+
linked_trust_dimension=data.get("linked_trust_dimension"),
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@dataclass
|
|
179
|
+
class FeedbackSummaryPayload:
|
|
180
|
+
"""Payload for ``llm.feedback.summary`` events.
|
|
181
|
+
|
|
182
|
+
Aggregated feedback statistics over a session or time window.
|
|
183
|
+
|
|
184
|
+
Attributes:
|
|
185
|
+
session_id: Session or aggregation window identifier.
|
|
186
|
+
total_feedback: Total number of feedback events in the window.
|
|
187
|
+
thumbs_up_count: Count of ``THUMBS_UP`` ratings.
|
|
188
|
+
thumbs_down_count: Count of ``THUMBS_DOWN`` ratings.
|
|
189
|
+
avg_star_rating: Mean star rating (1–5); ``None`` if no star ratings.
|
|
190
|
+
avg_likert_score: Mean Likert score (1–5); ``None`` if no Likert ratings.
|
|
191
|
+
free_text_count: Number of free-text comments submitted.
|
|
192
|
+
positive_rate: Fraction of positive feedback (0.0–1.0) — computed
|
|
193
|
+
from all numeric ratings above the neutral threshold.
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
session_id: str
|
|
197
|
+
total_feedback: int = 0
|
|
198
|
+
thumbs_up_count: int = 0
|
|
199
|
+
thumbs_down_count: int = 0
|
|
200
|
+
avg_star_rating: float | None = None
|
|
201
|
+
avg_likert_score: float | None = None
|
|
202
|
+
free_text_count: int = 0
|
|
203
|
+
positive_rate: float = 0.0
|
|
204
|
+
|
|
205
|
+
def __post_init__(self) -> None:
|
|
206
|
+
if not self.session_id:
|
|
207
|
+
raise ValueError("FeedbackSummaryPayload.session_id must be non-empty")
|
|
208
|
+
if not (0.0 <= self.positive_rate <= 1.0):
|
|
209
|
+
raise ValueError(
|
|
210
|
+
f"FeedbackSummaryPayload.positive_rate must be in [0, 1]; got {self.positive_rate}"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
def to_dict(self) -> dict[str, Any]:
|
|
214
|
+
"""Serialise to a plain dict."""
|
|
215
|
+
d: dict[str, Any] = {
|
|
216
|
+
"session_id": self.session_id,
|
|
217
|
+
"total_feedback": self.total_feedback,
|
|
218
|
+
"thumbs_up_count": self.thumbs_up_count,
|
|
219
|
+
"thumbs_down_count": self.thumbs_down_count,
|
|
220
|
+
"free_text_count": self.free_text_count,
|
|
221
|
+
"positive_rate": self.positive_rate,
|
|
222
|
+
}
|
|
223
|
+
if self.avg_star_rating is not None:
|
|
224
|
+
d["avg_star_rating"] = self.avg_star_rating
|
|
225
|
+
if self.avg_likert_score is not None:
|
|
226
|
+
d["avg_likert_score"] = self.avg_likert_score
|
|
227
|
+
return d
|
|
228
|
+
|
|
229
|
+
@classmethod
|
|
230
|
+
def from_dict(cls, data: dict[str, Any]) -> FeedbackSummaryPayload:
|
|
231
|
+
"""Deserialise from a plain dict."""
|
|
232
|
+
return cls(
|
|
233
|
+
session_id=str(data["session_id"]),
|
|
234
|
+
total_feedback=int(data.get("total_feedback", 0)),
|
|
235
|
+
thumbs_up_count=int(data.get("thumbs_up_count", 0)),
|
|
236
|
+
thumbs_down_count=int(data.get("thumbs_down_count", 0)),
|
|
237
|
+
avg_star_rating=data.get("avg_star_rating"),
|
|
238
|
+
avg_likert_score=data.get("avg_likert_score"),
|
|
239
|
+
free_text_count=int(data.get("free_text_count", 0)),
|
|
240
|
+
positive_rate=float(data.get("positive_rate", 0.0)),
|
|
241
|
+
)
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""spanforge.namespaces.fence — Fence payload types (RFC-0001).
|
|
2
|
+
|
|
3
|
+
Classes
|
|
4
|
+
-------
|
|
5
|
+
FenceValidatedPayload llm.fence.validated
|
|
6
|
+
FenceRetryTriggeredPayload llm.fence.retry.triggered
|
|
7
|
+
FenceMaxRetriesExceededPayload llm.fence.max_retries.exceeded
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from spanforge.namespaces.trace import CostBreakdown
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"FenceMaxRetriesExceededPayload",
|
|
19
|
+
"FenceRetryTriggeredPayload",
|
|
20
|
+
"FenceValidatedPayload",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
_VALID_OUTPUT_TYPES = frozenset({"json_schema", "pydantic", "regex", "xml", "custom"})
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class FenceValidatedPayload:
|
|
28
|
+
"""RFC-0001 — Structured output passed validation on a given attempt."""
|
|
29
|
+
|
|
30
|
+
fence_id: str
|
|
31
|
+
schema_name: str
|
|
32
|
+
attempt: int
|
|
33
|
+
output_type: str | None = None # "json_schema"|"pydantic"|"regex"|"xml"|"custom"
|
|
34
|
+
span_id: str | None = None
|
|
35
|
+
validation_duration_ms: float | None = None
|
|
36
|
+
|
|
37
|
+
def __post_init__(self) -> None:
|
|
38
|
+
if not self.fence_id:
|
|
39
|
+
raise ValueError("FenceValidatedPayload.fence_id must be non-empty")
|
|
40
|
+
if not self.schema_name:
|
|
41
|
+
raise ValueError("FenceValidatedPayload.schema_name must be non-empty")
|
|
42
|
+
if not isinstance(self.attempt, int) or self.attempt < 1:
|
|
43
|
+
raise ValueError("FenceValidatedPayload.attempt must be a positive int")
|
|
44
|
+
if self.output_type is not None and self.output_type not in _VALID_OUTPUT_TYPES:
|
|
45
|
+
raise ValueError(
|
|
46
|
+
f"FenceValidatedPayload.output_type must be one of {sorted(_VALID_OUTPUT_TYPES)}"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def to_dict(self) -> dict[str, Any]:
|
|
50
|
+
"""Serialise the payload to a plain ``dict``."""
|
|
51
|
+
d: dict[str, Any] = {
|
|
52
|
+
"fence_id": self.fence_id,
|
|
53
|
+
"schema_name": self.schema_name,
|
|
54
|
+
"attempt": self.attempt,
|
|
55
|
+
}
|
|
56
|
+
if self.output_type is not None:
|
|
57
|
+
d["output_type"] = self.output_type
|
|
58
|
+
if self.span_id is not None:
|
|
59
|
+
d["span_id"] = self.span_id
|
|
60
|
+
if self.validation_duration_ms is not None:
|
|
61
|
+
d["validation_duration_ms"] = self.validation_duration_ms
|
|
62
|
+
return d
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def from_dict(cls, data: dict[str, Any]) -> FenceValidatedPayload:
|
|
66
|
+
"""Deserialise from a plain ``dict``."""
|
|
67
|
+
return cls(
|
|
68
|
+
fence_id=data["fence_id"],
|
|
69
|
+
schema_name=data["schema_name"],
|
|
70
|
+
attempt=int(data["attempt"]),
|
|
71
|
+
output_type=data.get("output_type"),
|
|
72
|
+
span_id=data.get("span_id"),
|
|
73
|
+
validation_duration_ms=float(data["validation_duration_ms"])
|
|
74
|
+
if "validation_duration_ms" in data
|
|
75
|
+
else None,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class FenceRetryTriggeredPayload:
|
|
81
|
+
"""RFC-0001 — A validation failure triggered a retry."""
|
|
82
|
+
|
|
83
|
+
fence_id: str
|
|
84
|
+
schema_name: str
|
|
85
|
+
attempt: int
|
|
86
|
+
max_attempts: int
|
|
87
|
+
violation_summary: str
|
|
88
|
+
output_type: str | None = None
|
|
89
|
+
span_id: str | None = None
|
|
90
|
+
|
|
91
|
+
def __post_init__(self) -> None:
|
|
92
|
+
if not self.fence_id:
|
|
93
|
+
raise ValueError("FenceRetryTriggeredPayload.fence_id must be non-empty")
|
|
94
|
+
if not self.schema_name:
|
|
95
|
+
raise ValueError("FenceRetryTriggeredPayload.schema_name must be non-empty")
|
|
96
|
+
if not isinstance(self.attempt, int) or self.attempt < 1:
|
|
97
|
+
raise ValueError("FenceRetryTriggeredPayload.attempt must be a positive int")
|
|
98
|
+
if not isinstance(self.max_attempts, int) or self.max_attempts < 1:
|
|
99
|
+
raise ValueError("FenceRetryTriggeredPayload.max_attempts must be a positive int")
|
|
100
|
+
if not self.violation_summary:
|
|
101
|
+
raise ValueError("FenceRetryTriggeredPayload.violation_summary must be non-empty")
|
|
102
|
+
if self.output_type is not None and self.output_type not in _VALID_OUTPUT_TYPES:
|
|
103
|
+
raise ValueError(
|
|
104
|
+
f"FenceRetryTriggeredPayload.output_type must be one of {sorted(_VALID_OUTPUT_TYPES)}"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def to_dict(self) -> dict[str, Any]:
|
|
108
|
+
"""Serialise the payload to a plain ``dict``."""
|
|
109
|
+
d: dict[str, Any] = {
|
|
110
|
+
"fence_id": self.fence_id,
|
|
111
|
+
"schema_name": self.schema_name,
|
|
112
|
+
"attempt": self.attempt,
|
|
113
|
+
"max_attempts": self.max_attempts,
|
|
114
|
+
"violation_summary": self.violation_summary,
|
|
115
|
+
}
|
|
116
|
+
if self.output_type is not None:
|
|
117
|
+
d["output_type"] = self.output_type
|
|
118
|
+
if self.span_id is not None:
|
|
119
|
+
d["span_id"] = self.span_id
|
|
120
|
+
return d
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def from_dict(cls, data: dict[str, Any]) -> FenceRetryTriggeredPayload:
|
|
124
|
+
"""Deserialise from a plain ``dict``."""
|
|
125
|
+
return cls(
|
|
126
|
+
fence_id=data["fence_id"],
|
|
127
|
+
schema_name=data["schema_name"],
|
|
128
|
+
attempt=int(data["attempt"]),
|
|
129
|
+
max_attempts=int(data["max_attempts"]),
|
|
130
|
+
violation_summary=data["violation_summary"],
|
|
131
|
+
output_type=data.get("output_type"),
|
|
132
|
+
span_id=data.get("span_id"),
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@dataclass
|
|
137
|
+
class FenceMaxRetriesExceededPayload:
|
|
138
|
+
"""RFC-0001 — All retry attempts exhausted; output remains invalid."""
|
|
139
|
+
|
|
140
|
+
fence_id: str
|
|
141
|
+
schema_name: str
|
|
142
|
+
attempts_made: int
|
|
143
|
+
final_violation_summary: str
|
|
144
|
+
output_type: str | None = None
|
|
145
|
+
span_id: str | None = None
|
|
146
|
+
total_extra_cost: CostBreakdown | None = None
|
|
147
|
+
|
|
148
|
+
def __post_init__(self) -> None:
|
|
149
|
+
if not self.fence_id:
|
|
150
|
+
raise ValueError("FenceMaxRetriesExceededPayload.fence_id must be non-empty")
|
|
151
|
+
if not self.schema_name:
|
|
152
|
+
raise ValueError("FenceMaxRetriesExceededPayload.schema_name must be non-empty")
|
|
153
|
+
if not isinstance(self.attempts_made, int) or self.attempts_made < 1:
|
|
154
|
+
raise ValueError("FenceMaxRetriesExceededPayload.attempts_made must be a positive int")
|
|
155
|
+
if not self.final_violation_summary:
|
|
156
|
+
raise ValueError(
|
|
157
|
+
"FenceMaxRetriesExceededPayload.final_violation_summary must be non-empty"
|
|
158
|
+
)
|
|
159
|
+
if self.output_type is not None and self.output_type not in _VALID_OUTPUT_TYPES:
|
|
160
|
+
raise ValueError(
|
|
161
|
+
f"FenceMaxRetriesExceededPayload.output_type must be one of {sorted(_VALID_OUTPUT_TYPES)}"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def to_dict(self) -> dict[str, Any]:
|
|
165
|
+
"""Serialise the payload to a plain ``dict``."""
|
|
166
|
+
d: dict[str, Any] = {
|
|
167
|
+
"fence_id": self.fence_id,
|
|
168
|
+
"schema_name": self.schema_name,
|
|
169
|
+
"attempts_made": self.attempts_made,
|
|
170
|
+
"final_violation_summary": self.final_violation_summary,
|
|
171
|
+
}
|
|
172
|
+
if self.output_type is not None:
|
|
173
|
+
d["output_type"] = self.output_type
|
|
174
|
+
if self.span_id is not None:
|
|
175
|
+
d["span_id"] = self.span_id
|
|
176
|
+
if self.total_extra_cost is not None:
|
|
177
|
+
d["total_extra_cost"] = self.total_extra_cost.to_dict()
|
|
178
|
+
return d
|
|
179
|
+
|
|
180
|
+
@classmethod
|
|
181
|
+
def from_dict(cls, data: dict[str, Any]) -> FenceMaxRetriesExceededPayload:
|
|
182
|
+
"""Deserialise from a plain ``dict``."""
|
|
183
|
+
return cls(
|
|
184
|
+
fence_id=data["fence_id"],
|
|
185
|
+
schema_name=data["schema_name"],
|
|
186
|
+
attempts_made=int(data["attempts_made"]),
|
|
187
|
+
final_violation_summary=data["final_violation_summary"],
|
|
188
|
+
output_type=data.get("output_type"),
|
|
189
|
+
span_id=data.get("span_id"),
|
|
190
|
+
total_extra_cost=CostBreakdown.from_dict(data["total_extra_cost"])
|
|
191
|
+
if "total_extra_cost" in data
|
|
192
|
+
else None,
|
|
193
|
+
)
|