contexttrace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,284 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Mapping, Optional
5
+
6
+
7
+ NO_FAILURE = "no_failure_detected"
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class ReliabilityScore:
12
+ score: int
13
+ grade: str
14
+ strengths: list[str] = field(default_factory=list)
15
+ weaknesses: list[str] = field(default_factory=list)
16
+ recommendations: list[str] = field(default_factory=list)
17
+ components: dict[str, int] = field(default_factory=dict)
18
+
19
+ def to_dict(self) -> dict[str, Any]:
20
+ return {
21
+ "score": self.score,
22
+ "grade": self.grade,
23
+ "strengths": self.strengths,
24
+ "weaknesses": self.weaknesses,
25
+ "recommendations": self.recommendations,
26
+ "components": self.components,
27
+ }
28
+
29
+
30
+ class ReliabilityScorer:
31
+ """Practical, explainable diagnostic score for ContextTrace reports."""
32
+
33
+ DEFAULT_WEIGHTS = {
34
+ "citation_support": 0.35,
35
+ "unsupported_claim_rate": 0.25,
36
+ "failure_rate": 0.25,
37
+ "retrieval_quality": 0.10,
38
+ "abstention_quality": 0.10,
39
+ "token_efficiency": 0.05,
40
+ }
41
+
42
+ def score(
43
+ self,
44
+ *,
45
+ citation_support: Optional[float] = None,
46
+ unsupported_claim_rate: Optional[float] = None,
47
+ failure_rate: Optional[float] = None,
48
+ retrieval_quality: Optional[float] = None,
49
+ abstention_quality: Optional[float] = None,
50
+ token_efficiency: Optional[float] = None,
51
+ ) -> ReliabilityScore:
52
+ raw_components = {
53
+ "citation_support": _optional_clamp(citation_support),
54
+ "unsupported_claim_rate": _invert(unsupported_claim_rate),
55
+ "failure_rate": _invert(failure_rate),
56
+ "retrieval_quality": _optional_clamp(retrieval_quality),
57
+ "abstention_quality": _optional_clamp(abstention_quality),
58
+ "token_efficiency": _optional_clamp(token_efficiency),
59
+ }
60
+ available = {
61
+ key: value for key, value in raw_components.items() if value is not None
62
+ }
63
+ if not available:
64
+ return ReliabilityScore(
65
+ score=0,
66
+ grade="F",
67
+ weaknesses=["No evaluation metrics are available yet."],
68
+ recommendations=["Run citation verification before using the reliability score."],
69
+ )
70
+
71
+ total_weight = sum(self.DEFAULT_WEIGHTS[key] for key in available)
72
+ score = round(
73
+ sum(
74
+ available[key] * (self.DEFAULT_WEIGHTS[key] / total_weight)
75
+ for key in available
76
+ )
77
+ * 100
78
+ )
79
+ score = max(0, min(100, score))
80
+ strengths, weaknesses, recommendations = self._explain(
81
+ citation_support=citation_support,
82
+ unsupported_claim_rate=unsupported_claim_rate,
83
+ failure_rate=failure_rate,
84
+ retrieval_quality=retrieval_quality,
85
+ abstention_quality=abstention_quality,
86
+ token_efficiency=token_efficiency,
87
+ )
88
+ return ReliabilityScore(
89
+ score=score,
90
+ grade=_grade(score),
91
+ strengths=strengths,
92
+ weaknesses=weaknesses,
93
+ recommendations=recommendations,
94
+ components={key: round(value * 100) for key, value in available.items()},
95
+ )
96
+
97
+ def score_trace(self, trace: Mapping[str, Any]) -> ReliabilityScore:
98
+ evaluation = trace.get("evaluation") or {}
99
+ if isinstance(evaluation.get("reliability"), dict):
100
+ reliability = evaluation["reliability"]
101
+ return ReliabilityScore(
102
+ score=int(reliability.get("score", 0)),
103
+ grade=str(reliability.get("grade", "F")),
104
+ strengths=list(reliability.get("strengths") or []),
105
+ weaknesses=list(reliability.get("weaknesses") or []),
106
+ recommendations=list(reliability.get("recommendations") or []),
107
+ components=dict(reliability.get("components") or {}),
108
+ )
109
+
110
+ scores = evaluation.get("scores") or _scores_from_citation_checks(evaluation)
111
+ failure = evaluation.get("failure") or {}
112
+ failure_type = str(failure.get("failure_type") or "unknown")
113
+ answer = trace.get("answer") or {}
114
+ return self.score(
115
+ citation_support=_as_float(scores.get("citation_support")),
116
+ unsupported_claim_rate=_as_float(scores.get("unsupported_claim_rate")),
117
+ failure_rate=0.0 if failure_type == NO_FAILURE else 1.0,
118
+ retrieval_quality=_retrieval_quality(trace.get("chunks") or []),
119
+ abstention_quality=0.0 if failure_type == "should_have_abstained" else None,
120
+ token_efficiency=_token_efficiency(answer.get("usage") or {}),
121
+ )
122
+
123
+ def _explain(
124
+ self,
125
+ *,
126
+ citation_support: Optional[float],
127
+ unsupported_claim_rate: Optional[float],
128
+ failure_rate: Optional[float],
129
+ retrieval_quality: Optional[float],
130
+ abstention_quality: Optional[float],
131
+ token_efficiency: Optional[float],
132
+ ) -> tuple[list[str], list[str], list[str]]:
133
+ strengths: list[str] = []
134
+ weaknesses: list[str] = []
135
+ recommendations: list[str] = []
136
+
137
+ citation_support = _optional_clamp(citation_support)
138
+ unsupported_claim_rate = _optional_clamp(unsupported_claim_rate)
139
+ failure_rate = _optional_clamp(failure_rate)
140
+ retrieval_quality = _optional_clamp(retrieval_quality)
141
+ abstention_quality = _optional_clamp(abstention_quality)
142
+ token_efficiency = _optional_clamp(token_efficiency)
143
+
144
+ if citation_support is not None:
145
+ if citation_support >= 0.85:
146
+ strengths.append("Citations are usually supported by the cited evidence.")
147
+ elif citation_support < 0.60:
148
+ weaknesses.append("Citation support is weak across evaluated claims.")
149
+ recommendations.append("Add claim-level citation checks before returning answers.")
150
+ else:
151
+ weaknesses.append("Citation support is mixed and needs targeted review.")
152
+ recommendations.append("Review low-support citations and improve source selection.")
153
+
154
+ if unsupported_claim_rate is not None:
155
+ if unsupported_claim_rate <= 0.10:
156
+ strengths.append("Unsupported claims are uncommon in this evaluation.")
157
+ elif unsupported_claim_rate >= 0.30:
158
+ weaknesses.append("Unsupported claims appear frequently.")
159
+ recommendations.append("Constrain generation to selected evidence and add abstention rules.")
160
+ else:
161
+ weaknesses.append("Unsupported claims are present at a noticeable rate.")
162
+ recommendations.append("Tighten answer generation around cited chunks.")
163
+
164
+ if failure_rate is not None:
165
+ if failure_rate <= 0.05:
166
+ strengths.append("Few evaluated traces produced classified failures.")
167
+ else:
168
+ weaknesses.append("A meaningful share of traces produced classified failures.")
169
+ recommendations.append("Prioritize the most common failure type before tuning prompts.")
170
+
171
+ if retrieval_quality is not None:
172
+ if retrieval_quality >= 0.75:
173
+ strengths.append("Retrieved or selected context has strong relevance signals.")
174
+ else:
175
+ weaknesses.append("Retrieval relevance signals are low.")
176
+ recommendations.append("Try hybrid retrieval, reranking, or better metadata filters.")
177
+
178
+ if abstention_quality is not None:
179
+ if abstention_quality >= 0.75:
180
+ strengths.append("Abstention behavior looks aligned with available evidence.")
181
+ else:
182
+ weaknesses.append("The system likely answered when it should have abstained.")
183
+ recommendations.append("Add low-confidence abstention thresholds and user-facing uncertainty.")
184
+
185
+ if token_efficiency is not None:
186
+ if token_efficiency >= 0.75:
187
+ strengths.append("Token usage is within a reasonable diagnostic budget.")
188
+ else:
189
+ weaknesses.append("Token usage is high relative to the logged answer.")
190
+ recommendations.append("Reduce context size or use compression that preserves citation-bearing text.")
191
+
192
+ if not recommendations:
193
+ recommendations.append("Keep monitoring this score alongside the raw reliability metrics.")
194
+
195
+ return strengths, weaknesses, _dedupe(recommendations)
196
+
197
+
198
+ def _scores_from_citation_checks(evaluation: Mapping[str, Any]) -> dict[str, float]:
199
+ checks = evaluation.get("citation_checks") or []
200
+ if not checks:
201
+ return {"citation_support": 0.0, "unsupported_claim_rate": 1.0}
202
+ support_scores = [float(check.get("support_score") or 0.0) for check in checks]
203
+ unsupported = [
204
+ check
205
+ for check in checks
206
+ if check.get("verdict") in {"unsupported", "contradicted", "not_enough_info"}
207
+ ]
208
+ return {
209
+ "citation_support": round(sum(support_scores) / len(support_scores), 3),
210
+ "unsupported_claim_rate": round(len(unsupported) / len(checks), 3),
211
+ }
212
+
213
+
214
+ def _retrieval_quality(chunks: list[Mapping[str, Any]]) -> Optional[float]:
215
+ selected_scores = [
216
+ _as_float(chunk.get("relevance_score"))
217
+ for chunk in chunks
218
+ if chunk.get("selected") and _as_float(chunk.get("relevance_score")) is not None
219
+ ]
220
+ all_scores = [
221
+ _as_float(chunk.get("relevance_score"))
222
+ for chunk in chunks
223
+ if _as_float(chunk.get("relevance_score")) is not None
224
+ ]
225
+ values = selected_scores or all_scores
226
+ if not values:
227
+ return None
228
+ return round(sum(value for value in values if value is not None) / len(values), 3)
229
+
230
+
231
+ def _token_efficiency(usage: Mapping[str, Any]) -> Optional[float]:
232
+ total_tokens = _as_float(usage.get("total_tokens"))
233
+ if total_tokens is None:
234
+ return None
235
+ if total_tokens <= 2000:
236
+ return 1.0
237
+ if total_tokens >= 12000:
238
+ return 0.0
239
+ return round(1.0 - ((total_tokens - 2000) / 10000), 3)
240
+
241
+
242
+ def _invert(value: Optional[float]) -> Optional[float]:
243
+ clamped = _optional_clamp(value)
244
+ if clamped is None:
245
+ return None
246
+ return 1.0 - clamped
247
+
248
+
249
+ def _optional_clamp(value: Optional[float]) -> Optional[float]:
250
+ value = _as_float(value)
251
+ if value is None:
252
+ return None
253
+ return max(0.0, min(1.0, value))
254
+
255
+
256
+ def _as_float(value: Any) -> Optional[float]:
257
+ try:
258
+ if value is None:
259
+ return None
260
+ return float(value)
261
+ except (TypeError, ValueError):
262
+ return None
263
+
264
+
265
+ def _grade(score: int) -> str:
266
+ if score >= 90:
267
+ return "A"
268
+ if score >= 75:
269
+ return "B"
270
+ if score >= 60:
271
+ return "C"
272
+ if score >= 45:
273
+ return "D"
274
+ return "F"
275
+
276
+
277
+ def _dedupe(values: list[str]) -> list[str]:
278
+ seen = set()
279
+ unique = []
280
+ for value in values:
281
+ if value not in seen:
282
+ seen.add(value)
283
+ unique.append(value)
284
+ return unique