contexttrace 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contexttrace/__init__.py +36 -0
- contexttrace/_version.py +1 -0
- contexttrace/cli.py +474 -0
- contexttrace/client.py +1074 -0
- contexttrace/config.py +246 -0
- contexttrace/demo.py +311 -0
- contexttrace/demo_data.py +257 -0
- contexttrace/endpoint_eval.py +314 -0
- contexttrace/errors.py +14 -0
- contexttrace/evaluator.py +448 -0
- contexttrace/integrations/__init__.py +14 -0
- contexttrace/integrations/fastapi.py +311 -0
- contexttrace/integrations/langchain.py +440 -0
- contexttrace/integrations/langgraph.py +197 -0
- contexttrace/integrations/llamaindex.py +422 -0
- contexttrace/integrations/opentelemetry.py +111 -0
- contexttrace/local.py +325 -0
- contexttrace/py.typed +1 -0
- contexttrace/regression.py +123 -0
- contexttrace/reliability.py +284 -0
- contexttrace/report.py +550 -0
- contexttrace/storage/__init__.py +3 -0
- contexttrace/storage/sqlite_store.py +604 -0
- contexttrace/thresholds.py +50 -0
- contexttrace/transport.py +183 -0
- contexttrace/viewer.py +148 -0
- contexttrace-0.1.0.dist-info/METADATA +154 -0
- contexttrace-0.1.0.dist-info/RECORD +31 -0
- contexttrace-0.1.0.dist-info/WHEEL +5 -0
- contexttrace-0.1.0.dist-info/entry_points.txt +2 -0
- contexttrace-0.1.0.dist-info/top_level.txt +1 -0
contexttrace/local.py
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
from contexttrace.errors import ContextTraceLocalError
|
|
8
|
+
from contexttrace.reliability import ReliabilityScorer
|
|
9
|
+
from contexttrace.storage import SQLiteTraceStore
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger("contexttrace")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LocalTransport:
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
*,
|
|
18
|
+
store_dir: str = ".contexttrace",
|
|
19
|
+
storage_path: Optional[str] = None,
|
|
20
|
+
debug: bool = False,
|
|
21
|
+
log_chunk_text: bool = True,
|
|
22
|
+
log_answer_text: bool = True,
|
|
23
|
+
) -> None:
|
|
24
|
+
self.storage_path = storage_path or str(Path(store_dir) / "contexttrace.db")
|
|
25
|
+
self.store = SQLiteTraceStore(self.storage_path)
|
|
26
|
+
self.debug = debug
|
|
27
|
+
self.log_chunk_text = log_chunk_text
|
|
28
|
+
self.log_answer_text = log_answer_text
|
|
29
|
+
|
|
30
|
+
def post(self, path: str, payload: Optional[dict[str, Any]] = None) -> dict[str, Any]:
|
|
31
|
+
payload = payload or {}
|
|
32
|
+
self._debug("POST", path, payload)
|
|
33
|
+
if path == "/v1/traces/start":
|
|
34
|
+
trace = self.store.create_trace(
|
|
35
|
+
project=payload["project"],
|
|
36
|
+
query=payload["query"],
|
|
37
|
+
metadata=payload.get("metadata") or {},
|
|
38
|
+
)
|
|
39
|
+
return {"trace_id": trace["id"], "project_id": trace["project_id"]}
|
|
40
|
+
|
|
41
|
+
trace_id, action = _parse_trace_action(path)
|
|
42
|
+
if action == "retrieval":
|
|
43
|
+
chunks = [
|
|
44
|
+
_chunk(
|
|
45
|
+
chunk,
|
|
46
|
+
index=index,
|
|
47
|
+
selected=False,
|
|
48
|
+
log_chunk_text=self.log_chunk_text,
|
|
49
|
+
)
|
|
50
|
+
for index, chunk in enumerate(payload.get("chunks") or [])
|
|
51
|
+
]
|
|
52
|
+
accepted = self.store.upsert_chunks(trace_id, chunks, selected=False)
|
|
53
|
+
return {"trace_id": trace_id, "accepted": accepted}
|
|
54
|
+
|
|
55
|
+
if action == "context":
|
|
56
|
+
accepted = self.store.mark_context(trace_id, list(payload.get("chunk_ids") or []))
|
|
57
|
+
chunks = [
|
|
58
|
+
_chunk(
|
|
59
|
+
chunk,
|
|
60
|
+
index=index,
|
|
61
|
+
selected=True,
|
|
62
|
+
log_chunk_text=self.log_chunk_text,
|
|
63
|
+
)
|
|
64
|
+
for index, chunk in enumerate(payload.get("chunks") or [])
|
|
65
|
+
]
|
|
66
|
+
accepted += self.store.upsert_chunks(trace_id, chunks, selected=True)
|
|
67
|
+
return {"trace_id": trace_id, "accepted": accepted}
|
|
68
|
+
|
|
69
|
+
if action == "answer":
|
|
70
|
+
answer_text = payload["answer"] if self.log_answer_text else "[answer text redacted]"
|
|
71
|
+
self.store.save_answer(
|
|
72
|
+
trace_id,
|
|
73
|
+
{
|
|
74
|
+
"answer": answer_text,
|
|
75
|
+
"model": payload.get("model"),
|
|
76
|
+
"usage": payload.get("usage") or {},
|
|
77
|
+
"metadata": payload.get("metadata") or {},
|
|
78
|
+
},
|
|
79
|
+
)
|
|
80
|
+
return {"trace_id": trace_id, "accepted": 1}
|
|
81
|
+
|
|
82
|
+
if action == "citations":
|
|
83
|
+
accepted = self.store.save_citations(trace_id, list(payload.get("citations") or []))
|
|
84
|
+
return {"trace_id": trace_id, "accepted": accepted}
|
|
85
|
+
|
|
86
|
+
if action == "agent-events":
|
|
87
|
+
return self.store.add_agent_event(trace_id, payload)
|
|
88
|
+
|
|
89
|
+
if action == "evaluate":
|
|
90
|
+
trace = self.store.get_trace(trace_id)
|
|
91
|
+
evaluation = _evaluate_trace(trace)
|
|
92
|
+
self.store.save_evaluation(trace_id, evaluation)
|
|
93
|
+
return evaluation
|
|
94
|
+
|
|
95
|
+
raise ContextTraceLocalError("Unsupported local POST path: %s" % path)
|
|
96
|
+
|
|
97
|
+
def get(self, path: str) -> dict[str, Any]:
|
|
98
|
+
self._debug("GET", path, None)
|
|
99
|
+
if path.startswith("/v1/traces?") or path == "/v1/traces":
|
|
100
|
+
limit = _query_int(path, "limit", default=20)
|
|
101
|
+
return {"traces": self.store.list_traces(limit=limit)}
|
|
102
|
+
if path == "/v1/traces/last":
|
|
103
|
+
trace = self.store.last_trace()
|
|
104
|
+
if trace is None:
|
|
105
|
+
raise ContextTraceLocalError("No local traces found.")
|
|
106
|
+
return trace
|
|
107
|
+
if path == "/v1/status":
|
|
108
|
+
last_eval = self.store.last_eval_run()
|
|
109
|
+
return {
|
|
110
|
+
"storage_path": self.storage_path,
|
|
111
|
+
"trace_count": self.store.trace_count(),
|
|
112
|
+
"last_eval_run": last_eval,
|
|
113
|
+
}
|
|
114
|
+
parts = path.strip("/").split("/")
|
|
115
|
+
if len(parts) == 4 and parts[:2] == ["v1", "traces"] and parts[3] == "agent-events":
|
|
116
|
+
trace = self.store.get_trace(parts[2])
|
|
117
|
+
return {"trace_id": parts[2], "events": trace.get("agent_events") or []}
|
|
118
|
+
if len(parts) == 3 and parts[:2] == ["v1", "eval-runs"]:
|
|
119
|
+
return self.store.get_eval_run(parts[2])
|
|
120
|
+
if path == "/v1/eval-runs":
|
|
121
|
+
return {"eval_runs": self.store.list_eval_runs()}
|
|
122
|
+
if path.startswith("/v1/traces/"):
|
|
123
|
+
trace_id = path.rsplit("/", 1)[-1]
|
|
124
|
+
return self.store.get_trace(trace_id)
|
|
125
|
+
raise ContextTraceLocalError("Unsupported local GET path: %s" % path)
|
|
126
|
+
|
|
127
|
+
def close(self) -> None:
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
def _debug(self, method: str, path: str, payload: Optional[dict[str, Any]]) -> None:
|
|
131
|
+
if self.debug:
|
|
132
|
+
logger.debug("%s %s", method, path)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _parse_trace_action(path: str) -> tuple[str, str]:
|
|
136
|
+
parts = path.strip("/").split("/")
|
|
137
|
+
if len(parts) != 4 or parts[0] != "v1" or parts[1] != "traces":
|
|
138
|
+
raise ContextTraceLocalError("Unsupported local trace path: %s" % path)
|
|
139
|
+
return parts[2], parts[3]
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _chunk(
|
|
143
|
+
chunk: dict[str, Any],
|
|
144
|
+
*,
|
|
145
|
+
index: int,
|
|
146
|
+
selected: bool,
|
|
147
|
+
log_chunk_text: bool,
|
|
148
|
+
) -> dict[str, Any]:
|
|
149
|
+
chunk_id = chunk.get("chunk_id") or chunk.get("id") or "chunk_%s" % index
|
|
150
|
+
content = str(chunk.get("content") or chunk.get("text") or chunk.get("page_content") or "")
|
|
151
|
+
return {
|
|
152
|
+
"chunk_id": str(chunk_id),
|
|
153
|
+
"content": content if log_chunk_text else "[chunk text redacted]",
|
|
154
|
+
"source": chunk.get("source"),
|
|
155
|
+
"metadata": chunk.get("metadata") or {},
|
|
156
|
+
"relevance_score": chunk.get("relevance_score") or chunk.get("score"),
|
|
157
|
+
"selected": selected,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _evaluate_trace(trace: dict[str, Any]) -> dict[str, Any]:
|
|
162
|
+
chunks_by_id = {chunk["chunk_id"]: chunk for chunk in trace.get("chunks") or []}
|
|
163
|
+
chunks = list(trace.get("chunks") or [])
|
|
164
|
+
metadata = trace.get("metadata") or {}
|
|
165
|
+
evaluated = []
|
|
166
|
+
for citation in trace.get("citation_checks") or []:
|
|
167
|
+
source = chunks_by_id.get(citation["source_chunk_id"])
|
|
168
|
+
score = _support_score(citation["claim"], source.get("content", "") if source else "")
|
|
169
|
+
verdict = _verdict(score)
|
|
170
|
+
evaluated.append(
|
|
171
|
+
{
|
|
172
|
+
"claim": citation["claim"],
|
|
173
|
+
"source_chunk_id": citation["source_chunk_id"],
|
|
174
|
+
"verdict": verdict,
|
|
175
|
+
"support_score": score,
|
|
176
|
+
"reason": "Local lexical support score %.2f." % score,
|
|
177
|
+
}
|
|
178
|
+
)
|
|
179
|
+
unsupported = [check for check in evaluated if check["verdict"] != "directly_supported"]
|
|
180
|
+
failure_type = _failure_type(trace, chunks, evaluated, unsupported, metadata)
|
|
181
|
+
severity = "none" if failure_type == "no_failure_detected" else "medium"
|
|
182
|
+
scores = _score_summary(evaluated)
|
|
183
|
+
reliability = ReliabilityScorer().score_trace(
|
|
184
|
+
{
|
|
185
|
+
**trace,
|
|
186
|
+
"evaluation": {
|
|
187
|
+
"scores": scores,
|
|
188
|
+
"failure": {"failure_type": failure_type},
|
|
189
|
+
"citation_checks": evaluated,
|
|
190
|
+
},
|
|
191
|
+
}
|
|
192
|
+
).to_dict()
|
|
193
|
+
return {
|
|
194
|
+
"scores": scores,
|
|
195
|
+
"reliability": reliability,
|
|
196
|
+
"citation_checks": evaluated,
|
|
197
|
+
"failure": {
|
|
198
|
+
"failure_type": failure_type,
|
|
199
|
+
"severity": severity,
|
|
200
|
+
"root_cause": _root_cause(failure_type),
|
|
201
|
+
"suggested_fix": _suggested_fix(failure_type),
|
|
202
|
+
},
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _verdict(score: float) -> str:
|
|
207
|
+
if score >= 0.65:
|
|
208
|
+
return "directly_supported"
|
|
209
|
+
if score >= 0.35:
|
|
210
|
+
return "partially_supported"
|
|
211
|
+
return "unsupported"
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _failure_type(
|
|
215
|
+
trace: dict[str, Any],
|
|
216
|
+
chunks: list[dict[str, Any]],
|
|
217
|
+
evaluated: list[dict[str, Any]],
|
|
218
|
+
unsupported: list[dict[str, Any]],
|
|
219
|
+
metadata: dict[str, Any],
|
|
220
|
+
) -> str:
|
|
221
|
+
expected_sources = set(metadata.get("expected_sources") or [])
|
|
222
|
+
retrieved_sources = {chunk.get("source") for chunk in chunks if chunk.get("source")}
|
|
223
|
+
if expected_sources and not (expected_sources & retrieved_sources):
|
|
224
|
+
return "retrieval_miss"
|
|
225
|
+
|
|
226
|
+
answer = ((trace.get("answer") or {}).get("answer") or "").lower()
|
|
227
|
+
if metadata.get("question_type") == "unanswerable" and answer and not _looks_like_abstention(answer):
|
|
228
|
+
return "should_have_abstained"
|
|
229
|
+
|
|
230
|
+
expected_failure = metadata.get("expected_failure")
|
|
231
|
+
if expected_failure and expected_failure != "no_failure_detected":
|
|
232
|
+
return str(expected_failure)
|
|
233
|
+
|
|
234
|
+
for check in unsupported:
|
|
235
|
+
best_score = max((_support_score(check["claim"], chunk.get("content", "")) for chunk in chunks), default=0.0)
|
|
236
|
+
if best_score >= 0.65:
|
|
237
|
+
return "citation_mismatch"
|
|
238
|
+
|
|
239
|
+
stances = {
|
|
240
|
+
((chunk.get("metadata") or {}).get("stance") or "")
|
|
241
|
+
for chunk in chunks
|
|
242
|
+
if ((chunk.get("metadata") or {}).get("stance") or "") not in {"", "neutral"}
|
|
243
|
+
}
|
|
244
|
+
if len(stances) > 1:
|
|
245
|
+
return "conflicting_sources"
|
|
246
|
+
|
|
247
|
+
if unsupported or not evaluated:
|
|
248
|
+
return "unsupported_answer"
|
|
249
|
+
return "no_failure_detected"
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _looks_like_abstention(answer: str) -> bool:
|
|
253
|
+
return any(phrase in answer for phrase in ("do not state", "not enough", "cannot determine", "not mention"))
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _root_cause(failure_type: str) -> str:
|
|
257
|
+
return {
|
|
258
|
+
"no_failure_detected": "Local lexical evaluation did not find evidence-level issues.",
|
|
259
|
+
"retrieval_miss": "Expected source evidence was not present in the retrieved chunks.",
|
|
260
|
+
"should_have_abstained": "The query appears unanswerable from the available context, but the answer made a claim.",
|
|
261
|
+
"conflicting_sources": "Retrieved context includes conflicting current and archived evidence.",
|
|
262
|
+
"citation_mismatch": "A cited chunk does not support the claim, although another retrieved chunk appears to support it.",
|
|
263
|
+
"unsupported_answer": "The answer contains claims that are not sufficiently supported by retrieved context.",
|
|
264
|
+
}.get(failure_type, "Local lexical evaluation completed.")
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _suggested_fix(failure_type: str) -> str:
|
|
268
|
+
return {
|
|
269
|
+
"no_failure_detected": "No immediate fix suggested. Review raw evidence for high-stakes workflows.",
|
|
270
|
+
"retrieval_miss": "Tune retrieval, add query expansion, or verify the source is indexed.",
|
|
271
|
+
"should_have_abstained": "Add abstention rules when retrieved context lacks direct support.",
|
|
272
|
+
"conflicting_sources": "Filter archived sources or add source freshness/version ranking.",
|
|
273
|
+
"citation_mismatch": "Run sentence-level citation selection before returning the answer.",
|
|
274
|
+
"unsupported_answer": "Require claim-level support checks before final answer generation.",
|
|
275
|
+
}.get(failure_type, "Use a configured judge provider for deeper citation analysis.")
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _score_summary(evaluated_checks: list[dict[str, Any]]) -> dict[str, float]:
|
|
279
|
+
if not evaluated_checks:
|
|
280
|
+
return {
|
|
281
|
+
"citation_support": 0.0,
|
|
282
|
+
"unsupported_claim_rate": 1.0,
|
|
283
|
+
}
|
|
284
|
+
support_scores = [float(check.get("support_score") or 0.0) for check in evaluated_checks]
|
|
285
|
+
unsupported = [
|
|
286
|
+
check
|
|
287
|
+
for check in evaluated_checks
|
|
288
|
+
if check.get("verdict") in {"unsupported", "contradicted", "not_enough_info"}
|
|
289
|
+
]
|
|
290
|
+
return {
|
|
291
|
+
"citation_support": round(sum(support_scores) / len(support_scores), 3),
|
|
292
|
+
"unsupported_claim_rate": round(len(unsupported) / len(evaluated_checks), 3),
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _support_score(claim: str, source: str) -> float:
|
|
297
|
+
claim_terms = _terms(claim)
|
|
298
|
+
if not claim_terms:
|
|
299
|
+
return 0.0
|
|
300
|
+
source_terms = _terms(source)
|
|
301
|
+
return round(len(claim_terms & source_terms) / len(claim_terms), 3)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _terms(text: str) -> set:
|
|
305
|
+
return {
|
|
306
|
+
token.strip(".,:;!?()[]{}\"'").lower().rstrip("s")
|
|
307
|
+
for token in text.split()
|
|
308
|
+
if len(token.strip(".,:;!?()[]{}\"'")) > 2
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _query_int(path: str, key: str, *, default: int) -> int:
|
|
313
|
+
if "?" not in path:
|
|
314
|
+
return default
|
|
315
|
+
_, query = path.split("?", 1)
|
|
316
|
+
for part in query.split("&"):
|
|
317
|
+
if "=" not in part:
|
|
318
|
+
continue
|
|
319
|
+
name, value = part.split("=", 1)
|
|
320
|
+
if name == key:
|
|
321
|
+
try:
|
|
322
|
+
return int(value)
|
|
323
|
+
except ValueError:
|
|
324
|
+
return default
|
|
325
|
+
return default
|
contexttrace/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Iterable
|
|
6
|
+
|
|
7
|
+
from contexttrace.client import ContextTrace
|
|
8
|
+
from contexttrace.demo import aggregate_trace_metrics, run_demo_dataset
|
|
9
|
+
from contexttrace.report import ReportGenerator
|
|
10
|
+
from contexttrace.thresholds import parse_thresholds, threshold_failures
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
BENCHMARK_STRATEGIES = (
|
|
14
|
+
"dense_top_k",
|
|
15
|
+
"bm25",
|
|
16
|
+
"hybrid",
|
|
17
|
+
"hybrid_rerank",
|
|
18
|
+
"corrective",
|
|
19
|
+
"adaptive",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def run_local_benchmark(
|
|
24
|
+
*,
|
|
25
|
+
dataset: str,
|
|
26
|
+
contexttrace: ContextTrace,
|
|
27
|
+
output_dir: str = ".contexttrace/benchmarks",
|
|
28
|
+
strategies: Iterable[str] = BENCHMARK_STRATEGIES,
|
|
29
|
+
fail_on: Iterable[str] = (),
|
|
30
|
+
report_path: str | None = None,
|
|
31
|
+
) -> dict[str, Any]:
|
|
32
|
+
output = Path(output_dir)
|
|
33
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
strategy_results: dict[str, Any] = {}
|
|
35
|
+
all_traces: list[dict[str, Any]] = []
|
|
36
|
+
|
|
37
|
+
for strategy in strategies:
|
|
38
|
+
demo_run = run_demo_dataset(
|
|
39
|
+
dataset=dataset,
|
|
40
|
+
contexttrace=contexttrace,
|
|
41
|
+
strategy=strategy,
|
|
42
|
+
report_path=str(output / ("%s_%s_report.html" % (Path(str(dataset)).name, strategy))),
|
|
43
|
+
)
|
|
44
|
+
traces = [contexttrace.get_trace(trace_id) for trace_id in demo_run.trace_ids]
|
|
45
|
+
all_traces.extend(traces)
|
|
46
|
+
strategy_results[strategy] = {
|
|
47
|
+
"summary": demo_run.summary,
|
|
48
|
+
"trace_ids": demo_run.trace_ids,
|
|
49
|
+
"report_path": demo_run.report_path,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
summary = aggregate_trace_metrics(all_traces)
|
|
53
|
+
thresholds = parse_thresholds(fail_on)
|
|
54
|
+
failures = threshold_failures(summary, thresholds)
|
|
55
|
+
result = {
|
|
56
|
+
"dataset": dataset,
|
|
57
|
+
"strategies": strategy_results,
|
|
58
|
+
"summary": summary,
|
|
59
|
+
"threshold_failures": failures,
|
|
60
|
+
"status": "failed" if failures else "passed",
|
|
61
|
+
}
|
|
62
|
+
results_path = output / "benchmark_results.json"
|
|
63
|
+
summary_path = output / "benchmark_summary.md"
|
|
64
|
+
if report_path is None:
|
|
65
|
+
report_path = str(output / "benchmark_report.html")
|
|
66
|
+
results_path.write_text(json.dumps(result, indent=2), encoding="utf-8")
|
|
67
|
+
summary_path.write_text(render_benchmark_summary(result), encoding="utf-8")
|
|
68
|
+
ReportGenerator().generate_eval_report(
|
|
69
|
+
{
|
|
70
|
+
"id": "benchmark",
|
|
71
|
+
"dataset": dataset,
|
|
72
|
+
"endpoint": "contexttrace-benchmark",
|
|
73
|
+
"summary": summary,
|
|
74
|
+
},
|
|
75
|
+
all_traces,
|
|
76
|
+
path=report_path,
|
|
77
|
+
)
|
|
78
|
+
result["results_path"] = str(results_path)
|
|
79
|
+
result["summary_path"] = str(summary_path)
|
|
80
|
+
result["report_path"] = report_path
|
|
81
|
+
return result
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def render_benchmark_summary(result: dict[str, Any]) -> str:
|
|
85
|
+
summary = result["summary"]
|
|
86
|
+
lines = [
|
|
87
|
+
"# ContextTrace Benchmark Summary",
|
|
88
|
+
"",
|
|
89
|
+
"Status: **%s**" % result["status"],
|
|
90
|
+
"",
|
|
91
|
+
"| Metric | Value |",
|
|
92
|
+
"| --- | ---: |",
|
|
93
|
+
"| Questions tested | %s |" % summary.get("questions_tested", 0),
|
|
94
|
+
"| Reliability score | %s |" % summary.get("reliability_score", 0),
|
|
95
|
+
"| Failure rate | %.3f |" % float(summary.get("failure_rate", 0)),
|
|
96
|
+
"| Citation support | %.3f |" % float(summary.get("citation_support", 0)),
|
|
97
|
+
"| Unsupported claim rate | %.3f |" % float(summary.get("unsupported_claim_rate", 0)),
|
|
98
|
+
"| Retrieval miss rate | %.3f |" % float(summary.get("retrieval_miss_rate", 0)),
|
|
99
|
+
"| Latency ms | %.1f |" % float(summary.get("latency_ms", 0)),
|
|
100
|
+
"| Token count | %.1f |" % float(summary.get("token_count", 0)),
|
|
101
|
+
"| Cost USD | %.6f |" % float(summary.get("cost_usd", 0)),
|
|
102
|
+
"",
|
|
103
|
+
"## Strategy Summaries",
|
|
104
|
+
"",
|
|
105
|
+
"| Strategy | Failure Rate | Citation Support | Unsupported Claims | Retrieval Miss Rate |",
|
|
106
|
+
"| --- | ---: | ---: | ---: | ---: |",
|
|
107
|
+
]
|
|
108
|
+
for strategy, payload in result["strategies"].items():
|
|
109
|
+
metrics = payload["summary"]
|
|
110
|
+
lines.append(
|
|
111
|
+
"| %s | %.3f | %.3f | %.3f | %.3f |"
|
|
112
|
+
% (
|
|
113
|
+
strategy,
|
|
114
|
+
float(metrics.get("failure_rate", 0)),
|
|
115
|
+
float(metrics.get("citation_support", 0)),
|
|
116
|
+
float(metrics.get("unsupported_claim_rate", 0)),
|
|
117
|
+
float(metrics.get("retrieval_miss_rate", 0)),
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
if result["threshold_failures"]:
|
|
121
|
+
lines.extend(["", "## Threshold Failures", ""])
|
|
122
|
+
lines.extend("- %s" % failure for failure in result["threshold_failures"])
|
|
123
|
+
return "\n".join(lines) + "\n"
|