contexttrace 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {contexttrace-0.3.0 → contexttrace-0.4.0}/PKG-INFO +7 -2
- {contexttrace-0.3.0 → contexttrace-0.4.0}/README.md +6 -1
- contexttrace-0.4.0/contexttrace/_version.py +1 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/cli.py +64 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/__init__.py +4 -0
- contexttrace-0.4.0/contexttrace/verify/compare.py +445 -0
- contexttrace-0.4.0/contexttrace/verify/compare_report.py +386 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace.egg-info/SOURCES.txt +2 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/pyproject.toml +1 -1
- contexttrace-0.3.0/contexttrace/_version.py +0 -1
- {contexttrace-0.3.0 → contexttrace-0.4.0}/MANIFEST.in +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/__init__.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/client.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/config.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/demo.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/demo_data.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/endpoint_eval.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/errors.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/evaluator.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/integrations/__init__.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/integrations/fastapi.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/integrations/langchain.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/integrations/langgraph.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/integrations/llamaindex.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/integrations/opentelemetry.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/local.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/py.typed +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/regression.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/reliability.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/report.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/storage/__init__.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/storage/sqlite_store.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/thresholds.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/transport.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/abstention.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/benchmark.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/citations.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/claims.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/demos.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/evidence.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/external_benchmark_cases.json +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/facts.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/real_benchmark_cases.json +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/report.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/root_cause.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/runner.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/schema.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/spans.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/verdicts.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/viewer.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/setup.cfg +0 -0
- {contexttrace-0.3.0 → contexttrace-0.4.0}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: contexttrace
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
|
|
5
5
|
Author: ContextTrace contributors
|
|
6
6
|
License: MIT
|
|
@@ -147,6 +147,9 @@ contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
|
|
|
147
147
|
contexttrace verify-benchmark --mode semantic
|
|
148
148
|
contexttrace verify-benchmark --mode semantic --report
|
|
149
149
|
contexttrace verify-benchmark --case-set external --mode semantic --report
|
|
150
|
+
contexttrace compare baseline.json current.json
|
|
151
|
+
contexttrace compare baseline.json current.json --report
|
|
152
|
+
contexttrace compare baseline.json current.json --fail-on new_failure
|
|
150
153
|
```
|
|
151
154
|
|
|
152
155
|
Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
|
|
@@ -159,7 +162,9 @@ Verification output includes evidence span offsets, stable span hashes, multiple
|
|
|
159
162
|
|
|
160
163
|
ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
|
|
161
164
|
|
|
162
|
-
|
|
165
|
+
Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
|
|
166
|
+
|
|
167
|
+
The v0.4.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
163
168
|
|
|
164
169
|
## What It Catches
|
|
165
170
|
|
|
@@ -90,6 +90,9 @@ contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
|
|
|
90
90
|
contexttrace verify-benchmark --mode semantic
|
|
91
91
|
contexttrace verify-benchmark --mode semantic --report
|
|
92
92
|
contexttrace verify-benchmark --case-set external --mode semantic --report
|
|
93
|
+
contexttrace compare baseline.json current.json
|
|
94
|
+
contexttrace compare baseline.json current.json --report
|
|
95
|
+
contexttrace compare baseline.json current.json --fail-on new_failure
|
|
93
96
|
```
|
|
94
97
|
|
|
95
98
|
Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
|
|
@@ -102,7 +105,9 @@ Verification output includes evidence span offsets, stable span hashes, multiple
|
|
|
102
105
|
|
|
103
106
|
ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
|
|
104
107
|
|
|
105
|
-
|
|
108
|
+
Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
|
|
109
|
+
|
|
110
|
+
The v0.4.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
106
111
|
|
|
107
112
|
## What It Catches
|
|
108
113
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.4.0"
|
|
@@ -24,12 +24,15 @@ from contexttrace.storage import SQLiteTraceStore
|
|
|
24
24
|
from contexttrace.thresholds import parse_thresholds, threshold_failures
|
|
25
25
|
from contexttrace.verify import (
|
|
26
26
|
VerificationInputError,
|
|
27
|
+
compare_failures,
|
|
28
|
+
compare_trace_files,
|
|
27
29
|
list_verify_demos,
|
|
28
30
|
load_trace_file,
|
|
29
31
|
load_verify_demo,
|
|
30
32
|
verify_trace,
|
|
31
33
|
)
|
|
32
34
|
from contexttrace.verify.benchmark import run_verify_benchmark, write_verify_benchmark_report
|
|
35
|
+
from contexttrace.verify.compare_report import CompareReportGenerator
|
|
33
36
|
from contexttrace.verify.report import VerifyReportGenerator
|
|
34
37
|
from contexttrace.viewer import serve_viewer
|
|
35
38
|
|
|
@@ -340,6 +343,67 @@ def verify_benchmark_command(mode: str, case_set: str, json_output: bool, report
|
|
|
340
343
|
return 0
|
|
341
344
|
|
|
342
345
|
|
|
346
|
+
@cli.command("compare")
|
|
347
|
+
@click.argument("baseline_json")
|
|
348
|
+
@click.argument("current_json")
|
|
349
|
+
@click.option("--json", "json_output", is_flag=True, help="Print the full comparison result as JSON.")
|
|
350
|
+
@click.option("--report", is_flag=True, help="Generate a local HTML regression report.")
|
|
351
|
+
@click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
|
|
352
|
+
@click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode for raw trace inputs.")
|
|
353
|
+
@click.option("--fail-on", multiple=True, help="Fail on new_failure, new_unsupported, new_citation_mismatch, should_abstain_flip, support_rate_drop, new_root_cause, or any_regression.")
|
|
354
|
+
def compare_command(
|
|
355
|
+
baseline_json: str,
|
|
356
|
+
current_json: str,
|
|
357
|
+
json_output: bool,
|
|
358
|
+
report: bool,
|
|
359
|
+
out: Optional[str],
|
|
360
|
+
mode: str,
|
|
361
|
+
fail_on: tuple[str, ...],
|
|
362
|
+
) -> int:
|
|
363
|
+
"""Compare two portable RAG traces or verification JSON outputs."""
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
result = compare_trace_files(baseline_json, current_json, mode=mode)
|
|
367
|
+
except VerificationInputError as exc:
|
|
368
|
+
raise click.ClickException(str(exc)) from exc
|
|
369
|
+
|
|
370
|
+
written_report = None
|
|
371
|
+
if report or out:
|
|
372
|
+
default_name = "%s_vs_%s_compare.html" % (Path(baseline_json).stem, Path(current_json).stem)
|
|
373
|
+
output_path = out or str(Path(".contexttrace") / "reports" / default_name)
|
|
374
|
+
written_report = CompareReportGenerator().generate(result, path=output_path)
|
|
375
|
+
|
|
376
|
+
fail_messages = compare_failures(result, fail_on)
|
|
377
|
+
if json_output:
|
|
378
|
+
if written_report:
|
|
379
|
+
click.echo("Report: %s" % written_report, err=True)
|
|
380
|
+
click.echo(json.dumps(result, indent=2))
|
|
381
|
+
for message in fail_messages:
|
|
382
|
+
click.echo("Comparison failed: %s" % message, err=True)
|
|
383
|
+
return 1 if fail_messages else 0
|
|
384
|
+
|
|
385
|
+
summary = result["summary"]
|
|
386
|
+
click.echo("Regression: %s" % str(summary["regression"]).lower())
|
|
387
|
+
click.echo("Support rate: %.3f -> %.3f (%+.3f)" % (
|
|
388
|
+
float(summary.get("support_rate_before") or 0.0),
|
|
389
|
+
float(summary.get("support_rate_after") or 0.0),
|
|
390
|
+
float(summary.get("support_rate_delta") or 0.0),
|
|
391
|
+
))
|
|
392
|
+
click.echo("Unsupported claim rate delta: %+.3f" % float(summary.get("unsupported_claim_rate_delta") or 0.0))
|
|
393
|
+
click.echo("Citation mismatch delta: %+d" % int(summary.get("citation_mismatch_delta") or 0))
|
|
394
|
+
click.echo("New failures: %s" % summary["new_failures"])
|
|
395
|
+
click.echo("Resolved failures: %s" % summary["resolved_failures"])
|
|
396
|
+
click.echo("Added claims: %s" % summary["added_claims"])
|
|
397
|
+
click.echo("Removed claims: %s" % summary["removed_claims"])
|
|
398
|
+
click.echo("Changed claims: %s" % summary["changed_claims"])
|
|
399
|
+
click.echo("New root causes: %s" % (", ".join(summary.get("new_root_causes") or []) or "none"))
|
|
400
|
+
if written_report:
|
|
401
|
+
click.echo("Report: %s" % written_report)
|
|
402
|
+
for message in fail_messages:
|
|
403
|
+
click.echo("Comparison failed: %s" % message, err=True)
|
|
404
|
+
return 1 if fail_messages else 0
|
|
405
|
+
|
|
406
|
+
|
|
343
407
|
def _write_verify_report(
|
|
344
408
|
result: dict,
|
|
345
409
|
trace: object,
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from contexttrace.verify.runner import verify_trace, verify_trace_file
|
|
2
|
+
from contexttrace.verify.compare import compare_failures, compare_trace_files, compare_verifications
|
|
2
3
|
from contexttrace.verify.schema import (
|
|
3
4
|
RAGTrace,
|
|
4
5
|
TraceCitation,
|
|
@@ -13,6 +14,9 @@ __all__ = [
|
|
|
13
14
|
"TraceCitation",
|
|
14
15
|
"TraceContext",
|
|
15
16
|
"VerificationInputError",
|
|
17
|
+
"compare_failures",
|
|
18
|
+
"compare_trace_files",
|
|
19
|
+
"compare_verifications",
|
|
16
20
|
"list_verify_demos",
|
|
17
21
|
"load_trace_file",
|
|
18
22
|
"load_verify_demo",
|
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contexttrace.verify.evidence import lexical_score
|
|
8
|
+
from contexttrace.verify.runner import verify_trace
|
|
9
|
+
from contexttrace.verify.schema import VerificationInputError, load_trace
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
FAILURE_VERDICTS = {"partially_supported", "unsupported", "contradicted", "unverifiable"}
|
|
13
|
+
BAD_CITATIONS = {
|
|
14
|
+
"cited_source_missing",
|
|
15
|
+
"cited_source_does_not_support_claim",
|
|
16
|
+
"claim_supported_by_different_source",
|
|
17
|
+
}
|
|
18
|
+
NO_ROOT_CAUSE = "no_failure_detected"
|
|
19
|
+
MATCH_THRESHOLD = 0.58
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def compare_trace_files(
|
|
23
|
+
baseline_path: str | Path,
|
|
24
|
+
current_path: str | Path,
|
|
25
|
+
*,
|
|
26
|
+
mode: str = "lexical",
|
|
27
|
+
) -> dict[str, Any]:
|
|
28
|
+
baseline = load_compare_input(baseline_path, mode=mode)
|
|
29
|
+
current = load_compare_input(current_path, mode=mode)
|
|
30
|
+
return compare_verifications(baseline, current, mode=mode)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def load_compare_input(path: str | Path, *, mode: str = "lexical") -> dict[str, Any]:
|
|
34
|
+
input_path = Path(path)
|
|
35
|
+
try:
|
|
36
|
+
payload = json.loads(input_path.read_text(encoding="utf-8"))
|
|
37
|
+
except OSError as exc:
|
|
38
|
+
raise VerificationInputError("Could not read compare input %s: %s" % (input_path, exc)) from exc
|
|
39
|
+
except json.JSONDecodeError as exc:
|
|
40
|
+
raise VerificationInputError(
|
|
41
|
+
"Invalid JSON in %s at line %s column %s: %s"
|
|
42
|
+
% (input_path, exc.lineno, exc.colno, exc.msg)
|
|
43
|
+
) from exc
|
|
44
|
+
|
|
45
|
+
if _looks_like_verification_result(payload):
|
|
46
|
+
return _normalize_verified_result(payload, source=str(input_path))
|
|
47
|
+
|
|
48
|
+
trace = load_trace(payload, source=str(input_path))
|
|
49
|
+
result = verify_trace(trace, mode=mode)
|
|
50
|
+
result.setdefault("metadata", {})
|
|
51
|
+
result["metadata"] = {
|
|
52
|
+
**dict(result.get("metadata") or {}),
|
|
53
|
+
"compare_input": str(input_path),
|
|
54
|
+
"compare_input_type": "raw_trace",
|
|
55
|
+
}
|
|
56
|
+
return result
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def compare_verifications(
|
|
60
|
+
baseline: dict[str, Any],
|
|
61
|
+
current: dict[str, Any],
|
|
62
|
+
*,
|
|
63
|
+
mode: str = "lexical",
|
|
64
|
+
) -> dict[str, Any]:
|
|
65
|
+
baseline_claims = list(baseline.get("claims") or [])
|
|
66
|
+
current_claims = list(current.get("claims") or [])
|
|
67
|
+
matches = _match_claims(baseline_claims, current_claims, mode=mode)
|
|
68
|
+
|
|
69
|
+
changes = []
|
|
70
|
+
matched_baseline = set()
|
|
71
|
+
matched_current = set()
|
|
72
|
+
for baseline_index, current_index, score in matches:
|
|
73
|
+
matched_baseline.add(baseline_index)
|
|
74
|
+
matched_current.add(current_index)
|
|
75
|
+
change = _matched_change(
|
|
76
|
+
baseline_claims[baseline_index],
|
|
77
|
+
current_claims[current_index],
|
|
78
|
+
match_score=score,
|
|
79
|
+
)
|
|
80
|
+
if change["status"] != "unchanged":
|
|
81
|
+
changes.append(change)
|
|
82
|
+
|
|
83
|
+
for index, claim in enumerate(current_claims):
|
|
84
|
+
if index in matched_current:
|
|
85
|
+
continue
|
|
86
|
+
changes.append(_single_change("added_failure" if _is_failure(claim) else "added_claim", after=claim))
|
|
87
|
+
|
|
88
|
+
for index, claim in enumerate(baseline_claims):
|
|
89
|
+
if index in matched_baseline:
|
|
90
|
+
continue
|
|
91
|
+
changes.append(_single_change("removed_failure" if _is_failure(claim) else "removed_claim", before=claim))
|
|
92
|
+
|
|
93
|
+
changes = sorted(changes, key=_change_sort_key)
|
|
94
|
+
summary = _summary(baseline, current, changes)
|
|
95
|
+
return {
|
|
96
|
+
"mode": mode,
|
|
97
|
+
"summary": summary,
|
|
98
|
+
"changes": changes,
|
|
99
|
+
"baseline": _run_snapshot(baseline),
|
|
100
|
+
"current": _run_snapshot(current),
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def compare_failures(result: dict[str, Any], fail_on: tuple[str, ...]) -> list[str]:
|
|
105
|
+
if not fail_on:
|
|
106
|
+
return []
|
|
107
|
+
summary = result.get("summary") or {}
|
|
108
|
+
messages = []
|
|
109
|
+
for raw_rule in fail_on:
|
|
110
|
+
rule = raw_rule.strip().lower().replace("-", "_")
|
|
111
|
+
if rule == "new_failure" and int(summary.get("new_failures") or 0) > 0:
|
|
112
|
+
messages.append("new verification failure detected")
|
|
113
|
+
elif rule == "new_unsupported" and int(summary.get("new_unsupported") or 0) > 0:
|
|
114
|
+
messages.append("new unsupported claim detected")
|
|
115
|
+
elif rule == "new_citation_mismatch" and int(summary.get("new_citation_mismatches") or 0) > 0:
|
|
116
|
+
messages.append("new citation mismatch detected")
|
|
117
|
+
elif rule == "should_abstain_flip" and bool(summary.get("should_abstain_regressed")):
|
|
118
|
+
messages.append("should-abstain changed from false to true")
|
|
119
|
+
elif rule == "support_rate_drop" and float(summary.get("support_rate_delta") or 0.0) < 0:
|
|
120
|
+
messages.append("support rate dropped")
|
|
121
|
+
elif rule in {"new_root_cause", "root_cause_regression"} and summary.get("new_root_causes"):
|
|
122
|
+
messages.append("new root cause detected")
|
|
123
|
+
elif rule == "any_regression" and bool(summary.get("regression")):
|
|
124
|
+
messages.append("verification regression detected")
|
|
125
|
+
elif rule not in {
|
|
126
|
+
"new_failure",
|
|
127
|
+
"new_unsupported",
|
|
128
|
+
"new_citation_mismatch",
|
|
129
|
+
"should_abstain_flip",
|
|
130
|
+
"support_rate_drop",
|
|
131
|
+
"new_root_cause",
|
|
132
|
+
"root_cause_regression",
|
|
133
|
+
"any_regression",
|
|
134
|
+
}:
|
|
135
|
+
messages.append("unknown --fail-on rule %s" % raw_rule)
|
|
136
|
+
return messages
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _looks_like_verification_result(payload: Any) -> bool:
|
|
140
|
+
return (
|
|
141
|
+
isinstance(payload, dict)
|
|
142
|
+
and isinstance(payload.get("summary"), dict)
|
|
143
|
+
and isinstance(payload.get("claims"), list)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _normalize_verified_result(payload: dict[str, Any], *, source: str) -> dict[str, Any]:
|
|
148
|
+
result = dict(payload)
|
|
149
|
+
result.setdefault("metadata", {})
|
|
150
|
+
result["metadata"] = {
|
|
151
|
+
**dict(result.get("metadata") or {}),
|
|
152
|
+
"compare_input": source,
|
|
153
|
+
"compare_input_type": "verification_result",
|
|
154
|
+
}
|
|
155
|
+
return result
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _match_claims(
|
|
159
|
+
baseline_claims: list[dict[str, Any]],
|
|
160
|
+
current_claims: list[dict[str, Any]],
|
|
161
|
+
*,
|
|
162
|
+
mode: str,
|
|
163
|
+
) -> list[tuple[int, int, float]]:
|
|
164
|
+
candidates = []
|
|
165
|
+
for baseline_index, baseline_claim in enumerate(baseline_claims):
|
|
166
|
+
for current_index, current_claim in enumerate(current_claims):
|
|
167
|
+
score = _claim_similarity(
|
|
168
|
+
str(baseline_claim.get("claim") or ""),
|
|
169
|
+
str(current_claim.get("claim") or ""),
|
|
170
|
+
mode=mode,
|
|
171
|
+
)
|
|
172
|
+
if score >= MATCH_THRESHOLD:
|
|
173
|
+
candidates.append((score, baseline_index, current_index))
|
|
174
|
+
|
|
175
|
+
matches = []
|
|
176
|
+
used_baseline = set()
|
|
177
|
+
used_current = set()
|
|
178
|
+
for score, baseline_index, current_index in sorted(candidates, reverse=True):
|
|
179
|
+
if baseline_index in used_baseline or current_index in used_current:
|
|
180
|
+
continue
|
|
181
|
+
used_baseline.add(baseline_index)
|
|
182
|
+
used_current.add(current_index)
|
|
183
|
+
matches.append((baseline_index, current_index, score))
|
|
184
|
+
return matches
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _claim_similarity(left: str, right: str, *, mode: str) -> float:
|
|
188
|
+
if _normalize_text(left) == _normalize_text(right):
|
|
189
|
+
return 1.0
|
|
190
|
+
forward, _ = lexical_score(left, right, mode=mode)
|
|
191
|
+
reverse, _ = lexical_score(right, left, mode=mode)
|
|
192
|
+
return max(forward, reverse)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _matched_change(
|
|
196
|
+
before_claim: dict[str, Any],
|
|
197
|
+
after_claim: dict[str, Any],
|
|
198
|
+
*,
|
|
199
|
+
match_score: float,
|
|
200
|
+
) -> dict[str, Any]:
|
|
201
|
+
before_failure = _is_failure(before_claim)
|
|
202
|
+
after_failure = _is_failure(after_claim)
|
|
203
|
+
before_severity = _severity(before_claim)
|
|
204
|
+
after_severity = _severity(after_claim)
|
|
205
|
+
before_citation = _citation_severity(before_claim)
|
|
206
|
+
after_citation = _citation_severity(after_claim)
|
|
207
|
+
before_root = _root_label(before_claim)
|
|
208
|
+
after_root = _root_label(after_claim)
|
|
209
|
+
|
|
210
|
+
if not before_failure and after_failure:
|
|
211
|
+
status = "new_failure"
|
|
212
|
+
elif before_failure and not after_failure:
|
|
213
|
+
status = "resolved_failure"
|
|
214
|
+
elif after_severity > before_severity:
|
|
215
|
+
status = "verdict_regressed"
|
|
216
|
+
elif after_severity < before_severity:
|
|
217
|
+
status = "verdict_improved"
|
|
218
|
+
elif after_citation > before_citation:
|
|
219
|
+
status = "citation_regressed"
|
|
220
|
+
elif after_citation < before_citation:
|
|
221
|
+
status = "citation_improved"
|
|
222
|
+
elif before_root != after_root and after_root != NO_ROOT_CAUSE:
|
|
223
|
+
status = "root_cause_regressed"
|
|
224
|
+
elif before_root != after_root:
|
|
225
|
+
status = "root_cause_changed"
|
|
226
|
+
elif _context_id(before_claim) != _context_id(after_claim):
|
|
227
|
+
status = "source_changed"
|
|
228
|
+
elif _normalize_text(str(before_claim.get("claim") or "")) != _normalize_text(str(after_claim.get("claim") or "")):
|
|
229
|
+
status = "claim_changed"
|
|
230
|
+
else:
|
|
231
|
+
status = "unchanged"
|
|
232
|
+
|
|
233
|
+
return {
|
|
234
|
+
"status": status,
|
|
235
|
+
"claim": str(after_claim.get("claim") or before_claim.get("claim") or ""),
|
|
236
|
+
"match_score": round(match_score, 3),
|
|
237
|
+
"before": _claim_snapshot(before_claim),
|
|
238
|
+
"after": _claim_snapshot(after_claim),
|
|
239
|
+
"suggested_fix": _suggested_fix(after_claim, status=status),
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _single_change(
|
|
244
|
+
status: str,
|
|
245
|
+
*,
|
|
246
|
+
before: dict[str, Any] | None = None,
|
|
247
|
+
after: dict[str, Any] | None = None,
|
|
248
|
+
) -> dict[str, Any]:
|
|
249
|
+
claim = after or before or {}
|
|
250
|
+
return {
|
|
251
|
+
"status": status,
|
|
252
|
+
"claim": str(claim.get("claim") or ""),
|
|
253
|
+
"match_score": None,
|
|
254
|
+
"before": _claim_snapshot(before) if before else None,
|
|
255
|
+
"after": _claim_snapshot(after) if after else None,
|
|
256
|
+
"suggested_fix": _suggested_fix(claim, status=status),
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _summary(
|
|
261
|
+
baseline: dict[str, Any],
|
|
262
|
+
current: dict[str, Any],
|
|
263
|
+
changes: list[dict[str, Any]],
|
|
264
|
+
) -> dict[str, Any]:
|
|
265
|
+
baseline_summary = dict(baseline.get("summary") or {})
|
|
266
|
+
current_summary = dict(current.get("summary") or {})
|
|
267
|
+
new_failures = [change for change in changes if change["status"] in {"new_failure", "added_failure", "verdict_regressed", "citation_regressed", "root_cause_regressed"}]
|
|
268
|
+
resolved_failures = [change for change in changes if change["status"] in {"resolved_failure", "removed_failure", "verdict_improved", "citation_improved"}]
|
|
269
|
+
new_unsupported = [
|
|
270
|
+
change
|
|
271
|
+
for change in new_failures
|
|
272
|
+
if ((change.get("after") or {}).get("verdict") in {"unsupported", "contradicted"})
|
|
273
|
+
]
|
|
274
|
+
new_citations = [
|
|
275
|
+
change
|
|
276
|
+
for change in new_failures
|
|
277
|
+
if _citation_status_from_snapshot(change.get("after")) in BAD_CITATIONS
|
|
278
|
+
]
|
|
279
|
+
before_abstain = bool((baseline.get("abstention") or {}).get("should_abstain") or baseline_summary.get("should_abstain"))
|
|
280
|
+
after_abstain = bool((current.get("abstention") or {}).get("should_abstain") or current_summary.get("should_abstain"))
|
|
281
|
+
support_delta = _delta(current_summary.get("support_rate"), baseline_summary.get("support_rate"))
|
|
282
|
+
unsupported_delta = _delta(current_summary.get("unsupported_claim_rate"), baseline_summary.get("unsupported_claim_rate"))
|
|
283
|
+
citation_delta = int(current_summary.get("citation_mismatches") or 0) - int(baseline_summary.get("citation_mismatches") or 0)
|
|
284
|
+
new_root_causes = sorted(
|
|
285
|
+
{
|
|
286
|
+
_root_from_snapshot(change.get("after"))
|
|
287
|
+
for change in new_failures
|
|
288
|
+
if _root_from_snapshot(change.get("after")) != NO_ROOT_CAUSE
|
|
289
|
+
}
|
|
290
|
+
)
|
|
291
|
+
resolved_root_causes = sorted(
|
|
292
|
+
{
|
|
293
|
+
_root_from_snapshot(change.get("before"))
|
|
294
|
+
for change in resolved_failures
|
|
295
|
+
if _root_from_snapshot(change.get("before")) != NO_ROOT_CAUSE
|
|
296
|
+
}
|
|
297
|
+
)
|
|
298
|
+
regression = bool(
|
|
299
|
+
new_failures
|
|
300
|
+
or support_delta < 0
|
|
301
|
+
or unsupported_delta > 0
|
|
302
|
+
or citation_delta > 0
|
|
303
|
+
or (not before_abstain and after_abstain)
|
|
304
|
+
)
|
|
305
|
+
return {
|
|
306
|
+
"regression": regression,
|
|
307
|
+
"improved": bool(resolved_failures and not regression),
|
|
308
|
+
"support_rate_before": _number(baseline_summary.get("support_rate")),
|
|
309
|
+
"support_rate_after": _number(current_summary.get("support_rate")),
|
|
310
|
+
"support_rate_delta": support_delta,
|
|
311
|
+
"unsupported_claim_rate_delta": unsupported_delta,
|
|
312
|
+
"citation_mismatch_delta": citation_delta,
|
|
313
|
+
"should_abstain_before": before_abstain,
|
|
314
|
+
"should_abstain_after": after_abstain,
|
|
315
|
+
"should_abstain_changed": before_abstain != after_abstain,
|
|
316
|
+
"should_abstain_regressed": (not before_abstain and after_abstain),
|
|
317
|
+
"new_failures": len(new_failures),
|
|
318
|
+
"resolved_failures": len(resolved_failures),
|
|
319
|
+
"new_unsupported": len(new_unsupported),
|
|
320
|
+
"new_citation_mismatches": len(new_citations),
|
|
321
|
+
"added_claims": len([change for change in changes if change["status"] in {"added_claim", "added_failure"}]),
|
|
322
|
+
"removed_claims": len([change for change in changes if change["status"] in {"removed_claim", "removed_failure"}]),
|
|
323
|
+
"changed_claims": len(changes),
|
|
324
|
+
"new_root_causes": new_root_causes,
|
|
325
|
+
"resolved_root_causes": resolved_root_causes,
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _run_snapshot(result: dict[str, Any]) -> dict[str, Any]:
|
|
330
|
+
return {
|
|
331
|
+
"query": result.get("query"),
|
|
332
|
+
"answer": result.get("answer"),
|
|
333
|
+
"summary": result.get("summary") or {},
|
|
334
|
+
"abstention": result.get("abstention") or {},
|
|
335
|
+
"metadata": result.get("metadata") or {},
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _claim_snapshot(claim: dict[str, Any] | None) -> dict[str, Any] | None:
|
|
340
|
+
if claim is None:
|
|
341
|
+
return None
|
|
342
|
+
root = claim.get("root_cause") or {}
|
|
343
|
+
return {
|
|
344
|
+
"claim_id": claim.get("claim_id"),
|
|
345
|
+
"claim": claim.get("claim"),
|
|
346
|
+
"verdict": claim.get("verdict"),
|
|
347
|
+
"confidence": claim.get("confidence"),
|
|
348
|
+
"best_context_id": claim.get("best_context_id"),
|
|
349
|
+
"citation_status": claim.get("citation_status"),
|
|
350
|
+
"root_cause": root.get("label") if isinstance(root, dict) else None,
|
|
351
|
+
"missing_fact": root.get("missing_fact") if isinstance(root, dict) else None,
|
|
352
|
+
"closest_evidence": root.get("closest_evidence") if isinstance(root, dict) else claim.get("evidence"),
|
|
353
|
+
"suggested_fix": root.get("suggested_fix") if isinstance(root, dict) else None,
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _is_failure(claim: dict[str, Any]) -> bool:
|
|
358
|
+
return (
|
|
359
|
+
str(claim.get("verdict") or "") in FAILURE_VERDICTS
|
|
360
|
+
or str(claim.get("citation_status") or "") in BAD_CITATIONS
|
|
361
|
+
or _root_label(claim) != NO_ROOT_CAUSE
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _severity(claim: dict[str, Any]) -> int:
|
|
366
|
+
verdict = str(claim.get("verdict") or "")
|
|
367
|
+
if verdict in {"unsupported", "contradicted"}:
|
|
368
|
+
return 3
|
|
369
|
+
if verdict in {"partially_supported", "unverifiable"}:
|
|
370
|
+
return 2
|
|
371
|
+
return 0
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _citation_severity(claim: dict[str, Any]) -> int:
|
|
375
|
+
return 1 if str(claim.get("citation_status") or "") in BAD_CITATIONS else 0
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def _root_label(claim: dict[str, Any]) -> str:
|
|
379
|
+
root = claim.get("root_cause") or {}
|
|
380
|
+
if isinstance(root, dict):
|
|
381
|
+
return str(root.get("label") or NO_ROOT_CAUSE)
|
|
382
|
+
return NO_ROOT_CAUSE
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _context_id(claim: dict[str, Any]) -> str:
|
|
386
|
+
return str(claim.get("best_context_id") or "")
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _root_from_snapshot(snapshot: dict[str, Any] | None) -> str:
|
|
390
|
+
if not snapshot:
|
|
391
|
+
return NO_ROOT_CAUSE
|
|
392
|
+
return str(snapshot.get("root_cause") or NO_ROOT_CAUSE)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _citation_status_from_snapshot(snapshot: dict[str, Any] | None) -> str:
|
|
396
|
+
if not snapshot:
|
|
397
|
+
return ""
|
|
398
|
+
return str(snapshot.get("citation_status") or "")
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def _suggested_fix(claim: dict[str, Any], *, status: str) -> str:
|
|
402
|
+
root = claim.get("root_cause") or {}
|
|
403
|
+
if isinstance(root, dict) and root.get("suggested_fix"):
|
|
404
|
+
return str(root["suggested_fix"])
|
|
405
|
+
if status in {"added_failure", "new_failure", "verdict_regressed"}:
|
|
406
|
+
return "Inspect the new claim and remove unsupported details or retrieve supporting evidence."
|
|
407
|
+
if status == "citation_regressed":
|
|
408
|
+
return "Regenerate claim-level citations and require cited source IDs to support the claim."
|
|
409
|
+
if status == "source_changed":
|
|
410
|
+
return "Check whether the new retrieved source is intentional and still supports the claim."
|
|
411
|
+
return "No automatic fix suggested."
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _change_sort_key(change: dict[str, Any]) -> tuple[int, str]:
|
|
415
|
+
priority = {
|
|
416
|
+
"added_failure": 0,
|
|
417
|
+
"new_failure": 1,
|
|
418
|
+
"verdict_regressed": 2,
|
|
419
|
+
"citation_regressed": 3,
|
|
420
|
+
"root_cause_regressed": 4,
|
|
421
|
+
"resolved_failure": 5,
|
|
422
|
+
"verdict_improved": 6,
|
|
423
|
+
"citation_improved": 7,
|
|
424
|
+
"removed_failure": 8,
|
|
425
|
+
"added_claim": 8,
|
|
426
|
+
"removed_claim": 9,
|
|
427
|
+
"source_changed": 10,
|
|
428
|
+
"claim_changed": 11,
|
|
429
|
+
}
|
|
430
|
+
return (priority.get(str(change.get("status")), 99), str(change.get("claim") or ""))
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def _delta(current: Any, baseline: Any) -> float:
|
|
434
|
+
return round(_number(current) - _number(baseline), 3)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def _number(value: Any) -> float:
|
|
438
|
+
try:
|
|
439
|
+
return round(float(value), 3)
|
|
440
|
+
except (TypeError, ValueError):
|
|
441
|
+
return 0.0
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _normalize_text(text: str) -> str:
|
|
445
|
+
return " ".join(str(text or "").lower().strip().strip(".!?").split())
|
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from html import escape
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CompareReportGenerator:
|
|
10
|
+
def generate(self, result: dict[str, Any], *, path: str) -> str:
|
|
11
|
+
output_path = Path(path)
|
|
12
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
13
|
+
output_path.write_text(self.render(result), encoding="utf-8")
|
|
14
|
+
return str(output_path)
|
|
15
|
+
|
|
16
|
+
def render(self, result: dict[str, Any]) -> str:
|
|
17
|
+
summary = result.get("summary") or {}
|
|
18
|
+
changes = list(result.get("changes") or [])
|
|
19
|
+
return HTML_TEMPLATE.format(
|
|
20
|
+
verdict_class="bad" if summary.get("regression") else "ok",
|
|
21
|
+
regression=escape(_string(summary.get("regression"))),
|
|
22
|
+
mode=escape(_string(result.get("mode") or "lexical")),
|
|
23
|
+
summary_cards=_summary_cards(summary),
|
|
24
|
+
change_rows=_change_rows(changes),
|
|
25
|
+
new_failures=_change_cards(
|
|
26
|
+
changes,
|
|
27
|
+
{"added_failure", "new_failure", "verdict_regressed", "citation_regressed", "root_cause_regressed"},
|
|
28
|
+
empty="No new claim-level verification failures were detected.",
|
|
29
|
+
),
|
|
30
|
+
resolved_failures=_change_cards(
|
|
31
|
+
changes,
|
|
32
|
+
{"resolved_failure", "removed_failure", "verdict_improved", "citation_improved"},
|
|
33
|
+
empty="No previously failing claims were resolved.",
|
|
34
|
+
),
|
|
35
|
+
root_changes=_root_changes(summary),
|
|
36
|
+
baseline_summary=_run_summary(result.get("baseline") or {}),
|
|
37
|
+
current_summary=_run_summary(result.get("current") or {}),
|
|
38
|
+
raw_json=escape(json.dumps(_raw_summary(result), indent=2)),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _summary_cards(summary: dict[str, Any]) -> str:
|
|
43
|
+
cards = [
|
|
44
|
+
("Regression", summary.get("regression")),
|
|
45
|
+
("Support Rate Delta", _signed(summary.get("support_rate_delta"))),
|
|
46
|
+
("Unsupported Rate Delta", _signed(summary.get("unsupported_claim_rate_delta"))),
|
|
47
|
+
("Citation Mismatch Delta", _signed(summary.get("citation_mismatch_delta"))),
|
|
48
|
+
("New Failures", summary.get("new_failures", 0)),
|
|
49
|
+
("Resolved Failures", summary.get("resolved_failures", 0)),
|
|
50
|
+
("New Unsupported", summary.get("new_unsupported", 0)),
|
|
51
|
+
("New Citation Mismatches", summary.get("new_citation_mismatches", 0)),
|
|
52
|
+
("Added Claims", summary.get("added_claims", 0)),
|
|
53
|
+
("Removed Claims", summary.get("removed_claims", 0)),
|
|
54
|
+
("Should Abstain Before", summary.get("should_abstain_before")),
|
|
55
|
+
("Should Abstain After", summary.get("should_abstain_after")),
|
|
56
|
+
]
|
|
57
|
+
return "\n".join(
|
|
58
|
+
"""
|
|
59
|
+
<div class="card">
|
|
60
|
+
<div class="label">{label}</div>
|
|
61
|
+
<div class="value">{value}</div>
|
|
62
|
+
</div>
|
|
63
|
+
""".format(label=escape(label), value=escape(_string(value)))
|
|
64
|
+
for label, value in cards
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _change_rows(changes: list[dict[str, Any]]) -> str:
|
|
69
|
+
if not changes:
|
|
70
|
+
return "<tr><td colspan=\"7\" class=\"muted\">No claim-level changes detected.</td></tr>"
|
|
71
|
+
rows = []
|
|
72
|
+
for change in changes:
|
|
73
|
+
before = change.get("before") or {}
|
|
74
|
+
after = change.get("after") or {}
|
|
75
|
+
rows.append(
|
|
76
|
+
"""
|
|
77
|
+
<tr>
|
|
78
|
+
<td><span class="badge status-{status_class}">{status}</span></td>
|
|
79
|
+
<td>{claim}</td>
|
|
80
|
+
<td>{before_verdict}</td>
|
|
81
|
+
<td>{after_verdict}</td>
|
|
82
|
+
<td>{before_root}</td>
|
|
83
|
+
<td>{after_root}</td>
|
|
84
|
+
<td>{fix}</td>
|
|
85
|
+
</tr>
|
|
86
|
+
""".format(
|
|
87
|
+
status_class=escape(_css_token(change.get("status"))),
|
|
88
|
+
status=escape(_string(change.get("status"))),
|
|
89
|
+
claim=escape(_string(change.get("claim"))),
|
|
90
|
+
before_verdict=escape(_string(before.get("verdict") or "none")),
|
|
91
|
+
after_verdict=escape(_string(after.get("verdict") or "none")),
|
|
92
|
+
before_root=escape(_string(before.get("root_cause") or "none")),
|
|
93
|
+
after_root=escape(_string(after.get("root_cause") or "none")),
|
|
94
|
+
fix=escape(_string(change.get("suggested_fix"))),
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
return "\n".join(rows)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _change_cards(changes: list[dict[str, Any]], statuses: set[str], *, empty: str) -> str:
|
|
101
|
+
selected = [change for change in changes if change.get("status") in statuses]
|
|
102
|
+
if not selected:
|
|
103
|
+
return "<p class=\"muted\">%s</p>" % escape(empty)
|
|
104
|
+
return "\n".join(_change_card(change) for change in selected)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _change_card(change: dict[str, Any]) -> str:
|
|
108
|
+
before = change.get("before") or {}
|
|
109
|
+
after = change.get("after") or {}
|
|
110
|
+
active = after or before
|
|
111
|
+
return """
|
|
112
|
+
<article class="item">
|
|
113
|
+
<div class="item-meta">{status} | match {match_score}</div>
|
|
114
|
+
<h3>{claim}</h3>
|
|
115
|
+
<p><strong>Before:</strong> {before_verdict} | {before_citation} | {before_root}</p>
|
|
116
|
+
<p><strong>After:</strong> {after_verdict} | {after_citation} | {after_root}</p>
|
|
117
|
+
<p><strong>Best context:</strong> {context_id}</p>
|
|
118
|
+
<p><strong>Closest evidence:</strong> {evidence}</p>
|
|
119
|
+
<p><strong>Suggested fix:</strong> {fix}</p>
|
|
120
|
+
</article>
|
|
121
|
+
""".format(
|
|
122
|
+
status=escape(_string(change.get("status"))),
|
|
123
|
+
match_score=escape(_string(change.get("match_score") if change.get("match_score") is not None else "new")),
|
|
124
|
+
claim=escape(_string(change.get("claim"))),
|
|
125
|
+
before_verdict=escape(_string(before.get("verdict") or "none")),
|
|
126
|
+
before_citation=escape(_string(before.get("citation_status") or "none")),
|
|
127
|
+
before_root=escape(_string(before.get("root_cause") or "none")),
|
|
128
|
+
after_verdict=escape(_string(after.get("verdict") or "none")),
|
|
129
|
+
after_citation=escape(_string(after.get("citation_status") or "none")),
|
|
130
|
+
after_root=escape(_string(after.get("root_cause") or "none")),
|
|
131
|
+
context_id=escape(_string(active.get("best_context_id") or "none")),
|
|
132
|
+
evidence=escape(_string(active.get("closest_evidence") or "none")),
|
|
133
|
+
fix=escape(_string(change.get("suggested_fix"))),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _root_changes(summary: dict[str, Any]) -> str:
|
|
138
|
+
new_roots = list(summary.get("new_root_causes") or [])
|
|
139
|
+
resolved_roots = list(summary.get("resolved_root_causes") or [])
|
|
140
|
+
if not new_roots and not resolved_roots:
|
|
141
|
+
return "<p class=\"muted\">No root-cause labels changed.</p>"
|
|
142
|
+
return """
|
|
143
|
+
<div class="grid-two">
|
|
144
|
+
<div class="item">
|
|
145
|
+
<div class="item-meta">New root causes</div>
|
|
146
|
+
<p>{new_roots}</p>
|
|
147
|
+
</div>
|
|
148
|
+
<div class="item">
|
|
149
|
+
<div class="item-meta">Resolved root causes</div>
|
|
150
|
+
<p>{resolved_roots}</p>
|
|
151
|
+
</div>
|
|
152
|
+
</div>
|
|
153
|
+
""".format(
|
|
154
|
+
new_roots=escape(", ".join(new_roots) or "none"),
|
|
155
|
+
resolved_roots=escape(", ".join(resolved_roots) or "none"),
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _run_summary(run: dict[str, Any]) -> str:
|
|
160
|
+
summary = run.get("summary") or {}
|
|
161
|
+
metadata = run.get("metadata") or {}
|
|
162
|
+
cards = [
|
|
163
|
+
("Query", run.get("query")),
|
|
164
|
+
("Support Rate", summary.get("support_rate")),
|
|
165
|
+
("Unsupported Rate", summary.get("unsupported_claim_rate")),
|
|
166
|
+
("Citation Mismatches", summary.get("citation_mismatches")),
|
|
167
|
+
("Failure Type", summary.get("failure_type")),
|
|
168
|
+
("Primary Root Cause", summary.get("primary_root_cause")),
|
|
169
|
+
("Should Abstain", summary.get("should_abstain")),
|
|
170
|
+
("Input Type", metadata.get("compare_input_type")),
|
|
171
|
+
]
|
|
172
|
+
return "\n".join(
|
|
173
|
+
"""
|
|
174
|
+
<div class="card">
|
|
175
|
+
<div class="label">{label}</div>
|
|
176
|
+
<div class="small-value">{value}</div>
|
|
177
|
+
</div>
|
|
178
|
+
""".format(label=escape(label), value=escape(_string(value)))
|
|
179
|
+
for label, value in cards
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _raw_summary(result: dict[str, Any]) -> dict[str, Any]:
|
|
184
|
+
return {
|
|
185
|
+
"mode": result.get("mode"),
|
|
186
|
+
"summary": result.get("summary"),
|
|
187
|
+
"changes": result.get("changes"),
|
|
188
|
+
"baseline": result.get("baseline"),
|
|
189
|
+
"current": result.get("current"),
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _signed(value: Any) -> str:
|
|
194
|
+
try:
|
|
195
|
+
number = float(value)
|
|
196
|
+
except (TypeError, ValueError):
|
|
197
|
+
return "0"
|
|
198
|
+
if number > 0:
|
|
199
|
+
return "+%s" % _string(round(number, 3))
|
|
200
|
+
return _string(round(number, 3))
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _css_token(value: Any) -> str:
|
|
204
|
+
token = _string(value).lower().replace("_", "-").replace(" ", "-")
|
|
205
|
+
return "".join(char for char in token if char.isalnum() or char == "-") or "unknown"
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _string(value: Any) -> str:
|
|
209
|
+
if value is None:
|
|
210
|
+
return ""
|
|
211
|
+
return str(value)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
HTML_TEMPLATE = """<!doctype html>
|
|
215
|
+
<html lang="en">
|
|
216
|
+
<head>
|
|
217
|
+
<meta charset="utf-8">
|
|
218
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
219
|
+
<title>ContextTrace Regression Report</title>
|
|
220
|
+
<style>
|
|
221
|
+
:root {{
|
|
222
|
+
color-scheme: light;
|
|
223
|
+
--bg: #f7f8fa;
|
|
224
|
+
--panel: #ffffff;
|
|
225
|
+
--subtle: #fbfcfe;
|
|
226
|
+
--text: #202832;
|
|
227
|
+
--muted: #657286;
|
|
228
|
+
--line: #d9e0ea;
|
|
229
|
+
--ok: #176f44;
|
|
230
|
+
--warn: #946200;
|
|
231
|
+
--bad: #b42318;
|
|
232
|
+
--accent: #2458d3;
|
|
233
|
+
}}
|
|
234
|
+
* {{ box-sizing: border-box; }}
|
|
235
|
+
body {{
|
|
236
|
+
margin: 0;
|
|
237
|
+
background: var(--bg);
|
|
238
|
+
color: var(--text);
|
|
239
|
+
font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
|
240
|
+
line-height: 1.5;
|
|
241
|
+
}}
|
|
242
|
+
main {{ max-width: 1160px; margin: 0 auto; padding: 32px 20px 56px; }}
|
|
243
|
+
header {{ border-bottom: 1px solid var(--line); margin-bottom: 22px; padding-bottom: 18px; }}
|
|
244
|
+
h1, h2, h3 {{ margin: 0; }}
|
|
245
|
+
h1 {{ font-size: 30px; }}
|
|
246
|
+
h2 {{ font-size: 18px; margin-bottom: 12px; }}
|
|
247
|
+
h3 {{ font-size: 15px; margin-bottom: 8px; }}
|
|
248
|
+
section {{
|
|
249
|
+
background: var(--panel);
|
|
250
|
+
border: 1px solid var(--line);
|
|
251
|
+
border-radius: 8px;
|
|
252
|
+
margin: 16px 0;
|
|
253
|
+
padding: 18px;
|
|
254
|
+
}}
|
|
255
|
+
.banner {{
|
|
256
|
+
border: 1px solid var(--line);
|
|
257
|
+
border-radius: 8px;
|
|
258
|
+
background: var(--subtle);
|
|
259
|
+
padding: 14px;
|
|
260
|
+
margin-top: 12px;
|
|
261
|
+
}}
|
|
262
|
+
.banner.ok {{ border-color: #a7dfbf; background: #edf9f1; }}
|
|
263
|
+
.banner.bad {{ border-color: #f3b1ac; background: #fff1f0; }}
|
|
264
|
+
.summary {{
|
|
265
|
+
display: grid;
|
|
266
|
+
gap: 12px;
|
|
267
|
+
grid-template-columns: repeat(auto-fit, minmax(155px, 1fr));
|
|
268
|
+
}}
|
|
269
|
+
.grid-two {{
|
|
270
|
+
display: grid;
|
|
271
|
+
gap: 12px;
|
|
272
|
+
grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
|
|
273
|
+
}}
|
|
274
|
+
.card, .item {{
|
|
275
|
+
border: 1px solid var(--line);
|
|
276
|
+
border-radius: 8px;
|
|
277
|
+
background: var(--subtle);
|
|
278
|
+
padding: 12px;
|
|
279
|
+
}}
|
|
280
|
+
.item + .item {{ margin-top: 10px; }}
|
|
281
|
+
.label, .item-meta {{
|
|
282
|
+
color: var(--muted);
|
|
283
|
+
font-size: 12px;
|
|
284
|
+
font-weight: 700;
|
|
285
|
+
text-transform: uppercase;
|
|
286
|
+
}}
|
|
287
|
+
.value {{ margin-top: 4px; font-size: 18px; overflow-wrap: anywhere; }}
|
|
288
|
+
.small-value {{ margin-top: 4px; font-size: 14px; overflow-wrap: anywhere; }}
|
|
289
|
+
.muted {{ color: var(--muted); }}
|
|
290
|
+
table {{ width: 100%; border-collapse: collapse; font-size: 14px; }}
|
|
291
|
+
th, td {{ border-bottom: 1px solid var(--line); padding: 10px; text-align: left; vertical-align: top; }}
|
|
292
|
+
th {{ color: var(--muted); font-size: 12px; text-transform: uppercase; }}
|
|
293
|
+
.badge {{
|
|
294
|
+
display: inline-block;
|
|
295
|
+
border-radius: 999px;
|
|
296
|
+
border: 1px solid var(--line);
|
|
297
|
+
background: #eef2f7;
|
|
298
|
+
padding: 3px 8px;
|
|
299
|
+
font-size: 12px;
|
|
300
|
+
font-weight: 700;
|
|
301
|
+
white-space: nowrap;
|
|
302
|
+
}}
|
|
303
|
+
.status-added-failure, .status-new-failure, .status-verdict-regressed,
|
|
304
|
+
.status-citation-regressed, .status-root-cause-regressed {{ color: var(--bad); background: #fdeceb; }}
|
|
305
|
+
.status-resolved-failure, .status-removed-failure, .status-verdict-improved,
|
|
306
|
+
.status-citation-improved {{ color: var(--ok); background: #e9f7ef; }}
|
|
307
|
+
.status-added-claim, .status-removed-claim, .status-source-changed,
|
|
308
|
+
.status-claim-changed, .status-root-cause-changed {{ color: var(--warn); background: #fff7df; }}
|
|
309
|
+
pre {{
|
|
310
|
+
margin: 0;
|
|
311
|
+
overflow: auto;
|
|
312
|
+
background: #101828;
|
|
313
|
+
color: #f8fafc;
|
|
314
|
+
border-radius: 8px;
|
|
315
|
+
padding: 14px;
|
|
316
|
+
font-size: 13px;
|
|
317
|
+
}}
|
|
318
|
+
</style>
|
|
319
|
+
</head>
|
|
320
|
+
<body>
|
|
321
|
+
<main>
|
|
322
|
+
<header>
|
|
323
|
+
<h1>ContextTrace Regression Report</h1>
|
|
324
|
+
<p class="muted">Local diff of two claim-level evidence verification runs.</p>
|
|
325
|
+
<div class="banner {verdict_class}">
|
|
326
|
+
<strong>Regression: {regression}</strong>
|
|
327
|
+
<span class="muted"> | mode {mode}</span>
|
|
328
|
+
</div>
|
|
329
|
+
</header>
|
|
330
|
+
|
|
331
|
+
<section>
|
|
332
|
+
<h2>Regression Summary</h2>
|
|
333
|
+
<div class="summary">{summary_cards}</div>
|
|
334
|
+
</section>
|
|
335
|
+
|
|
336
|
+
<section>
|
|
337
|
+
<h2>Claim Changes</h2>
|
|
338
|
+
<table>
|
|
339
|
+
<thead>
|
|
340
|
+
<tr>
|
|
341
|
+
<th>Status</th>
|
|
342
|
+
<th>Claim</th>
|
|
343
|
+
<th>Before Verdict</th>
|
|
344
|
+
<th>After Verdict</th>
|
|
345
|
+
<th>Before Root Cause</th>
|
|
346
|
+
<th>After Root Cause</th>
|
|
347
|
+
<th>Suggested Fix</th>
|
|
348
|
+
</tr>
|
|
349
|
+
</thead>
|
|
350
|
+
<tbody>{change_rows}</tbody>
|
|
351
|
+
</table>
|
|
352
|
+
</section>
|
|
353
|
+
|
|
354
|
+
<section>
|
|
355
|
+
<h2>New Failures</h2>
|
|
356
|
+
{new_failures}
|
|
357
|
+
</section>
|
|
358
|
+
|
|
359
|
+
<section>
|
|
360
|
+
<h2>Resolved Failures</h2>
|
|
361
|
+
{resolved_failures}
|
|
362
|
+
</section>
|
|
363
|
+
|
|
364
|
+
<section>
|
|
365
|
+
<h2>Root Cause Changes</h2>
|
|
366
|
+
{root_changes}
|
|
367
|
+
</section>
|
|
368
|
+
|
|
369
|
+
<section>
|
|
370
|
+
<h2>Baseline Summary</h2>
|
|
371
|
+
<div class="summary">{baseline_summary}</div>
|
|
372
|
+
</section>
|
|
373
|
+
|
|
374
|
+
<section>
|
|
375
|
+
<h2>Current Summary</h2>
|
|
376
|
+
<div class="summary">{current_summary}</div>
|
|
377
|
+
</section>
|
|
378
|
+
|
|
379
|
+
<section>
|
|
380
|
+
<h2>Raw JSON Summary</h2>
|
|
381
|
+
<pre>{raw_json}</pre>
|
|
382
|
+
</section>
|
|
383
|
+
</main>
|
|
384
|
+
</body>
|
|
385
|
+
</html>
|
|
386
|
+
"""
|
|
@@ -33,6 +33,8 @@ contexttrace/verify/abstention.py
|
|
|
33
33
|
contexttrace/verify/benchmark.py
|
|
34
34
|
contexttrace/verify/citations.py
|
|
35
35
|
contexttrace/verify/claims.py
|
|
36
|
+
contexttrace/verify/compare.py
|
|
37
|
+
contexttrace/verify/compare_report.py
|
|
36
38
|
contexttrace/verify/demos.py
|
|
37
39
|
contexttrace/verify/evidence.py
|
|
38
40
|
contexttrace/verify/external_benchmark_cases.json
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "contexttrace"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.0"
|
|
8
8
|
description = "Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.3.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|