contexttrace 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {contexttrace-0.3.0 → contexttrace-0.4.0}/PKG-INFO +7 -2
  2. {contexttrace-0.3.0 → contexttrace-0.4.0}/README.md +6 -1
  3. contexttrace-0.4.0/contexttrace/_version.py +1 -0
  4. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/cli.py +64 -0
  5. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/__init__.py +4 -0
  6. contexttrace-0.4.0/contexttrace/verify/compare.py +445 -0
  7. contexttrace-0.4.0/contexttrace/verify/compare_report.py +386 -0
  8. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace.egg-info/SOURCES.txt +2 -0
  9. {contexttrace-0.3.0 → contexttrace-0.4.0}/pyproject.toml +1 -1
  10. contexttrace-0.3.0/contexttrace/_version.py +0 -1
  11. {contexttrace-0.3.0 → contexttrace-0.4.0}/MANIFEST.in +0 -0
  12. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/__init__.py +0 -0
  13. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/client.py +0 -0
  14. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/config.py +0 -0
  15. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/demo.py +0 -0
  16. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/demo_data.py +0 -0
  17. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/endpoint_eval.py +0 -0
  18. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/errors.py +0 -0
  19. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/evaluator.py +0 -0
  20. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/integrations/__init__.py +0 -0
  21. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/integrations/fastapi.py +0 -0
  22. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/integrations/langchain.py +0 -0
  23. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/integrations/langgraph.py +0 -0
  24. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/integrations/llamaindex.py +0 -0
  25. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/integrations/opentelemetry.py +0 -0
  26. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/local.py +0 -0
  27. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/py.typed +0 -0
  28. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/regression.py +0 -0
  29. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/reliability.py +0 -0
  30. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/report.py +0 -0
  31. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/storage/__init__.py +0 -0
  32. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/storage/sqlite_store.py +0 -0
  33. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/thresholds.py +0 -0
  34. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/transport.py +0 -0
  35. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/abstention.py +0 -0
  36. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/benchmark.py +0 -0
  37. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/citations.py +0 -0
  38. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/claims.py +0 -0
  39. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/demos.py +0 -0
  40. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/evidence.py +0 -0
  41. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/external_benchmark_cases.json +0 -0
  42. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/facts.py +0 -0
  43. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/real_benchmark_cases.json +0 -0
  44. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/report.py +0 -0
  45. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/root_cause.py +0 -0
  46. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/runner.py +0 -0
  47. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/schema.py +0 -0
  48. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/spans.py +0 -0
  49. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/verify/verdicts.py +0 -0
  50. {contexttrace-0.3.0 → contexttrace-0.4.0}/contexttrace/viewer.py +0 -0
  51. {contexttrace-0.3.0 → contexttrace-0.4.0}/setup.cfg +0 -0
  52. {contexttrace-0.3.0 → contexttrace-0.4.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: contexttrace
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
5
5
  Author: ContextTrace contributors
6
6
  License: MIT
@@ -147,6 +147,9 @@ contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
147
147
  contexttrace verify-benchmark --mode semantic
148
148
  contexttrace verify-benchmark --mode semantic --report
149
149
  contexttrace verify-benchmark --case-set external --mode semantic --report
150
+ contexttrace compare baseline.json current.json
151
+ contexttrace compare baseline.json current.json --report
152
+ contexttrace compare baseline.json current.json --fail-on new_failure
150
153
  ```
151
154
 
152
155
  Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
@@ -159,7 +162,9 @@ Verification output includes evidence span offsets, stable span hashes, multiple
159
162
 
160
163
  ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
161
164
 
162
- The v0.3.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
165
+ Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
166
+
167
+ The v0.4.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
163
168
 
164
169
  ## What It Catches
165
170
 
@@ -90,6 +90,9 @@ contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
90
90
  contexttrace verify-benchmark --mode semantic
91
91
  contexttrace verify-benchmark --mode semantic --report
92
92
  contexttrace verify-benchmark --case-set external --mode semantic --report
93
+ contexttrace compare baseline.json current.json
94
+ contexttrace compare baseline.json current.json --report
95
+ contexttrace compare baseline.json current.json --fail-on new_failure
93
96
  ```
94
97
 
95
98
  Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
@@ -102,7 +105,9 @@ Verification output includes evidence span offsets, stable span hashes, multiple
102
105
 
103
106
  ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
104
107
 
105
- The v0.3.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
108
+ Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
109
+
110
+ The v0.4.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
106
111
 
107
112
  ## What It Catches
108
113
 
@@ -0,0 +1 @@
1
+ __version__ = "0.4.0"
@@ -24,12 +24,15 @@ from contexttrace.storage import SQLiteTraceStore
24
24
  from contexttrace.thresholds import parse_thresholds, threshold_failures
25
25
  from contexttrace.verify import (
26
26
  VerificationInputError,
27
+ compare_failures,
28
+ compare_trace_files,
27
29
  list_verify_demos,
28
30
  load_trace_file,
29
31
  load_verify_demo,
30
32
  verify_trace,
31
33
  )
32
34
  from contexttrace.verify.benchmark import run_verify_benchmark, write_verify_benchmark_report
35
+ from contexttrace.verify.compare_report import CompareReportGenerator
33
36
  from contexttrace.verify.report import VerifyReportGenerator
34
37
  from contexttrace.viewer import serve_viewer
35
38
 
@@ -340,6 +343,67 @@ def verify_benchmark_command(mode: str, case_set: str, json_output: bool, report
340
343
  return 0
341
344
 
342
345
 
346
+ @cli.command("compare")
347
+ @click.argument("baseline_json")
348
+ @click.argument("current_json")
349
+ @click.option("--json", "json_output", is_flag=True, help="Print the full comparison result as JSON.")
350
+ @click.option("--report", is_flag=True, help="Generate a local HTML regression report.")
351
+ @click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
352
+ @click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode for raw trace inputs.")
353
+ @click.option("--fail-on", multiple=True, help="Fail on new_failure, new_unsupported, new_citation_mismatch, should_abstain_flip, support_rate_drop, new_root_cause, or any_regression.")
354
+ def compare_command(
355
+ baseline_json: str,
356
+ current_json: str,
357
+ json_output: bool,
358
+ report: bool,
359
+ out: Optional[str],
360
+ mode: str,
361
+ fail_on: tuple[str, ...],
362
+ ) -> int:
363
+ """Compare two portable RAG traces or verification JSON outputs."""
364
+
365
+ try:
366
+ result = compare_trace_files(baseline_json, current_json, mode=mode)
367
+ except VerificationInputError as exc:
368
+ raise click.ClickException(str(exc)) from exc
369
+
370
+ written_report = None
371
+ if report or out:
372
+ default_name = "%s_vs_%s_compare.html" % (Path(baseline_json).stem, Path(current_json).stem)
373
+ output_path = out or str(Path(".contexttrace") / "reports" / default_name)
374
+ written_report = CompareReportGenerator().generate(result, path=output_path)
375
+
376
+ fail_messages = compare_failures(result, fail_on)
377
+ if json_output:
378
+ if written_report:
379
+ click.echo("Report: %s" % written_report, err=True)
380
+ click.echo(json.dumps(result, indent=2))
381
+ for message in fail_messages:
382
+ click.echo("Comparison failed: %s" % message, err=True)
383
+ return 1 if fail_messages else 0
384
+
385
+ summary = result["summary"]
386
+ click.echo("Regression: %s" % str(summary["regression"]).lower())
387
+ click.echo("Support rate: %.3f -> %.3f (%+.3f)" % (
388
+ float(summary.get("support_rate_before") or 0.0),
389
+ float(summary.get("support_rate_after") or 0.0),
390
+ float(summary.get("support_rate_delta") or 0.0),
391
+ ))
392
+ click.echo("Unsupported claim rate delta: %+.3f" % float(summary.get("unsupported_claim_rate_delta") or 0.0))
393
+ click.echo("Citation mismatch delta: %+d" % int(summary.get("citation_mismatch_delta") or 0))
394
+ click.echo("New failures: %s" % summary["new_failures"])
395
+ click.echo("Resolved failures: %s" % summary["resolved_failures"])
396
+ click.echo("Added claims: %s" % summary["added_claims"])
397
+ click.echo("Removed claims: %s" % summary["removed_claims"])
398
+ click.echo("Changed claims: %s" % summary["changed_claims"])
399
+ click.echo("New root causes: %s" % (", ".join(summary.get("new_root_causes") or []) or "none"))
400
+ if written_report:
401
+ click.echo("Report: %s" % written_report)
402
+ for message in fail_messages:
403
+ click.echo("Comparison failed: %s" % message, err=True)
404
+ return 1 if fail_messages else 0
405
+
406
+
343
407
  def _write_verify_report(
344
408
  result: dict,
345
409
  trace: object,
@@ -1,4 +1,5 @@
1
1
  from contexttrace.verify.runner import verify_trace, verify_trace_file
2
+ from contexttrace.verify.compare import compare_failures, compare_trace_files, compare_verifications
2
3
  from contexttrace.verify.schema import (
3
4
  RAGTrace,
4
5
  TraceCitation,
@@ -13,6 +14,9 @@ __all__ = [
13
14
  "TraceCitation",
14
15
  "TraceContext",
15
16
  "VerificationInputError",
17
+ "compare_failures",
18
+ "compare_trace_files",
19
+ "compare_verifications",
16
20
  "list_verify_demos",
17
21
  "load_trace_file",
18
22
  "load_verify_demo",
@@ -0,0 +1,445 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from contexttrace.verify.evidence import lexical_score
8
+ from contexttrace.verify.runner import verify_trace
9
+ from contexttrace.verify.schema import VerificationInputError, load_trace
10
+
11
+
12
+ FAILURE_VERDICTS = {"partially_supported", "unsupported", "contradicted", "unverifiable"}
13
+ BAD_CITATIONS = {
14
+ "cited_source_missing",
15
+ "cited_source_does_not_support_claim",
16
+ "claim_supported_by_different_source",
17
+ }
18
+ NO_ROOT_CAUSE = "no_failure_detected"
19
+ MATCH_THRESHOLD = 0.58
20
+
21
+
22
+ def compare_trace_files(
23
+ baseline_path: str | Path,
24
+ current_path: str | Path,
25
+ *,
26
+ mode: str = "lexical",
27
+ ) -> dict[str, Any]:
28
+ baseline = load_compare_input(baseline_path, mode=mode)
29
+ current = load_compare_input(current_path, mode=mode)
30
+ return compare_verifications(baseline, current, mode=mode)
31
+
32
+
33
+ def load_compare_input(path: str | Path, *, mode: str = "lexical") -> dict[str, Any]:
34
+ input_path = Path(path)
35
+ try:
36
+ payload = json.loads(input_path.read_text(encoding="utf-8"))
37
+ except OSError as exc:
38
+ raise VerificationInputError("Could not read compare input %s: %s" % (input_path, exc)) from exc
39
+ except json.JSONDecodeError as exc:
40
+ raise VerificationInputError(
41
+ "Invalid JSON in %s at line %s column %s: %s"
42
+ % (input_path, exc.lineno, exc.colno, exc.msg)
43
+ ) from exc
44
+
45
+ if _looks_like_verification_result(payload):
46
+ return _normalize_verified_result(payload, source=str(input_path))
47
+
48
+ trace = load_trace(payload, source=str(input_path))
49
+ result = verify_trace(trace, mode=mode)
50
+ result.setdefault("metadata", {})
51
+ result["metadata"] = {
52
+ **dict(result.get("metadata") or {}),
53
+ "compare_input": str(input_path),
54
+ "compare_input_type": "raw_trace",
55
+ }
56
+ return result
57
+
58
+
59
+ def compare_verifications(
60
+ baseline: dict[str, Any],
61
+ current: dict[str, Any],
62
+ *,
63
+ mode: str = "lexical",
64
+ ) -> dict[str, Any]:
65
+ baseline_claims = list(baseline.get("claims") or [])
66
+ current_claims = list(current.get("claims") or [])
67
+ matches = _match_claims(baseline_claims, current_claims, mode=mode)
68
+
69
+ changes = []
70
+ matched_baseline = set()
71
+ matched_current = set()
72
+ for baseline_index, current_index, score in matches:
73
+ matched_baseline.add(baseline_index)
74
+ matched_current.add(current_index)
75
+ change = _matched_change(
76
+ baseline_claims[baseline_index],
77
+ current_claims[current_index],
78
+ match_score=score,
79
+ )
80
+ if change["status"] != "unchanged":
81
+ changes.append(change)
82
+
83
+ for index, claim in enumerate(current_claims):
84
+ if index in matched_current:
85
+ continue
86
+ changes.append(_single_change("added_failure" if _is_failure(claim) else "added_claim", after=claim))
87
+
88
+ for index, claim in enumerate(baseline_claims):
89
+ if index in matched_baseline:
90
+ continue
91
+ changes.append(_single_change("removed_failure" if _is_failure(claim) else "removed_claim", before=claim))
92
+
93
+ changes = sorted(changes, key=_change_sort_key)
94
+ summary = _summary(baseline, current, changes)
95
+ return {
96
+ "mode": mode,
97
+ "summary": summary,
98
+ "changes": changes,
99
+ "baseline": _run_snapshot(baseline),
100
+ "current": _run_snapshot(current),
101
+ }
102
+
103
+
104
+ def compare_failures(result: dict[str, Any], fail_on: tuple[str, ...]) -> list[str]:
105
+ if not fail_on:
106
+ return []
107
+ summary = result.get("summary") or {}
108
+ messages = []
109
+ for raw_rule in fail_on:
110
+ rule = raw_rule.strip().lower().replace("-", "_")
111
+ if rule == "new_failure" and int(summary.get("new_failures") or 0) > 0:
112
+ messages.append("new verification failure detected")
113
+ elif rule == "new_unsupported" and int(summary.get("new_unsupported") or 0) > 0:
114
+ messages.append("new unsupported claim detected")
115
+ elif rule == "new_citation_mismatch" and int(summary.get("new_citation_mismatches") or 0) > 0:
116
+ messages.append("new citation mismatch detected")
117
+ elif rule == "should_abstain_flip" and bool(summary.get("should_abstain_regressed")):
118
+ messages.append("should-abstain changed from false to true")
119
+ elif rule == "support_rate_drop" and float(summary.get("support_rate_delta") or 0.0) < 0:
120
+ messages.append("support rate dropped")
121
+ elif rule in {"new_root_cause", "root_cause_regression"} and summary.get("new_root_causes"):
122
+ messages.append("new root cause detected")
123
+ elif rule == "any_regression" and bool(summary.get("regression")):
124
+ messages.append("verification regression detected")
125
+ elif rule not in {
126
+ "new_failure",
127
+ "new_unsupported",
128
+ "new_citation_mismatch",
129
+ "should_abstain_flip",
130
+ "support_rate_drop",
131
+ "new_root_cause",
132
+ "root_cause_regression",
133
+ "any_regression",
134
+ }:
135
+ messages.append("unknown --fail-on rule %s" % raw_rule)
136
+ return messages
137
+
138
+
139
+ def _looks_like_verification_result(payload: Any) -> bool:
140
+ return (
141
+ isinstance(payload, dict)
142
+ and isinstance(payload.get("summary"), dict)
143
+ and isinstance(payload.get("claims"), list)
144
+ )
145
+
146
+
147
+ def _normalize_verified_result(payload: dict[str, Any], *, source: str) -> dict[str, Any]:
148
+ result = dict(payload)
149
+ result.setdefault("metadata", {})
150
+ result["metadata"] = {
151
+ **dict(result.get("metadata") or {}),
152
+ "compare_input": source,
153
+ "compare_input_type": "verification_result",
154
+ }
155
+ return result
156
+
157
+
158
+ def _match_claims(
159
+ baseline_claims: list[dict[str, Any]],
160
+ current_claims: list[dict[str, Any]],
161
+ *,
162
+ mode: str,
163
+ ) -> list[tuple[int, int, float]]:
164
+ candidates = []
165
+ for baseline_index, baseline_claim in enumerate(baseline_claims):
166
+ for current_index, current_claim in enumerate(current_claims):
167
+ score = _claim_similarity(
168
+ str(baseline_claim.get("claim") or ""),
169
+ str(current_claim.get("claim") or ""),
170
+ mode=mode,
171
+ )
172
+ if score >= MATCH_THRESHOLD:
173
+ candidates.append((score, baseline_index, current_index))
174
+
175
+ matches = []
176
+ used_baseline = set()
177
+ used_current = set()
178
+ for score, baseline_index, current_index in sorted(candidates, reverse=True):
179
+ if baseline_index in used_baseline or current_index in used_current:
180
+ continue
181
+ used_baseline.add(baseline_index)
182
+ used_current.add(current_index)
183
+ matches.append((baseline_index, current_index, score))
184
+ return matches
185
+
186
+
187
+ def _claim_similarity(left: str, right: str, *, mode: str) -> float:
188
+ if _normalize_text(left) == _normalize_text(right):
189
+ return 1.0
190
+ forward, _ = lexical_score(left, right, mode=mode)
191
+ reverse, _ = lexical_score(right, left, mode=mode)
192
+ return max(forward, reverse)
193
+
194
+
195
+ def _matched_change(
196
+ before_claim: dict[str, Any],
197
+ after_claim: dict[str, Any],
198
+ *,
199
+ match_score: float,
200
+ ) -> dict[str, Any]:
201
+ before_failure = _is_failure(before_claim)
202
+ after_failure = _is_failure(after_claim)
203
+ before_severity = _severity(before_claim)
204
+ after_severity = _severity(after_claim)
205
+ before_citation = _citation_severity(before_claim)
206
+ after_citation = _citation_severity(after_claim)
207
+ before_root = _root_label(before_claim)
208
+ after_root = _root_label(after_claim)
209
+
210
+ if not before_failure and after_failure:
211
+ status = "new_failure"
212
+ elif before_failure and not after_failure:
213
+ status = "resolved_failure"
214
+ elif after_severity > before_severity:
215
+ status = "verdict_regressed"
216
+ elif after_severity < before_severity:
217
+ status = "verdict_improved"
218
+ elif after_citation > before_citation:
219
+ status = "citation_regressed"
220
+ elif after_citation < before_citation:
221
+ status = "citation_improved"
222
+ elif before_root != after_root and after_root != NO_ROOT_CAUSE:
223
+ status = "root_cause_regressed"
224
+ elif before_root != after_root:
225
+ status = "root_cause_changed"
226
+ elif _context_id(before_claim) != _context_id(after_claim):
227
+ status = "source_changed"
228
+ elif _normalize_text(str(before_claim.get("claim") or "")) != _normalize_text(str(after_claim.get("claim") or "")):
229
+ status = "claim_changed"
230
+ else:
231
+ status = "unchanged"
232
+
233
+ return {
234
+ "status": status,
235
+ "claim": str(after_claim.get("claim") or before_claim.get("claim") or ""),
236
+ "match_score": round(match_score, 3),
237
+ "before": _claim_snapshot(before_claim),
238
+ "after": _claim_snapshot(after_claim),
239
+ "suggested_fix": _suggested_fix(after_claim, status=status),
240
+ }
241
+
242
+
243
+ def _single_change(
244
+ status: str,
245
+ *,
246
+ before: dict[str, Any] | None = None,
247
+ after: dict[str, Any] | None = None,
248
+ ) -> dict[str, Any]:
249
+ claim = after or before or {}
250
+ return {
251
+ "status": status,
252
+ "claim": str(claim.get("claim") or ""),
253
+ "match_score": None,
254
+ "before": _claim_snapshot(before) if before else None,
255
+ "after": _claim_snapshot(after) if after else None,
256
+ "suggested_fix": _suggested_fix(claim, status=status),
257
+ }
258
+
259
+
260
+ def _summary(
261
+ baseline: dict[str, Any],
262
+ current: dict[str, Any],
263
+ changes: list[dict[str, Any]],
264
+ ) -> dict[str, Any]:
265
+ baseline_summary = dict(baseline.get("summary") or {})
266
+ current_summary = dict(current.get("summary") or {})
267
+ new_failures = [change for change in changes if change["status"] in {"new_failure", "added_failure", "verdict_regressed", "citation_regressed", "root_cause_regressed"}]
268
+ resolved_failures = [change for change in changes if change["status"] in {"resolved_failure", "removed_failure", "verdict_improved", "citation_improved"}]
269
+ new_unsupported = [
270
+ change
271
+ for change in new_failures
272
+ if ((change.get("after") or {}).get("verdict") in {"unsupported", "contradicted"})
273
+ ]
274
+ new_citations = [
275
+ change
276
+ for change in new_failures
277
+ if _citation_status_from_snapshot(change.get("after")) in BAD_CITATIONS
278
+ ]
279
+ before_abstain = bool((baseline.get("abstention") or {}).get("should_abstain") or baseline_summary.get("should_abstain"))
280
+ after_abstain = bool((current.get("abstention") or {}).get("should_abstain") or current_summary.get("should_abstain"))
281
+ support_delta = _delta(current_summary.get("support_rate"), baseline_summary.get("support_rate"))
282
+ unsupported_delta = _delta(current_summary.get("unsupported_claim_rate"), baseline_summary.get("unsupported_claim_rate"))
283
+ citation_delta = int(current_summary.get("citation_mismatches") or 0) - int(baseline_summary.get("citation_mismatches") or 0)
284
+ new_root_causes = sorted(
285
+ {
286
+ _root_from_snapshot(change.get("after"))
287
+ for change in new_failures
288
+ if _root_from_snapshot(change.get("after")) != NO_ROOT_CAUSE
289
+ }
290
+ )
291
+ resolved_root_causes = sorted(
292
+ {
293
+ _root_from_snapshot(change.get("before"))
294
+ for change in resolved_failures
295
+ if _root_from_snapshot(change.get("before")) != NO_ROOT_CAUSE
296
+ }
297
+ )
298
+ regression = bool(
299
+ new_failures
300
+ or support_delta < 0
301
+ or unsupported_delta > 0
302
+ or citation_delta > 0
303
+ or (not before_abstain and after_abstain)
304
+ )
305
+ return {
306
+ "regression": regression,
307
+ "improved": bool(resolved_failures and not regression),
308
+ "support_rate_before": _number(baseline_summary.get("support_rate")),
309
+ "support_rate_after": _number(current_summary.get("support_rate")),
310
+ "support_rate_delta": support_delta,
311
+ "unsupported_claim_rate_delta": unsupported_delta,
312
+ "citation_mismatch_delta": citation_delta,
313
+ "should_abstain_before": before_abstain,
314
+ "should_abstain_after": after_abstain,
315
+ "should_abstain_changed": before_abstain != after_abstain,
316
+ "should_abstain_regressed": (not before_abstain and after_abstain),
317
+ "new_failures": len(new_failures),
318
+ "resolved_failures": len(resolved_failures),
319
+ "new_unsupported": len(new_unsupported),
320
+ "new_citation_mismatches": len(new_citations),
321
+ "added_claims": len([change for change in changes if change["status"] in {"added_claim", "added_failure"}]),
322
+ "removed_claims": len([change for change in changes if change["status"] in {"removed_claim", "removed_failure"}]),
323
+ "changed_claims": len(changes),
324
+ "new_root_causes": new_root_causes,
325
+ "resolved_root_causes": resolved_root_causes,
326
+ }
327
+
328
+
329
+ def _run_snapshot(result: dict[str, Any]) -> dict[str, Any]:
330
+ return {
331
+ "query": result.get("query"),
332
+ "answer": result.get("answer"),
333
+ "summary": result.get("summary") or {},
334
+ "abstention": result.get("abstention") or {},
335
+ "metadata": result.get("metadata") or {},
336
+ }
337
+
338
+
339
+ def _claim_snapshot(claim: dict[str, Any] | None) -> dict[str, Any] | None:
340
+ if claim is None:
341
+ return None
342
+ root = claim.get("root_cause") or {}
343
+ return {
344
+ "claim_id": claim.get("claim_id"),
345
+ "claim": claim.get("claim"),
346
+ "verdict": claim.get("verdict"),
347
+ "confidence": claim.get("confidence"),
348
+ "best_context_id": claim.get("best_context_id"),
349
+ "citation_status": claim.get("citation_status"),
350
+ "root_cause": root.get("label") if isinstance(root, dict) else None,
351
+ "missing_fact": root.get("missing_fact") if isinstance(root, dict) else None,
352
+ "closest_evidence": root.get("closest_evidence") if isinstance(root, dict) else claim.get("evidence"),
353
+ "suggested_fix": root.get("suggested_fix") if isinstance(root, dict) else None,
354
+ }
355
+
356
+
357
+ def _is_failure(claim: dict[str, Any]) -> bool:
358
+ return (
359
+ str(claim.get("verdict") or "") in FAILURE_VERDICTS
360
+ or str(claim.get("citation_status") or "") in BAD_CITATIONS
361
+ or _root_label(claim) != NO_ROOT_CAUSE
362
+ )
363
+
364
+
365
+ def _severity(claim: dict[str, Any]) -> int:
366
+ verdict = str(claim.get("verdict") or "")
367
+ if verdict in {"unsupported", "contradicted"}:
368
+ return 3
369
+ if verdict in {"partially_supported", "unverifiable"}:
370
+ return 2
371
+ return 0
372
+
373
+
374
+ def _citation_severity(claim: dict[str, Any]) -> int:
375
+ return 1 if str(claim.get("citation_status") or "") in BAD_CITATIONS else 0
376
+
377
+
378
+ def _root_label(claim: dict[str, Any]) -> str:
379
+ root = claim.get("root_cause") or {}
380
+ if isinstance(root, dict):
381
+ return str(root.get("label") or NO_ROOT_CAUSE)
382
+ return NO_ROOT_CAUSE
383
+
384
+
385
+ def _context_id(claim: dict[str, Any]) -> str:
386
+ return str(claim.get("best_context_id") or "")
387
+
388
+
389
+ def _root_from_snapshot(snapshot: dict[str, Any] | None) -> str:
390
+ if not snapshot:
391
+ return NO_ROOT_CAUSE
392
+ return str(snapshot.get("root_cause") or NO_ROOT_CAUSE)
393
+
394
+
395
+ def _citation_status_from_snapshot(snapshot: dict[str, Any] | None) -> str:
396
+ if not snapshot:
397
+ return ""
398
+ return str(snapshot.get("citation_status") or "")
399
+
400
+
401
+ def _suggested_fix(claim: dict[str, Any], *, status: str) -> str:
402
+ root = claim.get("root_cause") or {}
403
+ if isinstance(root, dict) and root.get("suggested_fix"):
404
+ return str(root["suggested_fix"])
405
+ if status in {"added_failure", "new_failure", "verdict_regressed"}:
406
+ return "Inspect the new claim and remove unsupported details or retrieve supporting evidence."
407
+ if status == "citation_regressed":
408
+ return "Regenerate claim-level citations and require cited source IDs to support the claim."
409
+ if status == "source_changed":
410
+ return "Check whether the new retrieved source is intentional and still supports the claim."
411
+ return "No automatic fix suggested."
412
+
413
+
414
+ def _change_sort_key(change: dict[str, Any]) -> tuple[int, str]:
415
+ priority = {
416
+ "added_failure": 0,
417
+ "new_failure": 1,
418
+ "verdict_regressed": 2,
419
+ "citation_regressed": 3,
420
+ "root_cause_regressed": 4,
421
+ "resolved_failure": 5,
422
+ "verdict_improved": 6,
423
+ "citation_improved": 7,
424
+ "removed_failure": 8,
425
+ "added_claim": 8,
426
+ "removed_claim": 9,
427
+ "source_changed": 10,
428
+ "claim_changed": 11,
429
+ }
430
+ return (priority.get(str(change.get("status")), 99), str(change.get("claim") or ""))
431
+
432
+
433
+ def _delta(current: Any, baseline: Any) -> float:
434
+ return round(_number(current) - _number(baseline), 3)
435
+
436
+
437
+ def _number(value: Any) -> float:
438
+ try:
439
+ return round(float(value), 3)
440
+ except (TypeError, ValueError):
441
+ return 0.0
442
+
443
+
444
+ def _normalize_text(text: str) -> str:
445
+ return " ".join(str(text or "").lower().strip().strip(".!?").split())
@@ -0,0 +1,386 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from html import escape
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+
9
+ class CompareReportGenerator:
10
+ def generate(self, result: dict[str, Any], *, path: str) -> str:
11
+ output_path = Path(path)
12
+ output_path.parent.mkdir(parents=True, exist_ok=True)
13
+ output_path.write_text(self.render(result), encoding="utf-8")
14
+ return str(output_path)
15
+
16
+ def render(self, result: dict[str, Any]) -> str:
17
+ summary = result.get("summary") or {}
18
+ changes = list(result.get("changes") or [])
19
+ return HTML_TEMPLATE.format(
20
+ verdict_class="bad" if summary.get("regression") else "ok",
21
+ regression=escape(_string(summary.get("regression"))),
22
+ mode=escape(_string(result.get("mode") or "lexical")),
23
+ summary_cards=_summary_cards(summary),
24
+ change_rows=_change_rows(changes),
25
+ new_failures=_change_cards(
26
+ changes,
27
+ {"added_failure", "new_failure", "verdict_regressed", "citation_regressed", "root_cause_regressed"},
28
+ empty="No new claim-level verification failures were detected.",
29
+ ),
30
+ resolved_failures=_change_cards(
31
+ changes,
32
+ {"resolved_failure", "removed_failure", "verdict_improved", "citation_improved"},
33
+ empty="No previously failing claims were resolved.",
34
+ ),
35
+ root_changes=_root_changes(summary),
36
+ baseline_summary=_run_summary(result.get("baseline") or {}),
37
+ current_summary=_run_summary(result.get("current") or {}),
38
+ raw_json=escape(json.dumps(_raw_summary(result), indent=2)),
39
+ )
40
+
41
+
42
+ def _summary_cards(summary: dict[str, Any]) -> str:
43
+ cards = [
44
+ ("Regression", summary.get("regression")),
45
+ ("Support Rate Delta", _signed(summary.get("support_rate_delta"))),
46
+ ("Unsupported Rate Delta", _signed(summary.get("unsupported_claim_rate_delta"))),
47
+ ("Citation Mismatch Delta", _signed(summary.get("citation_mismatch_delta"))),
48
+ ("New Failures", summary.get("new_failures", 0)),
49
+ ("Resolved Failures", summary.get("resolved_failures", 0)),
50
+ ("New Unsupported", summary.get("new_unsupported", 0)),
51
+ ("New Citation Mismatches", summary.get("new_citation_mismatches", 0)),
52
+ ("Added Claims", summary.get("added_claims", 0)),
53
+ ("Removed Claims", summary.get("removed_claims", 0)),
54
+ ("Should Abstain Before", summary.get("should_abstain_before")),
55
+ ("Should Abstain After", summary.get("should_abstain_after")),
56
+ ]
57
+ return "\n".join(
58
+ """
59
+ <div class="card">
60
+ <div class="label">{label}</div>
61
+ <div class="value">{value}</div>
62
+ </div>
63
+ """.format(label=escape(label), value=escape(_string(value)))
64
+ for label, value in cards
65
+ )
66
+
67
+
68
+ def _change_rows(changes: list[dict[str, Any]]) -> str:
69
+ if not changes:
70
+ return "<tr><td colspan=\"7\" class=\"muted\">No claim-level changes detected.</td></tr>"
71
+ rows = []
72
+ for change in changes:
73
+ before = change.get("before") or {}
74
+ after = change.get("after") or {}
75
+ rows.append(
76
+ """
77
+ <tr>
78
+ <td><span class="badge status-{status_class}">{status}</span></td>
79
+ <td>{claim}</td>
80
+ <td>{before_verdict}</td>
81
+ <td>{after_verdict}</td>
82
+ <td>{before_root}</td>
83
+ <td>{after_root}</td>
84
+ <td>{fix}</td>
85
+ </tr>
86
+ """.format(
87
+ status_class=escape(_css_token(change.get("status"))),
88
+ status=escape(_string(change.get("status"))),
89
+ claim=escape(_string(change.get("claim"))),
90
+ before_verdict=escape(_string(before.get("verdict") or "none")),
91
+ after_verdict=escape(_string(after.get("verdict") or "none")),
92
+ before_root=escape(_string(before.get("root_cause") or "none")),
93
+ after_root=escape(_string(after.get("root_cause") or "none")),
94
+ fix=escape(_string(change.get("suggested_fix"))),
95
+ )
96
+ )
97
+ return "\n".join(rows)
98
+
99
+
100
+ def _change_cards(changes: list[dict[str, Any]], statuses: set[str], *, empty: str) -> str:
101
+ selected = [change for change in changes if change.get("status") in statuses]
102
+ if not selected:
103
+ return "<p class=\"muted\">%s</p>" % escape(empty)
104
+ return "\n".join(_change_card(change) for change in selected)
105
+
106
+
107
+ def _change_card(change: dict[str, Any]) -> str:
108
+ before = change.get("before") or {}
109
+ after = change.get("after") or {}
110
+ active = after or before
111
+ return """
112
+ <article class="item">
113
+ <div class="item-meta">{status} | match {match_score}</div>
114
+ <h3>{claim}</h3>
115
+ <p><strong>Before:</strong> {before_verdict} | {before_citation} | {before_root}</p>
116
+ <p><strong>After:</strong> {after_verdict} | {after_citation} | {after_root}</p>
117
+ <p><strong>Best context:</strong> {context_id}</p>
118
+ <p><strong>Closest evidence:</strong> {evidence}</p>
119
+ <p><strong>Suggested fix:</strong> {fix}</p>
120
+ </article>
121
+ """.format(
122
+ status=escape(_string(change.get("status"))),
123
+ match_score=escape(_string(change.get("match_score") if change.get("match_score") is not None else "new")),
124
+ claim=escape(_string(change.get("claim"))),
125
+ before_verdict=escape(_string(before.get("verdict") or "none")),
126
+ before_citation=escape(_string(before.get("citation_status") or "none")),
127
+ before_root=escape(_string(before.get("root_cause") or "none")),
128
+ after_verdict=escape(_string(after.get("verdict") or "none")),
129
+ after_citation=escape(_string(after.get("citation_status") or "none")),
130
+ after_root=escape(_string(after.get("root_cause") or "none")),
131
+ context_id=escape(_string(active.get("best_context_id") or "none")),
132
+ evidence=escape(_string(active.get("closest_evidence") or "none")),
133
+ fix=escape(_string(change.get("suggested_fix"))),
134
+ )
135
+
136
+
137
+ def _root_changes(summary: dict[str, Any]) -> str:
138
+ new_roots = list(summary.get("new_root_causes") or [])
139
+ resolved_roots = list(summary.get("resolved_root_causes") or [])
140
+ if not new_roots and not resolved_roots:
141
+ return "<p class=\"muted\">No root-cause labels changed.</p>"
142
+ return """
143
+ <div class="grid-two">
144
+ <div class="item">
145
+ <div class="item-meta">New root causes</div>
146
+ <p>{new_roots}</p>
147
+ </div>
148
+ <div class="item">
149
+ <div class="item-meta">Resolved root causes</div>
150
+ <p>{resolved_roots}</p>
151
+ </div>
152
+ </div>
153
+ """.format(
154
+ new_roots=escape(", ".join(new_roots) or "none"),
155
+ resolved_roots=escape(", ".join(resolved_roots) or "none"),
156
+ )
157
+
158
+
159
+ def _run_summary(run: dict[str, Any]) -> str:
160
+ summary = run.get("summary") or {}
161
+ metadata = run.get("metadata") or {}
162
+ cards = [
163
+ ("Query", run.get("query")),
164
+ ("Support Rate", summary.get("support_rate")),
165
+ ("Unsupported Rate", summary.get("unsupported_claim_rate")),
166
+ ("Citation Mismatches", summary.get("citation_mismatches")),
167
+ ("Failure Type", summary.get("failure_type")),
168
+ ("Primary Root Cause", summary.get("primary_root_cause")),
169
+ ("Should Abstain", summary.get("should_abstain")),
170
+ ("Input Type", metadata.get("compare_input_type")),
171
+ ]
172
+ return "\n".join(
173
+ """
174
+ <div class="card">
175
+ <div class="label">{label}</div>
176
+ <div class="small-value">{value}</div>
177
+ </div>
178
+ """.format(label=escape(label), value=escape(_string(value)))
179
+ for label, value in cards
180
+ )
181
+
182
+
183
+ def _raw_summary(result: dict[str, Any]) -> dict[str, Any]:
184
+ return {
185
+ "mode": result.get("mode"),
186
+ "summary": result.get("summary"),
187
+ "changes": result.get("changes"),
188
+ "baseline": result.get("baseline"),
189
+ "current": result.get("current"),
190
+ }
191
+
192
+
193
+ def _signed(value: Any) -> str:
194
+ try:
195
+ number = float(value)
196
+ except (TypeError, ValueError):
197
+ return "0"
198
+ if number > 0:
199
+ return "+%s" % _string(round(number, 3))
200
+ return _string(round(number, 3))
201
+
202
+
203
+ def _css_token(value: Any) -> str:
204
+ token = _string(value).lower().replace("_", "-").replace(" ", "-")
205
+ return "".join(char for char in token if char.isalnum() or char == "-") or "unknown"
206
+
207
+
208
+ def _string(value: Any) -> str:
209
+ if value is None:
210
+ return ""
211
+ return str(value)
212
+
213
+
214
+ HTML_TEMPLATE = """<!doctype html>
215
+ <html lang="en">
216
+ <head>
217
+ <meta charset="utf-8">
218
+ <meta name="viewport" content="width=device-width, initial-scale=1">
219
+ <title>ContextTrace Regression Report</title>
220
+ <style>
221
+ :root {{
222
+ color-scheme: light;
223
+ --bg: #f7f8fa;
224
+ --panel: #ffffff;
225
+ --subtle: #fbfcfe;
226
+ --text: #202832;
227
+ --muted: #657286;
228
+ --line: #d9e0ea;
229
+ --ok: #176f44;
230
+ --warn: #946200;
231
+ --bad: #b42318;
232
+ --accent: #2458d3;
233
+ }}
234
+ * {{ box-sizing: border-box; }}
235
+ body {{
236
+ margin: 0;
237
+ background: var(--bg);
238
+ color: var(--text);
239
+ font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
240
+ line-height: 1.5;
241
+ }}
242
+ main {{ max-width: 1160px; margin: 0 auto; padding: 32px 20px 56px; }}
243
+ header {{ border-bottom: 1px solid var(--line); margin-bottom: 22px; padding-bottom: 18px; }}
244
+ h1, h2, h3 {{ margin: 0; }}
245
+ h1 {{ font-size: 30px; }}
246
+ h2 {{ font-size: 18px; margin-bottom: 12px; }}
247
+ h3 {{ font-size: 15px; margin-bottom: 8px; }}
248
+ section {{
249
+ background: var(--panel);
250
+ border: 1px solid var(--line);
251
+ border-radius: 8px;
252
+ margin: 16px 0;
253
+ padding: 18px;
254
+ }}
255
+ .banner {{
256
+ border: 1px solid var(--line);
257
+ border-radius: 8px;
258
+ background: var(--subtle);
259
+ padding: 14px;
260
+ margin-top: 12px;
261
+ }}
262
+ .banner.ok {{ border-color: #a7dfbf; background: #edf9f1; }}
263
+ .banner.bad {{ border-color: #f3b1ac; background: #fff1f0; }}
264
+ .summary {{
265
+ display: grid;
266
+ gap: 12px;
267
+ grid-template-columns: repeat(auto-fit, minmax(155px, 1fr));
268
+ }}
269
+ .grid-two {{
270
+ display: grid;
271
+ gap: 12px;
272
+ grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
273
+ }}
274
+ .card, .item {{
275
+ border: 1px solid var(--line);
276
+ border-radius: 8px;
277
+ background: var(--subtle);
278
+ padding: 12px;
279
+ }}
280
+ .item + .item {{ margin-top: 10px; }}
281
+ .label, .item-meta {{
282
+ color: var(--muted);
283
+ font-size: 12px;
284
+ font-weight: 700;
285
+ text-transform: uppercase;
286
+ }}
287
+ .value {{ margin-top: 4px; font-size: 18px; overflow-wrap: anywhere; }}
288
+ .small-value {{ margin-top: 4px; font-size: 14px; overflow-wrap: anywhere; }}
289
+ .muted {{ color: var(--muted); }}
290
+ table {{ width: 100%; border-collapse: collapse; font-size: 14px; }}
291
+ th, td {{ border-bottom: 1px solid var(--line); padding: 10px; text-align: left; vertical-align: top; }}
292
+ th {{ color: var(--muted); font-size: 12px; text-transform: uppercase; }}
293
+ .badge {{
294
+ display: inline-block;
295
+ border-radius: 999px;
296
+ border: 1px solid var(--line);
297
+ background: #eef2f7;
298
+ padding: 3px 8px;
299
+ font-size: 12px;
300
+ font-weight: 700;
301
+ white-space: nowrap;
302
+ }}
303
+ .status-added-failure, .status-new-failure, .status-verdict-regressed,
304
+ .status-citation-regressed, .status-root-cause-regressed {{ color: var(--bad); background: #fdeceb; }}
305
+ .status-resolved-failure, .status-removed-failure, .status-verdict-improved,
306
+ .status-citation-improved {{ color: var(--ok); background: #e9f7ef; }}
307
+ .status-added-claim, .status-removed-claim, .status-source-changed,
308
+ .status-claim-changed, .status-root-cause-changed {{ color: var(--warn); background: #fff7df; }}
309
+ pre {{
310
+ margin: 0;
311
+ overflow: auto;
312
+ background: #101828;
313
+ color: #f8fafc;
314
+ border-radius: 8px;
315
+ padding: 14px;
316
+ font-size: 13px;
317
+ }}
318
+ </style>
319
+ </head>
320
+ <body>
321
+ <main>
322
+ <header>
323
+ <h1>ContextTrace Regression Report</h1>
324
+ <p class="muted">Local diff of two claim-level evidence verification runs.</p>
325
+ <div class="banner {verdict_class}">
326
+ <strong>Regression: {regression}</strong>
327
+ <span class="muted"> | mode {mode}</span>
328
+ </div>
329
+ </header>
330
+
331
+ <section>
332
+ <h2>Regression Summary</h2>
333
+ <div class="summary">{summary_cards}</div>
334
+ </section>
335
+
336
+ <section>
337
+ <h2>Claim Changes</h2>
338
+ <table>
339
+ <thead>
340
+ <tr>
341
+ <th>Status</th>
342
+ <th>Claim</th>
343
+ <th>Before Verdict</th>
344
+ <th>After Verdict</th>
345
+ <th>Before Root Cause</th>
346
+ <th>After Root Cause</th>
347
+ <th>Suggested Fix</th>
348
+ </tr>
349
+ </thead>
350
+ <tbody>{change_rows}</tbody>
351
+ </table>
352
+ </section>
353
+
354
+ <section>
355
+ <h2>New Failures</h2>
356
+ {new_failures}
357
+ </section>
358
+
359
+ <section>
360
+ <h2>Resolved Failures</h2>
361
+ {resolved_failures}
362
+ </section>
363
+
364
+ <section>
365
+ <h2>Root Cause Changes</h2>
366
+ {root_changes}
367
+ </section>
368
+
369
+ <section>
370
+ <h2>Baseline Summary</h2>
371
+ <div class="summary">{baseline_summary}</div>
372
+ </section>
373
+
374
+ <section>
375
+ <h2>Current Summary</h2>
376
+ <div class="summary">{current_summary}</div>
377
+ </section>
378
+
379
+ <section>
380
+ <h2>Raw JSON Summary</h2>
381
+ <pre>{raw_json}</pre>
382
+ </section>
383
+ </main>
384
+ </body>
385
+ </html>
386
+ """
@@ -33,6 +33,8 @@ contexttrace/verify/abstention.py
33
33
  contexttrace/verify/benchmark.py
34
34
  contexttrace/verify/citations.py
35
35
  contexttrace/verify/claims.py
36
+ contexttrace/verify/compare.py
37
+ contexttrace/verify/compare_report.py
36
38
  contexttrace/verify/demos.py
37
39
  contexttrace/verify/evidence.py
38
40
  contexttrace/verify/external_benchmark_cases.json
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "contexttrace"
7
- version = "0.3.0"
7
+ version = "0.4.0"
8
8
  description = "Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -1 +0,0 @@
1
- __version__ = "0.3.0"
File without changes
File without changes
File without changes