contexttrace 0.3.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {contexttrace-0.3.0 → contexttrace-0.5.0}/PKG-INFO +12 -2
  2. {contexttrace-0.3.0 → contexttrace-0.5.0}/README.md +11 -1
  3. contexttrace-0.5.0/contexttrace/_version.py +1 -0
  4. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/cli.py +124 -0
  5. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/__init__.py +9 -0
  6. contexttrace-0.5.0/contexttrace/verify/audit.py +449 -0
  7. contexttrace-0.5.0/contexttrace/verify/audit_report.py +372 -0
  8. contexttrace-0.5.0/contexttrace/verify/compare.py +445 -0
  9. contexttrace-0.5.0/contexttrace/verify/compare_report.py +386 -0
  10. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace.egg-info/SOURCES.txt +4 -0
  11. {contexttrace-0.3.0 → contexttrace-0.5.0}/pyproject.toml +1 -1
  12. contexttrace-0.3.0/contexttrace/_version.py +0 -1
  13. {contexttrace-0.3.0 → contexttrace-0.5.0}/MANIFEST.in +0 -0
  14. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/__init__.py +0 -0
  15. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/client.py +0 -0
  16. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/config.py +0 -0
  17. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/demo.py +0 -0
  18. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/demo_data.py +0 -0
  19. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/endpoint_eval.py +0 -0
  20. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/errors.py +0 -0
  21. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/evaluator.py +0 -0
  22. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/integrations/__init__.py +0 -0
  23. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/integrations/fastapi.py +0 -0
  24. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/integrations/langchain.py +0 -0
  25. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/integrations/langgraph.py +0 -0
  26. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/integrations/llamaindex.py +0 -0
  27. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/integrations/opentelemetry.py +0 -0
  28. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/local.py +0 -0
  29. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/py.typed +0 -0
  30. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/regression.py +0 -0
  31. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/reliability.py +0 -0
  32. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/report.py +0 -0
  33. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/storage/__init__.py +0 -0
  34. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/storage/sqlite_store.py +0 -0
  35. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/thresholds.py +0 -0
  36. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/transport.py +0 -0
  37. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/abstention.py +0 -0
  38. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/benchmark.py +0 -0
  39. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/citations.py +0 -0
  40. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/claims.py +0 -0
  41. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/demos.py +0 -0
  42. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/evidence.py +0 -0
  43. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/external_benchmark_cases.json +0 -0
  44. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/facts.py +0 -0
  45. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/real_benchmark_cases.json +0 -0
  46. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/report.py +0 -0
  47. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/root_cause.py +0 -0
  48. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/runner.py +0 -0
  49. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/schema.py +0 -0
  50. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/spans.py +0 -0
  51. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/verdicts.py +0 -0
  52. {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/viewer.py +0 -0
  53. {contexttrace-0.3.0 → contexttrace-0.5.0}/setup.cfg +0 -0
  54. {contexttrace-0.3.0 → contexttrace-0.5.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: contexttrace
3
- Version: 0.3.0
3
+ Version: 0.5.0
4
4
  Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
5
5
  Author: ContextTrace contributors
6
6
  License: MIT
@@ -147,6 +147,12 @@ contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
147
147
  contexttrace verify-benchmark --mode semantic
148
148
  contexttrace verify-benchmark --mode semantic --report
149
149
  contexttrace verify-benchmark --case-set external --mode semantic --report
150
+ contexttrace compare baseline.json current.json
151
+ contexttrace compare baseline.json current.json --report
152
+ contexttrace compare baseline.json current.json --fail-on new_failure
153
+ contexttrace audit trace.json --corpus docs/
154
+ contexttrace audit trace.json --corpus docs/ --report
155
+ contexttrace audit trace.json --corpus docs/ --fail-on retrieval_miss
150
156
  ```
151
157
 
152
158
  Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
@@ -159,7 +165,11 @@ Verification output includes evidence span offsets, stable span hashes, multiple
159
165
 
160
166
  ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
161
167
 
162
- The v0.3.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
168
+ Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
169
+
170
+ Use `contexttrace audit trace.json --corpus docs/` to diagnose whether an unsupported claim failed because retrieval missed evidence, chunking omitted the supporting span, the corpus lacks coverage, or generation overclaimed.
171
+
172
+ The v0.5.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
163
173
 
164
174
  ## What It Catches
165
175
 
@@ -90,6 +90,12 @@ contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
90
90
  contexttrace verify-benchmark --mode semantic
91
91
  contexttrace verify-benchmark --mode semantic --report
92
92
  contexttrace verify-benchmark --case-set external --mode semantic --report
93
+ contexttrace compare baseline.json current.json
94
+ contexttrace compare baseline.json current.json --report
95
+ contexttrace compare baseline.json current.json --fail-on new_failure
96
+ contexttrace audit trace.json --corpus docs/
97
+ contexttrace audit trace.json --corpus docs/ --report
98
+ contexttrace audit trace.json --corpus docs/ --fail-on retrieval_miss
93
99
  ```
94
100
 
95
101
  Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
@@ -102,7 +108,11 @@ Verification output includes evidence span offsets, stable span hashes, multiple
102
108
 
103
109
  ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
104
110
 
105
- The v0.3.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
111
+ Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
112
+
113
+ Use `contexttrace audit trace.json --corpus docs/` to diagnose whether an unsupported claim failed because retrieval missed evidence, chunking omitted the supporting span, the corpus lacks coverage, or generation overclaimed.
114
+
115
+ The v0.5.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
106
116
 
107
117
  ## What It Catches
108
118
 
@@ -0,0 +1 @@
1
+ __version__ = "0.5.0"
@@ -24,12 +24,18 @@ from contexttrace.storage import SQLiteTraceStore
24
24
  from contexttrace.thresholds import parse_thresholds, threshold_failures
25
25
  from contexttrace.verify import (
26
26
  VerificationInputError,
27
+ audit_failures,
28
+ audit_trace,
29
+ compare_failures,
30
+ compare_trace_files,
27
31
  list_verify_demos,
28
32
  load_trace_file,
29
33
  load_verify_demo,
30
34
  verify_trace,
31
35
  )
32
36
  from contexttrace.verify.benchmark import run_verify_benchmark, write_verify_benchmark_report
37
+ from contexttrace.verify.audit_report import AuditReportGenerator
38
+ from contexttrace.verify.compare_report import CompareReportGenerator
33
39
  from contexttrace.verify.report import VerifyReportGenerator
34
40
  from contexttrace.viewer import serve_viewer
35
41
 
@@ -340,6 +346,124 @@ def verify_benchmark_command(mode: str, case_set: str, json_output: bool, report
340
346
  return 0
341
347
 
342
348
 
349
+ @cli.command("compare")
350
+ @click.argument("baseline_json")
351
+ @click.argument("current_json")
352
+ @click.option("--json", "json_output", is_flag=True, help="Print the full comparison result as JSON.")
353
+ @click.option("--report", is_flag=True, help="Generate a local HTML regression report.")
354
+ @click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
355
+ @click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode for raw trace inputs.")
356
+ @click.option("--fail-on", multiple=True, help="Fail on new_failure, new_unsupported, new_citation_mismatch, should_abstain_flip, support_rate_drop, new_root_cause, or any_regression.")
357
+ def compare_command(
358
+ baseline_json: str,
359
+ current_json: str,
360
+ json_output: bool,
361
+ report: bool,
362
+ out: Optional[str],
363
+ mode: str,
364
+ fail_on: tuple[str, ...],
365
+ ) -> int:
366
+ """Compare two portable RAG traces or verification JSON outputs."""
367
+
368
+ try:
369
+ result = compare_trace_files(baseline_json, current_json, mode=mode)
370
+ except VerificationInputError as exc:
371
+ raise click.ClickException(str(exc)) from exc
372
+
373
+ written_report = None
374
+ if report or out:
375
+ default_name = "%s_vs_%s_compare.html" % (Path(baseline_json).stem, Path(current_json).stem)
376
+ output_path = out or str(Path(".contexttrace") / "reports" / default_name)
377
+ written_report = CompareReportGenerator().generate(result, path=output_path)
378
+
379
+ fail_messages = compare_failures(result, fail_on)
380
+ if json_output:
381
+ if written_report:
382
+ click.echo("Report: %s" % written_report, err=True)
383
+ click.echo(json.dumps(result, indent=2))
384
+ for message in fail_messages:
385
+ click.echo("Comparison failed: %s" % message, err=True)
386
+ return 1 if fail_messages else 0
387
+
388
+ summary = result["summary"]
389
+ click.echo("Regression: %s" % str(summary["regression"]).lower())
390
+ click.echo("Support rate: %.3f -> %.3f (%+.3f)" % (
391
+ float(summary.get("support_rate_before") or 0.0),
392
+ float(summary.get("support_rate_after") or 0.0),
393
+ float(summary.get("support_rate_delta") or 0.0),
394
+ ))
395
+ click.echo("Unsupported claim rate delta: %+.3f" % float(summary.get("unsupported_claim_rate_delta") or 0.0))
396
+ click.echo("Citation mismatch delta: %+d" % int(summary.get("citation_mismatch_delta") or 0))
397
+ click.echo("New failures: %s" % summary["new_failures"])
398
+ click.echo("Resolved failures: %s" % summary["resolved_failures"])
399
+ click.echo("Added claims: %s" % summary["added_claims"])
400
+ click.echo("Removed claims: %s" % summary["removed_claims"])
401
+ click.echo("Changed claims: %s" % summary["changed_claims"])
402
+ click.echo("New root causes: %s" % (", ".join(summary.get("new_root_causes") or []) or "none"))
403
+ if written_report:
404
+ click.echo("Report: %s" % written_report)
405
+ for message in fail_messages:
406
+ click.echo("Comparison failed: %s" % message, err=True)
407
+ return 1 if fail_messages else 0
408
+
409
+
410
+ @cli.command("audit")
411
+ @click.argument("trace_json")
412
+ @click.option("--corpus", "corpus_path", required=True, help="Local corpus directory or file to search for supporting evidence.")
413
+ @click.option("--json", "json_output", is_flag=True, help="Print the full audit result as JSON.")
414
+ @click.option("--report", is_flag=True, help="Generate a local HTML retrieval audit report.")
415
+ @click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
416
+ @click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
417
+ @click.option("--fail-on", multiple=True, help="Fail on retrieval_miss, reranking_failure, chunking_issue, corpus_gap, answer_overreach, stale_source, insufficient_context, or any_failure.")
418
+ def audit_command(
419
+ trace_json: str,
420
+ corpus_path: str,
421
+ json_output: bool,
422
+ report: bool,
423
+ out: Optional[str],
424
+ mode: str,
425
+ fail_on: tuple[str, ...],
426
+ ) -> int:
427
+ """Audit a verified trace against a broader local corpus."""
428
+
429
+ try:
430
+ trace = load_trace_file(trace_json)
431
+ result = audit_trace(trace, corpus_path=corpus_path, mode=mode)
432
+ except VerificationInputError as exc:
433
+ raise click.ClickException(str(exc)) from exc
434
+
435
+ written_report = None
436
+ if report or out:
437
+ default_name = "%s_audit.html" % Path(trace_json).stem
438
+ output_path = out or str(Path(".contexttrace") / "reports" / default_name)
439
+ written_report = AuditReportGenerator().generate(result, trace, path=output_path)
440
+
441
+ fail_messages = audit_failures(result, fail_on)
442
+ if json_output:
443
+ if written_report:
444
+ click.echo("Report: %s" % written_report, err=True)
445
+ click.echo(json.dumps(result, indent=2))
446
+ for message in fail_messages:
447
+ click.echo("Audit failed: %s" % message, err=True)
448
+ return 1 if fail_messages else 0
449
+
450
+ summary = result["summary"]
451
+ click.echo("Primary audit label: %s" % summary["primary_audit_label"])
452
+ click.echo("Claims audited: %s" % summary["total_claims"])
453
+ click.echo("Corpus documents: %s" % summary["corpus_documents"])
454
+ click.echo("Retrieval misses: %s" % summary["retrieval_miss"])
455
+ click.echo("Chunking issues: %s" % summary["chunking_issue"])
456
+ click.echo("Reranking failures: %s" % summary["reranking_failure"])
457
+ click.echo("Corpus gaps: %s" % summary["corpus_gap"])
458
+ click.echo("Answer overreach: %s" % summary["answer_overreach"])
459
+ click.echo("Insufficient context: %s" % summary["insufficient_context"])
460
+ if written_report:
461
+ click.echo("Report: %s" % written_report)
462
+ for message in fail_messages:
463
+ click.echo("Audit failed: %s" % message, err=True)
464
+ return 1 if fail_messages else 0
465
+
466
+
343
467
  def _write_verify_report(
344
468
  result: dict,
345
469
  trace: object,
@@ -1,4 +1,6 @@
1
1
  from contexttrace.verify.runner import verify_trace, verify_trace_file
2
+ from contexttrace.verify.audit import audit_failures, audit_trace, audit_trace_file, load_corpus
3
+ from contexttrace.verify.compare import compare_failures, compare_trace_files, compare_verifications
2
4
  from contexttrace.verify.schema import (
3
5
  RAGTrace,
4
6
  TraceCitation,
@@ -13,7 +15,14 @@ __all__ = [
13
15
  "TraceCitation",
14
16
  "TraceContext",
15
17
  "VerificationInputError",
18
+ "audit_failures",
19
+ "audit_trace",
20
+ "audit_trace_file",
21
+ "compare_failures",
22
+ "compare_trace_files",
23
+ "compare_verifications",
16
24
  "list_verify_demos",
25
+ "load_corpus",
17
26
  "load_trace_file",
18
27
  "load_verify_demo",
19
28
  "verify_trace",
@@ -0,0 +1,449 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from contexttrace.verify.claims import Claim
8
+ from contexttrace.verify.evidence import find_best_evidence
9
+ from contexttrace.verify.runner import verify_trace
10
+ from contexttrace.verify.schema import RAGTrace, TraceContext, VerificationInputError, load_trace_file
11
+ from contexttrace.verify.verdicts import classify_claim
12
+
13
+
14
+ NO_FAILURE = "no_failure_detected"
15
+ RETRIEVAL_MISS = "retrieval_miss"
16
+ RERANKING_FAILURE = "reranking_failure"
17
+ CHUNKING_ISSUE = "chunking_issue"
18
+ CORPUS_GAP = "corpus_gap"
19
+ ANSWER_OVERREACH = "answer_overreach"
20
+ STALE_SOURCE = "stale_source"
21
+ INSUFFICIENT_CONTEXT = "insufficient_context"
22
+
23
+ AUDIT_FAILURE_LABELS = {
24
+ RETRIEVAL_MISS,
25
+ RERANKING_FAILURE,
26
+ CHUNKING_ISSUE,
27
+ CORPUS_GAP,
28
+ ANSWER_OVERREACH,
29
+ STALE_SOURCE,
30
+ INSUFFICIENT_CONTEXT,
31
+ }
32
+ BAD_CITATIONS = {
33
+ "cited_source_missing",
34
+ "cited_source_does_not_support_claim",
35
+ "claim_supported_by_different_source",
36
+ }
37
+ SUPPORTED_VERDICTS = {"supported"}
38
+ CORPUS_EXTENSIONS = {
39
+ ".csv",
40
+ ".html",
41
+ ".json",
42
+ ".jsonl",
43
+ ".md",
44
+ ".markdown",
45
+ ".rst",
46
+ ".text",
47
+ ".tsv",
48
+ ".txt",
49
+ ".yaml",
50
+ ".yml",
51
+ }
52
+ SKIP_DIRECTORIES = {
53
+ ".contexttrace",
54
+ ".git",
55
+ ".hg",
56
+ ".mypy_cache",
57
+ ".pytest_cache",
58
+ ".ruff_cache",
59
+ ".svn",
60
+ "__pycache__",
61
+ "build",
62
+ "dist",
63
+ "node_modules",
64
+ }
65
+ MAX_FILE_BYTES = 1_000_000
66
+ RERANKING_CUTOFF = 3
67
+
68
+
69
+ def audit_trace_file(
70
+ trace_path: str | Path,
71
+ *,
72
+ corpus_path: str | Path,
73
+ mode: str = "lexical",
74
+ ) -> dict[str, Any]:
75
+ trace = load_trace_file(trace_path)
76
+ return audit_trace(trace, corpus_path=corpus_path, mode=mode)
77
+
78
+
79
+ def audit_trace(
80
+ trace: RAGTrace,
81
+ *,
82
+ corpus_path: str | Path,
83
+ mode: str = "lexical",
84
+ ) -> dict[str, Any]:
85
+ corpus_contexts = load_corpus(corpus_path)
86
+ verification = verify_trace(trace, mode=mode)
87
+ claim_audits = [
88
+ _audit_claim(claim, trace, corpus_contexts, mode=mode)
89
+ for claim in verification.get("claims") or []
90
+ ]
91
+ summary = _summary(claim_audits, verification, corpus_contexts, mode=mode)
92
+ return {
93
+ "query": trace.query,
94
+ "answer": trace.answer,
95
+ "summary": summary,
96
+ "claims": claim_audits,
97
+ "verification": {
98
+ "summary": verification.get("summary") or {},
99
+ "abstention": verification.get("abstention") or {},
100
+ "diagnostics": verification.get("diagnostics") or {},
101
+ },
102
+ "corpus": {
103
+ "path": str(Path(corpus_path)),
104
+ "documents": len(corpus_contexts),
105
+ },
106
+ "metadata": dict(trace.metadata),
107
+ }
108
+
109
+
110
+ def load_corpus(corpus_path: str | Path) -> list[TraceContext]:
111
+ root = Path(corpus_path)
112
+ if not root.exists():
113
+ raise VerificationInputError("Corpus path %s does not exist." % root)
114
+
115
+ files = [root] if root.is_file() else _corpus_files(root)
116
+ contexts: list[TraceContext] = []
117
+ for path in files:
118
+ text = _read_text(path)
119
+ if not text.strip():
120
+ continue
121
+ context_id = _context_id(path, root)
122
+ contexts.append(
123
+ TraceContext(
124
+ id=context_id,
125
+ text=text,
126
+ metadata={
127
+ "path": str(path),
128
+ "source": context_id,
129
+ "size_bytes": path.stat().st_size,
130
+ "kind": "corpus_document",
131
+ },
132
+ )
133
+ )
134
+
135
+ if not contexts:
136
+ raise VerificationInputError("Corpus path %s did not contain readable text documents." % root)
137
+ return contexts
138
+
139
+
140
+ def audit_failures(result: dict[str, Any], fail_on: tuple[str, ...]) -> list[str]:
141
+ if not fail_on:
142
+ return []
143
+ summary = result.get("summary") or {}
144
+ messages = []
145
+ for raw_rule in fail_on:
146
+ rule = raw_rule.strip().lower().replace("-", "_")
147
+ if rule == "any_failure" and bool(summary.get("has_audit_failures")):
148
+ messages.append("audit failure detected")
149
+ elif rule == "retrieval_miss" and int(summary.get(RETRIEVAL_MISS) or 0) > 0:
150
+ messages.append("retrieval miss detected")
151
+ elif rule == "reranking_failure" and int(summary.get(RERANKING_FAILURE) or 0) > 0:
152
+ messages.append("reranking failure detected")
153
+ elif rule == "chunking_issue" and int(summary.get(CHUNKING_ISSUE) or 0) > 0:
154
+ messages.append("chunking issue detected")
155
+ elif rule == "corpus_gap" and int(summary.get(CORPUS_GAP) or 0) > 0:
156
+ messages.append("corpus gap detected")
157
+ elif rule == "answer_overreach" and int(summary.get(ANSWER_OVERREACH) or 0) > 0:
158
+ messages.append("answer overreach detected")
159
+ elif rule == "stale_source" and int(summary.get(STALE_SOURCE) or 0) > 0:
160
+ messages.append("stale source detected")
161
+ elif rule == "insufficient_context" and int(summary.get(INSUFFICIENT_CONTEXT) or 0) > 0:
162
+ messages.append("insufficient context detected")
163
+ elif rule not in AUDIT_FAILURE_LABELS and rule != "any_failure":
164
+ messages.append("unknown --fail-on rule %s" % raw_rule)
165
+ return messages
166
+
167
+
168
+ def _audit_claim(
169
+ claim: dict[str, Any],
170
+ trace: RAGTrace,
171
+ corpus_contexts: list[TraceContext],
172
+ *,
173
+ mode: str,
174
+ ) -> dict[str, Any]:
175
+ claim_text = str(claim.get("claim") or "")
176
+ claim_id = str(claim.get("claim_id") or "")
177
+ corpus_match = find_best_evidence(claim_text, corpus_contexts, mode=mode)
178
+ corpus_verification = classify_claim(
179
+ Claim(id=claim_id or "claim", text=claim_text),
180
+ corpus_match,
181
+ has_contexts=bool(corpus_contexts),
182
+ )
183
+ diagnosis = _diagnose(claim, trace, corpus_match, corpus_verification)
184
+ return {
185
+ "claim_id": claim_id,
186
+ "claim": claim_text,
187
+ "audit_label": diagnosis["label"],
188
+ "confidence": diagnosis["confidence"],
189
+ "reason": diagnosis["reason"],
190
+ "suggested_fix": diagnosis["suggested_fix"],
191
+ "retrieved": {
192
+ "verdict": claim.get("verdict"),
193
+ "best_context_id": claim.get("best_context_id"),
194
+ "best_score": claim.get("best_score"),
195
+ "evidence": claim.get("evidence"),
196
+ "matched_terms": list(claim.get("matched_terms") or []),
197
+ "root_cause": (claim.get("root_cause") or {}).get("label"),
198
+ "citation_status": claim.get("citation_status"),
199
+ },
200
+ "corpus": {
201
+ "verdict": corpus_verification.verdict,
202
+ "best_document_id": corpus_match.context_id,
203
+ "best_score": corpus_match.score,
204
+ "evidence": corpus_match.snippet,
205
+ "matched_terms": list(corpus_match.matched_terms),
206
+ "evidence_span": corpus_match.span_dict(),
207
+ "supporting_spans": list(corpus_match.supporting_spans or []),
208
+ "required_facts": list(corpus_verification.required_facts),
209
+ "matched_facts": list(corpus_verification.matched_facts),
210
+ "missing_facts": list(corpus_verification.missing_facts),
211
+ "conflicting_facts": list(corpus_verification.conflicting_facts),
212
+ },
213
+ }
214
+
215
+
216
+ def _diagnose(
217
+ claim: dict[str, Any],
218
+ trace: RAGTrace,
219
+ corpus_match: object,
220
+ corpus_verification: object,
221
+ ) -> dict[str, Any]:
222
+ verdict = str(claim.get("verdict") or "")
223
+ root_label = str((claim.get("root_cause") or {}).get("label") or NO_FAILURE)
224
+ citation_status = str(claim.get("citation_status") or "")
225
+ corpus_verdict = str(getattr(corpus_verification, "verdict", ""))
226
+ corpus_score = float(getattr(corpus_match, "score", 0.0) or 0.0)
227
+ same_source_rank = _same_source_retrieved_rank(str(getattr(corpus_match, "context_id", "") or ""), trace)
228
+
229
+ if _is_citation_only_failure(claim):
230
+ return _result(
231
+ NO_FAILURE,
232
+ 0.92,
233
+ "The claim is supported by retrieved evidence; the remaining issue is citation-level, not a retrieval or corpus failure.",
234
+ "Fix the claim-level citation, but do not treat this as a retrieval miss.",
235
+ )
236
+
237
+ if not _is_failure(claim):
238
+ return _result(
239
+ NO_FAILURE,
240
+ 0.99,
241
+ "The claim is already supported by the retrieved contexts.",
242
+ "No fix needed for this claim.",
243
+ )
244
+
245
+ if verdict == "contradicted" or corpus_verdict == "contradicted" or root_label in {"stale_context", "conflicting_contexts"}:
246
+ return _result(
247
+ STALE_SOURCE,
248
+ 0.86,
249
+ "The claim appears to conflict with retrieved or corpus evidence.",
250
+ "Resolve stale or conflicting sources before allowing the answer to use this fact.",
251
+ )
252
+
253
+ if corpus_verdict in SUPPORTED_VERDICTS:
254
+ if same_source_rank is None:
255
+ return _result(
256
+ RETRIEVAL_MISS,
257
+ max(0.82, min(0.98, corpus_score + 0.12)),
258
+ "The broader corpus contains evidence for this claim, but the retrieved contexts did not include it.",
259
+ "Improve retrieval recall, filters, query rewriting, or top_k so this source is retrieved.",
260
+ )
261
+ if same_source_rank >= RERANKING_CUTOFF:
262
+ return _result(
263
+ RERANKING_FAILURE,
264
+ max(0.78, min(0.95, corpus_score + 0.08)),
265
+ "A related source was retrieved, but it appeared too low in the retrieved context list for reliable generation.",
266
+ "Add a reranker or raise high-evidence chunks from this source before generation.",
267
+ )
268
+ return _result(
269
+ CHUNKING_ISSUE,
270
+ max(0.78, min(0.95, corpus_score + 0.08)),
271
+ "The retrieved source appears related, but the retrieved chunk omitted the supporting span found in the corpus.",
272
+ "Adjust chunk boundaries, overlap, or parent-document retrieval so the answerable span is included.",
273
+ )
274
+
275
+ if root_label == "answer_overreach" or verdict == "partially_supported":
276
+ return _result(
277
+ ANSWER_OVERREACH,
278
+ 0.82,
279
+ "The evidence supports part of the claim, but not every required fact.",
280
+ "Remove unsupported details or retrieve evidence that explicitly supports each detail.",
281
+ )
282
+
283
+ if corpus_verdict == "partially_supported":
284
+ return _result(
285
+ ANSWER_OVERREACH,
286
+ 0.78,
287
+ "The corpus supports only part of the claim, so the answer likely added unsupported detail.",
288
+ "Split the claim and require support for every required fact before answering.",
289
+ )
290
+
291
+ if corpus_verdict == "unverifiable" or verdict == "unverifiable":
292
+ return _result(
293
+ INSUFFICIENT_CONTEXT,
294
+ 0.72,
295
+ "The closest corpus evidence is related but too weak or ambiguous to verify the claim.",
296
+ "Retrieve more specific evidence or force the model to qualify/abstain.",
297
+ )
298
+
299
+ if citation_status in BAD_CITATIONS and corpus_score >= 0.35:
300
+ return _result(
301
+ INSUFFICIENT_CONTEXT,
302
+ 0.7,
303
+ "The claim has a citation problem and the broader corpus evidence is still not strong enough.",
304
+ "Regenerate claim-level citations and require cited sources to cover all required facts.",
305
+ )
306
+
307
+ return _result(
308
+ CORPUS_GAP,
309
+ max(0.7, min(0.95, 1.0 - corpus_score)),
310
+ "Neither the retrieved contexts nor the broader corpus provide enough support for this claim.",
311
+ "Add the missing source to the corpus or make the answer abstain when the corpus lacks this fact.",
312
+ )
313
+
314
+
315
+ def _summary(
316
+ claim_audits: list[dict[str, Any]],
317
+ verification: dict[str, Any],
318
+ corpus_contexts: list[TraceContext],
319
+ *,
320
+ mode: str,
321
+ ) -> dict[str, Any]:
322
+ counts = Counter(str(claim.get("audit_label") or NO_FAILURE) for claim in claim_audits)
323
+ labels = [NO_FAILURE] + sorted(AUDIT_FAILURE_LABELS)
324
+ failure_count = sum(counts[label] for label in AUDIT_FAILURE_LABELS)
325
+ return {
326
+ "mode": mode,
327
+ "total_claims": len(claim_audits),
328
+ "audited_claims": len([claim for claim in claim_audits if claim.get("audit_label") != NO_FAILURE]),
329
+ "corpus_documents": len(corpus_contexts),
330
+ "has_audit_failures": failure_count > 0,
331
+ "primary_audit_label": _primary_label(counts),
332
+ "verification_failure_type": (verification.get("summary") or {}).get("failure_type"),
333
+ "verification_primary_root_cause": (verification.get("summary") or {}).get("primary_root_cause"),
334
+ **{label: counts[label] for label in labels},
335
+ }
336
+
337
+
338
+ def _primary_label(counts: Counter) -> str:
339
+ failures = {label: counts[label] for label in AUDIT_FAILURE_LABELS if counts[label]}
340
+ if not failures:
341
+ return NO_FAILURE
342
+ priority = [
343
+ RETRIEVAL_MISS,
344
+ CHUNKING_ISSUE,
345
+ RERANKING_FAILURE,
346
+ CORPUS_GAP,
347
+ ANSWER_OVERREACH,
348
+ STALE_SOURCE,
349
+ INSUFFICIENT_CONTEXT,
350
+ ]
351
+ return max(
352
+ failures,
353
+ key=lambda label: (
354
+ failures[label],
355
+ -priority.index(label) if label in priority else -len(priority),
356
+ ),
357
+ )
358
+
359
+
360
+ def _is_failure(claim: dict[str, Any]) -> bool:
361
+ return (
362
+ str(claim.get("verdict") or "") not in SUPPORTED_VERDICTS
363
+ or str(claim.get("citation_status") or "") in BAD_CITATIONS
364
+ or str((claim.get("root_cause") or {}).get("label") or NO_FAILURE) != NO_FAILURE
365
+ )
366
+
367
+
368
+ def _is_citation_only_failure(claim: dict[str, Any]) -> bool:
369
+ return (
370
+ str(claim.get("verdict") or "") in SUPPORTED_VERDICTS
371
+ and str(claim.get("citation_status") or "") in BAD_CITATIONS
372
+ and str((claim.get("root_cause") or {}).get("label") or NO_FAILURE)
373
+ in {"wrong_source_cited", "missing_cited_source", NO_FAILURE}
374
+ )
375
+
376
+
377
+ def _same_source_retrieved_rank(corpus_context_id: str, trace: RAGTrace) -> int | None:
378
+ corpus_key = _source_key(corpus_context_id)
379
+ if not corpus_key:
380
+ return None
381
+ for index, context in enumerate(trace.contexts):
382
+ candidates = [
383
+ context.id,
384
+ context.metadata.get("source"),
385
+ context.metadata.get("path"),
386
+ context.metadata.get("file"),
387
+ context.metadata.get("document"),
388
+ ]
389
+ if any(_sources_match(corpus_key, _source_key(value)) for value in candidates):
390
+ return index
391
+ return None
392
+
393
+
394
+ def _sources_match(left: str, right: str) -> bool:
395
+ if not left or not right:
396
+ return False
397
+ if left == right:
398
+ return True
399
+ return Path(left).name == Path(right).name
400
+
401
+
402
+ def _source_key(value: Any) -> str:
403
+ text = str(value or "").strip().replace("\\", "/").lower()
404
+ return text.strip("./")
405
+
406
+
407
+ def _result(label: str, confidence: float, reason: str, suggested_fix: str) -> dict[str, Any]:
408
+ return {
409
+ "label": label,
410
+ "confidence": round(confidence, 3),
411
+ "reason": reason,
412
+ "suggested_fix": suggested_fix,
413
+ }
414
+
415
+
416
+ def _corpus_files(root: Path) -> list[Path]:
417
+ files: list[Path] = []
418
+ for path in root.rglob("*"):
419
+ if not path.is_file():
420
+ continue
421
+ if any(part in SKIP_DIRECTORIES for part in path.parts):
422
+ continue
423
+ if path.suffix.lower() not in CORPUS_EXTENSIONS:
424
+ continue
425
+ if path.stat().st_size > MAX_FILE_BYTES:
426
+ continue
427
+ files.append(path)
428
+ return sorted(files, key=lambda item: str(item).lower())
429
+
430
+
431
+ def _read_text(path: Path) -> str:
432
+ try:
433
+ return path.read_text(encoding="utf-8")
434
+ except UnicodeDecodeError:
435
+ try:
436
+ return path.read_text(encoding="utf-8", errors="ignore")
437
+ except OSError:
438
+ return ""
439
+ except OSError:
440
+ return ""
441
+
442
+
443
+ def _context_id(path: Path, root: Path) -> str:
444
+ if root.is_file():
445
+ return path.name
446
+ try:
447
+ return path.relative_to(root).as_posix()
448
+ except ValueError:
449
+ return path.name