contexttrace 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {contexttrace-0.4.0 → contexttrace-0.5.0}/PKG-INFO +7 -2
- {contexttrace-0.4.0 → contexttrace-0.5.0}/README.md +6 -1
- contexttrace-0.5.0/contexttrace/_version.py +1 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/cli.py +60 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/__init__.py +5 -0
- contexttrace-0.5.0/contexttrace/verify/audit.py +449 -0
- contexttrace-0.5.0/contexttrace/verify/audit_report.py +372 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace.egg-info/SOURCES.txt +2 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/pyproject.toml +1 -1
- contexttrace-0.4.0/contexttrace/_version.py +0 -1
- {contexttrace-0.4.0 → contexttrace-0.5.0}/MANIFEST.in +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/__init__.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/client.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/config.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/demo.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/demo_data.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/endpoint_eval.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/errors.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/evaluator.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/integrations/__init__.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/integrations/fastapi.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/integrations/langchain.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/integrations/langgraph.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/integrations/llamaindex.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/integrations/opentelemetry.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/local.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/py.typed +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/regression.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/reliability.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/report.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/storage/__init__.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/storage/sqlite_store.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/thresholds.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/transport.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/abstention.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/benchmark.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/citations.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/claims.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/compare.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/compare_report.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/demos.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/evidence.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/external_benchmark_cases.json +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/facts.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/real_benchmark_cases.json +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/report.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/root_cause.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/runner.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/schema.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/spans.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/verify/verdicts.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/contexttrace/viewer.py +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/setup.cfg +0 -0
- {contexttrace-0.4.0 → contexttrace-0.5.0}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: contexttrace
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
|
|
5
5
|
Author: ContextTrace contributors
|
|
6
6
|
License: MIT
|
|
@@ -150,6 +150,9 @@ contexttrace verify-benchmark --case-set external --mode semantic --report
|
|
|
150
150
|
contexttrace compare baseline.json current.json
|
|
151
151
|
contexttrace compare baseline.json current.json --report
|
|
152
152
|
contexttrace compare baseline.json current.json --fail-on new_failure
|
|
153
|
+
contexttrace audit trace.json --corpus docs/
|
|
154
|
+
contexttrace audit trace.json --corpus docs/ --report
|
|
155
|
+
contexttrace audit trace.json --corpus docs/ --fail-on retrieval_miss
|
|
153
156
|
```
|
|
154
157
|
|
|
155
158
|
Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
|
|
@@ -164,7 +167,9 @@ ContextTrace verifies whether each generated claim is actually supported by retr
|
|
|
164
167
|
|
|
165
168
|
Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
|
|
166
169
|
|
|
167
|
-
|
|
170
|
+
Use `contexttrace audit trace.json --corpus docs/` to diagnose whether an unsupported claim failed because retrieval missed evidence, chunking omitted the supporting span, the corpus lacks coverage, or generation overclaimed.
|
|
171
|
+
|
|
172
|
+
The v0.5.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
168
173
|
|
|
169
174
|
## What It Catches
|
|
170
175
|
|
|
@@ -93,6 +93,9 @@ contexttrace verify-benchmark --case-set external --mode semantic --report
|
|
|
93
93
|
contexttrace compare baseline.json current.json
|
|
94
94
|
contexttrace compare baseline.json current.json --report
|
|
95
95
|
contexttrace compare baseline.json current.json --fail-on new_failure
|
|
96
|
+
contexttrace audit trace.json --corpus docs/
|
|
97
|
+
contexttrace audit trace.json --corpus docs/ --report
|
|
98
|
+
contexttrace audit trace.json --corpus docs/ --fail-on retrieval_miss
|
|
96
99
|
```
|
|
97
100
|
|
|
98
101
|
Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
|
|
@@ -107,7 +110,9 @@ ContextTrace verifies whether each generated claim is actually supported by retr
|
|
|
107
110
|
|
|
108
111
|
Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
|
|
109
112
|
|
|
110
|
-
|
|
113
|
+
Use `contexttrace audit trace.json --corpus docs/` to diagnose whether an unsupported claim failed because retrieval missed evidence, chunking omitted the supporting span, the corpus lacks coverage, or generation overclaimed.
|
|
114
|
+
|
|
115
|
+
The v0.5.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
111
116
|
|
|
112
117
|
## What It Catches
|
|
113
118
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.5.0"
|
|
@@ -24,6 +24,8 @@ from contexttrace.storage import SQLiteTraceStore
|
|
|
24
24
|
from contexttrace.thresholds import parse_thresholds, threshold_failures
|
|
25
25
|
from contexttrace.verify import (
|
|
26
26
|
VerificationInputError,
|
|
27
|
+
audit_failures,
|
|
28
|
+
audit_trace,
|
|
27
29
|
compare_failures,
|
|
28
30
|
compare_trace_files,
|
|
29
31
|
list_verify_demos,
|
|
@@ -32,6 +34,7 @@ from contexttrace.verify import (
|
|
|
32
34
|
verify_trace,
|
|
33
35
|
)
|
|
34
36
|
from contexttrace.verify.benchmark import run_verify_benchmark, write_verify_benchmark_report
|
|
37
|
+
from contexttrace.verify.audit_report import AuditReportGenerator
|
|
35
38
|
from contexttrace.verify.compare_report import CompareReportGenerator
|
|
36
39
|
from contexttrace.verify.report import VerifyReportGenerator
|
|
37
40
|
from contexttrace.viewer import serve_viewer
|
|
@@ -404,6 +407,63 @@ def compare_command(
|
|
|
404
407
|
return 1 if fail_messages else 0
|
|
405
408
|
|
|
406
409
|
|
|
410
|
+
@cli.command("audit")
|
|
411
|
+
@click.argument("trace_json")
|
|
412
|
+
@click.option("--corpus", "corpus_path", required=True, help="Local corpus directory or file to search for supporting evidence.")
|
|
413
|
+
@click.option("--json", "json_output", is_flag=True, help="Print the full audit result as JSON.")
|
|
414
|
+
@click.option("--report", is_flag=True, help="Generate a local HTML retrieval audit report.")
|
|
415
|
+
@click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
|
|
416
|
+
@click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
|
|
417
|
+
@click.option("--fail-on", multiple=True, help="Fail on retrieval_miss, reranking_failure, chunking_issue, corpus_gap, answer_overreach, stale_source, insufficient_context, or any_failure.")
|
|
418
|
+
def audit_command(
|
|
419
|
+
trace_json: str,
|
|
420
|
+
corpus_path: str,
|
|
421
|
+
json_output: bool,
|
|
422
|
+
report: bool,
|
|
423
|
+
out: Optional[str],
|
|
424
|
+
mode: str,
|
|
425
|
+
fail_on: tuple[str, ...],
|
|
426
|
+
) -> int:
|
|
427
|
+
"""Audit a verified trace against a broader local corpus."""
|
|
428
|
+
|
|
429
|
+
try:
|
|
430
|
+
trace = load_trace_file(trace_json)
|
|
431
|
+
result = audit_trace(trace, corpus_path=corpus_path, mode=mode)
|
|
432
|
+
except VerificationInputError as exc:
|
|
433
|
+
raise click.ClickException(str(exc)) from exc
|
|
434
|
+
|
|
435
|
+
written_report = None
|
|
436
|
+
if report or out:
|
|
437
|
+
default_name = "%s_audit.html" % Path(trace_json).stem
|
|
438
|
+
output_path = out or str(Path(".contexttrace") / "reports" / default_name)
|
|
439
|
+
written_report = AuditReportGenerator().generate(result, trace, path=output_path)
|
|
440
|
+
|
|
441
|
+
fail_messages = audit_failures(result, fail_on)
|
|
442
|
+
if json_output:
|
|
443
|
+
if written_report:
|
|
444
|
+
click.echo("Report: %s" % written_report, err=True)
|
|
445
|
+
click.echo(json.dumps(result, indent=2))
|
|
446
|
+
for message in fail_messages:
|
|
447
|
+
click.echo("Audit failed: %s" % message, err=True)
|
|
448
|
+
return 1 if fail_messages else 0
|
|
449
|
+
|
|
450
|
+
summary = result["summary"]
|
|
451
|
+
click.echo("Primary audit label: %s" % summary["primary_audit_label"])
|
|
452
|
+
click.echo("Claims audited: %s" % summary["total_claims"])
|
|
453
|
+
click.echo("Corpus documents: %s" % summary["corpus_documents"])
|
|
454
|
+
click.echo("Retrieval misses: %s" % summary["retrieval_miss"])
|
|
455
|
+
click.echo("Chunking issues: %s" % summary["chunking_issue"])
|
|
456
|
+
click.echo("Reranking failures: %s" % summary["reranking_failure"])
|
|
457
|
+
click.echo("Corpus gaps: %s" % summary["corpus_gap"])
|
|
458
|
+
click.echo("Answer overreach: %s" % summary["answer_overreach"])
|
|
459
|
+
click.echo("Insufficient context: %s" % summary["insufficient_context"])
|
|
460
|
+
if written_report:
|
|
461
|
+
click.echo("Report: %s" % written_report)
|
|
462
|
+
for message in fail_messages:
|
|
463
|
+
click.echo("Audit failed: %s" % message, err=True)
|
|
464
|
+
return 1 if fail_messages else 0
|
|
465
|
+
|
|
466
|
+
|
|
407
467
|
def _write_verify_report(
|
|
408
468
|
result: dict,
|
|
409
469
|
trace: object,
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from contexttrace.verify.runner import verify_trace, verify_trace_file
|
|
2
|
+
from contexttrace.verify.audit import audit_failures, audit_trace, audit_trace_file, load_corpus
|
|
2
3
|
from contexttrace.verify.compare import compare_failures, compare_trace_files, compare_verifications
|
|
3
4
|
from contexttrace.verify.schema import (
|
|
4
5
|
RAGTrace,
|
|
@@ -14,10 +15,14 @@ __all__ = [
|
|
|
14
15
|
"TraceCitation",
|
|
15
16
|
"TraceContext",
|
|
16
17
|
"VerificationInputError",
|
|
18
|
+
"audit_failures",
|
|
19
|
+
"audit_trace",
|
|
20
|
+
"audit_trace_file",
|
|
17
21
|
"compare_failures",
|
|
18
22
|
"compare_trace_files",
|
|
19
23
|
"compare_verifications",
|
|
20
24
|
"list_verify_demos",
|
|
25
|
+
"load_corpus",
|
|
21
26
|
"load_trace_file",
|
|
22
27
|
"load_verify_demo",
|
|
23
28
|
"verify_trace",
|
|
@@ -0,0 +1,449 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contexttrace.verify.claims import Claim
|
|
8
|
+
from contexttrace.verify.evidence import find_best_evidence
|
|
9
|
+
from contexttrace.verify.runner import verify_trace
|
|
10
|
+
from contexttrace.verify.schema import RAGTrace, TraceContext, VerificationInputError, load_trace_file
|
|
11
|
+
from contexttrace.verify.verdicts import classify_claim
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
NO_FAILURE = "no_failure_detected"
|
|
15
|
+
RETRIEVAL_MISS = "retrieval_miss"
|
|
16
|
+
RERANKING_FAILURE = "reranking_failure"
|
|
17
|
+
CHUNKING_ISSUE = "chunking_issue"
|
|
18
|
+
CORPUS_GAP = "corpus_gap"
|
|
19
|
+
ANSWER_OVERREACH = "answer_overreach"
|
|
20
|
+
STALE_SOURCE = "stale_source"
|
|
21
|
+
INSUFFICIENT_CONTEXT = "insufficient_context"
|
|
22
|
+
|
|
23
|
+
AUDIT_FAILURE_LABELS = {
|
|
24
|
+
RETRIEVAL_MISS,
|
|
25
|
+
RERANKING_FAILURE,
|
|
26
|
+
CHUNKING_ISSUE,
|
|
27
|
+
CORPUS_GAP,
|
|
28
|
+
ANSWER_OVERREACH,
|
|
29
|
+
STALE_SOURCE,
|
|
30
|
+
INSUFFICIENT_CONTEXT,
|
|
31
|
+
}
|
|
32
|
+
BAD_CITATIONS = {
|
|
33
|
+
"cited_source_missing",
|
|
34
|
+
"cited_source_does_not_support_claim",
|
|
35
|
+
"claim_supported_by_different_source",
|
|
36
|
+
}
|
|
37
|
+
SUPPORTED_VERDICTS = {"supported"}
|
|
38
|
+
CORPUS_EXTENSIONS = {
|
|
39
|
+
".csv",
|
|
40
|
+
".html",
|
|
41
|
+
".json",
|
|
42
|
+
".jsonl",
|
|
43
|
+
".md",
|
|
44
|
+
".markdown",
|
|
45
|
+
".rst",
|
|
46
|
+
".text",
|
|
47
|
+
".tsv",
|
|
48
|
+
".txt",
|
|
49
|
+
".yaml",
|
|
50
|
+
".yml",
|
|
51
|
+
}
|
|
52
|
+
SKIP_DIRECTORIES = {
|
|
53
|
+
".contexttrace",
|
|
54
|
+
".git",
|
|
55
|
+
".hg",
|
|
56
|
+
".mypy_cache",
|
|
57
|
+
".pytest_cache",
|
|
58
|
+
".ruff_cache",
|
|
59
|
+
".svn",
|
|
60
|
+
"__pycache__",
|
|
61
|
+
"build",
|
|
62
|
+
"dist",
|
|
63
|
+
"node_modules",
|
|
64
|
+
}
|
|
65
|
+
MAX_FILE_BYTES = 1_000_000
|
|
66
|
+
RERANKING_CUTOFF = 3
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def audit_trace_file(
|
|
70
|
+
trace_path: str | Path,
|
|
71
|
+
*,
|
|
72
|
+
corpus_path: str | Path,
|
|
73
|
+
mode: str = "lexical",
|
|
74
|
+
) -> dict[str, Any]:
|
|
75
|
+
trace = load_trace_file(trace_path)
|
|
76
|
+
return audit_trace(trace, corpus_path=corpus_path, mode=mode)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def audit_trace(
|
|
80
|
+
trace: RAGTrace,
|
|
81
|
+
*,
|
|
82
|
+
corpus_path: str | Path,
|
|
83
|
+
mode: str = "lexical",
|
|
84
|
+
) -> dict[str, Any]:
|
|
85
|
+
corpus_contexts = load_corpus(corpus_path)
|
|
86
|
+
verification = verify_trace(trace, mode=mode)
|
|
87
|
+
claim_audits = [
|
|
88
|
+
_audit_claim(claim, trace, corpus_contexts, mode=mode)
|
|
89
|
+
for claim in verification.get("claims") or []
|
|
90
|
+
]
|
|
91
|
+
summary = _summary(claim_audits, verification, corpus_contexts, mode=mode)
|
|
92
|
+
return {
|
|
93
|
+
"query": trace.query,
|
|
94
|
+
"answer": trace.answer,
|
|
95
|
+
"summary": summary,
|
|
96
|
+
"claims": claim_audits,
|
|
97
|
+
"verification": {
|
|
98
|
+
"summary": verification.get("summary") or {},
|
|
99
|
+
"abstention": verification.get("abstention") or {},
|
|
100
|
+
"diagnostics": verification.get("diagnostics") or {},
|
|
101
|
+
},
|
|
102
|
+
"corpus": {
|
|
103
|
+
"path": str(Path(corpus_path)),
|
|
104
|
+
"documents": len(corpus_contexts),
|
|
105
|
+
},
|
|
106
|
+
"metadata": dict(trace.metadata),
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def load_corpus(corpus_path: str | Path) -> list[TraceContext]:
|
|
111
|
+
root = Path(corpus_path)
|
|
112
|
+
if not root.exists():
|
|
113
|
+
raise VerificationInputError("Corpus path %s does not exist." % root)
|
|
114
|
+
|
|
115
|
+
files = [root] if root.is_file() else _corpus_files(root)
|
|
116
|
+
contexts: list[TraceContext] = []
|
|
117
|
+
for path in files:
|
|
118
|
+
text = _read_text(path)
|
|
119
|
+
if not text.strip():
|
|
120
|
+
continue
|
|
121
|
+
context_id = _context_id(path, root)
|
|
122
|
+
contexts.append(
|
|
123
|
+
TraceContext(
|
|
124
|
+
id=context_id,
|
|
125
|
+
text=text,
|
|
126
|
+
metadata={
|
|
127
|
+
"path": str(path),
|
|
128
|
+
"source": context_id,
|
|
129
|
+
"size_bytes": path.stat().st_size,
|
|
130
|
+
"kind": "corpus_document",
|
|
131
|
+
},
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if not contexts:
|
|
136
|
+
raise VerificationInputError("Corpus path %s did not contain readable text documents." % root)
|
|
137
|
+
return contexts
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def audit_failures(result: dict[str, Any], fail_on: tuple[str, ...]) -> list[str]:
|
|
141
|
+
if not fail_on:
|
|
142
|
+
return []
|
|
143
|
+
summary = result.get("summary") or {}
|
|
144
|
+
messages = []
|
|
145
|
+
for raw_rule in fail_on:
|
|
146
|
+
rule = raw_rule.strip().lower().replace("-", "_")
|
|
147
|
+
if rule == "any_failure" and bool(summary.get("has_audit_failures")):
|
|
148
|
+
messages.append("audit failure detected")
|
|
149
|
+
elif rule == "retrieval_miss" and int(summary.get(RETRIEVAL_MISS) or 0) > 0:
|
|
150
|
+
messages.append("retrieval miss detected")
|
|
151
|
+
elif rule == "reranking_failure" and int(summary.get(RERANKING_FAILURE) or 0) > 0:
|
|
152
|
+
messages.append("reranking failure detected")
|
|
153
|
+
elif rule == "chunking_issue" and int(summary.get(CHUNKING_ISSUE) or 0) > 0:
|
|
154
|
+
messages.append("chunking issue detected")
|
|
155
|
+
elif rule == "corpus_gap" and int(summary.get(CORPUS_GAP) or 0) > 0:
|
|
156
|
+
messages.append("corpus gap detected")
|
|
157
|
+
elif rule == "answer_overreach" and int(summary.get(ANSWER_OVERREACH) or 0) > 0:
|
|
158
|
+
messages.append("answer overreach detected")
|
|
159
|
+
elif rule == "stale_source" and int(summary.get(STALE_SOURCE) or 0) > 0:
|
|
160
|
+
messages.append("stale source detected")
|
|
161
|
+
elif rule == "insufficient_context" and int(summary.get(INSUFFICIENT_CONTEXT) or 0) > 0:
|
|
162
|
+
messages.append("insufficient context detected")
|
|
163
|
+
elif rule not in AUDIT_FAILURE_LABELS and rule != "any_failure":
|
|
164
|
+
messages.append("unknown --fail-on rule %s" % raw_rule)
|
|
165
|
+
return messages
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _audit_claim(
|
|
169
|
+
claim: dict[str, Any],
|
|
170
|
+
trace: RAGTrace,
|
|
171
|
+
corpus_contexts: list[TraceContext],
|
|
172
|
+
*,
|
|
173
|
+
mode: str,
|
|
174
|
+
) -> dict[str, Any]:
|
|
175
|
+
claim_text = str(claim.get("claim") or "")
|
|
176
|
+
claim_id = str(claim.get("claim_id") or "")
|
|
177
|
+
corpus_match = find_best_evidence(claim_text, corpus_contexts, mode=mode)
|
|
178
|
+
corpus_verification = classify_claim(
|
|
179
|
+
Claim(id=claim_id or "claim", text=claim_text),
|
|
180
|
+
corpus_match,
|
|
181
|
+
has_contexts=bool(corpus_contexts),
|
|
182
|
+
)
|
|
183
|
+
diagnosis = _diagnose(claim, trace, corpus_match, corpus_verification)
|
|
184
|
+
return {
|
|
185
|
+
"claim_id": claim_id,
|
|
186
|
+
"claim": claim_text,
|
|
187
|
+
"audit_label": diagnosis["label"],
|
|
188
|
+
"confidence": diagnosis["confidence"],
|
|
189
|
+
"reason": diagnosis["reason"],
|
|
190
|
+
"suggested_fix": diagnosis["suggested_fix"],
|
|
191
|
+
"retrieved": {
|
|
192
|
+
"verdict": claim.get("verdict"),
|
|
193
|
+
"best_context_id": claim.get("best_context_id"),
|
|
194
|
+
"best_score": claim.get("best_score"),
|
|
195
|
+
"evidence": claim.get("evidence"),
|
|
196
|
+
"matched_terms": list(claim.get("matched_terms") or []),
|
|
197
|
+
"root_cause": (claim.get("root_cause") or {}).get("label"),
|
|
198
|
+
"citation_status": claim.get("citation_status"),
|
|
199
|
+
},
|
|
200
|
+
"corpus": {
|
|
201
|
+
"verdict": corpus_verification.verdict,
|
|
202
|
+
"best_document_id": corpus_match.context_id,
|
|
203
|
+
"best_score": corpus_match.score,
|
|
204
|
+
"evidence": corpus_match.snippet,
|
|
205
|
+
"matched_terms": list(corpus_match.matched_terms),
|
|
206
|
+
"evidence_span": corpus_match.span_dict(),
|
|
207
|
+
"supporting_spans": list(corpus_match.supporting_spans or []),
|
|
208
|
+
"required_facts": list(corpus_verification.required_facts),
|
|
209
|
+
"matched_facts": list(corpus_verification.matched_facts),
|
|
210
|
+
"missing_facts": list(corpus_verification.missing_facts),
|
|
211
|
+
"conflicting_facts": list(corpus_verification.conflicting_facts),
|
|
212
|
+
},
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _diagnose(
|
|
217
|
+
claim: dict[str, Any],
|
|
218
|
+
trace: RAGTrace,
|
|
219
|
+
corpus_match: object,
|
|
220
|
+
corpus_verification: object,
|
|
221
|
+
) -> dict[str, Any]:
|
|
222
|
+
verdict = str(claim.get("verdict") or "")
|
|
223
|
+
root_label = str((claim.get("root_cause") or {}).get("label") or NO_FAILURE)
|
|
224
|
+
citation_status = str(claim.get("citation_status") or "")
|
|
225
|
+
corpus_verdict = str(getattr(corpus_verification, "verdict", ""))
|
|
226
|
+
corpus_score = float(getattr(corpus_match, "score", 0.0) or 0.0)
|
|
227
|
+
same_source_rank = _same_source_retrieved_rank(str(getattr(corpus_match, "context_id", "") or ""), trace)
|
|
228
|
+
|
|
229
|
+
if _is_citation_only_failure(claim):
|
|
230
|
+
return _result(
|
|
231
|
+
NO_FAILURE,
|
|
232
|
+
0.92,
|
|
233
|
+
"The claim is supported by retrieved evidence; the remaining issue is citation-level, not a retrieval or corpus failure.",
|
|
234
|
+
"Fix the claim-level citation, but do not treat this as a retrieval miss.",
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
if not _is_failure(claim):
|
|
238
|
+
return _result(
|
|
239
|
+
NO_FAILURE,
|
|
240
|
+
0.99,
|
|
241
|
+
"The claim is already supported by the retrieved contexts.",
|
|
242
|
+
"No fix needed for this claim.",
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if verdict == "contradicted" or corpus_verdict == "contradicted" or root_label in {"stale_context", "conflicting_contexts"}:
|
|
246
|
+
return _result(
|
|
247
|
+
STALE_SOURCE,
|
|
248
|
+
0.86,
|
|
249
|
+
"The claim appears to conflict with retrieved or corpus evidence.",
|
|
250
|
+
"Resolve stale or conflicting sources before allowing the answer to use this fact.",
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
if corpus_verdict in SUPPORTED_VERDICTS:
|
|
254
|
+
if same_source_rank is None:
|
|
255
|
+
return _result(
|
|
256
|
+
RETRIEVAL_MISS,
|
|
257
|
+
max(0.82, min(0.98, corpus_score + 0.12)),
|
|
258
|
+
"The broader corpus contains evidence for this claim, but the retrieved contexts did not include it.",
|
|
259
|
+
"Improve retrieval recall, filters, query rewriting, or top_k so this source is retrieved.",
|
|
260
|
+
)
|
|
261
|
+
if same_source_rank >= RERANKING_CUTOFF:
|
|
262
|
+
return _result(
|
|
263
|
+
RERANKING_FAILURE,
|
|
264
|
+
max(0.78, min(0.95, corpus_score + 0.08)),
|
|
265
|
+
"A related source was retrieved, but it appeared too low in the retrieved context list for reliable generation.",
|
|
266
|
+
"Add a reranker or raise high-evidence chunks from this source before generation.",
|
|
267
|
+
)
|
|
268
|
+
return _result(
|
|
269
|
+
CHUNKING_ISSUE,
|
|
270
|
+
max(0.78, min(0.95, corpus_score + 0.08)),
|
|
271
|
+
"The retrieved source appears related, but the retrieved chunk omitted the supporting span found in the corpus.",
|
|
272
|
+
"Adjust chunk boundaries, overlap, or parent-document retrieval so the answerable span is included.",
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
if root_label == "answer_overreach" or verdict == "partially_supported":
|
|
276
|
+
return _result(
|
|
277
|
+
ANSWER_OVERREACH,
|
|
278
|
+
0.82,
|
|
279
|
+
"The evidence supports part of the claim, but not every required fact.",
|
|
280
|
+
"Remove unsupported details or retrieve evidence that explicitly supports each detail.",
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
if corpus_verdict == "partially_supported":
|
|
284
|
+
return _result(
|
|
285
|
+
ANSWER_OVERREACH,
|
|
286
|
+
0.78,
|
|
287
|
+
"The corpus supports only part of the claim, so the answer likely added unsupported detail.",
|
|
288
|
+
"Split the claim and require support for every required fact before answering.",
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
if corpus_verdict == "unverifiable" or verdict == "unverifiable":
|
|
292
|
+
return _result(
|
|
293
|
+
INSUFFICIENT_CONTEXT,
|
|
294
|
+
0.72,
|
|
295
|
+
"The closest corpus evidence is related but too weak or ambiguous to verify the claim.",
|
|
296
|
+
"Retrieve more specific evidence or force the model to qualify/abstain.",
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
if citation_status in BAD_CITATIONS and corpus_score >= 0.35:
|
|
300
|
+
return _result(
|
|
301
|
+
INSUFFICIENT_CONTEXT,
|
|
302
|
+
0.7,
|
|
303
|
+
"The claim has a citation problem and the broader corpus evidence is still not strong enough.",
|
|
304
|
+
"Regenerate claim-level citations and require cited sources to cover all required facts.",
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
return _result(
|
|
308
|
+
CORPUS_GAP,
|
|
309
|
+
max(0.7, min(0.95, 1.0 - corpus_score)),
|
|
310
|
+
"Neither the retrieved contexts nor the broader corpus provide enough support for this claim.",
|
|
311
|
+
"Add the missing source to the corpus or make the answer abstain when the corpus lacks this fact.",
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _summary(
|
|
316
|
+
claim_audits: list[dict[str, Any]],
|
|
317
|
+
verification: dict[str, Any],
|
|
318
|
+
corpus_contexts: list[TraceContext],
|
|
319
|
+
*,
|
|
320
|
+
mode: str,
|
|
321
|
+
) -> dict[str, Any]:
|
|
322
|
+
counts = Counter(str(claim.get("audit_label") or NO_FAILURE) for claim in claim_audits)
|
|
323
|
+
labels = [NO_FAILURE] + sorted(AUDIT_FAILURE_LABELS)
|
|
324
|
+
failure_count = sum(counts[label] for label in AUDIT_FAILURE_LABELS)
|
|
325
|
+
return {
|
|
326
|
+
"mode": mode,
|
|
327
|
+
"total_claims": len(claim_audits),
|
|
328
|
+
"audited_claims": len([claim for claim in claim_audits if claim.get("audit_label") != NO_FAILURE]),
|
|
329
|
+
"corpus_documents": len(corpus_contexts),
|
|
330
|
+
"has_audit_failures": failure_count > 0,
|
|
331
|
+
"primary_audit_label": _primary_label(counts),
|
|
332
|
+
"verification_failure_type": (verification.get("summary") or {}).get("failure_type"),
|
|
333
|
+
"verification_primary_root_cause": (verification.get("summary") or {}).get("primary_root_cause"),
|
|
334
|
+
**{label: counts[label] for label in labels},
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _primary_label(counts: Counter) -> str:
|
|
339
|
+
failures = {label: counts[label] for label in AUDIT_FAILURE_LABELS if counts[label]}
|
|
340
|
+
if not failures:
|
|
341
|
+
return NO_FAILURE
|
|
342
|
+
priority = [
|
|
343
|
+
RETRIEVAL_MISS,
|
|
344
|
+
CHUNKING_ISSUE,
|
|
345
|
+
RERANKING_FAILURE,
|
|
346
|
+
CORPUS_GAP,
|
|
347
|
+
ANSWER_OVERREACH,
|
|
348
|
+
STALE_SOURCE,
|
|
349
|
+
INSUFFICIENT_CONTEXT,
|
|
350
|
+
]
|
|
351
|
+
return max(
|
|
352
|
+
failures,
|
|
353
|
+
key=lambda label: (
|
|
354
|
+
failures[label],
|
|
355
|
+
-priority.index(label) if label in priority else -len(priority),
|
|
356
|
+
),
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _is_failure(claim: dict[str, Any]) -> bool:
|
|
361
|
+
return (
|
|
362
|
+
str(claim.get("verdict") or "") not in SUPPORTED_VERDICTS
|
|
363
|
+
or str(claim.get("citation_status") or "") in BAD_CITATIONS
|
|
364
|
+
or str((claim.get("root_cause") or {}).get("label") or NO_FAILURE) != NO_FAILURE
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def _is_citation_only_failure(claim: dict[str, Any]) -> bool:
|
|
369
|
+
return (
|
|
370
|
+
str(claim.get("verdict") or "") in SUPPORTED_VERDICTS
|
|
371
|
+
and str(claim.get("citation_status") or "") in BAD_CITATIONS
|
|
372
|
+
and str((claim.get("root_cause") or {}).get("label") or NO_FAILURE)
|
|
373
|
+
in {"wrong_source_cited", "missing_cited_source", NO_FAILURE}
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _same_source_retrieved_rank(corpus_context_id: str, trace: RAGTrace) -> int | None:
|
|
378
|
+
corpus_key = _source_key(corpus_context_id)
|
|
379
|
+
if not corpus_key:
|
|
380
|
+
return None
|
|
381
|
+
for index, context in enumerate(trace.contexts):
|
|
382
|
+
candidates = [
|
|
383
|
+
context.id,
|
|
384
|
+
context.metadata.get("source"),
|
|
385
|
+
context.metadata.get("path"),
|
|
386
|
+
context.metadata.get("file"),
|
|
387
|
+
context.metadata.get("document"),
|
|
388
|
+
]
|
|
389
|
+
if any(_sources_match(corpus_key, _source_key(value)) for value in candidates):
|
|
390
|
+
return index
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _sources_match(left: str, right: str) -> bool:
|
|
395
|
+
if not left or not right:
|
|
396
|
+
return False
|
|
397
|
+
if left == right:
|
|
398
|
+
return True
|
|
399
|
+
return Path(left).name == Path(right).name
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _source_key(value: Any) -> str:
|
|
403
|
+
text = str(value or "").strip().replace("\\", "/").lower()
|
|
404
|
+
return text.strip("./")
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _result(label: str, confidence: float, reason: str, suggested_fix: str) -> dict[str, Any]:
|
|
408
|
+
return {
|
|
409
|
+
"label": label,
|
|
410
|
+
"confidence": round(confidence, 3),
|
|
411
|
+
"reason": reason,
|
|
412
|
+
"suggested_fix": suggested_fix,
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def _corpus_files(root: Path) -> list[Path]:
|
|
417
|
+
files: list[Path] = []
|
|
418
|
+
for path in root.rglob("*"):
|
|
419
|
+
if not path.is_file():
|
|
420
|
+
continue
|
|
421
|
+
if any(part in SKIP_DIRECTORIES for part in path.parts):
|
|
422
|
+
continue
|
|
423
|
+
if path.suffix.lower() not in CORPUS_EXTENSIONS:
|
|
424
|
+
continue
|
|
425
|
+
if path.stat().st_size > MAX_FILE_BYTES:
|
|
426
|
+
continue
|
|
427
|
+
files.append(path)
|
|
428
|
+
return sorted(files, key=lambda item: str(item).lower())
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def _read_text(path: Path) -> str:
|
|
432
|
+
try:
|
|
433
|
+
return path.read_text(encoding="utf-8")
|
|
434
|
+
except UnicodeDecodeError:
|
|
435
|
+
try:
|
|
436
|
+
return path.read_text(encoding="utf-8", errors="ignore")
|
|
437
|
+
except OSError:
|
|
438
|
+
return ""
|
|
439
|
+
except OSError:
|
|
440
|
+
return ""
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _context_id(path: Path, root: Path) -> str:
|
|
444
|
+
if root.is_file():
|
|
445
|
+
return path.name
|
|
446
|
+
try:
|
|
447
|
+
return path.relative_to(root).as_posix()
|
|
448
|
+
except ValueError:
|
|
449
|
+
return path.name
|
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from html import escape
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from contexttrace.verify.schema import RAGTrace
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AuditReportGenerator:
|
|
12
|
+
def generate(self, result: dict[str, Any], trace: RAGTrace, *, path: str) -> str:
|
|
13
|
+
output_path = Path(path)
|
|
14
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
output_path.write_text(self.render(result, trace), encoding="utf-8")
|
|
16
|
+
return str(output_path)
|
|
17
|
+
|
|
18
|
+
def render(self, result: dict[str, Any], trace: RAGTrace) -> str:
|
|
19
|
+
summary = result.get("summary") or {}
|
|
20
|
+
claims = list(result.get("claims") or [])
|
|
21
|
+
return HTML_TEMPLATE.format(
|
|
22
|
+
query=escape(_string(result.get("query"))),
|
|
23
|
+
answer=escape(_string(result.get("answer"))),
|
|
24
|
+
summary_cards=_summary_cards(summary),
|
|
25
|
+
claim_rows=_claim_rows(claims),
|
|
26
|
+
retrieval_misses=_claim_cards(claims, {"retrieval_miss"}, "No retrieval misses detected."),
|
|
27
|
+
chunking_issues=_claim_cards(
|
|
28
|
+
claims,
|
|
29
|
+
{"chunking_issue", "reranking_failure"},
|
|
30
|
+
"No chunking or reranking failures detected.",
|
|
31
|
+
),
|
|
32
|
+
corpus_gaps=_claim_cards(claims, {"corpus_gap"}, "No corpus coverage gaps detected."),
|
|
33
|
+
answer_overreach=_claim_cards(
|
|
34
|
+
claims,
|
|
35
|
+
{"answer_overreach", "insufficient_context", "stale_source"},
|
|
36
|
+
"No answer overreach, stale source, or insufficient-context failures detected.",
|
|
37
|
+
),
|
|
38
|
+
retrieved_contexts=_retrieved_contexts(trace),
|
|
39
|
+
corpus_summary=escape(json.dumps(result.get("corpus") or {}, indent=2)),
|
|
40
|
+
why_failed=_why_failed(claims),
|
|
41
|
+
raw_json=escape(json.dumps(_raw_summary(result), indent=2)),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _summary_cards(summary: dict[str, Any]) -> str:
|
|
46
|
+
cards = [
|
|
47
|
+
("Primary Audit Label", summary.get("primary_audit_label")),
|
|
48
|
+
("Total Claims", summary.get("total_claims", 0)),
|
|
49
|
+
("Audited Failures", summary.get("audited_claims", 0)),
|
|
50
|
+
("Corpus Documents", summary.get("corpus_documents", 0)),
|
|
51
|
+
("Retrieval Misses", summary.get("retrieval_miss", 0)),
|
|
52
|
+
("Chunking Issues", summary.get("chunking_issue", 0)),
|
|
53
|
+
("Reranking Failures", summary.get("reranking_failure", 0)),
|
|
54
|
+
("Corpus Gaps", summary.get("corpus_gap", 0)),
|
|
55
|
+
("Answer Overreach", summary.get("answer_overreach", 0)),
|
|
56
|
+
("Stale Sources", summary.get("stale_source", 0)),
|
|
57
|
+
("Insufficient Context", summary.get("insufficient_context", 0)),
|
|
58
|
+
("Verification Failure", summary.get("verification_failure_type")),
|
|
59
|
+
]
|
|
60
|
+
return "\n".join(
|
|
61
|
+
"""
|
|
62
|
+
<div class="card">
|
|
63
|
+
<div class="label">{label}</div>
|
|
64
|
+
<div class="value">{value}</div>
|
|
65
|
+
</div>
|
|
66
|
+
""".format(label=escape(label), value=escape(_string(value)))
|
|
67
|
+
for label, value in cards
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _claim_rows(claims: list[dict[str, Any]]) -> str:
|
|
72
|
+
if not claims:
|
|
73
|
+
return "<tr><td colspan=\"7\" class=\"muted\">No factual claims were extracted.</td></tr>"
|
|
74
|
+
rows = []
|
|
75
|
+
for claim in claims:
|
|
76
|
+
retrieved = claim.get("retrieved") or {}
|
|
77
|
+
corpus = claim.get("corpus") or {}
|
|
78
|
+
label = _string(claim.get("audit_label"))
|
|
79
|
+
rows.append(
|
|
80
|
+
"""
|
|
81
|
+
<tr>
|
|
82
|
+
<td><span class="badge audit-{label_class}">{label}</span></td>
|
|
83
|
+
<td>{claim}</td>
|
|
84
|
+
<td>{retrieved_verdict}</td>
|
|
85
|
+
<td>{retrieved_context}</td>
|
|
86
|
+
<td>{corpus_verdict}</td>
|
|
87
|
+
<td>{corpus_document}</td>
|
|
88
|
+
<td>{fix}</td>
|
|
89
|
+
</tr>
|
|
90
|
+
""".format(
|
|
91
|
+
label_class=escape(_css_token(label)),
|
|
92
|
+
label=escape(label),
|
|
93
|
+
claim=escape(_string(claim.get("claim"))),
|
|
94
|
+
retrieved_verdict=escape(_string(retrieved.get("verdict"))),
|
|
95
|
+
retrieved_context=escape(_string(retrieved.get("best_context_id") or "none")),
|
|
96
|
+
corpus_verdict=escape(_string(corpus.get("verdict"))),
|
|
97
|
+
corpus_document=escape(_string(corpus.get("best_document_id") or "none")),
|
|
98
|
+
fix=escape(_string(claim.get("suggested_fix"))),
|
|
99
|
+
)
|
|
100
|
+
)
|
|
101
|
+
return "\n".join(rows)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _claim_cards(claims: list[dict[str, Any]], labels: set[str], empty: str) -> str:
|
|
105
|
+
selected = [claim for claim in claims if claim.get("audit_label") in labels]
|
|
106
|
+
if not selected:
|
|
107
|
+
return "<p class=\"muted\">%s</p>" % escape(empty)
|
|
108
|
+
return "\n".join(_claim_card(claim) for claim in selected)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _claim_card(claim: dict[str, Any]) -> str:
|
|
112
|
+
retrieved = claim.get("retrieved") or {}
|
|
113
|
+
corpus = claim.get("corpus") or {}
|
|
114
|
+
return """
|
|
115
|
+
<article class="item">
|
|
116
|
+
<div class="item-meta">{claim_id} | {label} | confidence {confidence}</div>
|
|
117
|
+
<h3>{claim}</h3>
|
|
118
|
+
<p><strong>Diagnosis:</strong> {reason}</p>
|
|
119
|
+
<p><strong>Retrieved evidence:</strong> {retrieved_evidence}</p>
|
|
120
|
+
<p class="muted">Retrieved context: {retrieved_context} | verdict {retrieved_verdict} | score {retrieved_score}</p>
|
|
121
|
+
<p><strong>Corpus evidence:</strong> {corpus_evidence}</p>
|
|
122
|
+
<p class="muted">Corpus document: {corpus_document} | verdict {corpus_verdict} | score {corpus_score}</p>
|
|
123
|
+
<p><strong>Suggested fix:</strong> {fix}</p>
|
|
124
|
+
</article>
|
|
125
|
+
""".format(
|
|
126
|
+
claim_id=escape(_string(claim.get("claim_id"))),
|
|
127
|
+
label=escape(_string(claim.get("audit_label"))),
|
|
128
|
+
confidence=escape(_string(claim.get("confidence"))),
|
|
129
|
+
claim=escape(_string(claim.get("claim"))),
|
|
130
|
+
reason=escape(_string(claim.get("reason"))),
|
|
131
|
+
retrieved_evidence=escape(_string(retrieved.get("evidence") or "none")),
|
|
132
|
+
retrieved_context=escape(_string(retrieved.get("best_context_id") or "none")),
|
|
133
|
+
retrieved_verdict=escape(_string(retrieved.get("verdict"))),
|
|
134
|
+
retrieved_score=escape(_string(retrieved.get("best_score"))),
|
|
135
|
+
corpus_evidence=escape(_string(corpus.get("evidence") or "none")),
|
|
136
|
+
corpus_document=escape(_string(corpus.get("best_document_id") or "none")),
|
|
137
|
+
corpus_verdict=escape(_string(corpus.get("verdict"))),
|
|
138
|
+
corpus_score=escape(_string(corpus.get("best_score"))),
|
|
139
|
+
fix=escape(_string(claim.get("suggested_fix"))),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _retrieved_contexts(trace: RAGTrace) -> str:
|
|
144
|
+
if not trace.contexts:
|
|
145
|
+
return "<p class=\"muted\">No retrieved contexts were supplied.</p>"
|
|
146
|
+
cards = []
|
|
147
|
+
for index, context in enumerate(trace.contexts, start=1):
|
|
148
|
+
cards.append(
|
|
149
|
+
"""
|
|
150
|
+
<article class="item">
|
|
151
|
+
<div class="item-meta">rank {rank} | {context_id} | {metadata}</div>
|
|
152
|
+
<p>{text}</p>
|
|
153
|
+
</article>
|
|
154
|
+
""".format(
|
|
155
|
+
rank=index,
|
|
156
|
+
context_id=escape(context.id),
|
|
157
|
+
metadata=escape(json.dumps(context.metadata, sort_keys=True) if context.metadata else "no metadata"),
|
|
158
|
+
text=escape(context.text),
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
return "\n".join(cards)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _why_failed(claims: list[dict[str, Any]]) -> str:
|
|
165
|
+
explanations = []
|
|
166
|
+
for claim in claims:
|
|
167
|
+
label = _string(claim.get("audit_label"))
|
|
168
|
+
if label == "no_failure_detected":
|
|
169
|
+
continue
|
|
170
|
+
explanations.append(
|
|
171
|
+
"%s: %s Suggested fix: %s"
|
|
172
|
+
% (
|
|
173
|
+
label,
|
|
174
|
+
_string(claim.get("reason")),
|
|
175
|
+
_string(claim.get("suggested_fix")),
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
if not explanations:
|
|
179
|
+
explanations.append("No corpus-level evidence-chain failure was detected.")
|
|
180
|
+
return "<ul>%s</ul>" % "\n".join("<li>%s</li>" % escape(item) for item in explanations)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _raw_summary(result: dict[str, Any]) -> dict[str, Any]:
|
|
184
|
+
return {
|
|
185
|
+
"summary": result.get("summary"),
|
|
186
|
+
"claims": result.get("claims"),
|
|
187
|
+
"verification": result.get("verification"),
|
|
188
|
+
"corpus": result.get("corpus"),
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _css_token(value: Any) -> str:
|
|
193
|
+
token = _string(value).lower().replace("_", "-").replace(" ", "-")
|
|
194
|
+
return "".join(char for char in token if char.isalnum() or char == "-") or "unknown"
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _string(value: Any) -> str:
|
|
198
|
+
if value is None:
|
|
199
|
+
return ""
|
|
200
|
+
return str(value)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
HTML_TEMPLATE = """<!doctype html>
|
|
204
|
+
<html lang="en">
|
|
205
|
+
<head>
|
|
206
|
+
<meta charset="utf-8">
|
|
207
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
208
|
+
<title>ContextTrace Retrieval Audit Report</title>
|
|
209
|
+
<style>
|
|
210
|
+
:root {{
|
|
211
|
+
color-scheme: light;
|
|
212
|
+
--bg: #f7f8fa;
|
|
213
|
+
--panel: #ffffff;
|
|
214
|
+
--subtle: #fbfcfe;
|
|
215
|
+
--text: #202832;
|
|
216
|
+
--muted: #657286;
|
|
217
|
+
--line: #d9e0ea;
|
|
218
|
+
--ok: #176f44;
|
|
219
|
+
--warn: #946200;
|
|
220
|
+
--bad: #b42318;
|
|
221
|
+
--accent: #2458d3;
|
|
222
|
+
}}
|
|
223
|
+
* {{ box-sizing: border-box; }}
|
|
224
|
+
body {{
|
|
225
|
+
margin: 0;
|
|
226
|
+
background: var(--bg);
|
|
227
|
+
color: var(--text);
|
|
228
|
+
font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
|
229
|
+
line-height: 1.5;
|
|
230
|
+
}}
|
|
231
|
+
main {{ max-width: 1160px; margin: 0 auto; padding: 32px 20px 56px; }}
|
|
232
|
+
header {{ border-bottom: 1px solid var(--line); margin-bottom: 22px; padding-bottom: 18px; }}
|
|
233
|
+
h1, h2, h3 {{ margin: 0; }}
|
|
234
|
+
h1 {{ font-size: 30px; }}
|
|
235
|
+
h2 {{ font-size: 18px; margin-bottom: 12px; }}
|
|
236
|
+
h3 {{ font-size: 15px; margin-bottom: 8px; }}
|
|
237
|
+
section {{
|
|
238
|
+
background: var(--panel);
|
|
239
|
+
border: 1px solid var(--line);
|
|
240
|
+
border-radius: 8px;
|
|
241
|
+
margin: 16px 0;
|
|
242
|
+
padding: 18px;
|
|
243
|
+
}}
|
|
244
|
+
.summary {{
|
|
245
|
+
display: grid;
|
|
246
|
+
gap: 12px;
|
|
247
|
+
grid-template-columns: repeat(auto-fit, minmax(155px, 1fr));
|
|
248
|
+
}}
|
|
249
|
+
.card, .item {{
|
|
250
|
+
border: 1px solid var(--line);
|
|
251
|
+
border-radius: 8px;
|
|
252
|
+
background: var(--subtle);
|
|
253
|
+
padding: 12px;
|
|
254
|
+
}}
|
|
255
|
+
.item + .item {{ margin-top: 10px; }}
|
|
256
|
+
.label, .item-meta {{
|
|
257
|
+
color: var(--muted);
|
|
258
|
+
font-size: 12px;
|
|
259
|
+
font-weight: 700;
|
|
260
|
+
text-transform: uppercase;
|
|
261
|
+
}}
|
|
262
|
+
.value {{ margin-top: 4px; font-size: 18px; overflow-wrap: anywhere; }}
|
|
263
|
+
.muted {{ color: var(--muted); }}
|
|
264
|
+
.answer, .item p {{ white-space: pre-wrap; }}
|
|
265
|
+
table {{ width: 100%; border-collapse: collapse; font-size: 14px; }}
|
|
266
|
+
th, td {{ border-bottom: 1px solid var(--line); padding: 10px; text-align: left; vertical-align: top; }}
|
|
267
|
+
th {{ color: var(--muted); font-size: 12px; text-transform: uppercase; }}
|
|
268
|
+
.badge {{
|
|
269
|
+
display: inline-block;
|
|
270
|
+
border-radius: 999px;
|
|
271
|
+
border: 1px solid var(--line);
|
|
272
|
+
background: #eef2f7;
|
|
273
|
+
padding: 3px 8px;
|
|
274
|
+
font-size: 12px;
|
|
275
|
+
font-weight: 700;
|
|
276
|
+
white-space: nowrap;
|
|
277
|
+
}}
|
|
278
|
+
.audit-no-failure-detected {{ color: var(--ok); background: #e9f7ef; }}
|
|
279
|
+
.audit-retrieval-miss, .audit-corpus-gap, .audit-stale-source {{ color: var(--bad); background: #fdeceb; }}
|
|
280
|
+
.audit-chunking-issue, .audit-reranking-failure,
|
|
281
|
+
.audit-answer-overreach, .audit-insufficient-context {{ color: var(--warn); background: #fff7df; }}
|
|
282
|
+
pre {{
|
|
283
|
+
margin: 0;
|
|
284
|
+
overflow: auto;
|
|
285
|
+
background: #101828;
|
|
286
|
+
color: #f8fafc;
|
|
287
|
+
border-radius: 8px;
|
|
288
|
+
padding: 14px;
|
|
289
|
+
font-size: 13px;
|
|
290
|
+
}}
|
|
291
|
+
</style>
|
|
292
|
+
</head>
|
|
293
|
+
<body>
|
|
294
|
+
<main>
|
|
295
|
+
<header>
|
|
296
|
+
<h1>ContextTrace Retrieval Audit Report</h1>
|
|
297
|
+
<p class="muted">Local corpus-level diagnosis for claim evidence failures.</p>
|
|
298
|
+
</header>
|
|
299
|
+
|
|
300
|
+
<section>
|
|
301
|
+
<h2>Audit Summary</h2>
|
|
302
|
+
<div class="summary">{summary_cards}</div>
|
|
303
|
+
</section>
|
|
304
|
+
|
|
305
|
+
<section>
|
|
306
|
+
<h2>Query</h2>
|
|
307
|
+
<p>{query}</p>
|
|
308
|
+
<h2>Answer</h2>
|
|
309
|
+
<p class="answer">{answer}</p>
|
|
310
|
+
</section>
|
|
311
|
+
|
|
312
|
+
<section>
|
|
313
|
+
<h2>Claim Failure Diagnosis</h2>
|
|
314
|
+
<table>
|
|
315
|
+
<thead>
|
|
316
|
+
<tr>
|
|
317
|
+
<th>Audit Label</th>
|
|
318
|
+
<th>Claim</th>
|
|
319
|
+
<th>Retrieved Verdict</th>
|
|
320
|
+
<th>Retrieved Context</th>
|
|
321
|
+
<th>Corpus Verdict</th>
|
|
322
|
+
<th>Corpus Document</th>
|
|
323
|
+
<th>Suggested Fix</th>
|
|
324
|
+
</tr>
|
|
325
|
+
</thead>
|
|
326
|
+
<tbody>{claim_rows}</tbody>
|
|
327
|
+
</table>
|
|
328
|
+
</section>
|
|
329
|
+
|
|
330
|
+
<section>
|
|
331
|
+
<h2>Retrieval Misses</h2>
|
|
332
|
+
{retrieval_misses}
|
|
333
|
+
</section>
|
|
334
|
+
|
|
335
|
+
<section>
|
|
336
|
+
<h2>Chunking And Reranking Issues</h2>
|
|
337
|
+
{chunking_issues}
|
|
338
|
+
</section>
|
|
339
|
+
|
|
340
|
+
<section>
|
|
341
|
+
<h2>Corpus Gaps</h2>
|
|
342
|
+
{corpus_gaps}
|
|
343
|
+
</section>
|
|
344
|
+
|
|
345
|
+
<section>
|
|
346
|
+
<h2>Answer Overreach And Ambiguous Evidence</h2>
|
|
347
|
+
{answer_overreach}
|
|
348
|
+
</section>
|
|
349
|
+
|
|
350
|
+
<section>
|
|
351
|
+
<h2>Retrieved Contexts</h2>
|
|
352
|
+
{retrieved_contexts}
|
|
353
|
+
</section>
|
|
354
|
+
|
|
355
|
+
<section>
|
|
356
|
+
<h2>Corpus Summary</h2>
|
|
357
|
+
<pre>{corpus_summary}</pre>
|
|
358
|
+
</section>
|
|
359
|
+
|
|
360
|
+
<section>
|
|
361
|
+
<h2>Why This Failed</h2>
|
|
362
|
+
{why_failed}
|
|
363
|
+
</section>
|
|
364
|
+
|
|
365
|
+
<section>
|
|
366
|
+
<h2>Raw JSON Summary</h2>
|
|
367
|
+
<pre>{raw_json}</pre>
|
|
368
|
+
</section>
|
|
369
|
+
</main>
|
|
370
|
+
</body>
|
|
371
|
+
</html>
|
|
372
|
+
"""
|
|
@@ -30,6 +30,8 @@ contexttrace/storage/__init__.py
|
|
|
30
30
|
contexttrace/storage/sqlite_store.py
|
|
31
31
|
contexttrace/verify/__init__.py
|
|
32
32
|
contexttrace/verify/abstention.py
|
|
33
|
+
contexttrace/verify/audit.py
|
|
34
|
+
contexttrace/verify/audit_report.py
|
|
33
35
|
contexttrace/verify/benchmark.py
|
|
34
36
|
contexttrace/verify/citations.py
|
|
35
37
|
contexttrace/verify/claims.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "contexttrace"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.0"
|
|
8
8
|
description = "Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.4.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|