contexttrace 0.3.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {contexttrace-0.3.0 → contexttrace-0.5.0}/PKG-INFO +12 -2
- {contexttrace-0.3.0 → contexttrace-0.5.0}/README.md +11 -1
- contexttrace-0.5.0/contexttrace/_version.py +1 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/cli.py +124 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/__init__.py +9 -0
- contexttrace-0.5.0/contexttrace/verify/audit.py +449 -0
- contexttrace-0.5.0/contexttrace/verify/audit_report.py +372 -0
- contexttrace-0.5.0/contexttrace/verify/compare.py +445 -0
- contexttrace-0.5.0/contexttrace/verify/compare_report.py +386 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace.egg-info/SOURCES.txt +4 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/pyproject.toml +1 -1
- contexttrace-0.3.0/contexttrace/_version.py +0 -1
- {contexttrace-0.3.0 → contexttrace-0.5.0}/MANIFEST.in +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/__init__.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/client.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/config.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/demo.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/demo_data.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/endpoint_eval.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/errors.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/evaluator.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/integrations/__init__.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/integrations/fastapi.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/integrations/langchain.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/integrations/langgraph.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/integrations/llamaindex.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/integrations/opentelemetry.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/local.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/py.typed +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/regression.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/reliability.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/report.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/storage/__init__.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/storage/sqlite_store.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/thresholds.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/transport.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/abstention.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/benchmark.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/citations.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/claims.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/demos.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/evidence.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/external_benchmark_cases.json +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/facts.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/real_benchmark_cases.json +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/report.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/root_cause.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/runner.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/schema.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/spans.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/verify/verdicts.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/contexttrace/viewer.py +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/setup.cfg +0 -0
- {contexttrace-0.3.0 → contexttrace-0.5.0}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: contexttrace
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
|
|
5
5
|
Author: ContextTrace contributors
|
|
6
6
|
License: MIT
|
|
@@ -147,6 +147,12 @@ contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
|
|
|
147
147
|
contexttrace verify-benchmark --mode semantic
|
|
148
148
|
contexttrace verify-benchmark --mode semantic --report
|
|
149
149
|
contexttrace verify-benchmark --case-set external --mode semantic --report
|
|
150
|
+
contexttrace compare baseline.json current.json
|
|
151
|
+
contexttrace compare baseline.json current.json --report
|
|
152
|
+
contexttrace compare baseline.json current.json --fail-on new_failure
|
|
153
|
+
contexttrace audit trace.json --corpus docs/
|
|
154
|
+
contexttrace audit trace.json --corpus docs/ --report
|
|
155
|
+
contexttrace audit trace.json --corpus docs/ --fail-on retrieval_miss
|
|
150
156
|
```
|
|
151
157
|
|
|
152
158
|
Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
|
|
@@ -159,7 +165,11 @@ Verification output includes evidence span offsets, stable span hashes, multiple
|
|
|
159
165
|
|
|
160
166
|
ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
|
|
161
167
|
|
|
162
|
-
|
|
168
|
+
Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
|
|
169
|
+
|
|
170
|
+
Use `contexttrace audit trace.json --corpus docs/` to diagnose whether an unsupported claim failed because retrieval missed evidence, chunking omitted the supporting span, the corpus lacks coverage, or generation overclaimed.
|
|
171
|
+
|
|
172
|
+
The v0.5.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
163
173
|
|
|
164
174
|
## What It Catches
|
|
165
175
|
|
|
@@ -90,6 +90,12 @@ contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
|
|
|
90
90
|
contexttrace verify-benchmark --mode semantic
|
|
91
91
|
contexttrace verify-benchmark --mode semantic --report
|
|
92
92
|
contexttrace verify-benchmark --case-set external --mode semantic --report
|
|
93
|
+
contexttrace compare baseline.json current.json
|
|
94
|
+
contexttrace compare baseline.json current.json --report
|
|
95
|
+
contexttrace compare baseline.json current.json --fail-on new_failure
|
|
96
|
+
contexttrace audit trace.json --corpus docs/
|
|
97
|
+
contexttrace audit trace.json --corpus docs/ --report
|
|
98
|
+
contexttrace audit trace.json --corpus docs/ --fail-on retrieval_miss
|
|
93
99
|
```
|
|
94
100
|
|
|
95
101
|
Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
|
|
@@ -102,7 +108,11 @@ Verification output includes evidence span offsets, stable span hashes, multiple
|
|
|
102
108
|
|
|
103
109
|
ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
|
|
104
110
|
|
|
105
|
-
|
|
111
|
+
Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
|
|
112
|
+
|
|
113
|
+
Use `contexttrace audit trace.json --corpus docs/` to diagnose whether an unsupported claim failed because retrieval missed evidence, chunking omitted the supporting span, the corpus lacks coverage, or generation overclaimed.
|
|
114
|
+
|
|
115
|
+
The v0.5.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
106
116
|
|
|
107
117
|
## What It Catches
|
|
108
118
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.5.0"
|
|
@@ -24,12 +24,18 @@ from contexttrace.storage import SQLiteTraceStore
|
|
|
24
24
|
from contexttrace.thresholds import parse_thresholds, threshold_failures
|
|
25
25
|
from contexttrace.verify import (
|
|
26
26
|
VerificationInputError,
|
|
27
|
+
audit_failures,
|
|
28
|
+
audit_trace,
|
|
29
|
+
compare_failures,
|
|
30
|
+
compare_trace_files,
|
|
27
31
|
list_verify_demos,
|
|
28
32
|
load_trace_file,
|
|
29
33
|
load_verify_demo,
|
|
30
34
|
verify_trace,
|
|
31
35
|
)
|
|
32
36
|
from contexttrace.verify.benchmark import run_verify_benchmark, write_verify_benchmark_report
|
|
37
|
+
from contexttrace.verify.audit_report import AuditReportGenerator
|
|
38
|
+
from contexttrace.verify.compare_report import CompareReportGenerator
|
|
33
39
|
from contexttrace.verify.report import VerifyReportGenerator
|
|
34
40
|
from contexttrace.viewer import serve_viewer
|
|
35
41
|
|
|
@@ -340,6 +346,124 @@ def verify_benchmark_command(mode: str, case_set: str, json_output: bool, report
|
|
|
340
346
|
return 0
|
|
341
347
|
|
|
342
348
|
|
|
349
|
+
@cli.command("compare")
|
|
350
|
+
@click.argument("baseline_json")
|
|
351
|
+
@click.argument("current_json")
|
|
352
|
+
@click.option("--json", "json_output", is_flag=True, help="Print the full comparison result as JSON.")
|
|
353
|
+
@click.option("--report", is_flag=True, help="Generate a local HTML regression report.")
|
|
354
|
+
@click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
|
|
355
|
+
@click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode for raw trace inputs.")
|
|
356
|
+
@click.option("--fail-on", multiple=True, help="Fail on new_failure, new_unsupported, new_citation_mismatch, should_abstain_flip, support_rate_drop, new_root_cause, or any_regression.")
|
|
357
|
+
def compare_command(
|
|
358
|
+
baseline_json: str,
|
|
359
|
+
current_json: str,
|
|
360
|
+
json_output: bool,
|
|
361
|
+
report: bool,
|
|
362
|
+
out: Optional[str],
|
|
363
|
+
mode: str,
|
|
364
|
+
fail_on: tuple[str, ...],
|
|
365
|
+
) -> int:
|
|
366
|
+
"""Compare two portable RAG traces or verification JSON outputs."""
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
result = compare_trace_files(baseline_json, current_json, mode=mode)
|
|
370
|
+
except VerificationInputError as exc:
|
|
371
|
+
raise click.ClickException(str(exc)) from exc
|
|
372
|
+
|
|
373
|
+
written_report = None
|
|
374
|
+
if report or out:
|
|
375
|
+
default_name = "%s_vs_%s_compare.html" % (Path(baseline_json).stem, Path(current_json).stem)
|
|
376
|
+
output_path = out or str(Path(".contexttrace") / "reports" / default_name)
|
|
377
|
+
written_report = CompareReportGenerator().generate(result, path=output_path)
|
|
378
|
+
|
|
379
|
+
fail_messages = compare_failures(result, fail_on)
|
|
380
|
+
if json_output:
|
|
381
|
+
if written_report:
|
|
382
|
+
click.echo("Report: %s" % written_report, err=True)
|
|
383
|
+
click.echo(json.dumps(result, indent=2))
|
|
384
|
+
for message in fail_messages:
|
|
385
|
+
click.echo("Comparison failed: %s" % message, err=True)
|
|
386
|
+
return 1 if fail_messages else 0
|
|
387
|
+
|
|
388
|
+
summary = result["summary"]
|
|
389
|
+
click.echo("Regression: %s" % str(summary["regression"]).lower())
|
|
390
|
+
click.echo("Support rate: %.3f -> %.3f (%+.3f)" % (
|
|
391
|
+
float(summary.get("support_rate_before") or 0.0),
|
|
392
|
+
float(summary.get("support_rate_after") or 0.0),
|
|
393
|
+
float(summary.get("support_rate_delta") or 0.0),
|
|
394
|
+
))
|
|
395
|
+
click.echo("Unsupported claim rate delta: %+.3f" % float(summary.get("unsupported_claim_rate_delta") or 0.0))
|
|
396
|
+
click.echo("Citation mismatch delta: %+d" % int(summary.get("citation_mismatch_delta") or 0))
|
|
397
|
+
click.echo("New failures: %s" % summary["new_failures"])
|
|
398
|
+
click.echo("Resolved failures: %s" % summary["resolved_failures"])
|
|
399
|
+
click.echo("Added claims: %s" % summary["added_claims"])
|
|
400
|
+
click.echo("Removed claims: %s" % summary["removed_claims"])
|
|
401
|
+
click.echo("Changed claims: %s" % summary["changed_claims"])
|
|
402
|
+
click.echo("New root causes: %s" % (", ".join(summary.get("new_root_causes") or []) or "none"))
|
|
403
|
+
if written_report:
|
|
404
|
+
click.echo("Report: %s" % written_report)
|
|
405
|
+
for message in fail_messages:
|
|
406
|
+
click.echo("Comparison failed: %s" % message, err=True)
|
|
407
|
+
return 1 if fail_messages else 0
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
@cli.command("audit")
|
|
411
|
+
@click.argument("trace_json")
|
|
412
|
+
@click.option("--corpus", "corpus_path", required=True, help="Local corpus directory or file to search for supporting evidence.")
|
|
413
|
+
@click.option("--json", "json_output", is_flag=True, help="Print the full audit result as JSON.")
|
|
414
|
+
@click.option("--report", is_flag=True, help="Generate a local HTML retrieval audit report.")
|
|
415
|
+
@click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
|
|
416
|
+
@click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
|
|
417
|
+
@click.option("--fail-on", multiple=True, help="Fail on retrieval_miss, reranking_failure, chunking_issue, corpus_gap, answer_overreach, stale_source, insufficient_context, or any_failure.")
|
|
418
|
+
def audit_command(
|
|
419
|
+
trace_json: str,
|
|
420
|
+
corpus_path: str,
|
|
421
|
+
json_output: bool,
|
|
422
|
+
report: bool,
|
|
423
|
+
out: Optional[str],
|
|
424
|
+
mode: str,
|
|
425
|
+
fail_on: tuple[str, ...],
|
|
426
|
+
) -> int:
|
|
427
|
+
"""Audit a verified trace against a broader local corpus."""
|
|
428
|
+
|
|
429
|
+
try:
|
|
430
|
+
trace = load_trace_file(trace_json)
|
|
431
|
+
result = audit_trace(trace, corpus_path=corpus_path, mode=mode)
|
|
432
|
+
except VerificationInputError as exc:
|
|
433
|
+
raise click.ClickException(str(exc)) from exc
|
|
434
|
+
|
|
435
|
+
written_report = None
|
|
436
|
+
if report or out:
|
|
437
|
+
default_name = "%s_audit.html" % Path(trace_json).stem
|
|
438
|
+
output_path = out or str(Path(".contexttrace") / "reports" / default_name)
|
|
439
|
+
written_report = AuditReportGenerator().generate(result, trace, path=output_path)
|
|
440
|
+
|
|
441
|
+
fail_messages = audit_failures(result, fail_on)
|
|
442
|
+
if json_output:
|
|
443
|
+
if written_report:
|
|
444
|
+
click.echo("Report: %s" % written_report, err=True)
|
|
445
|
+
click.echo(json.dumps(result, indent=2))
|
|
446
|
+
for message in fail_messages:
|
|
447
|
+
click.echo("Audit failed: %s" % message, err=True)
|
|
448
|
+
return 1 if fail_messages else 0
|
|
449
|
+
|
|
450
|
+
summary = result["summary"]
|
|
451
|
+
click.echo("Primary audit label: %s" % summary["primary_audit_label"])
|
|
452
|
+
click.echo("Claims audited: %s" % summary["total_claims"])
|
|
453
|
+
click.echo("Corpus documents: %s" % summary["corpus_documents"])
|
|
454
|
+
click.echo("Retrieval misses: %s" % summary["retrieval_miss"])
|
|
455
|
+
click.echo("Chunking issues: %s" % summary["chunking_issue"])
|
|
456
|
+
click.echo("Reranking failures: %s" % summary["reranking_failure"])
|
|
457
|
+
click.echo("Corpus gaps: %s" % summary["corpus_gap"])
|
|
458
|
+
click.echo("Answer overreach: %s" % summary["answer_overreach"])
|
|
459
|
+
click.echo("Insufficient context: %s" % summary["insufficient_context"])
|
|
460
|
+
if written_report:
|
|
461
|
+
click.echo("Report: %s" % written_report)
|
|
462
|
+
for message in fail_messages:
|
|
463
|
+
click.echo("Audit failed: %s" % message, err=True)
|
|
464
|
+
return 1 if fail_messages else 0
|
|
465
|
+
|
|
466
|
+
|
|
343
467
|
def _write_verify_report(
|
|
344
468
|
result: dict,
|
|
345
469
|
trace: object,
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
from contexttrace.verify.runner import verify_trace, verify_trace_file
|
|
2
|
+
from contexttrace.verify.audit import audit_failures, audit_trace, audit_trace_file, load_corpus
|
|
3
|
+
from contexttrace.verify.compare import compare_failures, compare_trace_files, compare_verifications
|
|
2
4
|
from contexttrace.verify.schema import (
|
|
3
5
|
RAGTrace,
|
|
4
6
|
TraceCitation,
|
|
@@ -13,7 +15,14 @@ __all__ = [
|
|
|
13
15
|
"TraceCitation",
|
|
14
16
|
"TraceContext",
|
|
15
17
|
"VerificationInputError",
|
|
18
|
+
"audit_failures",
|
|
19
|
+
"audit_trace",
|
|
20
|
+
"audit_trace_file",
|
|
21
|
+
"compare_failures",
|
|
22
|
+
"compare_trace_files",
|
|
23
|
+
"compare_verifications",
|
|
16
24
|
"list_verify_demos",
|
|
25
|
+
"load_corpus",
|
|
17
26
|
"load_trace_file",
|
|
18
27
|
"load_verify_demo",
|
|
19
28
|
"verify_trace",
|
|
@@ -0,0 +1,449 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contexttrace.verify.claims import Claim
|
|
8
|
+
from contexttrace.verify.evidence import find_best_evidence
|
|
9
|
+
from contexttrace.verify.runner import verify_trace
|
|
10
|
+
from contexttrace.verify.schema import RAGTrace, TraceContext, VerificationInputError, load_trace_file
|
|
11
|
+
from contexttrace.verify.verdicts import classify_claim
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
NO_FAILURE = "no_failure_detected"
|
|
15
|
+
RETRIEVAL_MISS = "retrieval_miss"
|
|
16
|
+
RERANKING_FAILURE = "reranking_failure"
|
|
17
|
+
CHUNKING_ISSUE = "chunking_issue"
|
|
18
|
+
CORPUS_GAP = "corpus_gap"
|
|
19
|
+
ANSWER_OVERREACH = "answer_overreach"
|
|
20
|
+
STALE_SOURCE = "stale_source"
|
|
21
|
+
INSUFFICIENT_CONTEXT = "insufficient_context"
|
|
22
|
+
|
|
23
|
+
AUDIT_FAILURE_LABELS = {
|
|
24
|
+
RETRIEVAL_MISS,
|
|
25
|
+
RERANKING_FAILURE,
|
|
26
|
+
CHUNKING_ISSUE,
|
|
27
|
+
CORPUS_GAP,
|
|
28
|
+
ANSWER_OVERREACH,
|
|
29
|
+
STALE_SOURCE,
|
|
30
|
+
INSUFFICIENT_CONTEXT,
|
|
31
|
+
}
|
|
32
|
+
BAD_CITATIONS = {
|
|
33
|
+
"cited_source_missing",
|
|
34
|
+
"cited_source_does_not_support_claim",
|
|
35
|
+
"claim_supported_by_different_source",
|
|
36
|
+
}
|
|
37
|
+
SUPPORTED_VERDICTS = {"supported"}
|
|
38
|
+
CORPUS_EXTENSIONS = {
|
|
39
|
+
".csv",
|
|
40
|
+
".html",
|
|
41
|
+
".json",
|
|
42
|
+
".jsonl",
|
|
43
|
+
".md",
|
|
44
|
+
".markdown",
|
|
45
|
+
".rst",
|
|
46
|
+
".text",
|
|
47
|
+
".tsv",
|
|
48
|
+
".txt",
|
|
49
|
+
".yaml",
|
|
50
|
+
".yml",
|
|
51
|
+
}
|
|
52
|
+
SKIP_DIRECTORIES = {
|
|
53
|
+
".contexttrace",
|
|
54
|
+
".git",
|
|
55
|
+
".hg",
|
|
56
|
+
".mypy_cache",
|
|
57
|
+
".pytest_cache",
|
|
58
|
+
".ruff_cache",
|
|
59
|
+
".svn",
|
|
60
|
+
"__pycache__",
|
|
61
|
+
"build",
|
|
62
|
+
"dist",
|
|
63
|
+
"node_modules",
|
|
64
|
+
}
|
|
65
|
+
MAX_FILE_BYTES = 1_000_000
|
|
66
|
+
RERANKING_CUTOFF = 3
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def audit_trace_file(
|
|
70
|
+
trace_path: str | Path,
|
|
71
|
+
*,
|
|
72
|
+
corpus_path: str | Path,
|
|
73
|
+
mode: str = "lexical",
|
|
74
|
+
) -> dict[str, Any]:
|
|
75
|
+
trace = load_trace_file(trace_path)
|
|
76
|
+
return audit_trace(trace, corpus_path=corpus_path, mode=mode)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def audit_trace(
|
|
80
|
+
trace: RAGTrace,
|
|
81
|
+
*,
|
|
82
|
+
corpus_path: str | Path,
|
|
83
|
+
mode: str = "lexical",
|
|
84
|
+
) -> dict[str, Any]:
|
|
85
|
+
corpus_contexts = load_corpus(corpus_path)
|
|
86
|
+
verification = verify_trace(trace, mode=mode)
|
|
87
|
+
claim_audits = [
|
|
88
|
+
_audit_claim(claim, trace, corpus_contexts, mode=mode)
|
|
89
|
+
for claim in verification.get("claims") or []
|
|
90
|
+
]
|
|
91
|
+
summary = _summary(claim_audits, verification, corpus_contexts, mode=mode)
|
|
92
|
+
return {
|
|
93
|
+
"query": trace.query,
|
|
94
|
+
"answer": trace.answer,
|
|
95
|
+
"summary": summary,
|
|
96
|
+
"claims": claim_audits,
|
|
97
|
+
"verification": {
|
|
98
|
+
"summary": verification.get("summary") or {},
|
|
99
|
+
"abstention": verification.get("abstention") or {},
|
|
100
|
+
"diagnostics": verification.get("diagnostics") or {},
|
|
101
|
+
},
|
|
102
|
+
"corpus": {
|
|
103
|
+
"path": str(Path(corpus_path)),
|
|
104
|
+
"documents": len(corpus_contexts),
|
|
105
|
+
},
|
|
106
|
+
"metadata": dict(trace.metadata),
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def load_corpus(corpus_path: str | Path) -> list[TraceContext]:
|
|
111
|
+
root = Path(corpus_path)
|
|
112
|
+
if not root.exists():
|
|
113
|
+
raise VerificationInputError("Corpus path %s does not exist." % root)
|
|
114
|
+
|
|
115
|
+
files = [root] if root.is_file() else _corpus_files(root)
|
|
116
|
+
contexts: list[TraceContext] = []
|
|
117
|
+
for path in files:
|
|
118
|
+
text = _read_text(path)
|
|
119
|
+
if not text.strip():
|
|
120
|
+
continue
|
|
121
|
+
context_id = _context_id(path, root)
|
|
122
|
+
contexts.append(
|
|
123
|
+
TraceContext(
|
|
124
|
+
id=context_id,
|
|
125
|
+
text=text,
|
|
126
|
+
metadata={
|
|
127
|
+
"path": str(path),
|
|
128
|
+
"source": context_id,
|
|
129
|
+
"size_bytes": path.stat().st_size,
|
|
130
|
+
"kind": "corpus_document",
|
|
131
|
+
},
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if not contexts:
|
|
136
|
+
raise VerificationInputError("Corpus path %s did not contain readable text documents." % root)
|
|
137
|
+
return contexts
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def audit_failures(result: dict[str, Any], fail_on: tuple[str, ...]) -> list[str]:
|
|
141
|
+
if not fail_on:
|
|
142
|
+
return []
|
|
143
|
+
summary = result.get("summary") or {}
|
|
144
|
+
messages = []
|
|
145
|
+
for raw_rule in fail_on:
|
|
146
|
+
rule = raw_rule.strip().lower().replace("-", "_")
|
|
147
|
+
if rule == "any_failure" and bool(summary.get("has_audit_failures")):
|
|
148
|
+
messages.append("audit failure detected")
|
|
149
|
+
elif rule == "retrieval_miss" and int(summary.get(RETRIEVAL_MISS) or 0) > 0:
|
|
150
|
+
messages.append("retrieval miss detected")
|
|
151
|
+
elif rule == "reranking_failure" and int(summary.get(RERANKING_FAILURE) or 0) > 0:
|
|
152
|
+
messages.append("reranking failure detected")
|
|
153
|
+
elif rule == "chunking_issue" and int(summary.get(CHUNKING_ISSUE) or 0) > 0:
|
|
154
|
+
messages.append("chunking issue detected")
|
|
155
|
+
elif rule == "corpus_gap" and int(summary.get(CORPUS_GAP) or 0) > 0:
|
|
156
|
+
messages.append("corpus gap detected")
|
|
157
|
+
elif rule == "answer_overreach" and int(summary.get(ANSWER_OVERREACH) or 0) > 0:
|
|
158
|
+
messages.append("answer overreach detected")
|
|
159
|
+
elif rule == "stale_source" and int(summary.get(STALE_SOURCE) or 0) > 0:
|
|
160
|
+
messages.append("stale source detected")
|
|
161
|
+
elif rule == "insufficient_context" and int(summary.get(INSUFFICIENT_CONTEXT) or 0) > 0:
|
|
162
|
+
messages.append("insufficient context detected")
|
|
163
|
+
elif rule not in AUDIT_FAILURE_LABELS and rule != "any_failure":
|
|
164
|
+
messages.append("unknown --fail-on rule %s" % raw_rule)
|
|
165
|
+
return messages
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _audit_claim(
|
|
169
|
+
claim: dict[str, Any],
|
|
170
|
+
trace: RAGTrace,
|
|
171
|
+
corpus_contexts: list[TraceContext],
|
|
172
|
+
*,
|
|
173
|
+
mode: str,
|
|
174
|
+
) -> dict[str, Any]:
|
|
175
|
+
claim_text = str(claim.get("claim") or "")
|
|
176
|
+
claim_id = str(claim.get("claim_id") or "")
|
|
177
|
+
corpus_match = find_best_evidence(claim_text, corpus_contexts, mode=mode)
|
|
178
|
+
corpus_verification = classify_claim(
|
|
179
|
+
Claim(id=claim_id or "claim", text=claim_text),
|
|
180
|
+
corpus_match,
|
|
181
|
+
has_contexts=bool(corpus_contexts),
|
|
182
|
+
)
|
|
183
|
+
diagnosis = _diagnose(claim, trace, corpus_match, corpus_verification)
|
|
184
|
+
return {
|
|
185
|
+
"claim_id": claim_id,
|
|
186
|
+
"claim": claim_text,
|
|
187
|
+
"audit_label": diagnosis["label"],
|
|
188
|
+
"confidence": diagnosis["confidence"],
|
|
189
|
+
"reason": diagnosis["reason"],
|
|
190
|
+
"suggested_fix": diagnosis["suggested_fix"],
|
|
191
|
+
"retrieved": {
|
|
192
|
+
"verdict": claim.get("verdict"),
|
|
193
|
+
"best_context_id": claim.get("best_context_id"),
|
|
194
|
+
"best_score": claim.get("best_score"),
|
|
195
|
+
"evidence": claim.get("evidence"),
|
|
196
|
+
"matched_terms": list(claim.get("matched_terms") or []),
|
|
197
|
+
"root_cause": (claim.get("root_cause") or {}).get("label"),
|
|
198
|
+
"citation_status": claim.get("citation_status"),
|
|
199
|
+
},
|
|
200
|
+
"corpus": {
|
|
201
|
+
"verdict": corpus_verification.verdict,
|
|
202
|
+
"best_document_id": corpus_match.context_id,
|
|
203
|
+
"best_score": corpus_match.score,
|
|
204
|
+
"evidence": corpus_match.snippet,
|
|
205
|
+
"matched_terms": list(corpus_match.matched_terms),
|
|
206
|
+
"evidence_span": corpus_match.span_dict(),
|
|
207
|
+
"supporting_spans": list(corpus_match.supporting_spans or []),
|
|
208
|
+
"required_facts": list(corpus_verification.required_facts),
|
|
209
|
+
"matched_facts": list(corpus_verification.matched_facts),
|
|
210
|
+
"missing_facts": list(corpus_verification.missing_facts),
|
|
211
|
+
"conflicting_facts": list(corpus_verification.conflicting_facts),
|
|
212
|
+
},
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _diagnose(
|
|
217
|
+
claim: dict[str, Any],
|
|
218
|
+
trace: RAGTrace,
|
|
219
|
+
corpus_match: object,
|
|
220
|
+
corpus_verification: object,
|
|
221
|
+
) -> dict[str, Any]:
|
|
222
|
+
verdict = str(claim.get("verdict") or "")
|
|
223
|
+
root_label = str((claim.get("root_cause") or {}).get("label") or NO_FAILURE)
|
|
224
|
+
citation_status = str(claim.get("citation_status") or "")
|
|
225
|
+
corpus_verdict = str(getattr(corpus_verification, "verdict", ""))
|
|
226
|
+
corpus_score = float(getattr(corpus_match, "score", 0.0) or 0.0)
|
|
227
|
+
same_source_rank = _same_source_retrieved_rank(str(getattr(corpus_match, "context_id", "") or ""), trace)
|
|
228
|
+
|
|
229
|
+
if _is_citation_only_failure(claim):
|
|
230
|
+
return _result(
|
|
231
|
+
NO_FAILURE,
|
|
232
|
+
0.92,
|
|
233
|
+
"The claim is supported by retrieved evidence; the remaining issue is citation-level, not a retrieval or corpus failure.",
|
|
234
|
+
"Fix the claim-level citation, but do not treat this as a retrieval miss.",
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
if not _is_failure(claim):
|
|
238
|
+
return _result(
|
|
239
|
+
NO_FAILURE,
|
|
240
|
+
0.99,
|
|
241
|
+
"The claim is already supported by the retrieved contexts.",
|
|
242
|
+
"No fix needed for this claim.",
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if verdict == "contradicted" or corpus_verdict == "contradicted" or root_label in {"stale_context", "conflicting_contexts"}:
|
|
246
|
+
return _result(
|
|
247
|
+
STALE_SOURCE,
|
|
248
|
+
0.86,
|
|
249
|
+
"The claim appears to conflict with retrieved or corpus evidence.",
|
|
250
|
+
"Resolve stale or conflicting sources before allowing the answer to use this fact.",
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
if corpus_verdict in SUPPORTED_VERDICTS:
|
|
254
|
+
if same_source_rank is None:
|
|
255
|
+
return _result(
|
|
256
|
+
RETRIEVAL_MISS,
|
|
257
|
+
max(0.82, min(0.98, corpus_score + 0.12)),
|
|
258
|
+
"The broader corpus contains evidence for this claim, but the retrieved contexts did not include it.",
|
|
259
|
+
"Improve retrieval recall, filters, query rewriting, or top_k so this source is retrieved.",
|
|
260
|
+
)
|
|
261
|
+
if same_source_rank >= RERANKING_CUTOFF:
|
|
262
|
+
return _result(
|
|
263
|
+
RERANKING_FAILURE,
|
|
264
|
+
max(0.78, min(0.95, corpus_score + 0.08)),
|
|
265
|
+
"A related source was retrieved, but it appeared too low in the retrieved context list for reliable generation.",
|
|
266
|
+
"Add a reranker or raise high-evidence chunks from this source before generation.",
|
|
267
|
+
)
|
|
268
|
+
return _result(
|
|
269
|
+
CHUNKING_ISSUE,
|
|
270
|
+
max(0.78, min(0.95, corpus_score + 0.08)),
|
|
271
|
+
"The retrieved source appears related, but the retrieved chunk omitted the supporting span found in the corpus.",
|
|
272
|
+
"Adjust chunk boundaries, overlap, or parent-document retrieval so the answerable span is included.",
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
if root_label == "answer_overreach" or verdict == "partially_supported":
|
|
276
|
+
return _result(
|
|
277
|
+
ANSWER_OVERREACH,
|
|
278
|
+
0.82,
|
|
279
|
+
"The evidence supports part of the claim, but not every required fact.",
|
|
280
|
+
"Remove unsupported details or retrieve evidence that explicitly supports each detail.",
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
if corpus_verdict == "partially_supported":
|
|
284
|
+
return _result(
|
|
285
|
+
ANSWER_OVERREACH,
|
|
286
|
+
0.78,
|
|
287
|
+
"The corpus supports only part of the claim, so the answer likely added unsupported detail.",
|
|
288
|
+
"Split the claim and require support for every required fact before answering.",
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
if corpus_verdict == "unverifiable" or verdict == "unverifiable":
|
|
292
|
+
return _result(
|
|
293
|
+
INSUFFICIENT_CONTEXT,
|
|
294
|
+
0.72,
|
|
295
|
+
"The closest corpus evidence is related but too weak or ambiguous to verify the claim.",
|
|
296
|
+
"Retrieve more specific evidence or force the model to qualify/abstain.",
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
if citation_status in BAD_CITATIONS and corpus_score >= 0.35:
|
|
300
|
+
return _result(
|
|
301
|
+
INSUFFICIENT_CONTEXT,
|
|
302
|
+
0.7,
|
|
303
|
+
"The claim has a citation problem and the broader corpus evidence is still not strong enough.",
|
|
304
|
+
"Regenerate claim-level citations and require cited sources to cover all required facts.",
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
return _result(
|
|
308
|
+
CORPUS_GAP,
|
|
309
|
+
max(0.7, min(0.95, 1.0 - corpus_score)),
|
|
310
|
+
"Neither the retrieved contexts nor the broader corpus provide enough support for this claim.",
|
|
311
|
+
"Add the missing source to the corpus or make the answer abstain when the corpus lacks this fact.",
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _summary(
|
|
316
|
+
claim_audits: list[dict[str, Any]],
|
|
317
|
+
verification: dict[str, Any],
|
|
318
|
+
corpus_contexts: list[TraceContext],
|
|
319
|
+
*,
|
|
320
|
+
mode: str,
|
|
321
|
+
) -> dict[str, Any]:
|
|
322
|
+
counts = Counter(str(claim.get("audit_label") or NO_FAILURE) for claim in claim_audits)
|
|
323
|
+
labels = [NO_FAILURE] + sorted(AUDIT_FAILURE_LABELS)
|
|
324
|
+
failure_count = sum(counts[label] for label in AUDIT_FAILURE_LABELS)
|
|
325
|
+
return {
|
|
326
|
+
"mode": mode,
|
|
327
|
+
"total_claims": len(claim_audits),
|
|
328
|
+
"audited_claims": len([claim for claim in claim_audits if claim.get("audit_label") != NO_FAILURE]),
|
|
329
|
+
"corpus_documents": len(corpus_contexts),
|
|
330
|
+
"has_audit_failures": failure_count > 0,
|
|
331
|
+
"primary_audit_label": _primary_label(counts),
|
|
332
|
+
"verification_failure_type": (verification.get("summary") or {}).get("failure_type"),
|
|
333
|
+
"verification_primary_root_cause": (verification.get("summary") or {}).get("primary_root_cause"),
|
|
334
|
+
**{label: counts[label] for label in labels},
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _primary_label(counts: Counter) -> str:
|
|
339
|
+
failures = {label: counts[label] for label in AUDIT_FAILURE_LABELS if counts[label]}
|
|
340
|
+
if not failures:
|
|
341
|
+
return NO_FAILURE
|
|
342
|
+
priority = [
|
|
343
|
+
RETRIEVAL_MISS,
|
|
344
|
+
CHUNKING_ISSUE,
|
|
345
|
+
RERANKING_FAILURE,
|
|
346
|
+
CORPUS_GAP,
|
|
347
|
+
ANSWER_OVERREACH,
|
|
348
|
+
STALE_SOURCE,
|
|
349
|
+
INSUFFICIENT_CONTEXT,
|
|
350
|
+
]
|
|
351
|
+
return max(
|
|
352
|
+
failures,
|
|
353
|
+
key=lambda label: (
|
|
354
|
+
failures[label],
|
|
355
|
+
-priority.index(label) if label in priority else -len(priority),
|
|
356
|
+
),
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _is_failure(claim: dict[str, Any]) -> bool:
|
|
361
|
+
return (
|
|
362
|
+
str(claim.get("verdict") or "") not in SUPPORTED_VERDICTS
|
|
363
|
+
or str(claim.get("citation_status") or "") in BAD_CITATIONS
|
|
364
|
+
or str((claim.get("root_cause") or {}).get("label") or NO_FAILURE) != NO_FAILURE
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def _is_citation_only_failure(claim: dict[str, Any]) -> bool:
|
|
369
|
+
return (
|
|
370
|
+
str(claim.get("verdict") or "") in SUPPORTED_VERDICTS
|
|
371
|
+
and str(claim.get("citation_status") or "") in BAD_CITATIONS
|
|
372
|
+
and str((claim.get("root_cause") or {}).get("label") or NO_FAILURE)
|
|
373
|
+
in {"wrong_source_cited", "missing_cited_source", NO_FAILURE}
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _same_source_retrieved_rank(corpus_context_id: str, trace: RAGTrace) -> int | None:
|
|
378
|
+
corpus_key = _source_key(corpus_context_id)
|
|
379
|
+
if not corpus_key:
|
|
380
|
+
return None
|
|
381
|
+
for index, context in enumerate(trace.contexts):
|
|
382
|
+
candidates = [
|
|
383
|
+
context.id,
|
|
384
|
+
context.metadata.get("source"),
|
|
385
|
+
context.metadata.get("path"),
|
|
386
|
+
context.metadata.get("file"),
|
|
387
|
+
context.metadata.get("document"),
|
|
388
|
+
]
|
|
389
|
+
if any(_sources_match(corpus_key, _source_key(value)) for value in candidates):
|
|
390
|
+
return index
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _sources_match(left: str, right: str) -> bool:
|
|
395
|
+
if not left or not right:
|
|
396
|
+
return False
|
|
397
|
+
if left == right:
|
|
398
|
+
return True
|
|
399
|
+
return Path(left).name == Path(right).name
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _source_key(value: Any) -> str:
|
|
403
|
+
text = str(value or "").strip().replace("\\", "/").lower()
|
|
404
|
+
return text.strip("./")
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _result(label: str, confidence: float, reason: str, suggested_fix: str) -> dict[str, Any]:
|
|
408
|
+
return {
|
|
409
|
+
"label": label,
|
|
410
|
+
"confidence": round(confidence, 3),
|
|
411
|
+
"reason": reason,
|
|
412
|
+
"suggested_fix": suggested_fix,
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def _corpus_files(root: Path) -> list[Path]:
|
|
417
|
+
files: list[Path] = []
|
|
418
|
+
for path in root.rglob("*"):
|
|
419
|
+
if not path.is_file():
|
|
420
|
+
continue
|
|
421
|
+
if any(part in SKIP_DIRECTORIES for part in path.parts):
|
|
422
|
+
continue
|
|
423
|
+
if path.suffix.lower() not in CORPUS_EXTENSIONS:
|
|
424
|
+
continue
|
|
425
|
+
if path.stat().st_size > MAX_FILE_BYTES:
|
|
426
|
+
continue
|
|
427
|
+
files.append(path)
|
|
428
|
+
return sorted(files, key=lambda item: str(item).lower())
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def _read_text(path: Path) -> str:
|
|
432
|
+
try:
|
|
433
|
+
return path.read_text(encoding="utf-8")
|
|
434
|
+
except UnicodeDecodeError:
|
|
435
|
+
try:
|
|
436
|
+
return path.read_text(encoding="utf-8", errors="ignore")
|
|
437
|
+
except OSError:
|
|
438
|
+
return ""
|
|
439
|
+
except OSError:
|
|
440
|
+
return ""
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _context_id(path: Path, root: Path) -> str:
|
|
444
|
+
if root.is_file():
|
|
445
|
+
return path.name
|
|
446
|
+
try:
|
|
447
|
+
return path.relative_to(root).as_posix()
|
|
448
|
+
except ValueError:
|
|
449
|
+
return path.name
|