contexttrace 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {contexttrace-0.1.0 → contexttrace-0.2.0}/PKG-INFO +25 -1
- {contexttrace-0.1.0 → contexttrace-0.2.0}/README.md +24 -0
- contexttrace-0.2.0/contexttrace/_version.py +1 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/cli.py +217 -0
- contexttrace-0.2.0/contexttrace/verify/__init__.py +21 -0
- contexttrace-0.2.0/contexttrace/verify/abstention.py +70 -0
- contexttrace-0.2.0/contexttrace/verify/benchmark.py +229 -0
- contexttrace-0.2.0/contexttrace/verify/citations.py +81 -0
- contexttrace-0.2.0/contexttrace/verify/claims.py +221 -0
- contexttrace-0.2.0/contexttrace/verify/demos.py +165 -0
- contexttrace-0.2.0/contexttrace/verify/evidence.py +330 -0
- contexttrace-0.2.0/contexttrace/verify/report.py +436 -0
- contexttrace-0.2.0/contexttrace/verify/runner.py +141 -0
- contexttrace-0.2.0/contexttrace/verify/schema.py +150 -0
- contexttrace-0.2.0/contexttrace/verify/verdicts.py +170 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace.egg-info/SOURCES.txt +12 -1
- {contexttrace-0.1.0 → contexttrace-0.2.0}/pyproject.toml +1 -1
- contexttrace-0.1.0/contexttrace/_version.py +0 -1
- {contexttrace-0.1.0 → contexttrace-0.2.0}/MANIFEST.in +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/__init__.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/client.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/config.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/demo.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/demo_data.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/endpoint_eval.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/errors.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/evaluator.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/integrations/__init__.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/integrations/fastapi.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/integrations/langchain.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/integrations/langgraph.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/integrations/llamaindex.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/integrations/opentelemetry.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/local.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/py.typed +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/regression.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/reliability.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/report.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/storage/__init__.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/storage/sqlite_store.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/thresholds.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/transport.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/contexttrace/viewer.py +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/setup.cfg +0 -0
- {contexttrace-0.1.0 → contexttrace-0.2.0}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: contexttrace
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
|
|
5
5
|
Author: ContextTrace contributors
|
|
6
6
|
License: MIT
|
|
@@ -133,6 +133,30 @@ contexttrace eval \
|
|
|
133
133
|
--fail-on "failure_rate>0.25"
|
|
134
134
|
```
|
|
135
135
|
|
|
136
|
+
## Claim-Level Evidence Verification
|
|
137
|
+
|
|
138
|
+
Verify a portable RAG trace artifact without a hosted dashboard:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
contexttrace verify-demo unsupported_claim --report
|
|
142
|
+
contexttrace verify trace.json
|
|
143
|
+
contexttrace verify trace.json --json
|
|
144
|
+
contexttrace verify trace.json --report --out reports/example.html
|
|
145
|
+
contexttrace verify trace.json --mode semantic
|
|
146
|
+
contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
|
|
147
|
+
contexttrace verify-benchmark --mode semantic
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
|
|
151
|
+
|
|
152
|
+
`verify-demo` uses bundled demo traces, so it works immediately after `pip install contexttrace`. Available demos include `unsupported_claim`, `partial_support`, `citation_mismatch`, `should_abstain`, and `supported_answer`.
|
|
153
|
+
|
|
154
|
+
Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics.
|
|
155
|
+
|
|
156
|
+
ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, insufficient context, or should-have-abstained.
|
|
157
|
+
|
|
158
|
+
The v0.2.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
159
|
+
|
|
136
160
|
## What It Catches
|
|
137
161
|
|
|
138
162
|
- `retrieval_miss`
|
|
@@ -76,6 +76,30 @@ contexttrace eval \
|
|
|
76
76
|
--fail-on "failure_rate>0.25"
|
|
77
77
|
```
|
|
78
78
|
|
|
79
|
+
## Claim-Level Evidence Verification
|
|
80
|
+
|
|
81
|
+
Verify a portable RAG trace artifact without a hosted dashboard:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
contexttrace verify-demo unsupported_claim --report
|
|
85
|
+
contexttrace verify trace.json
|
|
86
|
+
contexttrace verify trace.json --json
|
|
87
|
+
contexttrace verify trace.json --report --out reports/example.html
|
|
88
|
+
contexttrace verify trace.json --mode semantic
|
|
89
|
+
contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
|
|
90
|
+
contexttrace verify-benchmark --mode semantic
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
|
|
94
|
+
|
|
95
|
+
`verify-demo` uses bundled demo traces, so it works immediately after `pip install contexttrace`. Available demos include `unsupported_claim`, `partial_support`, `citation_mismatch`, `should_abstain`, and `supported_answer`.
|
|
96
|
+
|
|
97
|
+
Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics.
|
|
98
|
+
|
|
99
|
+
ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, insufficient context, or should-have-abstained.
|
|
100
|
+
|
|
101
|
+
The v0.2.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
102
|
+
|
|
79
103
|
## What It Catches
|
|
80
104
|
|
|
81
105
|
- `retrieval_miss`
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.0"
|
|
@@ -22,6 +22,15 @@ from contexttrace.regression import BENCHMARK_STRATEGIES, run_local_benchmark
|
|
|
22
22
|
from contexttrace.report import ReportGenerator
|
|
23
23
|
from contexttrace.storage import SQLiteTraceStore
|
|
24
24
|
from contexttrace.thresholds import parse_thresholds, threshold_failures
|
|
25
|
+
from contexttrace.verify import (
|
|
26
|
+
VerificationInputError,
|
|
27
|
+
list_verify_demos,
|
|
28
|
+
load_trace_file,
|
|
29
|
+
load_verify_demo,
|
|
30
|
+
verify_trace,
|
|
31
|
+
)
|
|
32
|
+
from contexttrace.verify.benchmark import run_verify_benchmark
|
|
33
|
+
from contexttrace.verify.report import VerifyReportGenerator
|
|
25
34
|
from contexttrace.viewer import serve_viewer
|
|
26
35
|
|
|
27
36
|
|
|
@@ -198,6 +207,214 @@ def report(
|
|
|
198
207
|
webbrowser.open(Path(written).resolve().as_uri())
|
|
199
208
|
|
|
200
209
|
|
|
210
|
+
@cli.command("verify")
|
|
211
|
+
@click.argument("trace_json")
|
|
212
|
+
@click.option("--json", "json_output", is_flag=True, help="Print the full verification result as JSON.")
|
|
213
|
+
@click.option("--report", is_flag=True, help="Generate a local HTML verification report.")
|
|
214
|
+
@click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
|
|
215
|
+
@click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
|
|
216
|
+
@click.option("--fail-on", multiple=True, help="Fail on unsupported, partial_support, citation_mismatch, should_abstain, contradicted, unverifiable, no_citation, or any_failure.")
|
|
217
|
+
def verify_command(
|
|
218
|
+
trace_json: str,
|
|
219
|
+
json_output: bool,
|
|
220
|
+
report: bool,
|
|
221
|
+
out: Optional[str],
|
|
222
|
+
mode: str,
|
|
223
|
+
fail_on: tuple[str, ...],
|
|
224
|
+
) -> int:
|
|
225
|
+
"""Verify claim-level evidence support for a portable RAG trace JSON file."""
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
trace = load_trace_file(trace_json)
|
|
229
|
+
except VerificationInputError as exc:
|
|
230
|
+
raise click.ClickException(str(exc)) from exc
|
|
231
|
+
|
|
232
|
+
result = verify_trace(trace, mode=mode)
|
|
233
|
+
written_report = _write_verify_report(
|
|
234
|
+
result,
|
|
235
|
+
trace,
|
|
236
|
+
report=report,
|
|
237
|
+
out=out,
|
|
238
|
+
default_name="%s_verify.html" % Path(trace_json).stem,
|
|
239
|
+
)
|
|
240
|
+
return _print_verify_result(
|
|
241
|
+
result,
|
|
242
|
+
json_output=json_output,
|
|
243
|
+
written_report=written_report,
|
|
244
|
+
fail_on=fail_on,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@cli.command("verify-demo")
|
|
249
|
+
@click.argument("demo_name", required=False, default="unsupported_claim")
|
|
250
|
+
@click.option("--json", "json_output", is_flag=True, help="Print the full verification result as JSON.")
|
|
251
|
+
@click.option("--report", is_flag=True, help="Generate a local HTML verification report.")
|
|
252
|
+
@click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
|
|
253
|
+
@click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
|
|
254
|
+
@click.option("--fail-on", multiple=True, help="Fail on unsupported, partial_support, citation_mismatch, should_abstain, contradicted, unverifiable, no_citation, or any_failure.")
|
|
255
|
+
def verify_demo_command(
|
|
256
|
+
demo_name: str,
|
|
257
|
+
json_output: bool,
|
|
258
|
+
report: bool,
|
|
259
|
+
out: Optional[str],
|
|
260
|
+
mode: str,
|
|
261
|
+
fail_on: tuple[str, ...],
|
|
262
|
+
) -> int:
|
|
263
|
+
"""Run a bundled claim-level verification demo."""
|
|
264
|
+
|
|
265
|
+
try:
|
|
266
|
+
trace = load_verify_demo(demo_name)
|
|
267
|
+
except KeyError as exc:
|
|
268
|
+
raise click.ClickException(
|
|
269
|
+
"Unknown verify demo %s. Available demos: %s"
|
|
270
|
+
% (demo_name, ", ".join(list_verify_demos()))
|
|
271
|
+
) from exc
|
|
272
|
+
|
|
273
|
+
result = verify_trace(trace, mode=mode)
|
|
274
|
+
written_report = _write_verify_report(
|
|
275
|
+
result,
|
|
276
|
+
trace,
|
|
277
|
+
report=report,
|
|
278
|
+
out=out,
|
|
279
|
+
default_name="%s_verify_demo.html" % demo_name,
|
|
280
|
+
)
|
|
281
|
+
return _print_verify_result(
|
|
282
|
+
result,
|
|
283
|
+
json_output=json_output,
|
|
284
|
+
written_report=written_report,
|
|
285
|
+
fail_on=fail_on,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
@cli.command("verify-benchmark")
|
|
290
|
+
@click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
|
|
291
|
+
@click.option("--json", "json_output", is_flag=True, help="Print benchmark results as JSON.")
|
|
292
|
+
def verify_benchmark_command(mode: str, json_output: bool) -> int:
|
|
293
|
+
"""Run the bundled verification precision/recall benchmark."""
|
|
294
|
+
|
|
295
|
+
result = run_verify_benchmark(mode=mode)
|
|
296
|
+
if json_output:
|
|
297
|
+
click.echo(json.dumps(result, indent=2))
|
|
298
|
+
return 0
|
|
299
|
+
|
|
300
|
+
click.echo("Mode: %s" % result["mode"])
|
|
301
|
+
click.echo("Cases: %s" % result["cases"])
|
|
302
|
+
click.echo("Exact match rate: %.3f" % float(result["exact_match_rate"]))
|
|
303
|
+
click.echo("label\tprecision\trecall\tf1\ttp\tfp\tfn")
|
|
304
|
+
for label, metrics in result["per_label"].items():
|
|
305
|
+
click.echo(
|
|
306
|
+
"%s\t%.3f\t%.3f\t%.3f\t%s\t%s\t%s"
|
|
307
|
+
% (
|
|
308
|
+
label,
|
|
309
|
+
float(metrics["precision"]),
|
|
310
|
+
float(metrics["recall"]),
|
|
311
|
+
float(metrics["f1"]),
|
|
312
|
+
metrics["tp"],
|
|
313
|
+
metrics["fp"],
|
|
314
|
+
metrics["fn"],
|
|
315
|
+
)
|
|
316
|
+
)
|
|
317
|
+
missed = [row for row in result["rows"] if not row["exact_match"]]
|
|
318
|
+
if missed:
|
|
319
|
+
click.echo("Mismatches:")
|
|
320
|
+
for row in missed:
|
|
321
|
+
click.echo(
|
|
322
|
+
"- %s expected=%s predicted=%s"
|
|
323
|
+
% (row["id"], ",".join(row["expected"]), ",".join(row["predicted"]))
|
|
324
|
+
)
|
|
325
|
+
return 0
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _write_verify_report(
|
|
329
|
+
result: dict,
|
|
330
|
+
trace: object,
|
|
331
|
+
*,
|
|
332
|
+
report: bool,
|
|
333
|
+
out: Optional[str],
|
|
334
|
+
default_name: str,
|
|
335
|
+
) -> Optional[str]:
|
|
336
|
+
if not report and not out:
|
|
337
|
+
return None
|
|
338
|
+
output_path = out or str(Path(".contexttrace") / "reports" / default_name)
|
|
339
|
+
return VerifyReportGenerator().generate(result, trace, path=output_path)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _print_verify_result(
|
|
343
|
+
result: dict,
|
|
344
|
+
*,
|
|
345
|
+
json_output: bool,
|
|
346
|
+
written_report: Optional[str],
|
|
347
|
+
fail_on: tuple[str, ...] = (),
|
|
348
|
+
) -> int:
|
|
349
|
+
fail_messages = _verify_failures(result, fail_on)
|
|
350
|
+
if json_output:
|
|
351
|
+
if written_report:
|
|
352
|
+
click.echo("Report: %s" % written_report, err=True)
|
|
353
|
+
click.echo(json.dumps(result, indent=2))
|
|
354
|
+
for message in fail_messages:
|
|
355
|
+
click.echo("Verification failed: %s" % message, err=True)
|
|
356
|
+
return 1 if fail_messages else 0
|
|
357
|
+
summary = result["summary"]
|
|
358
|
+
click.echo("Claims verified: %s" % summary["total_claims"])
|
|
359
|
+
click.echo(
|
|
360
|
+
"Supported: {supported} | Partial: {partially_supported} | Unsupported: {unsupported} | Unverifiable: {unverifiable} | Contradicted: {contradicted}".format(
|
|
361
|
+
**summary
|
|
362
|
+
)
|
|
363
|
+
)
|
|
364
|
+
click.echo("Support rate: %.3f" % float(summary["support_rate"]))
|
|
365
|
+
click.echo("Unsupported claim rate: %.3f" % float(summary["unsupported_claim_rate"]))
|
|
366
|
+
click.echo("Citation mismatches: %s" % summary["citation_mismatches"])
|
|
367
|
+
click.echo("Failure type: %s" % summary["failure_type"])
|
|
368
|
+
click.echo("Should abstain: %s" % str(summary["should_abstain"]).lower())
|
|
369
|
+
click.echo("Suggested fix: %s" % summary["suggested_fix"])
|
|
370
|
+
if written_report:
|
|
371
|
+
click.echo("Report: %s" % written_report)
|
|
372
|
+
for message in fail_messages:
|
|
373
|
+
click.echo("Verification failed: %s" % message, err=True)
|
|
374
|
+
return 1 if fail_messages else 0
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _verify_failures(result: dict, fail_on: tuple[str, ...]) -> list[str]:
|
|
378
|
+
if not fail_on:
|
|
379
|
+
return []
|
|
380
|
+
summary = result.get("summary") or {}
|
|
381
|
+
claims = result.get("claims") or []
|
|
382
|
+
failure_types = set(summary.get("failure_types") or [])
|
|
383
|
+
messages = []
|
|
384
|
+
for raw_rule in fail_on:
|
|
385
|
+
rule = raw_rule.strip().lower().replace("-", "_")
|
|
386
|
+
if rule == "unsupported" and int(summary.get("unsupported") or 0) > 0:
|
|
387
|
+
messages.append("unsupported claim detected")
|
|
388
|
+
elif rule in {"partial", "partial_support", "partially_supported"} and int(summary.get("partially_supported") or 0) > 0:
|
|
389
|
+
messages.append("partially supported claim detected")
|
|
390
|
+
elif rule == "citation_mismatch" and "citation_mismatch" in failure_types:
|
|
391
|
+
messages.append("citation mismatch detected")
|
|
392
|
+
elif rule == "should_abstain" and bool(summary.get("should_abstain")):
|
|
393
|
+
messages.append("answer should have abstained")
|
|
394
|
+
elif rule == "contradicted" and int(summary.get("contradicted") or 0) > 0:
|
|
395
|
+
messages.append("contradicted claim detected")
|
|
396
|
+
elif rule == "unverifiable" and int(summary.get("unverifiable") or 0) > 0:
|
|
397
|
+
messages.append("unverifiable claim detected")
|
|
398
|
+
elif rule == "no_citation" and any(claim.get("citation_status") == "claim_has_no_citation" for claim in claims):
|
|
399
|
+
messages.append("claim without citation detected")
|
|
400
|
+
elif rule == "any_failure" and failure_types != {"no_failure_detected"}:
|
|
401
|
+
messages.append("verification failure detected")
|
|
402
|
+
elif rule not in {
|
|
403
|
+
"unsupported",
|
|
404
|
+
"partial",
|
|
405
|
+
"partial_support",
|
|
406
|
+
"partially_supported",
|
|
407
|
+
"citation_mismatch",
|
|
408
|
+
"should_abstain",
|
|
409
|
+
"contradicted",
|
|
410
|
+
"unverifiable",
|
|
411
|
+
"no_citation",
|
|
412
|
+
"any_failure",
|
|
413
|
+
}:
|
|
414
|
+
messages.append("unknown --fail-on rule %s" % raw_rule)
|
|
415
|
+
return messages
|
|
416
|
+
|
|
417
|
+
|
|
201
418
|
@cli.command("eval")
|
|
202
419
|
@click.option("--dataset", required=True, help="Path to eval questions JSON.")
|
|
203
420
|
@click.option("--endpoint", default=None, help="RAG endpoint URL. Defaults to config eval_endpoint.")
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from contexttrace.verify.runner import verify_trace, verify_trace_file
|
|
2
|
+
from contexttrace.verify.schema import (
|
|
3
|
+
RAGTrace,
|
|
4
|
+
TraceCitation,
|
|
5
|
+
TraceContext,
|
|
6
|
+
VerificationInputError,
|
|
7
|
+
load_trace_file,
|
|
8
|
+
)
|
|
9
|
+
from contexttrace.verify.demos import list_verify_demos, load_verify_demo
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"RAGTrace",
|
|
13
|
+
"TraceCitation",
|
|
14
|
+
"TraceContext",
|
|
15
|
+
"VerificationInputError",
|
|
16
|
+
"list_verify_demos",
|
|
17
|
+
"load_trace_file",
|
|
18
|
+
"load_verify_demo",
|
|
19
|
+
"verify_trace",
|
|
20
|
+
"verify_trace_file",
|
|
21
|
+
]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contexttrace.verify.claims import Claim
|
|
4
|
+
from contexttrace.verify.evidence import find_best_evidence
|
|
5
|
+
from contexttrace.verify.schema import TraceContext
|
|
6
|
+
from contexttrace.verify.verdicts import ClaimVerification
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def judge_abstention(
|
|
10
|
+
*,
|
|
11
|
+
query: str,
|
|
12
|
+
claims: list[Claim],
|
|
13
|
+
contexts: list[TraceContext],
|
|
14
|
+
verifications: list[ClaimVerification],
|
|
15
|
+
mode: str = "lexical",
|
|
16
|
+
) -> dict[str, object]:
|
|
17
|
+
if not claims:
|
|
18
|
+
return {
|
|
19
|
+
"should_abstain": False,
|
|
20
|
+
"reason": "The answer does not contain factual claims that require evidence support.",
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
if not contexts:
|
|
24
|
+
return {
|
|
25
|
+
"should_abstain": True,
|
|
26
|
+
"reason": "The answer contains factual claims, but no retrieved contexts were provided.",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
total_claims = len(verifications)
|
|
30
|
+
supported = len([item for item in verifications if item.verdict == "supported"])
|
|
31
|
+
unsupported_like = len(
|
|
32
|
+
[
|
|
33
|
+
item
|
|
34
|
+
for item in verifications
|
|
35
|
+
if item.verdict in {"unsupported", "contradicted"}
|
|
36
|
+
]
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
query_match = find_best_evidence(query, contexts, mode=mode)
|
|
40
|
+
if supported == 0 and query_match.score < 0.18:
|
|
41
|
+
return {
|
|
42
|
+
"should_abstain": True,
|
|
43
|
+
"reason": (
|
|
44
|
+
"The query asks for information that does not appear in the retrieved contexts, "
|
|
45
|
+
"but the answer still gives a factual response."
|
|
46
|
+
),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if unsupported_like / total_claims >= 0.5:
|
|
50
|
+
return {
|
|
51
|
+
"should_abstain": True,
|
|
52
|
+
"reason": (
|
|
53
|
+
"The answer contains factual claims, but most important claims are unsupported "
|
|
54
|
+
"or contradicted by the retrieved contexts."
|
|
55
|
+
),
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if any(item.verdict == "partially_supported" for item in verifications):
|
|
59
|
+
return {
|
|
60
|
+
"should_abstain": False,
|
|
61
|
+
"reason": (
|
|
62
|
+
"At least one claim is only partially supported; the answer should remove "
|
|
63
|
+
"or qualify unsupported details rather than fully abstain."
|
|
64
|
+
),
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
"should_abstain": False,
|
|
69
|
+
"reason": "Most generated claims are supported by retrieved evidence.",
|
|
70
|
+
}
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from contexttrace.verify.runner import verify_trace
|
|
7
|
+
from contexttrace.verify.schema import RAGTrace, load_trace
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class VerifyBenchmarkCase:
|
|
12
|
+
id: str
|
|
13
|
+
trace: RAGTrace
|
|
14
|
+
expected_labels: set[str]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def benchmark_cases() -> list[VerifyBenchmarkCase]:
|
|
18
|
+
return [
|
|
19
|
+
_case(
|
|
20
|
+
"supported_refund_window",
|
|
21
|
+
{
|
|
22
|
+
"query": "What is the refund policy?",
|
|
23
|
+
"answer": "Refunds are allowed within 30 days of purchase.",
|
|
24
|
+
"contexts": [
|
|
25
|
+
{
|
|
26
|
+
"id": "policy",
|
|
27
|
+
"text": "Customers may request refunds within 30 days of purchase.",
|
|
28
|
+
}
|
|
29
|
+
],
|
|
30
|
+
"citations": [
|
|
31
|
+
{
|
|
32
|
+
"claim": "Refunds are allowed within 30 days of purchase.",
|
|
33
|
+
"source_id": "policy",
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
},
|
|
37
|
+
{"no_failure_detected"},
|
|
38
|
+
),
|
|
39
|
+
_case(
|
|
40
|
+
"semantic_money_back_window",
|
|
41
|
+
{
|
|
42
|
+
"query": "What is the refund policy?",
|
|
43
|
+
"answer": "Refunds are allowed within 30 days.",
|
|
44
|
+
"contexts": [
|
|
45
|
+
{
|
|
46
|
+
"id": "policy",
|
|
47
|
+
"text": "Customers can request money back within thirty days of purchase.",
|
|
48
|
+
}
|
|
49
|
+
],
|
|
50
|
+
},
|
|
51
|
+
{"no_failure_detected"},
|
|
52
|
+
),
|
|
53
|
+
_case(
|
|
54
|
+
"semantic_order_id",
|
|
55
|
+
{
|
|
56
|
+
"query": "What do refund requests need?",
|
|
57
|
+
"answer": "Refund requests must include an order number.",
|
|
58
|
+
"contexts": [
|
|
59
|
+
{
|
|
60
|
+
"id": "policy",
|
|
61
|
+
"text": "Refund requests require an order ID.",
|
|
62
|
+
}
|
|
63
|
+
],
|
|
64
|
+
},
|
|
65
|
+
{"no_failure_detected"},
|
|
66
|
+
),
|
|
67
|
+
_case(
|
|
68
|
+
"unsupported_processing_time",
|
|
69
|
+
{
|
|
70
|
+
"query": "How long does refund processing take?",
|
|
71
|
+
"answer": "Refunds are processed within 5 business days.",
|
|
72
|
+
"contexts": [
|
|
73
|
+
{
|
|
74
|
+
"id": "policy",
|
|
75
|
+
"text": "Customers may request refunds within 30 days of purchase.",
|
|
76
|
+
}
|
|
77
|
+
],
|
|
78
|
+
},
|
|
79
|
+
{"should_have_abstained", "unsupported_answer"},
|
|
80
|
+
),
|
|
81
|
+
_case(
|
|
82
|
+
"partial_manager_approval",
|
|
83
|
+
{
|
|
84
|
+
"query": "Can customers request refunds within 30 days?",
|
|
85
|
+
"answer": "Refunds within 30 days require manager approval.",
|
|
86
|
+
"contexts": [
|
|
87
|
+
{
|
|
88
|
+
"id": "policy",
|
|
89
|
+
"text": "Customers may request refunds within 30 days of purchase.",
|
|
90
|
+
}
|
|
91
|
+
],
|
|
92
|
+
},
|
|
93
|
+
{"partial_support"},
|
|
94
|
+
),
|
|
95
|
+
_case(
|
|
96
|
+
"citation_wrong_source",
|
|
97
|
+
{
|
|
98
|
+
"query": "What is the current refund window?",
|
|
99
|
+
"answer": "Refunds are allowed within 30 days of purchase.",
|
|
100
|
+
"contexts": [
|
|
101
|
+
{
|
|
102
|
+
"id": "archive",
|
|
103
|
+
"text": "Customers may exchange eligible items within 14 days of purchase.",
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"id": "current_policy",
|
|
107
|
+
"text": "Customers may request refunds within 30 days of purchase.",
|
|
108
|
+
},
|
|
109
|
+
],
|
|
110
|
+
"citations": [
|
|
111
|
+
{
|
|
112
|
+
"claim": "Refunds are allowed within 30 days of purchase.",
|
|
113
|
+
"source_id": "archive",
|
|
114
|
+
}
|
|
115
|
+
],
|
|
116
|
+
},
|
|
117
|
+
{"citation_mismatch"},
|
|
118
|
+
),
|
|
119
|
+
_case(
|
|
120
|
+
"should_abstain_vip",
|
|
121
|
+
{
|
|
122
|
+
"query": "What refund exception applies to VIP customers?",
|
|
123
|
+
"answer": "VIP customers receive cash refunds up to 90 days after purchase.",
|
|
124
|
+
"contexts": [
|
|
125
|
+
{
|
|
126
|
+
"id": "shipping",
|
|
127
|
+
"text": "Standard shipping takes 3 to 5 business days.",
|
|
128
|
+
}
|
|
129
|
+
],
|
|
130
|
+
},
|
|
131
|
+
{"should_have_abstained", "unsupported_answer"},
|
|
132
|
+
),
|
|
133
|
+
_case(
|
|
134
|
+
"contradicted_refund_allowed",
|
|
135
|
+
{
|
|
136
|
+
"query": "Are refunds allowed within 30 days?",
|
|
137
|
+
"answer": "Refunds are allowed within 30 days of purchase.",
|
|
138
|
+
"contexts": [
|
|
139
|
+
{
|
|
140
|
+
"id": "policy",
|
|
141
|
+
"text": "Refunds are not allowed within 30 days of purchase.",
|
|
142
|
+
}
|
|
143
|
+
],
|
|
144
|
+
},
|
|
145
|
+
{"should_have_abstained", "contradicted_answer"},
|
|
146
|
+
),
|
|
147
|
+
_case(
|
|
148
|
+
"supported_two_claims",
|
|
149
|
+
{
|
|
150
|
+
"query": "What is the refund policy?",
|
|
151
|
+
"answer": "Refunds are allowed within 30 days of purchase. Refund requests must include an order number.",
|
|
152
|
+
"contexts": [
|
|
153
|
+
{
|
|
154
|
+
"id": "policy",
|
|
155
|
+
"text": "Customers may request refunds within 30 days of purchase. Refund requests must include an order number.",
|
|
156
|
+
}
|
|
157
|
+
],
|
|
158
|
+
"citations": [
|
|
159
|
+
{
|
|
160
|
+
"claim": "Refunds are allowed within 30 days of purchase.",
|
|
161
|
+
"source_id": "policy",
|
|
162
|
+
},
|
|
163
|
+
{
|
|
164
|
+
"claim": "Refund requests must include an order number.",
|
|
165
|
+
"source_id": "policy",
|
|
166
|
+
},
|
|
167
|
+
],
|
|
168
|
+
},
|
|
169
|
+
{"no_failure_detected"},
|
|
170
|
+
),
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def run_verify_benchmark(*, mode: str = "lexical") -> dict[str, Any]:
|
|
175
|
+
rows = []
|
|
176
|
+
labels = set()
|
|
177
|
+
for case in benchmark_cases():
|
|
178
|
+
result = verify_trace(case.trace, mode=mode)
|
|
179
|
+
predicted = _predicted_labels(result)
|
|
180
|
+
labels.update(case.expected_labels)
|
|
181
|
+
labels.update(predicted)
|
|
182
|
+
rows.append(
|
|
183
|
+
{
|
|
184
|
+
"id": case.id,
|
|
185
|
+
"expected": sorted(case.expected_labels),
|
|
186
|
+
"predicted": sorted(predicted),
|
|
187
|
+
"exact_match": predicted == case.expected_labels,
|
|
188
|
+
"summary": result.get("summary") or {},
|
|
189
|
+
}
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
per_label = {}
|
|
193
|
+
for label in sorted(labels):
|
|
194
|
+
tp = sum(1 for row in rows if label in row["expected"] and label in row["predicted"])
|
|
195
|
+
fp = sum(1 for row in rows if label not in row["expected"] and label in row["predicted"])
|
|
196
|
+
fn = sum(1 for row in rows if label in row["expected"] and label not in row["predicted"])
|
|
197
|
+
precision = tp / (tp + fp) if tp + fp else 0.0
|
|
198
|
+
recall = tp / (tp + fn) if tp + fn else 0.0
|
|
199
|
+
f1 = (2 * precision * recall / (precision + recall)) if precision + recall else 0.0
|
|
200
|
+
per_label[label] = {
|
|
201
|
+
"tp": tp,
|
|
202
|
+
"fp": fp,
|
|
203
|
+
"fn": fn,
|
|
204
|
+
"precision": round(precision, 3),
|
|
205
|
+
"recall": round(recall, 3),
|
|
206
|
+
"f1": round(f1, 3),
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
exact_matches = sum(1 for row in rows if row["exact_match"])
|
|
210
|
+
return {
|
|
211
|
+
"mode": mode,
|
|
212
|
+
"cases": len(rows),
|
|
213
|
+
"exact_match_rate": round(exact_matches / len(rows), 3) if rows else 0.0,
|
|
214
|
+
"per_label": per_label,
|
|
215
|
+
"rows": rows,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _case(case_id: str, payload: dict[str, Any], expected_labels: set[str]) -> VerifyBenchmarkCase:
|
|
220
|
+
return VerifyBenchmarkCase(
|
|
221
|
+
id=case_id,
|
|
222
|
+
trace=load_trace(payload, source="benchmark case %s" % case_id),
|
|
223
|
+
expected_labels=expected_labels,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _predicted_labels(result: dict[str, Any]) -> set[str]:
|
|
228
|
+
labels = set((result.get("summary") or {}).get("failure_types") or [])
|
|
229
|
+
return labels or {"no_failure_detected"}
|