contexttrace 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {contexttrace-0.1.0 → contexttrace-0.3.0}/MANIFEST.in +1 -0
  2. {contexttrace-0.1.0 → contexttrace-0.3.0}/PKG-INFO +29 -1
  3. {contexttrace-0.1.0 → contexttrace-0.3.0}/README.md +28 -0
  4. contexttrace-0.3.0/contexttrace/_version.py +1 -0
  5. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/cli.py +233 -0
  6. contexttrace-0.3.0/contexttrace/verify/__init__.py +21 -0
  7. contexttrace-0.3.0/contexttrace/verify/abstention.py +70 -0
  8. contexttrace-0.3.0/contexttrace/verify/benchmark.py +574 -0
  9. contexttrace-0.3.0/contexttrace/verify/citations.py +99 -0
  10. contexttrace-0.3.0/contexttrace/verify/claims.py +281 -0
  11. contexttrace-0.3.0/contexttrace/verify/demos.py +165 -0
  12. contexttrace-0.3.0/contexttrace/verify/evidence.py +433 -0
  13. contexttrace-0.3.0/contexttrace/verify/external_benchmark_cases.json +311 -0
  14. contexttrace-0.3.0/contexttrace/verify/facts.py +387 -0
  15. contexttrace-0.3.0/contexttrace/verify/real_benchmark_cases.json +713 -0
  16. contexttrace-0.3.0/contexttrace/verify/report.py +557 -0
  17. contexttrace-0.3.0/contexttrace/verify/root_cause.py +218 -0
  18. contexttrace-0.3.0/contexttrace/verify/runner.py +151 -0
  19. contexttrace-0.3.0/contexttrace/verify/schema.py +150 -0
  20. contexttrace-0.3.0/contexttrace/verify/spans.py +103 -0
  21. contexttrace-0.3.0/contexttrace/verify/verdicts.py +250 -0
  22. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace.egg-info/SOURCES.txt +17 -1
  23. {contexttrace-0.1.0 → contexttrace-0.3.0}/pyproject.toml +2 -2
  24. contexttrace-0.1.0/contexttrace/_version.py +0 -1
  25. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/__init__.py +0 -0
  26. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/client.py +0 -0
  27. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/config.py +0 -0
  28. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/demo.py +0 -0
  29. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/demo_data.py +0 -0
  30. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/endpoint_eval.py +0 -0
  31. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/errors.py +0 -0
  32. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/evaluator.py +0 -0
  33. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/integrations/__init__.py +0 -0
  34. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/integrations/fastapi.py +0 -0
  35. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/integrations/langchain.py +0 -0
  36. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/integrations/langgraph.py +0 -0
  37. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/integrations/llamaindex.py +0 -0
  38. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/integrations/opentelemetry.py +0 -0
  39. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/local.py +0 -0
  40. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/py.typed +0 -0
  41. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/regression.py +0 -0
  42. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/reliability.py +0 -0
  43. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/report.py +0 -0
  44. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/storage/__init__.py +0 -0
  45. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/storage/sqlite_store.py +0 -0
  46. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/thresholds.py +0 -0
  47. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/transport.py +0 -0
  48. {contexttrace-0.1.0 → contexttrace-0.3.0}/contexttrace/viewer.py +0 -0
  49. {contexttrace-0.1.0 → contexttrace-0.3.0}/setup.cfg +0 -0
  50. {contexttrace-0.1.0 → contexttrace-0.3.0}/setup.py +0 -0
@@ -3,6 +3,7 @@ include pyproject.toml
3
3
  include setup.py
4
4
  include contexttrace/py.typed
5
5
  recursive-include contexttrace *.py
6
+ recursive-include contexttrace/verify *.json
6
7
  prune build
7
8
  prune dist
8
9
  prune tests
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: contexttrace
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
5
5
  Author: ContextTrace contributors
6
6
  License: MIT
@@ -133,6 +133,34 @@ contexttrace eval \
133
133
  --fail-on "failure_rate>0.25"
134
134
  ```
135
135
 
136
+ ## Claim-Level Evidence Verification
137
+
138
+ Verify a portable RAG trace artifact without a hosted dashboard:
139
+
140
+ ```bash
141
+ contexttrace verify-demo unsupported_claim --report
142
+ contexttrace verify trace.json
143
+ contexttrace verify trace.json --json
144
+ contexttrace verify trace.json --report --out reports/example.html
145
+ contexttrace verify trace.json --mode semantic
146
+ contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
147
+ contexttrace verify-benchmark --mode semantic
148
+ contexttrace verify-benchmark --mode semantic --report
149
+ contexttrace verify-benchmark --case-set external --mode semantic --report
150
+ ```
151
+
152
+ Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
153
+
154
+ `verify-demo` uses bundled demo traces, so it works immediately after `pip install contexttrace`. Available demos include `unsupported_claim`, `partial_support`, `citation_mismatch`, `should_abstain`, and `supported_answer`.
155
+
156
+ Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics. The default benchmark includes 32 real ContextTrace docs and release-artifact cases. `--case-set external` adds public OSS documentation and GitHub issue cases from Qdrant, Chroma, Haystack, and LangChain, while `--case-set all` runs both packs. `--report` writes an HTML report with misses to inspect.
157
+
158
+ Verification output includes evidence span offsets, stable span hashes, multiple supporting spans, typed matched/missing facts, and claim-level root causes so partial support failures are easier to inspect.
159
+
160
+ ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
161
+
162
+ The v0.3.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
163
+
136
164
  ## What It Catches
137
165
 
138
166
  - `retrieval_miss`
@@ -76,6 +76,34 @@ contexttrace eval \
76
76
  --fail-on "failure_rate>0.25"
77
77
  ```
78
78
 
79
+ ## Claim-Level Evidence Verification
80
+
81
+ Verify a portable RAG trace artifact without a hosted dashboard:
82
+
83
+ ```bash
84
+ contexttrace verify-demo unsupported_claim --report
85
+ contexttrace verify trace.json
86
+ contexttrace verify trace.json --json
87
+ contexttrace verify trace.json --report --out reports/example.html
88
+ contexttrace verify trace.json --mode semantic
89
+ contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
90
+ contexttrace verify-benchmark --mode semantic
91
+ contexttrace verify-benchmark --mode semantic --report
92
+ contexttrace verify-benchmark --case-set external --mode semantic --report
93
+ ```
94
+
95
+ Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
96
+
97
+ `verify-demo` uses bundled demo traces, so it works immediately after `pip install contexttrace`. Available demos include `unsupported_claim`, `partial_support`, `citation_mismatch`, `should_abstain`, and `supported_answer`.
98
+
99
+ Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics. The default benchmark includes 32 real ContextTrace docs and release-artifact cases. `--case-set external` adds public OSS documentation and GitHub issue cases from Qdrant, Chroma, Haystack, and LangChain, while `--case-set all` runs both packs. `--report` writes an HTML report with misses to inspect.
100
+
101
+ Verification output includes evidence span offsets, stable span hashes, multiple supporting spans, typed matched/missing facts, and claim-level root causes so partial support failures are easier to inspect.
102
+
103
+ ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
104
+
105
+ The v0.3.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
106
+
79
107
  ## What It Catches
80
108
 
81
109
  - `retrieval_miss`
@@ -0,0 +1 @@
1
+ __version__ = "0.3.0"
@@ -22,6 +22,15 @@ from contexttrace.regression import BENCHMARK_STRATEGIES, run_local_benchmark
22
22
  from contexttrace.report import ReportGenerator
23
23
  from contexttrace.storage import SQLiteTraceStore
24
24
  from contexttrace.thresholds import parse_thresholds, threshold_failures
25
+ from contexttrace.verify import (
26
+ VerificationInputError,
27
+ list_verify_demos,
28
+ load_trace_file,
29
+ load_verify_demo,
30
+ verify_trace,
31
+ )
32
+ from contexttrace.verify.benchmark import run_verify_benchmark, write_verify_benchmark_report
33
+ from contexttrace.verify.report import VerifyReportGenerator
25
34
  from contexttrace.viewer import serve_viewer
26
35
 
27
36
 
@@ -198,6 +207,230 @@ def report(
198
207
  webbrowser.open(Path(written).resolve().as_uri())
199
208
 
200
209
 
210
+ @cli.command("verify")
211
+ @click.argument("trace_json")
212
+ @click.option("--json", "json_output", is_flag=True, help="Print the full verification result as JSON.")
213
+ @click.option("--report", is_flag=True, help="Generate a local HTML verification report.")
214
+ @click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
215
+ @click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
216
+ @click.option("--fail-on", multiple=True, help="Fail on unsupported, partial_support, citation_mismatch, should_abstain, contradicted, unverifiable, no_citation, or any_failure.")
217
+ def verify_command(
218
+ trace_json: str,
219
+ json_output: bool,
220
+ report: bool,
221
+ out: Optional[str],
222
+ mode: str,
223
+ fail_on: tuple[str, ...],
224
+ ) -> int:
225
+ """Verify claim-level evidence support for a portable RAG trace JSON file."""
226
+
227
+ try:
228
+ trace = load_trace_file(trace_json)
229
+ except VerificationInputError as exc:
230
+ raise click.ClickException(str(exc)) from exc
231
+
232
+ result = verify_trace(trace, mode=mode)
233
+ written_report = _write_verify_report(
234
+ result,
235
+ trace,
236
+ report=report,
237
+ out=out,
238
+ default_name="%s_verify.html" % Path(trace_json).stem,
239
+ )
240
+ return _print_verify_result(
241
+ result,
242
+ json_output=json_output,
243
+ written_report=written_report,
244
+ fail_on=fail_on,
245
+ )
246
+
247
+
248
+ @cli.command("verify-demo")
249
+ @click.argument("demo_name", required=False, default="unsupported_claim")
250
+ @click.option("--json", "json_output", is_flag=True, help="Print the full verification result as JSON.")
251
+ @click.option("--report", is_flag=True, help="Generate a local HTML verification report.")
252
+ @click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
253
+ @click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
254
+ @click.option("--fail-on", multiple=True, help="Fail on unsupported, partial_support, citation_mismatch, should_abstain, contradicted, unverifiable, no_citation, or any_failure.")
255
+ def verify_demo_command(
256
+ demo_name: str,
257
+ json_output: bool,
258
+ report: bool,
259
+ out: Optional[str],
260
+ mode: str,
261
+ fail_on: tuple[str, ...],
262
+ ) -> int:
263
+ """Run a bundled claim-level verification demo."""
264
+
265
+ try:
266
+ trace = load_verify_demo(demo_name)
267
+ except KeyError as exc:
268
+ raise click.ClickException(
269
+ "Unknown verify demo %s. Available demos: %s"
270
+ % (demo_name, ", ".join(list_verify_demos()))
271
+ ) from exc
272
+
273
+ result = verify_trace(trace, mode=mode)
274
+ written_report = _write_verify_report(
275
+ result,
276
+ trace,
277
+ report=report,
278
+ out=out,
279
+ default_name="%s_verify_demo.html" % demo_name,
280
+ )
281
+ return _print_verify_result(
282
+ result,
283
+ json_output=json_output,
284
+ written_report=written_report,
285
+ fail_on=fail_on,
286
+ )
287
+
288
+
289
+ @cli.command("verify-benchmark")
290
+ @click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
291
+ @click.option("--case-set", default="contexttrace", show_default=True, type=click.Choice(["contexttrace", "external", "all"]), help="Benchmark case set to run.")
292
+ @click.option("--json", "json_output", is_flag=True, help="Print benchmark results as JSON.")
293
+ @click.option("--report", is_flag=True, help="Generate a local HTML benchmark report.")
294
+ @click.option("--out", default=None, help="HTML benchmark report path. Implies --report when provided.")
295
+ def verify_benchmark_command(mode: str, case_set: str, json_output: bool, report: bool, out: Optional[str]) -> int:
296
+ """Run the bundled verification precision/recall benchmark."""
297
+
298
+ result = run_verify_benchmark(mode=mode, case_set=case_set)
299
+ written_report = None
300
+ if report or out:
301
+ output_path = out or str(Path(".contexttrace") / "reports" / ("verify_benchmark_%s.html" % mode))
302
+ written_report = write_verify_benchmark_report(result, path=output_path)
303
+ if json_output:
304
+ if written_report:
305
+ click.echo("Report: %s" % written_report, err=True)
306
+ click.echo(json.dumps(result, indent=2))
307
+ return 0
308
+
309
+ click.echo("Mode: %s" % result["mode"])
310
+ click.echo("Case source: %s" % result["case_source"])
311
+ click.echo("Cases: %s" % result["cases"])
312
+ click.echo("Exact match rate: %.3f" % float(result["exact_match_rate"]))
313
+ click.echo("Verdict match rate: %.3f" % float(result["verdict_match_rate"]))
314
+ click.echo("Citation match rate: %.3f" % float(result["citation_match_rate"]))
315
+ click.echo("Abstention match rate: %.3f" % float(result["abstention_match_rate"]))
316
+ click.echo("label\tprecision\trecall\tf1\ttp\tfp\tfn")
317
+ for label, metrics in result["per_label"].items():
318
+ click.echo(
319
+ "%s\t%.3f\t%.3f\t%.3f\t%s\t%s\t%s"
320
+ % (
321
+ label,
322
+ float(metrics["precision"]),
323
+ float(metrics["recall"]),
324
+ float(metrics["f1"]),
325
+ metrics["tp"],
326
+ metrics["fp"],
327
+ metrics["fn"],
328
+ )
329
+ )
330
+ missed = [row for row in result["rows"] if not row["exact_match"]]
331
+ if missed:
332
+ click.echo("Mismatches:")
333
+ for row in missed:
334
+ click.echo(
335
+ "- %s expected=%s predicted=%s"
336
+ % (row["id"], ",".join(row["expected"]), ",".join(row["predicted"]))
337
+ )
338
+ if written_report:
339
+ click.echo("Report: %s" % written_report)
340
+ return 0
341
+
342
+
343
+ def _write_verify_report(
344
+ result: dict,
345
+ trace: object,
346
+ *,
347
+ report: bool,
348
+ out: Optional[str],
349
+ default_name: str,
350
+ ) -> Optional[str]:
351
+ if not report and not out:
352
+ return None
353
+ output_path = out or str(Path(".contexttrace") / "reports" / default_name)
354
+ return VerifyReportGenerator().generate(result, trace, path=output_path)
355
+
356
+
357
+ def _print_verify_result(
358
+ result: dict,
359
+ *,
360
+ json_output: bool,
361
+ written_report: Optional[str],
362
+ fail_on: tuple[str, ...] = (),
363
+ ) -> int:
364
+ fail_messages = _verify_failures(result, fail_on)
365
+ if json_output:
366
+ if written_report:
367
+ click.echo("Report: %s" % written_report, err=True)
368
+ click.echo(json.dumps(result, indent=2))
369
+ for message in fail_messages:
370
+ click.echo("Verification failed: %s" % message, err=True)
371
+ return 1 if fail_messages else 0
372
+ summary = result["summary"]
373
+ click.echo("Claims verified: %s" % summary["total_claims"])
374
+ click.echo(
375
+ "Supported: {supported} | Partial: {partially_supported} | Unsupported: {unsupported} | Unverifiable: {unverifiable} | Contradicted: {contradicted}".format(
376
+ **summary
377
+ )
378
+ )
379
+ click.echo("Support rate: %.3f" % float(summary["support_rate"]))
380
+ click.echo("Unsupported claim rate: %.3f" % float(summary["unsupported_claim_rate"]))
381
+ click.echo("Citation mismatches: %s" % summary["citation_mismatches"])
382
+ click.echo("Failure type: %s" % summary["failure_type"])
383
+ click.echo("Primary root cause: %s" % summary.get("primary_root_cause", "unknown"))
384
+ click.echo("Should abstain: %s" % str(summary["should_abstain"]).lower())
385
+ click.echo("Suggested fix: %s" % summary["suggested_fix"])
386
+ if written_report:
387
+ click.echo("Report: %s" % written_report)
388
+ for message in fail_messages:
389
+ click.echo("Verification failed: %s" % message, err=True)
390
+ return 1 if fail_messages else 0
391
+
392
+
393
+ def _verify_failures(result: dict, fail_on: tuple[str, ...]) -> list[str]:
394
+ if not fail_on:
395
+ return []
396
+ summary = result.get("summary") or {}
397
+ claims = result.get("claims") or []
398
+ failure_types = set(summary.get("failure_types") or [])
399
+ messages = []
400
+ for raw_rule in fail_on:
401
+ rule = raw_rule.strip().lower().replace("-", "_")
402
+ if rule == "unsupported" and int(summary.get("unsupported") or 0) > 0:
403
+ messages.append("unsupported claim detected")
404
+ elif rule in {"partial", "partial_support", "partially_supported"} and int(summary.get("partially_supported") or 0) > 0:
405
+ messages.append("partially supported claim detected")
406
+ elif rule == "citation_mismatch" and "citation_mismatch" in failure_types:
407
+ messages.append("citation mismatch detected")
408
+ elif rule == "should_abstain" and bool(summary.get("should_abstain")):
409
+ messages.append("answer should have abstained")
410
+ elif rule == "contradicted" and int(summary.get("contradicted") or 0) > 0:
411
+ messages.append("contradicted claim detected")
412
+ elif rule == "unverifiable" and int(summary.get("unverifiable") or 0) > 0:
413
+ messages.append("unverifiable claim detected")
414
+ elif rule == "no_citation" and any(claim.get("citation_status") == "claim_has_no_citation" for claim in claims):
415
+ messages.append("claim without citation detected")
416
+ elif rule == "any_failure" and failure_types != {"no_failure_detected"}:
417
+ messages.append("verification failure detected")
418
+ elif rule not in {
419
+ "unsupported",
420
+ "partial",
421
+ "partial_support",
422
+ "partially_supported",
423
+ "citation_mismatch",
424
+ "should_abstain",
425
+ "contradicted",
426
+ "unverifiable",
427
+ "no_citation",
428
+ "any_failure",
429
+ }:
430
+ messages.append("unknown --fail-on rule %s" % raw_rule)
431
+ return messages
432
+
433
+
201
434
  @cli.command("eval")
202
435
  @click.option("--dataset", required=True, help="Path to eval questions JSON.")
203
436
  @click.option("--endpoint", default=None, help="RAG endpoint URL. Defaults to config eval_endpoint.")
@@ -0,0 +1,21 @@
1
+ from contexttrace.verify.runner import verify_trace, verify_trace_file
2
+ from contexttrace.verify.schema import (
3
+ RAGTrace,
4
+ TraceCitation,
5
+ TraceContext,
6
+ VerificationInputError,
7
+ load_trace_file,
8
+ )
9
+ from contexttrace.verify.demos import list_verify_demos, load_verify_demo
10
+
11
+ __all__ = [
12
+ "RAGTrace",
13
+ "TraceCitation",
14
+ "TraceContext",
15
+ "VerificationInputError",
16
+ "list_verify_demos",
17
+ "load_trace_file",
18
+ "load_verify_demo",
19
+ "verify_trace",
20
+ "verify_trace_file",
21
+ ]
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ from contexttrace.verify.claims import Claim
4
+ from contexttrace.verify.evidence import find_best_evidence
5
+ from contexttrace.verify.schema import TraceContext
6
+ from contexttrace.verify.verdicts import ClaimVerification
7
+
8
+
9
+ def judge_abstention(
10
+ *,
11
+ query: str,
12
+ claims: list[Claim],
13
+ contexts: list[TraceContext],
14
+ verifications: list[ClaimVerification],
15
+ mode: str = "lexical",
16
+ ) -> dict[str, object]:
17
+ if not claims:
18
+ return {
19
+ "should_abstain": False,
20
+ "reason": "The answer does not contain factual claims that require evidence support.",
21
+ }
22
+
23
+ if not contexts:
24
+ return {
25
+ "should_abstain": True,
26
+ "reason": "The answer contains factual claims, but no retrieved contexts were provided.",
27
+ }
28
+
29
+ total_claims = len(verifications)
30
+ supported = len([item for item in verifications if item.verdict == "supported"])
31
+ unsupported_like = len(
32
+ [
33
+ item
34
+ for item in verifications
35
+ if item.verdict in {"unsupported", "contradicted"}
36
+ ]
37
+ )
38
+
39
+ query_match = find_best_evidence(query, contexts, mode=mode)
40
+ if supported == 0 and query_match.score < 0.18:
41
+ return {
42
+ "should_abstain": True,
43
+ "reason": (
44
+ "The query asks for information that does not appear in the retrieved contexts, "
45
+ "but the answer still gives a factual response."
46
+ ),
47
+ }
48
+
49
+ if unsupported_like / total_claims >= 0.5:
50
+ return {
51
+ "should_abstain": True,
52
+ "reason": (
53
+ "The answer contains factual claims, but most important claims are unsupported "
54
+ "or contradicted by the retrieved contexts."
55
+ ),
56
+ }
57
+
58
+ if any(item.verdict == "partially_supported" for item in verifications):
59
+ return {
60
+ "should_abstain": False,
61
+ "reason": (
62
+ "At least one claim is only partially supported; the answer should remove "
63
+ "or qualify unsupported details rather than fully abstain."
64
+ ),
65
+ }
66
+
67
+ return {
68
+ "should_abstain": False,
69
+ "reason": "Most generated claims are supported by retrieved evidence.",
70
+ }