contexttrace 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {contexttrace-0.2.0 → contexttrace-0.3.0}/MANIFEST.in +1 -0
  2. {contexttrace-0.2.0 → contexttrace-0.3.0}/PKG-INFO +8 -4
  3. {contexttrace-0.2.0 → contexttrace-0.3.0}/README.md +7 -3
  4. contexttrace-0.3.0/contexttrace/_version.py +1 -0
  5. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/cli.py +19 -3
  6. contexttrace-0.3.0/contexttrace/verify/benchmark.py +574 -0
  7. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/citations.py +19 -1
  8. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/claims.py +62 -2
  9. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/evidence.py +113 -10
  10. contexttrace-0.3.0/contexttrace/verify/external_benchmark_cases.json +311 -0
  11. contexttrace-0.3.0/contexttrace/verify/facts.py +387 -0
  12. contexttrace-0.3.0/contexttrace/verify/real_benchmark_cases.json +713 -0
  13. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/report.py +122 -1
  14. contexttrace-0.3.0/contexttrace/verify/root_cause.py +218 -0
  15. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/runner.py +11 -1
  16. contexttrace-0.3.0/contexttrace/verify/spans.py +103 -0
  17. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/verdicts.py +82 -2
  18. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace.egg-info/SOURCES.txt +5 -0
  19. {contexttrace-0.2.0 → contexttrace-0.3.0}/pyproject.toml +2 -2
  20. contexttrace-0.2.0/contexttrace/_version.py +0 -1
  21. contexttrace-0.2.0/contexttrace/verify/benchmark.py +0 -229
  22. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/__init__.py +0 -0
  23. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/client.py +0 -0
  24. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/config.py +0 -0
  25. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/demo.py +0 -0
  26. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/demo_data.py +0 -0
  27. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/endpoint_eval.py +0 -0
  28. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/errors.py +0 -0
  29. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/evaluator.py +0 -0
  30. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/integrations/__init__.py +0 -0
  31. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/integrations/fastapi.py +0 -0
  32. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/integrations/langchain.py +0 -0
  33. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/integrations/langgraph.py +0 -0
  34. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/integrations/llamaindex.py +0 -0
  35. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/integrations/opentelemetry.py +0 -0
  36. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/local.py +0 -0
  37. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/py.typed +0 -0
  38. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/regression.py +0 -0
  39. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/reliability.py +0 -0
  40. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/report.py +0 -0
  41. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/storage/__init__.py +0 -0
  42. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/storage/sqlite_store.py +0 -0
  43. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/thresholds.py +0 -0
  44. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/transport.py +0 -0
  45. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/__init__.py +0 -0
  46. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/abstention.py +0 -0
  47. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/demos.py +0 -0
  48. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/schema.py +0 -0
  49. {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/viewer.py +0 -0
  50. {contexttrace-0.2.0 → contexttrace-0.3.0}/setup.cfg +0 -0
  51. {contexttrace-0.2.0 → contexttrace-0.3.0}/setup.py +0 -0
@@ -3,6 +3,7 @@ include pyproject.toml
3
3
  include setup.py
4
4
  include contexttrace/py.typed
5
5
  recursive-include contexttrace *.py
6
+ recursive-include contexttrace/verify *.json
6
7
  prune build
7
8
  prune dist
8
9
  prune tests
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: contexttrace
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
5
5
  Author: ContextTrace contributors
6
6
  License: MIT
@@ -145,17 +145,21 @@ contexttrace verify trace.json --report --out reports/example.html
145
145
  contexttrace verify trace.json --mode semantic
146
146
  contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
147
147
  contexttrace verify-benchmark --mode semantic
148
+ contexttrace verify-benchmark --mode semantic --report
149
+ contexttrace verify-benchmark --case-set external --mode semantic --report
148
150
  ```
149
151
 
150
152
  Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
151
153
 
152
154
  `verify-demo` uses bundled demo traces, so it works immediately after `pip install contexttrace`. Available demos include `unsupported_claim`, `partial_support`, `citation_mismatch`, `should_abstain`, and `supported_answer`.
153
155
 
154
- Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics.
156
+ Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics. The default benchmark includes 32 real ContextTrace docs and release-artifact cases. `--case-set external` adds public OSS documentation and GitHub issue cases from Qdrant, Chroma, Haystack, and LangChain, while `--case-set all` runs both packs. `--report` writes an HTML report with misses to inspect.
155
157
 
156
- ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, insufficient context, or should-have-abstained.
158
+ Verification output includes evidence span offsets, stable span hashes, multiple supporting spans, typed matched/missing facts, and claim-level root causes so partial support failures are easier to inspect.
157
159
 
158
- The v0.2.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
160
+ ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
161
+
162
+ The v0.3.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
159
163
 
160
164
  ## What It Catches
161
165
 
@@ -88,17 +88,21 @@ contexttrace verify trace.json --report --out reports/example.html
88
88
  contexttrace verify trace.json --mode semantic
89
89
  contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
90
90
  contexttrace verify-benchmark --mode semantic
91
+ contexttrace verify-benchmark --mode semantic --report
92
+ contexttrace verify-benchmark --case-set external --mode semantic --report
91
93
  ```
92
94
 
93
95
  Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
94
96
 
95
97
  `verify-demo` uses bundled demo traces, so it works immediately after `pip install contexttrace`. Available demos include `unsupported_claim`, `partial_support`, `citation_mismatch`, `should_abstain`, and `supported_answer`.
96
98
 
97
- Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics.
99
+ Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics. The default benchmark includes 32 real ContextTrace docs and release-artifact cases. `--case-set external` adds public OSS documentation and GitHub issue cases from Qdrant, Chroma, Haystack, and LangChain, while `--case-set all` runs both packs. `--report` writes an HTML report with misses to inspect.
98
100
 
99
- ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, insufficient context, or should-have-abstained.
101
+ Verification output includes evidence span offsets, stable span hashes, multiple supporting spans, typed matched/missing facts, and claim-level root causes so partial support failures are easier to inspect.
100
102
 
101
- The v0.2.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
103
+ ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
104
+
105
+ The v0.3.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
102
106
 
103
107
  ## What It Catches
104
108
 
@@ -0,0 +1 @@
1
+ __version__ = "0.3.0"
@@ -29,7 +29,7 @@ from contexttrace.verify import (
29
29
  load_verify_demo,
30
30
  verify_trace,
31
31
  )
32
- from contexttrace.verify.benchmark import run_verify_benchmark
32
+ from contexttrace.verify.benchmark import run_verify_benchmark, write_verify_benchmark_report
33
33
  from contexttrace.verify.report import VerifyReportGenerator
34
34
  from contexttrace.viewer import serve_viewer
35
35
 
@@ -288,18 +288,31 @@ def verify_demo_command(
288
288
 
289
289
  @cli.command("verify-benchmark")
290
290
  @click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
291
+ @click.option("--case-set", default="contexttrace", show_default=True, type=click.Choice(["contexttrace", "external", "all"]), help="Benchmark case set to run.")
291
292
  @click.option("--json", "json_output", is_flag=True, help="Print benchmark results as JSON.")
292
- def verify_benchmark_command(mode: str, json_output: bool) -> int:
293
+ @click.option("--report", is_flag=True, help="Generate a local HTML benchmark report.")
294
+ @click.option("--out", default=None, help="HTML benchmark report path. Implies --report when provided.")
295
+ def verify_benchmark_command(mode: str, case_set: str, json_output: bool, report: bool, out: Optional[str]) -> int:
293
296
  """Run the bundled verification precision/recall benchmark."""
294
297
 
295
- result = run_verify_benchmark(mode=mode)
298
+ result = run_verify_benchmark(mode=mode, case_set=case_set)
299
+ written_report = None
300
+ if report or out:
301
+ output_path = out or str(Path(".contexttrace") / "reports" / ("verify_benchmark_%s.html" % mode))
302
+ written_report = write_verify_benchmark_report(result, path=output_path)
296
303
  if json_output:
304
+ if written_report:
305
+ click.echo("Report: %s" % written_report, err=True)
297
306
  click.echo(json.dumps(result, indent=2))
298
307
  return 0
299
308
 
300
309
  click.echo("Mode: %s" % result["mode"])
310
+ click.echo("Case source: %s" % result["case_source"])
301
311
  click.echo("Cases: %s" % result["cases"])
302
312
  click.echo("Exact match rate: %.3f" % float(result["exact_match_rate"]))
313
+ click.echo("Verdict match rate: %.3f" % float(result["verdict_match_rate"]))
314
+ click.echo("Citation match rate: %.3f" % float(result["citation_match_rate"]))
315
+ click.echo("Abstention match rate: %.3f" % float(result["abstention_match_rate"]))
303
316
  click.echo("label\tprecision\trecall\tf1\ttp\tfp\tfn")
304
317
  for label, metrics in result["per_label"].items():
305
318
  click.echo(
@@ -322,6 +335,8 @@ def verify_benchmark_command(mode: str, json_output: bool) -> int:
322
335
  "- %s expected=%s predicted=%s"
323
336
  % (row["id"], ",".join(row["expected"]), ",".join(row["predicted"]))
324
337
  )
338
+ if written_report:
339
+ click.echo("Report: %s" % written_report)
325
340
  return 0
326
341
 
327
342
 
@@ -365,6 +380,7 @@ def _print_verify_result(
365
380
  click.echo("Unsupported claim rate: %.3f" % float(summary["unsupported_claim_rate"]))
366
381
  click.echo("Citation mismatches: %s" % summary["citation_mismatches"])
367
382
  click.echo("Failure type: %s" % summary["failure_type"])
383
+ click.echo("Primary root cause: %s" % summary.get("primary_root_cause", "unknown"))
368
384
  click.echo("Should abstain: %s" % str(summary["should_abstain"]).lower())
369
385
  click.echo("Suggested fix: %s" % summary["suggested_fix"])
370
386
  if written_report: