contexttrace 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {contexttrace-0.2.0 → contexttrace-0.4.0}/MANIFEST.in +1 -0
  2. {contexttrace-0.2.0 → contexttrace-0.4.0}/PKG-INFO +13 -4
  3. {contexttrace-0.2.0 → contexttrace-0.4.0}/README.md +12 -3
  4. contexttrace-0.4.0/contexttrace/_version.py +1 -0
  5. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/cli.py +83 -3
  6. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/__init__.py +4 -0
  7. contexttrace-0.4.0/contexttrace/verify/benchmark.py +574 -0
  8. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/citations.py +19 -1
  9. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/claims.py +62 -2
  10. contexttrace-0.4.0/contexttrace/verify/compare.py +445 -0
  11. contexttrace-0.4.0/contexttrace/verify/compare_report.py +386 -0
  12. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/evidence.py +113 -10
  13. contexttrace-0.4.0/contexttrace/verify/external_benchmark_cases.json +311 -0
  14. contexttrace-0.4.0/contexttrace/verify/facts.py +387 -0
  15. contexttrace-0.4.0/contexttrace/verify/real_benchmark_cases.json +713 -0
  16. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/report.py +122 -1
  17. contexttrace-0.4.0/contexttrace/verify/root_cause.py +218 -0
  18. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/runner.py +11 -1
  19. contexttrace-0.4.0/contexttrace/verify/spans.py +103 -0
  20. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/verdicts.py +82 -2
  21. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace.egg-info/SOURCES.txt +7 -0
  22. {contexttrace-0.2.0 → contexttrace-0.4.0}/pyproject.toml +2 -2
  23. contexttrace-0.2.0/contexttrace/_version.py +0 -1
  24. contexttrace-0.2.0/contexttrace/verify/benchmark.py +0 -229
  25. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/__init__.py +0 -0
  26. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/client.py +0 -0
  27. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/config.py +0 -0
  28. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/demo.py +0 -0
  29. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/demo_data.py +0 -0
  30. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/endpoint_eval.py +0 -0
  31. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/errors.py +0 -0
  32. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/evaluator.py +0 -0
  33. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/integrations/__init__.py +0 -0
  34. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/integrations/fastapi.py +0 -0
  35. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/integrations/langchain.py +0 -0
  36. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/integrations/langgraph.py +0 -0
  37. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/integrations/llamaindex.py +0 -0
  38. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/integrations/opentelemetry.py +0 -0
  39. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/local.py +0 -0
  40. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/py.typed +0 -0
  41. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/regression.py +0 -0
  42. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/reliability.py +0 -0
  43. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/report.py +0 -0
  44. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/storage/__init__.py +0 -0
  45. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/storage/sqlite_store.py +0 -0
  46. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/thresholds.py +0 -0
  47. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/transport.py +0 -0
  48. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/abstention.py +0 -0
  49. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/demos.py +0 -0
  50. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/schema.py +0 -0
  51. {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/viewer.py +0 -0
  52. {contexttrace-0.2.0 → contexttrace-0.4.0}/setup.cfg +0 -0
  53. {contexttrace-0.2.0 → contexttrace-0.4.0}/setup.py +0 -0
@@ -3,6 +3,7 @@ include pyproject.toml
3
3
  include setup.py
4
4
  include contexttrace/py.typed
5
5
  recursive-include contexttrace *.py
6
+ recursive-include contexttrace/verify *.json
6
7
  prune build
7
8
  prune dist
8
9
  prune tests
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: contexttrace
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
5
5
  Author: ContextTrace contributors
6
6
  License: MIT
@@ -145,17 +145,26 @@ contexttrace verify trace.json --report --out reports/example.html
145
145
  contexttrace verify trace.json --mode semantic
146
146
  contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
147
147
  contexttrace verify-benchmark --mode semantic
148
+ contexttrace verify-benchmark --mode semantic --report
149
+ contexttrace verify-benchmark --case-set external --mode semantic --report
150
+ contexttrace compare baseline.json current.json
151
+ contexttrace compare baseline.json current.json --report
152
+ contexttrace compare baseline.json current.json --fail-on new_failure
148
153
  ```
149
154
 
150
155
  Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
151
156
 
152
157
  `verify-demo` uses bundled demo traces, so it works immediately after `pip install contexttrace`. Available demos include `unsupported_claim`, `partial_support`, `citation_mismatch`, `should_abstain`, and `supported_answer`.
153
158
 
154
- Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics.
159
+ Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics. The default benchmark includes 32 real ContextTrace docs and release-artifact cases. `--case-set external` adds public OSS documentation and GitHub issue cases from Qdrant, Chroma, Haystack, and LangChain, while `--case-set all` runs both packs. `--report` writes an HTML report with misses to inspect.
155
160
 
156
- ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, insufficient context, or should-have-abstained.
161
+ Verification output includes evidence span offsets, stable span hashes, multiple supporting spans, typed matched/missing facts, and claim-level root causes so partial support failures are easier to inspect.
157
162
 
158
- The v0.2.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
163
+ ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
164
+
165
+ Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
166
+
167
+ The v0.4.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
159
168
 
160
169
  ## What It Catches
161
170
 
@@ -88,17 +88,26 @@ contexttrace verify trace.json --report --out reports/example.html
88
88
  contexttrace verify trace.json --mode semantic
89
89
  contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
90
90
  contexttrace verify-benchmark --mode semantic
91
+ contexttrace verify-benchmark --mode semantic --report
92
+ contexttrace verify-benchmark --case-set external --mode semantic --report
93
+ contexttrace compare baseline.json current.json
94
+ contexttrace compare baseline.json current.json --report
95
+ contexttrace compare baseline.json current.json --fail-on new_failure
91
96
  ```
92
97
 
93
98
  Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
94
99
 
95
100
  `verify-demo` uses bundled demo traces, so it works immediately after `pip install contexttrace`. Available demos include `unsupported_claim`, `partial_support`, `citation_mismatch`, `should_abstain`, and `supported_answer`.
96
101
 
97
- Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics.
102
+ Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics. The default benchmark includes 32 real ContextTrace docs and release-artifact cases. `--case-set external` adds public OSS documentation and GitHub issue cases from Qdrant, Chroma, Haystack, and LangChain, while `--case-set all` runs both packs. `--report` writes an HTML report with misses to inspect.
98
103
 
99
- ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, insufficient context, or should-have-abstained.
104
+ Verification output includes evidence span offsets, stable span hashes, multiple supporting spans, typed matched/missing facts, and claim-level root causes so partial support failures are easier to inspect.
100
105
 
101
- The v0.2.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
106
+ ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
107
+
108
+ Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
109
+
110
+ The v0.4.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
102
111
 
103
112
  ## What It Catches
104
113
 
@@ -0,0 +1 @@
1
+ __version__ = "0.4.0"
@@ -24,12 +24,15 @@ from contexttrace.storage import SQLiteTraceStore
24
24
  from contexttrace.thresholds import parse_thresholds, threshold_failures
25
25
  from contexttrace.verify import (
26
26
  VerificationInputError,
27
+ compare_failures,
28
+ compare_trace_files,
27
29
  list_verify_demos,
28
30
  load_trace_file,
29
31
  load_verify_demo,
30
32
  verify_trace,
31
33
  )
32
- from contexttrace.verify.benchmark import run_verify_benchmark
34
+ from contexttrace.verify.benchmark import run_verify_benchmark, write_verify_benchmark_report
35
+ from contexttrace.verify.compare_report import CompareReportGenerator
33
36
  from contexttrace.verify.report import VerifyReportGenerator
34
37
  from contexttrace.viewer import serve_viewer
35
38
 
@@ -288,18 +291,31 @@ def verify_demo_command(
288
291
 
289
292
  @cli.command("verify-benchmark")
290
293
  @click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
294
+ @click.option("--case-set", default="contexttrace", show_default=True, type=click.Choice(["contexttrace", "external", "all"]), help="Benchmark case set to run.")
291
295
  @click.option("--json", "json_output", is_flag=True, help="Print benchmark results as JSON.")
292
- def verify_benchmark_command(mode: str, json_output: bool) -> int:
296
+ @click.option("--report", is_flag=True, help="Generate a local HTML benchmark report.")
297
+ @click.option("--out", default=None, help="HTML benchmark report path. Implies --report when provided.")
298
+ def verify_benchmark_command(mode: str, case_set: str, json_output: bool, report: bool, out: Optional[str]) -> int:
293
299
  """Run the bundled verification precision/recall benchmark."""
294
300
 
295
- result = run_verify_benchmark(mode=mode)
301
+ result = run_verify_benchmark(mode=mode, case_set=case_set)
302
+ written_report = None
303
+ if report or out:
304
+ output_path = out or str(Path(".contexttrace") / "reports" / ("verify_benchmark_%s.html" % mode))
305
+ written_report = write_verify_benchmark_report(result, path=output_path)
296
306
  if json_output:
307
+ if written_report:
308
+ click.echo("Report: %s" % written_report, err=True)
297
309
  click.echo(json.dumps(result, indent=2))
298
310
  return 0
299
311
 
300
312
  click.echo("Mode: %s" % result["mode"])
313
+ click.echo("Case source: %s" % result["case_source"])
301
314
  click.echo("Cases: %s" % result["cases"])
302
315
  click.echo("Exact match rate: %.3f" % float(result["exact_match_rate"]))
316
+ click.echo("Verdict match rate: %.3f" % float(result["verdict_match_rate"]))
317
+ click.echo("Citation match rate: %.3f" % float(result["citation_match_rate"]))
318
+ click.echo("Abstention match rate: %.3f" % float(result["abstention_match_rate"]))
303
319
  click.echo("label\tprecision\trecall\tf1\ttp\tfp\tfn")
304
320
  for label, metrics in result["per_label"].items():
305
321
  click.echo(
@@ -322,9 +338,72 @@ def verify_benchmark_command(mode: str, json_output: bool) -> int:
322
338
  "- %s expected=%s predicted=%s"
323
339
  % (row["id"], ",".join(row["expected"]), ",".join(row["predicted"]))
324
340
  )
341
+ if written_report:
342
+ click.echo("Report: %s" % written_report)
325
343
  return 0
326
344
 
327
345
 
346
+ @cli.command("compare")
347
+ @click.argument("baseline_json")
348
+ @click.argument("current_json")
349
+ @click.option("--json", "json_output", is_flag=True, help="Print the full comparison result as JSON.")
350
+ @click.option("--report", is_flag=True, help="Generate a local HTML regression report.")
351
+ @click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
352
+ @click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode for raw trace inputs.")
353
+ @click.option("--fail-on", multiple=True, help="Fail on new_failure, new_unsupported, new_citation_mismatch, should_abstain_flip, support_rate_drop, new_root_cause, or any_regression.")
354
+ def compare_command(
355
+ baseline_json: str,
356
+ current_json: str,
357
+ json_output: bool,
358
+ report: bool,
359
+ out: Optional[str],
360
+ mode: str,
361
+ fail_on: tuple[str, ...],
362
+ ) -> int:
363
+ """Compare two portable RAG traces or verification JSON outputs."""
364
+
365
+ try:
366
+ result = compare_trace_files(baseline_json, current_json, mode=mode)
367
+ except VerificationInputError as exc:
368
+ raise click.ClickException(str(exc)) from exc
369
+
370
+ written_report = None
371
+ if report or out:
372
+ default_name = "%s_vs_%s_compare.html" % (Path(baseline_json).stem, Path(current_json).stem)
373
+ output_path = out or str(Path(".contexttrace") / "reports" / default_name)
374
+ written_report = CompareReportGenerator().generate(result, path=output_path)
375
+
376
+ fail_messages = compare_failures(result, fail_on)
377
+ if json_output:
378
+ if written_report:
379
+ click.echo("Report: %s" % written_report, err=True)
380
+ click.echo(json.dumps(result, indent=2))
381
+ for message in fail_messages:
382
+ click.echo("Comparison failed: %s" % message, err=True)
383
+ return 1 if fail_messages else 0
384
+
385
+ summary = result["summary"]
386
+ click.echo("Regression: %s" % str(summary["regression"]).lower())
387
+ click.echo("Support rate: %.3f -> %.3f (%+.3f)" % (
388
+ float(summary.get("support_rate_before") or 0.0),
389
+ float(summary.get("support_rate_after") or 0.0),
390
+ float(summary.get("support_rate_delta") or 0.0),
391
+ ))
392
+ click.echo("Unsupported claim rate delta: %+.3f" % float(summary.get("unsupported_claim_rate_delta") or 0.0))
393
+ click.echo("Citation mismatch delta: %+d" % int(summary.get("citation_mismatch_delta") or 0))
394
+ click.echo("New failures: %s" % summary["new_failures"])
395
+ click.echo("Resolved failures: %s" % summary["resolved_failures"])
396
+ click.echo("Added claims: %s" % summary["added_claims"])
397
+ click.echo("Removed claims: %s" % summary["removed_claims"])
398
+ click.echo("Changed claims: %s" % summary["changed_claims"])
399
+ click.echo("New root causes: %s" % (", ".join(summary.get("new_root_causes") or []) or "none"))
400
+ if written_report:
401
+ click.echo("Report: %s" % written_report)
402
+ for message in fail_messages:
403
+ click.echo("Comparison failed: %s" % message, err=True)
404
+ return 1 if fail_messages else 0
405
+
406
+
328
407
  def _write_verify_report(
329
408
  result: dict,
330
409
  trace: object,
@@ -365,6 +444,7 @@ def _print_verify_result(
365
444
  click.echo("Unsupported claim rate: %.3f" % float(summary["unsupported_claim_rate"]))
366
445
  click.echo("Citation mismatches: %s" % summary["citation_mismatches"])
367
446
  click.echo("Failure type: %s" % summary["failure_type"])
447
+ click.echo("Primary root cause: %s" % summary.get("primary_root_cause", "unknown"))
368
448
  click.echo("Should abstain: %s" % str(summary["should_abstain"]).lower())
369
449
  click.echo("Suggested fix: %s" % summary["suggested_fix"])
370
450
  if written_report:
@@ -1,4 +1,5 @@
1
1
  from contexttrace.verify.runner import verify_trace, verify_trace_file
2
+ from contexttrace.verify.compare import compare_failures, compare_trace_files, compare_verifications
2
3
  from contexttrace.verify.schema import (
3
4
  RAGTrace,
4
5
  TraceCitation,
@@ -13,6 +14,9 @@ __all__ = [
13
14
  "TraceCitation",
14
15
  "TraceContext",
15
16
  "VerificationInputError",
17
+ "compare_failures",
18
+ "compare_trace_files",
19
+ "compare_verifications",
16
20
  "list_verify_demos",
17
21
  "load_trace_file",
18
22
  "load_verify_demo",