contexttrace 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {contexttrace-0.2.0 → contexttrace-0.3.0}/MANIFEST.in +1 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/PKG-INFO +8 -4
- {contexttrace-0.2.0 → contexttrace-0.3.0}/README.md +7 -3
- contexttrace-0.3.0/contexttrace/_version.py +1 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/cli.py +19 -3
- contexttrace-0.3.0/contexttrace/verify/benchmark.py +574 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/citations.py +19 -1
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/claims.py +62 -2
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/evidence.py +113 -10
- contexttrace-0.3.0/contexttrace/verify/external_benchmark_cases.json +311 -0
- contexttrace-0.3.0/contexttrace/verify/facts.py +387 -0
- contexttrace-0.3.0/contexttrace/verify/real_benchmark_cases.json +713 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/report.py +122 -1
- contexttrace-0.3.0/contexttrace/verify/root_cause.py +218 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/runner.py +11 -1
- contexttrace-0.3.0/contexttrace/verify/spans.py +103 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/verdicts.py +82 -2
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace.egg-info/SOURCES.txt +5 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/pyproject.toml +2 -2
- contexttrace-0.2.0/contexttrace/_version.py +0 -1
- contexttrace-0.2.0/contexttrace/verify/benchmark.py +0 -229
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/__init__.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/client.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/config.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/demo.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/demo_data.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/endpoint_eval.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/errors.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/evaluator.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/integrations/__init__.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/integrations/fastapi.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/integrations/langchain.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/integrations/langgraph.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/integrations/llamaindex.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/integrations/opentelemetry.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/local.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/py.typed +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/regression.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/reliability.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/report.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/storage/__init__.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/storage/sqlite_store.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/thresholds.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/transport.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/__init__.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/abstention.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/demos.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/verify/schema.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/contexttrace/viewer.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/setup.cfg +0 -0
- {contexttrace-0.2.0 → contexttrace-0.3.0}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: contexttrace
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
|
|
5
5
|
Author: ContextTrace contributors
|
|
6
6
|
License: MIT
|
|
@@ -145,17 +145,21 @@ contexttrace verify trace.json --report --out reports/example.html
|
|
|
145
145
|
contexttrace verify trace.json --mode semantic
|
|
146
146
|
contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
|
|
147
147
|
contexttrace verify-benchmark --mode semantic
|
|
148
|
+
contexttrace verify-benchmark --mode semantic --report
|
|
149
|
+
contexttrace verify-benchmark --case-set external --mode semantic --report
|
|
148
150
|
```
|
|
149
151
|
|
|
150
152
|
Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
|
|
151
153
|
|
|
152
154
|
`verify-demo` uses bundled demo traces, so it works immediately after `pip install contexttrace`. Available demos include `unsupported_claim`, `partial_support`, `citation_mismatch`, `should_abstain`, and `supported_answer`.
|
|
153
155
|
|
|
154
|
-
Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics.
|
|
156
|
+
Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics. The default benchmark includes 32 real ContextTrace docs and release-artifact cases. `--case-set external` adds public OSS documentation and GitHub issue cases from Qdrant, Chroma, Haystack, and LangChain, while `--case-set all` runs both packs. `--report` writes an HTML report with misses to inspect.
|
|
155
157
|
|
|
156
|
-
|
|
158
|
+
Verification output includes evidence span offsets, stable span hashes, multiple supporting spans, typed matched/missing facts, and claim-level root causes so partial support failures are easier to inspect.
|
|
157
159
|
|
|
158
|
-
|
|
160
|
+
ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
|
|
161
|
+
|
|
162
|
+
The v0.3.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
159
163
|
|
|
160
164
|
## What It Catches
|
|
161
165
|
|
|
@@ -88,17 +88,21 @@ contexttrace verify trace.json --report --out reports/example.html
|
|
|
88
88
|
contexttrace verify trace.json --mode semantic
|
|
89
89
|
contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
|
|
90
90
|
contexttrace verify-benchmark --mode semantic
|
|
91
|
+
contexttrace verify-benchmark --mode semantic --report
|
|
92
|
+
contexttrace verify-benchmark --case-set external --mode semantic --report
|
|
91
93
|
```
|
|
92
94
|
|
|
93
95
|
Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
|
|
94
96
|
|
|
95
97
|
`verify-demo` uses bundled demo traces, so it works immediately after `pip install contexttrace`. Available demos include `unsupported_claim`, `partial_support`, `citation_mismatch`, `should_abstain`, and `supported_answer`.
|
|
96
98
|
|
|
97
|
-
Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics.
|
|
99
|
+
Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics. The default benchmark includes 32 real ContextTrace docs and release-artifact cases. `--case-set external` adds public OSS documentation and GitHub issue cases from Qdrant, Chroma, Haystack, and LangChain, while `--case-set all` runs both packs. `--report` writes an HTML report with misses to inspect.
|
|
98
100
|
|
|
99
|
-
|
|
101
|
+
Verification output includes evidence span offsets, stable span hashes, multiple supporting spans, typed matched/missing facts, and claim-level root causes so partial support failures are easier to inspect.
|
|
100
102
|
|
|
101
|
-
|
|
103
|
+
ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
|
|
104
|
+
|
|
105
|
+
The v0.3.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
102
106
|
|
|
103
107
|
## What It Catches
|
|
104
108
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.0"
|
|
@@ -29,7 +29,7 @@ from contexttrace.verify import (
|
|
|
29
29
|
load_verify_demo,
|
|
30
30
|
verify_trace,
|
|
31
31
|
)
|
|
32
|
-
from contexttrace.verify.benchmark import run_verify_benchmark
|
|
32
|
+
from contexttrace.verify.benchmark import run_verify_benchmark, write_verify_benchmark_report
|
|
33
33
|
from contexttrace.verify.report import VerifyReportGenerator
|
|
34
34
|
from contexttrace.viewer import serve_viewer
|
|
35
35
|
|
|
@@ -288,18 +288,31 @@ def verify_demo_command(
|
|
|
288
288
|
|
|
289
289
|
@cli.command("verify-benchmark")
|
|
290
290
|
@click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
|
|
291
|
+
@click.option("--case-set", default="contexttrace", show_default=True, type=click.Choice(["contexttrace", "external", "all"]), help="Benchmark case set to run.")
|
|
291
292
|
@click.option("--json", "json_output", is_flag=True, help="Print benchmark results as JSON.")
|
|
292
|
-
|
|
293
|
+
@click.option("--report", is_flag=True, help="Generate a local HTML benchmark report.")
|
|
294
|
+
@click.option("--out", default=None, help="HTML benchmark report path. Implies --report when provided.")
|
|
295
|
+
def verify_benchmark_command(mode: str, case_set: str, json_output: bool, report: bool, out: Optional[str]) -> int:
|
|
293
296
|
"""Run the bundled verification precision/recall benchmark."""
|
|
294
297
|
|
|
295
|
-
result = run_verify_benchmark(mode=mode)
|
|
298
|
+
result = run_verify_benchmark(mode=mode, case_set=case_set)
|
|
299
|
+
written_report = None
|
|
300
|
+
if report or out:
|
|
301
|
+
output_path = out or str(Path(".contexttrace") / "reports" / ("verify_benchmark_%s.html" % mode))
|
|
302
|
+
written_report = write_verify_benchmark_report(result, path=output_path)
|
|
296
303
|
if json_output:
|
|
304
|
+
if written_report:
|
|
305
|
+
click.echo("Report: %s" % written_report, err=True)
|
|
297
306
|
click.echo(json.dumps(result, indent=2))
|
|
298
307
|
return 0
|
|
299
308
|
|
|
300
309
|
click.echo("Mode: %s" % result["mode"])
|
|
310
|
+
click.echo("Case source: %s" % result["case_source"])
|
|
301
311
|
click.echo("Cases: %s" % result["cases"])
|
|
302
312
|
click.echo("Exact match rate: %.3f" % float(result["exact_match_rate"]))
|
|
313
|
+
click.echo("Verdict match rate: %.3f" % float(result["verdict_match_rate"]))
|
|
314
|
+
click.echo("Citation match rate: %.3f" % float(result["citation_match_rate"]))
|
|
315
|
+
click.echo("Abstention match rate: %.3f" % float(result["abstention_match_rate"]))
|
|
303
316
|
click.echo("label\tprecision\trecall\tf1\ttp\tfp\tfn")
|
|
304
317
|
for label, metrics in result["per_label"].items():
|
|
305
318
|
click.echo(
|
|
@@ -322,6 +335,8 @@ def verify_benchmark_command(mode: str, json_output: bool) -> int:
|
|
|
322
335
|
"- %s expected=%s predicted=%s"
|
|
323
336
|
% (row["id"], ",".join(row["expected"]), ",".join(row["predicted"]))
|
|
324
337
|
)
|
|
338
|
+
if written_report:
|
|
339
|
+
click.echo("Report: %s" % written_report)
|
|
325
340
|
return 0
|
|
326
341
|
|
|
327
342
|
|
|
@@ -365,6 +380,7 @@ def _print_verify_result(
|
|
|
365
380
|
click.echo("Unsupported claim rate: %.3f" % float(summary["unsupported_claim_rate"]))
|
|
366
381
|
click.echo("Citation mismatches: %s" % summary["citation_mismatches"])
|
|
367
382
|
click.echo("Failure type: %s" % summary["failure_type"])
|
|
383
|
+
click.echo("Primary root cause: %s" % summary.get("primary_root_cause", "unknown"))
|
|
368
384
|
click.echo("Should abstain: %s" % str(summary["should_abstain"]).lower())
|
|
369
385
|
click.echo("Suggested fix: %s" % summary["suggested_fix"])
|
|
370
386
|
if written_report:
|