contexttrace 0.2.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {contexttrace-0.2.0 → contexttrace-0.4.0}/MANIFEST.in +1 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/PKG-INFO +13 -4
- {contexttrace-0.2.0 → contexttrace-0.4.0}/README.md +12 -3
- contexttrace-0.4.0/contexttrace/_version.py +1 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/cli.py +83 -3
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/__init__.py +4 -0
- contexttrace-0.4.0/contexttrace/verify/benchmark.py +574 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/citations.py +19 -1
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/claims.py +62 -2
- contexttrace-0.4.0/contexttrace/verify/compare.py +445 -0
- contexttrace-0.4.0/contexttrace/verify/compare_report.py +386 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/evidence.py +113 -10
- contexttrace-0.4.0/contexttrace/verify/external_benchmark_cases.json +311 -0
- contexttrace-0.4.0/contexttrace/verify/facts.py +387 -0
- contexttrace-0.4.0/contexttrace/verify/real_benchmark_cases.json +713 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/report.py +122 -1
- contexttrace-0.4.0/contexttrace/verify/root_cause.py +218 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/runner.py +11 -1
- contexttrace-0.4.0/contexttrace/verify/spans.py +103 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/verdicts.py +82 -2
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace.egg-info/SOURCES.txt +7 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/pyproject.toml +2 -2
- contexttrace-0.2.0/contexttrace/_version.py +0 -1
- contexttrace-0.2.0/contexttrace/verify/benchmark.py +0 -229
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/__init__.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/client.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/config.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/demo.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/demo_data.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/endpoint_eval.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/errors.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/evaluator.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/integrations/__init__.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/integrations/fastapi.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/integrations/langchain.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/integrations/langgraph.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/integrations/llamaindex.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/integrations/opentelemetry.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/local.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/py.typed +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/regression.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/reliability.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/report.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/storage/__init__.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/storage/sqlite_store.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/thresholds.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/transport.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/abstention.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/demos.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/verify/schema.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/contexttrace/viewer.py +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/setup.cfg +0 -0
- {contexttrace-0.2.0 → contexttrace-0.4.0}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: contexttrace
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
|
|
5
5
|
Author: ContextTrace contributors
|
|
6
6
|
License: MIT
|
|
@@ -145,17 +145,26 @@ contexttrace verify trace.json --report --out reports/example.html
|
|
|
145
145
|
contexttrace verify trace.json --mode semantic
|
|
146
146
|
contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
|
|
147
147
|
contexttrace verify-benchmark --mode semantic
|
|
148
|
+
contexttrace verify-benchmark --mode semantic --report
|
|
149
|
+
contexttrace verify-benchmark --case-set external --mode semantic --report
|
|
150
|
+
contexttrace compare baseline.json current.json
|
|
151
|
+
contexttrace compare baseline.json current.json --report
|
|
152
|
+
contexttrace compare baseline.json current.json --fail-on new_failure
|
|
148
153
|
```
|
|
149
154
|
|
|
150
155
|
Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
|
|
151
156
|
|
|
152
157
|
`verify-demo` uses bundled demo traces, so it works immediately after `pip install contexttrace`. Available demos include `unsupported_claim`, `partial_support`, `citation_mismatch`, `should_abstain`, and `supported_answer`.
|
|
153
158
|
|
|
154
|
-
Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics.
|
|
159
|
+
Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics. The default benchmark includes 32 real ContextTrace docs and release-artifact cases. `--case-set external` adds public OSS documentation and GitHub issue cases from Qdrant, Chroma, Haystack, and LangChain, while `--case-set all` runs both packs. `--report` writes an HTML report with misses to inspect.
|
|
155
160
|
|
|
156
|
-
|
|
161
|
+
Verification output includes evidence span offsets, stable span hashes, multiple supporting spans, typed matched/missing facts, and claim-level root causes so partial support failures are easier to inspect.
|
|
157
162
|
|
|
158
|
-
|
|
163
|
+
ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
|
|
164
|
+
|
|
165
|
+
Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
|
|
166
|
+
|
|
167
|
+
The v0.4.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
159
168
|
|
|
160
169
|
## What It Catches
|
|
161
170
|
|
|
@@ -88,17 +88,26 @@ contexttrace verify trace.json --report --out reports/example.html
|
|
|
88
88
|
contexttrace verify trace.json --mode semantic
|
|
89
89
|
contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
|
|
90
90
|
contexttrace verify-benchmark --mode semantic
|
|
91
|
+
contexttrace verify-benchmark --mode semantic --report
|
|
92
|
+
contexttrace verify-benchmark --case-set external --mode semantic --report
|
|
93
|
+
contexttrace compare baseline.json current.json
|
|
94
|
+
contexttrace compare baseline.json current.json --report
|
|
95
|
+
contexttrace compare baseline.json current.json --fail-on new_failure
|
|
91
96
|
```
|
|
92
97
|
|
|
93
98
|
Input requires `query`, `answer`, and `contexts` with `id` and `text`. Optional `citations` are checked to catch cited sources that do not actually support the matched claim.
|
|
94
99
|
|
|
95
100
|
`verify-demo` uses bundled demo traces, so it works immediately after `pip install contexttrace`. Available demos include `unsupported_claim`, `partial_support`, `citation_mismatch`, `should_abstain`, and `supported_answer`.
|
|
96
101
|
|
|
97
|
-
Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics.
|
|
102
|
+
Use `--mode semantic` for local paraphrase-aware matching, and `verify-benchmark` to inspect bundled precision/recall metrics. The default benchmark includes 32 real ContextTrace docs and release-artifact cases. `--case-set external` adds public OSS documentation and GitHub issue cases from Qdrant, Chroma, Haystack, and LangChain, while `--case-set all` runs both packs. `--report` writes an HTML report with misses to inspect.
|
|
98
103
|
|
|
99
|
-
|
|
104
|
+
Verification output includes evidence span offsets, stable span hashes, multiple supporting spans, typed matched/missing facts, and claim-level root causes so partial support failures are easier to inspect.
|
|
100
105
|
|
|
101
|
-
|
|
106
|
+
ContextTrace verifies whether each generated claim is actually supported by retrieved evidence. Instead of only showing a trace or a score, it tells you where the evidence chain broke: unsupported claim, citation mismatch, retrieval miss, answer overreach, conflicting context, or should-have-abstained.
|
|
107
|
+
|
|
108
|
+
Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
|
|
109
|
+
|
|
110
|
+
The v0.4.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
102
111
|
|
|
103
112
|
## What It Catches
|
|
104
113
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.4.0"
|
|
@@ -24,12 +24,15 @@ from contexttrace.storage import SQLiteTraceStore
|
|
|
24
24
|
from contexttrace.thresholds import parse_thresholds, threshold_failures
|
|
25
25
|
from contexttrace.verify import (
|
|
26
26
|
VerificationInputError,
|
|
27
|
+
compare_failures,
|
|
28
|
+
compare_trace_files,
|
|
27
29
|
list_verify_demos,
|
|
28
30
|
load_trace_file,
|
|
29
31
|
load_verify_demo,
|
|
30
32
|
verify_trace,
|
|
31
33
|
)
|
|
32
|
-
from contexttrace.verify.benchmark import run_verify_benchmark
|
|
34
|
+
from contexttrace.verify.benchmark import run_verify_benchmark, write_verify_benchmark_report
|
|
35
|
+
from contexttrace.verify.compare_report import CompareReportGenerator
|
|
33
36
|
from contexttrace.verify.report import VerifyReportGenerator
|
|
34
37
|
from contexttrace.viewer import serve_viewer
|
|
35
38
|
|
|
@@ -288,18 +291,31 @@ def verify_demo_command(
|
|
|
288
291
|
|
|
289
292
|
@cli.command("verify-benchmark")
|
|
290
293
|
@click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
|
|
294
|
+
@click.option("--case-set", default="contexttrace", show_default=True, type=click.Choice(["contexttrace", "external", "all"]), help="Benchmark case set to run.")
|
|
291
295
|
@click.option("--json", "json_output", is_flag=True, help="Print benchmark results as JSON.")
|
|
292
|
-
|
|
296
|
+
@click.option("--report", is_flag=True, help="Generate a local HTML benchmark report.")
|
|
297
|
+
@click.option("--out", default=None, help="HTML benchmark report path. Implies --report when provided.")
|
|
298
|
+
def verify_benchmark_command(mode: str, case_set: str, json_output: bool, report: bool, out: Optional[str]) -> int:
|
|
293
299
|
"""Run the bundled verification precision/recall benchmark."""
|
|
294
300
|
|
|
295
|
-
result = run_verify_benchmark(mode=mode)
|
|
301
|
+
result = run_verify_benchmark(mode=mode, case_set=case_set)
|
|
302
|
+
written_report = None
|
|
303
|
+
if report or out:
|
|
304
|
+
output_path = out or str(Path(".contexttrace") / "reports" / ("verify_benchmark_%s.html" % mode))
|
|
305
|
+
written_report = write_verify_benchmark_report(result, path=output_path)
|
|
296
306
|
if json_output:
|
|
307
|
+
if written_report:
|
|
308
|
+
click.echo("Report: %s" % written_report, err=True)
|
|
297
309
|
click.echo(json.dumps(result, indent=2))
|
|
298
310
|
return 0
|
|
299
311
|
|
|
300
312
|
click.echo("Mode: %s" % result["mode"])
|
|
313
|
+
click.echo("Case source: %s" % result["case_source"])
|
|
301
314
|
click.echo("Cases: %s" % result["cases"])
|
|
302
315
|
click.echo("Exact match rate: %.3f" % float(result["exact_match_rate"]))
|
|
316
|
+
click.echo("Verdict match rate: %.3f" % float(result["verdict_match_rate"]))
|
|
317
|
+
click.echo("Citation match rate: %.3f" % float(result["citation_match_rate"]))
|
|
318
|
+
click.echo("Abstention match rate: %.3f" % float(result["abstention_match_rate"]))
|
|
303
319
|
click.echo("label\tprecision\trecall\tf1\ttp\tfp\tfn")
|
|
304
320
|
for label, metrics in result["per_label"].items():
|
|
305
321
|
click.echo(
|
|
@@ -322,9 +338,72 @@ def verify_benchmark_command(mode: str, json_output: bool) -> int:
|
|
|
322
338
|
"- %s expected=%s predicted=%s"
|
|
323
339
|
% (row["id"], ",".join(row["expected"]), ",".join(row["predicted"]))
|
|
324
340
|
)
|
|
341
|
+
if written_report:
|
|
342
|
+
click.echo("Report: %s" % written_report)
|
|
325
343
|
return 0
|
|
326
344
|
|
|
327
345
|
|
|
346
|
+
@cli.command("compare")
|
|
347
|
+
@click.argument("baseline_json")
|
|
348
|
+
@click.argument("current_json")
|
|
349
|
+
@click.option("--json", "json_output", is_flag=True, help="Print the full comparison result as JSON.")
|
|
350
|
+
@click.option("--report", is_flag=True, help="Generate a local HTML regression report.")
|
|
351
|
+
@click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
|
|
352
|
+
@click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode for raw trace inputs.")
|
|
353
|
+
@click.option("--fail-on", multiple=True, help="Fail on new_failure, new_unsupported, new_citation_mismatch, should_abstain_flip, support_rate_drop, new_root_cause, or any_regression.")
|
|
354
|
+
def compare_command(
|
|
355
|
+
baseline_json: str,
|
|
356
|
+
current_json: str,
|
|
357
|
+
json_output: bool,
|
|
358
|
+
report: bool,
|
|
359
|
+
out: Optional[str],
|
|
360
|
+
mode: str,
|
|
361
|
+
fail_on: tuple[str, ...],
|
|
362
|
+
) -> int:
|
|
363
|
+
"""Compare two portable RAG traces or verification JSON outputs."""
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
result = compare_trace_files(baseline_json, current_json, mode=mode)
|
|
367
|
+
except VerificationInputError as exc:
|
|
368
|
+
raise click.ClickException(str(exc)) from exc
|
|
369
|
+
|
|
370
|
+
written_report = None
|
|
371
|
+
if report or out:
|
|
372
|
+
default_name = "%s_vs_%s_compare.html" % (Path(baseline_json).stem, Path(current_json).stem)
|
|
373
|
+
output_path = out or str(Path(".contexttrace") / "reports" / default_name)
|
|
374
|
+
written_report = CompareReportGenerator().generate(result, path=output_path)
|
|
375
|
+
|
|
376
|
+
fail_messages = compare_failures(result, fail_on)
|
|
377
|
+
if json_output:
|
|
378
|
+
if written_report:
|
|
379
|
+
click.echo("Report: %s" % written_report, err=True)
|
|
380
|
+
click.echo(json.dumps(result, indent=2))
|
|
381
|
+
for message in fail_messages:
|
|
382
|
+
click.echo("Comparison failed: %s" % message, err=True)
|
|
383
|
+
return 1 if fail_messages else 0
|
|
384
|
+
|
|
385
|
+
summary = result["summary"]
|
|
386
|
+
click.echo("Regression: %s" % str(summary["regression"]).lower())
|
|
387
|
+
click.echo("Support rate: %.3f -> %.3f (%+.3f)" % (
|
|
388
|
+
float(summary.get("support_rate_before") or 0.0),
|
|
389
|
+
float(summary.get("support_rate_after") or 0.0),
|
|
390
|
+
float(summary.get("support_rate_delta") or 0.0),
|
|
391
|
+
))
|
|
392
|
+
click.echo("Unsupported claim rate delta: %+.3f" % float(summary.get("unsupported_claim_rate_delta") or 0.0))
|
|
393
|
+
click.echo("Citation mismatch delta: %+d" % int(summary.get("citation_mismatch_delta") or 0))
|
|
394
|
+
click.echo("New failures: %s" % summary["new_failures"])
|
|
395
|
+
click.echo("Resolved failures: %s" % summary["resolved_failures"])
|
|
396
|
+
click.echo("Added claims: %s" % summary["added_claims"])
|
|
397
|
+
click.echo("Removed claims: %s" % summary["removed_claims"])
|
|
398
|
+
click.echo("Changed claims: %s" % summary["changed_claims"])
|
|
399
|
+
click.echo("New root causes: %s" % (", ".join(summary.get("new_root_causes") or []) or "none"))
|
|
400
|
+
if written_report:
|
|
401
|
+
click.echo("Report: %s" % written_report)
|
|
402
|
+
for message in fail_messages:
|
|
403
|
+
click.echo("Comparison failed: %s" % message, err=True)
|
|
404
|
+
return 1 if fail_messages else 0
|
|
405
|
+
|
|
406
|
+
|
|
328
407
|
def _write_verify_report(
|
|
329
408
|
result: dict,
|
|
330
409
|
trace: object,
|
|
@@ -365,6 +444,7 @@ def _print_verify_result(
|
|
|
365
444
|
click.echo("Unsupported claim rate: %.3f" % float(summary["unsupported_claim_rate"]))
|
|
366
445
|
click.echo("Citation mismatches: %s" % summary["citation_mismatches"])
|
|
367
446
|
click.echo("Failure type: %s" % summary["failure_type"])
|
|
447
|
+
click.echo("Primary root cause: %s" % summary.get("primary_root_cause", "unknown"))
|
|
368
448
|
click.echo("Should abstain: %s" % str(summary["should_abstain"]).lower())
|
|
369
449
|
click.echo("Suggested fix: %s" % summary["suggested_fix"])
|
|
370
450
|
if written_report:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from contexttrace.verify.runner import verify_trace, verify_trace_file
|
|
2
|
+
from contexttrace.verify.compare import compare_failures, compare_trace_files, compare_verifications
|
|
2
3
|
from contexttrace.verify.schema import (
|
|
3
4
|
RAGTrace,
|
|
4
5
|
TraceCitation,
|
|
@@ -13,6 +14,9 @@ __all__ = [
|
|
|
13
14
|
"TraceCitation",
|
|
14
15
|
"TraceContext",
|
|
15
16
|
"VerificationInputError",
|
|
17
|
+
"compare_failures",
|
|
18
|
+
"compare_trace_files",
|
|
19
|
+
"compare_verifications",
|
|
16
20
|
"list_verify_demos",
|
|
17
21
|
"load_trace_file",
|
|
18
22
|
"load_verify_demo",
|