contexttrace 0.6.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {contexttrace-0.6.0 → contexttrace-0.7.0}/PKG-INFO +10 -2
- {contexttrace-0.6.0 → contexttrace-0.7.0}/README.md +19 -11
- contexttrace-0.7.0/contexttrace/_version.py +1 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/cli.py +362 -28
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/__init__.py +18 -18
- contexttrace-0.7.0/contexttrace/verify/suite.py +662 -0
- contexttrace-0.7.0/contexttrace/verify/suite_report.py +316 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace.egg-info/SOURCES.txt +2 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/pyproject.toml +1 -1
- contexttrace-0.6.0/contexttrace/_version.py +0 -1
- {contexttrace-0.6.0 → contexttrace-0.7.0}/MANIFEST.in +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/__init__.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/capture.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/capture_endpoint.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/client.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/config.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/demo.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/demo_data.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/endpoint_eval.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/errors.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/evaluator.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/integrations/__init__.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/integrations/fastapi.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/integrations/langchain.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/integrations/langgraph.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/integrations/llamaindex.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/integrations/opentelemetry.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/local.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/py.typed +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/regression.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/reliability.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/report.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/storage/__init__.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/storage/sqlite_store.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/thresholds.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/transport.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/abstention.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/audit.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/audit_benchmark.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/audit_benchmark_cases.json +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/audit_report.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/benchmark.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/citations.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/claims.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/compare.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/compare_report.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/demos.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/evidence.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/external_benchmark_cases.json +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/facts.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/qa.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/qa_report.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/real_benchmark_cases.json +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/report.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/root_cause.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/runner.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/schema.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/spans.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/trace_inspect.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/verdicts.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/viewer.py +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/setup.cfg +0 -0
- {contexttrace-0.6.0 → contexttrace-0.7.0}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: contexttrace
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
|
|
5
5
|
Author: ContextTrace contributors
|
|
6
6
|
License: MIT
|
|
@@ -176,6 +176,12 @@ contexttrace verify-benchmark --case-set external --mode semantic --report
|
|
|
176
176
|
contexttrace compare baseline.json current.json
|
|
177
177
|
contexttrace compare baseline.json current.json --report
|
|
178
178
|
contexttrace compare baseline.json current.json --fail-on new_failure
|
|
179
|
+
contexttrace suite create traces/*.json --out contexttrace-suite.json
|
|
180
|
+
contexttrace suite add contexttrace-suite.json traces/new_failure.json
|
|
181
|
+
contexttrace suite list contexttrace-suite.json
|
|
182
|
+
contexttrace suite run contexttrace-suite.json --endpoint http://localhost:8000/query --report
|
|
183
|
+
contexttrace suite prune contexttrace-suite.json --results .contexttrace/suites/contexttrace-regression-suite_results.json
|
|
184
|
+
contexttrace suite report .contexttrace/suites/contexttrace-regression-suite_results.json
|
|
179
185
|
contexttrace audit trace.json --corpus docs/
|
|
180
186
|
contexttrace audit trace.json --corpus docs/ --report
|
|
181
187
|
contexttrace audit trace.json --corpus docs/ --fail-on retrieval_miss
|
|
@@ -204,11 +210,13 @@ write_rag_trace(trace, "trace.json")
|
|
|
204
210
|
|
|
205
211
|
Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
|
|
206
212
|
|
|
213
|
+
Use `contexttrace suite create`, `suite add`, and `suite run` to turn saved failures into replayable endpoint tests. Suite runs call your current RAG endpoint with the saved query, verify the new answer, compare it with the baseline trace, and exit non-zero when a saved failure still reproduces or a good case regresses. Use `suite list`, `suite remove`, and `suite prune` to manage the suite as failures are fixed or retired.
|
|
214
|
+
|
|
207
215
|
Use `contexttrace audit trace.json --corpus docs/` to diagnose whether an unsupported claim failed because retrieval missed evidence, reranking buried it, chunking omitted the supporting span, the corpus lacks coverage, or generation overclaimed. Audit output includes failure stages, diagnostic signals, and prioritized next actions.
|
|
208
216
|
|
|
209
217
|
Use `contexttrace audit-benchmark --case-set real --mode semantic` to test retrieval-audit labels against bundled public OSS documentation and GitHub issue snippets from Qdrant, Chroma, Haystack, LangChain, and ContextTrace docs.
|
|
210
218
|
|
|
211
|
-
The v0.
|
|
219
|
+
The v0.7.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
212
220
|
|
|
213
221
|
## What It Catches
|
|
214
222
|
|
|
@@ -116,12 +116,18 @@ contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
|
|
|
116
116
|
contexttrace verify-benchmark --mode semantic
|
|
117
117
|
contexttrace verify-benchmark --mode semantic --report
|
|
118
118
|
contexttrace verify-benchmark --case-set external --mode semantic --report
|
|
119
|
-
contexttrace compare baseline.json current.json
|
|
120
|
-
contexttrace compare baseline.json current.json --report
|
|
121
|
-
contexttrace compare baseline.json current.json --fail-on new_failure
|
|
122
|
-
contexttrace
|
|
123
|
-
contexttrace
|
|
124
|
-
contexttrace
|
|
119
|
+
contexttrace compare baseline.json current.json
|
|
120
|
+
contexttrace compare baseline.json current.json --report
|
|
121
|
+
contexttrace compare baseline.json current.json --fail-on new_failure
|
|
122
|
+
contexttrace suite create traces/*.json --out contexttrace-suite.json
|
|
123
|
+
contexttrace suite add contexttrace-suite.json traces/new_failure.json
|
|
124
|
+
contexttrace suite list contexttrace-suite.json
|
|
125
|
+
contexttrace suite run contexttrace-suite.json --endpoint http://localhost:8000/query --report
|
|
126
|
+
contexttrace suite prune contexttrace-suite.json --results .contexttrace/suites/contexttrace-regression-suite_results.json
|
|
127
|
+
contexttrace suite report .contexttrace/suites/contexttrace-regression-suite_results.json
|
|
128
|
+
contexttrace audit trace.json --corpus docs/
|
|
129
|
+
contexttrace audit trace.json --corpus docs/ --report
|
|
130
|
+
contexttrace audit trace.json --corpus docs/ --fail-on retrieval_miss
|
|
125
131
|
contexttrace audit-benchmark --case-set real --mode semantic
|
|
126
132
|
contexttrace audit-benchmark --case-set real --mode semantic --report
|
|
127
133
|
```
|
|
@@ -144,14 +150,16 @@ from contexttrace import capture_rag_trace, write_rag_trace
|
|
|
144
150
|
trace = capture_rag_trace(query=question, answer=answer, contexts=retrieved_docs)
|
|
145
151
|
write_rag_trace(trace, "trace.json")
|
|
146
152
|
```
|
|
147
|
-
|
|
148
|
-
Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
|
|
149
|
-
|
|
150
|
-
Use `contexttrace
|
|
153
|
+
|
|
154
|
+
Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
|
|
155
|
+
|
|
156
|
+
Use `contexttrace suite create`, `suite add`, and `suite run` to turn saved failures into replayable endpoint tests. Suite runs call your current RAG endpoint with the saved query, verify the new answer, compare it with the baseline trace, and exit non-zero when a saved failure still reproduces or a good case regresses. Use `suite list`, `suite remove`, and `suite prune` to manage the suite as failures are fixed or retired.
|
|
157
|
+
|
|
158
|
+
Use `contexttrace audit trace.json --corpus docs/` to diagnose whether an unsupported claim failed because retrieval missed evidence, reranking buried it, chunking omitted the supporting span, the corpus lacks coverage, or generation overclaimed. Audit output includes failure stages, diagnostic signals, and prioritized next actions.
|
|
151
159
|
|
|
152
160
|
Use `contexttrace audit-benchmark --case-set real --mode semantic` to test retrieval-audit labels against bundled public OSS documentation and GitHub issue snippets from Qdrant, Chroma, Haystack, LangChain, and ContextTrace docs.
|
|
153
161
|
|
|
154
|
-
The v0.
|
|
162
|
+
The v0.7.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
|
|
155
163
|
|
|
156
164
|
## What It Catches
|
|
157
165
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.7.0"
|
|
@@ -25,24 +25,38 @@ from contexttrace.report import ReportGenerator
|
|
|
25
25
|
from contexttrace.storage import SQLiteTraceStore
|
|
26
26
|
from contexttrace.thresholds import parse_thresholds, threshold_failures
|
|
27
27
|
from contexttrace.verify import (
|
|
28
|
-
VerificationInputError,
|
|
29
|
-
audit_failures,
|
|
30
|
-
audit_trace,
|
|
31
|
-
compare_failures,
|
|
32
|
-
compare_trace_files,
|
|
33
|
-
list_verify_demos,
|
|
34
|
-
load_trace_file,
|
|
35
|
-
load_verify_demo,
|
|
36
|
-
verify_trace,
|
|
37
|
-
)
|
|
28
|
+
VerificationInputError,
|
|
29
|
+
audit_failures,
|
|
30
|
+
audit_trace,
|
|
31
|
+
compare_failures,
|
|
32
|
+
compare_trace_files,
|
|
33
|
+
list_verify_demos,
|
|
34
|
+
load_trace_file,
|
|
35
|
+
load_verify_demo,
|
|
36
|
+
verify_trace,
|
|
37
|
+
)
|
|
38
38
|
from contexttrace.verify.benchmark import run_verify_benchmark, write_verify_benchmark_report
|
|
39
39
|
from contexttrace.verify.audit_benchmark import run_audit_benchmark, write_audit_benchmark_report
|
|
40
40
|
from contexttrace.verify.audit_report import AuditReportGenerator
|
|
41
41
|
from contexttrace.verify.compare_report import CompareReportGenerator
|
|
42
|
-
from contexttrace.verify.qa import qa_failures, qa_trace
|
|
43
|
-
from contexttrace.verify.qa_report import QAReportGenerator
|
|
44
|
-
from contexttrace.verify.report import VerifyReportGenerator
|
|
45
|
-
from contexttrace.verify.
|
|
42
|
+
from contexttrace.verify.qa import qa_failures, qa_trace
|
|
43
|
+
from contexttrace.verify.qa_report import QAReportGenerator
|
|
44
|
+
from contexttrace.verify.report import VerifyReportGenerator
|
|
45
|
+
from contexttrace.verify.suite import (
|
|
46
|
+
add_trace_files_to_suite,
|
|
47
|
+
create_suite_from_trace_files,
|
|
48
|
+
list_suite_cases,
|
|
49
|
+
load_suite_file,
|
|
50
|
+
load_suite_result_file,
|
|
51
|
+
prune_suite_cases,
|
|
52
|
+
remove_suite_cases,
|
|
53
|
+
run_suite,
|
|
54
|
+
suite_failures,
|
|
55
|
+
write_suite_file,
|
|
56
|
+
write_suite_result,
|
|
57
|
+
)
|
|
58
|
+
from contexttrace.verify.suite_report import SuiteReportGenerator
|
|
59
|
+
from contexttrace.verify.trace_inspect import inspect_trace
|
|
46
60
|
from contexttrace.viewer import serve_viewer
|
|
47
61
|
|
|
48
62
|
|
|
@@ -315,7 +329,7 @@ def inspect_command(trace_json: str, json_output: bool) -> int:
|
|
|
315
329
|
@click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
|
|
316
330
|
@click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
|
|
317
331
|
@click.option("--fail-on", multiple=True, help="Fail on high_risk, medium_risk, any_risk, unsupported, should_abstain, audit_failure, or inspect_warning.")
|
|
318
|
-
def qa_command(
|
|
332
|
+
def qa_command(
|
|
319
333
|
trace_json: str,
|
|
320
334
|
corpus_path: Optional[str],
|
|
321
335
|
json_output: bool,
|
|
@@ -376,11 +390,297 @@ def qa_command(
|
|
|
376
390
|
click.echo("Report: %s" % written_report)
|
|
377
391
|
for message in fail_messages:
|
|
378
392
|
click.echo("QA failed: %s" % message, err=True)
|
|
379
|
-
return 1 if fail_messages else 0
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
@cli.
|
|
383
|
-
|
|
393
|
+
return 1 if fail_messages else 0
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
@cli.group("suite")
|
|
397
|
+
def suite_group() -> None:
|
|
398
|
+
"""Create and run local RAG regression suites."""
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
@suite_group.command("create")
|
|
402
|
+
@click.argument("trace_json", nargs=-1, required=True)
|
|
403
|
+
@click.option("--out", default="contexttrace-suite.json", show_default=True, help="Suite JSON file to write.")
|
|
404
|
+
@click.option("--name", default=None, help="Suite name.")
|
|
405
|
+
@click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode for baseline QA.")
|
|
406
|
+
@click.option("--corpus", "corpus_path", default=None, help="Optional local corpus directory or file for baseline retrieval/corpus audit.")
|
|
407
|
+
def suite_create_command(
|
|
408
|
+
trace_json: tuple[str, ...],
|
|
409
|
+
out: str,
|
|
410
|
+
name: Optional[str],
|
|
411
|
+
mode: str,
|
|
412
|
+
corpus_path: Optional[str],
|
|
413
|
+
) -> int:
|
|
414
|
+
"""Create a suite from saved portable RAG trace files."""
|
|
415
|
+
|
|
416
|
+
try:
|
|
417
|
+
suite = create_suite_from_trace_files(
|
|
418
|
+
trace_json,
|
|
419
|
+
name=name,
|
|
420
|
+
mode=mode,
|
|
421
|
+
corpus_path=corpus_path,
|
|
422
|
+
)
|
|
423
|
+
written = write_suite_file(suite, out)
|
|
424
|
+
except VerificationInputError as exc:
|
|
425
|
+
raise click.ClickException(str(exc)) from exc
|
|
426
|
+
|
|
427
|
+
click.echo("Suite: %s" % written)
|
|
428
|
+
click.echo("Cases: %s" % len(suite.get("cases") or []))
|
|
429
|
+
click.echo("Policy: saved cases must pass on replay")
|
|
430
|
+
return 0
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
@suite_group.command("add")
|
|
434
|
+
@click.argument("suite_json")
|
|
435
|
+
@click.argument("trace_json", nargs=-1, required=True)
|
|
436
|
+
@click.option("--out", default=None, help="Suite JSON file to write. Defaults to overwriting suite_json.")
|
|
437
|
+
@click.option("--mode", default=None, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode for added baselines. Defaults to the suite mode.")
|
|
438
|
+
@click.option("--corpus", "corpus_path", default=None, help="Optional local corpus directory or file for baseline retrieval/corpus audit.")
|
|
439
|
+
@click.option("--replace", is_flag=True, help="Replace existing cases with the same generated case IDs.")
|
|
440
|
+
def suite_add_command(
|
|
441
|
+
suite_json: str,
|
|
442
|
+
trace_json: tuple[str, ...],
|
|
443
|
+
out: Optional[str],
|
|
444
|
+
mode: Optional[str],
|
|
445
|
+
corpus_path: Optional[str],
|
|
446
|
+
replace: bool,
|
|
447
|
+
) -> int:
|
|
448
|
+
"""Add saved portable RAG traces to an existing suite."""
|
|
449
|
+
|
|
450
|
+
try:
|
|
451
|
+
suite = load_suite_file(suite_json)
|
|
452
|
+
result = add_trace_files_to_suite(
|
|
453
|
+
suite,
|
|
454
|
+
trace_json,
|
|
455
|
+
mode=mode,
|
|
456
|
+
corpus_path=corpus_path,
|
|
457
|
+
replace=replace,
|
|
458
|
+
)
|
|
459
|
+
written = write_suite_file(result["suite"], out or suite_json)
|
|
460
|
+
except VerificationInputError as exc:
|
|
461
|
+
raise click.ClickException(str(exc)) from exc
|
|
462
|
+
|
|
463
|
+
click.echo("Suite: %s" % written)
|
|
464
|
+
click.echo("Added: %s" % len(result["added_case_ids"]))
|
|
465
|
+
if result["added_case_ids"]:
|
|
466
|
+
click.echo("Added case IDs: %s" % ", ".join(result["added_case_ids"]))
|
|
467
|
+
click.echo("Replaced: %s" % result["replaced"])
|
|
468
|
+
click.echo("Cases: %s" % len(result["suite"].get("cases") or []))
|
|
469
|
+
return 0
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
@suite_group.command("list")
|
|
473
|
+
@click.argument("suite_json")
|
|
474
|
+
@click.option("--json", "json_output", is_flag=True, help="Print cases as JSON.")
|
|
475
|
+
def suite_list_command(suite_json: str, json_output: bool) -> int:
|
|
476
|
+
"""List cases in a local regression suite."""
|
|
477
|
+
|
|
478
|
+
try:
|
|
479
|
+
suite = load_suite_file(suite_json)
|
|
480
|
+
rows = list_suite_cases(suite)
|
|
481
|
+
except VerificationInputError as exc:
|
|
482
|
+
raise click.ClickException(str(exc)) from exc
|
|
483
|
+
|
|
484
|
+
if json_output:
|
|
485
|
+
click.echo(json.dumps({"suite": suite.get("name"), "cases": rows}, indent=2))
|
|
486
|
+
return 0
|
|
487
|
+
|
|
488
|
+
click.echo("Suite: %s" % (suite.get("name") or suite_json))
|
|
489
|
+
click.echo("Cases: %s" % len(rows))
|
|
490
|
+
click.echo("id\tbaseline_risk\tbaseline_issue\tsupport_rate\tquery")
|
|
491
|
+
for row in rows:
|
|
492
|
+
click.echo(
|
|
493
|
+
"%s\t%s\t%s\t%s\t%s"
|
|
494
|
+
% (
|
|
495
|
+
row.get("id"),
|
|
496
|
+
row.get("baseline_risk_level") or "",
|
|
497
|
+
row.get("baseline_primary_issue") or "",
|
|
498
|
+
row.get("baseline_support_rate") if row.get("baseline_support_rate") is not None else "",
|
|
499
|
+
_preview(row.get("query"), limit=90),
|
|
500
|
+
)
|
|
501
|
+
)
|
|
502
|
+
return 0
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
@suite_group.command("remove")
|
|
506
|
+
@click.argument("suite_json")
|
|
507
|
+
@click.argument("case_id", nargs=-1, required=True)
|
|
508
|
+
@click.option("--out", default=None, help="Suite JSON file to write. Defaults to overwriting suite_json.")
|
|
509
|
+
def suite_remove_command(suite_json: str, case_id: tuple[str, ...], out: Optional[str]) -> int:
|
|
510
|
+
"""Remove one or more case IDs from a suite."""
|
|
511
|
+
|
|
512
|
+
try:
|
|
513
|
+
suite = load_suite_file(suite_json)
|
|
514
|
+
result = remove_suite_cases(suite, case_id)
|
|
515
|
+
written = write_suite_file(result["suite"], out or suite_json)
|
|
516
|
+
except VerificationInputError as exc:
|
|
517
|
+
raise click.ClickException(str(exc)) from exc
|
|
518
|
+
|
|
519
|
+
click.echo("Suite: %s" % written)
|
|
520
|
+
click.echo("Removed: %s" % len(result["removed_case_ids"]))
|
|
521
|
+
if result["removed_case_ids"]:
|
|
522
|
+
click.echo("Removed case IDs: %s" % ", ".join(result["removed_case_ids"]))
|
|
523
|
+
if result["missing_case_ids"]:
|
|
524
|
+
click.echo("Missing case IDs: %s" % ", ".join(result["missing_case_ids"]))
|
|
525
|
+
click.echo("Cases: %s" % len(result["suite"].get("cases") or []))
|
|
526
|
+
return 1 if result["missing_case_ids"] else 0
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
@suite_group.command("prune")
|
|
530
|
+
@click.argument("suite_json")
|
|
531
|
+
@click.option("--results", "results_json", required=True, help="Suite result JSON from `contexttrace suite run`.")
|
|
532
|
+
@click.option("--status", "statuses", multiple=True, default=("passed",), show_default=True, help="Result status to remove. May be repeated.")
|
|
533
|
+
@click.option("--out", default=None, help="Suite JSON file to write. Defaults to overwriting suite_json.")
|
|
534
|
+
def suite_prune_command(
|
|
535
|
+
suite_json: str,
|
|
536
|
+
results_json: str,
|
|
537
|
+
statuses: tuple[str, ...],
|
|
538
|
+
out: Optional[str],
|
|
539
|
+
) -> int:
|
|
540
|
+
"""Remove cases by status from a saved suite result."""
|
|
541
|
+
|
|
542
|
+
try:
|
|
543
|
+
suite = load_suite_file(suite_json)
|
|
544
|
+
result_payload = load_suite_result_file(results_json)
|
|
545
|
+
result = prune_suite_cases(suite, result_payload, statuses=statuses)
|
|
546
|
+
written = write_suite_file(result["suite"], out or suite_json)
|
|
547
|
+
except VerificationInputError as exc:
|
|
548
|
+
raise click.ClickException(str(exc)) from exc
|
|
549
|
+
|
|
550
|
+
click.echo("Suite: %s" % written)
|
|
551
|
+
click.echo("Pruned statuses: %s" % ", ".join(result["statuses"]))
|
|
552
|
+
click.echo("Removed: %s" % len(result["removed_case_ids"]))
|
|
553
|
+
if result["removed_case_ids"]:
|
|
554
|
+
click.echo("Removed case IDs: %s" % ", ".join(result["removed_case_ids"]))
|
|
555
|
+
click.echo("Cases: %s" % len(result["suite"].get("cases") or []))
|
|
556
|
+
return 0
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
@suite_group.command("run")
|
|
560
|
+
@click.argument("suite_json")
|
|
561
|
+
@click.option("--endpoint", default=None, help="RAG endpoint URL. Defaults to config eval_endpoint.")
|
|
562
|
+
@click.option("--method", default="POST", type=click.Choice(["GET", "POST"], case_sensitive=False), help="Endpoint method.")
|
|
563
|
+
@click.option("--input-key", default="question", show_default=True, help="Request body/query key for the question.")
|
|
564
|
+
@click.option("--answer-path", default="$.answer", show_default=True, help="JSONPath for answer extraction.")
|
|
565
|
+
@click.option("--contexts-path", default="$.contexts", show_default=True, help="JSONPath for context extraction.")
|
|
566
|
+
@click.option("--citations-path", default="$.citations", show_default=True, help="JSONPath for citation extraction.")
|
|
567
|
+
@click.option("--metadata-path", default="$.metadata", show_default=True, help="JSONPath for response metadata extraction.")
|
|
568
|
+
@click.option("--body-template", default=None, help="JSON body template. Use {{query}} where the question should be inserted.")
|
|
569
|
+
@click.option("--endpoint-header", multiple=True, help="Header formatted as Name:Value. May be repeated.")
|
|
570
|
+
@click.option("--timeout", default=30.0, show_default=True, type=float, help="Per-request timeout.")
|
|
571
|
+
@click.option("--corpus", "corpus_path", default=None, help="Optional local corpus directory or file for retrieval/corpus audit.")
|
|
572
|
+
@click.option("--out", default=None, help="Suite result JSON path.")
|
|
573
|
+
@click.option("--json", "json_output", is_flag=True, help="Print the full suite result as JSON.")
|
|
574
|
+
@click.option("--report", is_flag=True, help="Generate a local HTML suite report.")
|
|
575
|
+
@click.option("--report-out", default=None, help="HTML report path. Implies --report when provided.")
|
|
576
|
+
@click.option("--mode", default=None, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode. Defaults to the suite mode.")
|
|
577
|
+
@click.option("--fail-on", multiple=True, help="Fail on failed_case, regression, unsupported, should_abstain, high_risk, medium_risk, error, or any_failure.")
|
|
578
|
+
@click.pass_context
|
|
579
|
+
def suite_run_command(
|
|
580
|
+
ctx: click.Context,
|
|
581
|
+
suite_json: str,
|
|
582
|
+
endpoint: Optional[str],
|
|
583
|
+
method: str,
|
|
584
|
+
input_key: str,
|
|
585
|
+
answer_path: str,
|
|
586
|
+
contexts_path: str,
|
|
587
|
+
citations_path: str,
|
|
588
|
+
metadata_path: str,
|
|
589
|
+
body_template: Optional[str],
|
|
590
|
+
endpoint_header: tuple[str, ...],
|
|
591
|
+
timeout: float,
|
|
592
|
+
corpus_path: Optional[str],
|
|
593
|
+
out: Optional[str],
|
|
594
|
+
json_output: bool,
|
|
595
|
+
report: bool,
|
|
596
|
+
report_out: Optional[str],
|
|
597
|
+
mode: Optional[str],
|
|
598
|
+
fail_on: tuple[str, ...],
|
|
599
|
+
) -> int:
|
|
600
|
+
"""Replay a regression suite against a running RAG endpoint."""
|
|
601
|
+
|
|
602
|
+
config = _load(ctx)
|
|
603
|
+
resolved_endpoint = endpoint or config.eval_endpoint
|
|
604
|
+
if not resolved_endpoint:
|
|
605
|
+
raise click.ClickException("--endpoint or eval_endpoint in contexttrace.yaml is required.")
|
|
606
|
+
|
|
607
|
+
try:
|
|
608
|
+
suite = load_suite_file(suite_json)
|
|
609
|
+
body = json.loads(body_template) if body_template else None
|
|
610
|
+
result = run_suite(
|
|
611
|
+
suite,
|
|
612
|
+
endpoint=resolved_endpoint,
|
|
613
|
+
method=method,
|
|
614
|
+
headers=_parse_headers(list(endpoint_header)),
|
|
615
|
+
body_template=body,
|
|
616
|
+
input_key=input_key,
|
|
617
|
+
answer_path=answer_path,
|
|
618
|
+
contexts_path=contexts_path,
|
|
619
|
+
citations_path=citations_path,
|
|
620
|
+
metadata_path=metadata_path,
|
|
621
|
+
timeout=timeout,
|
|
622
|
+
corpus_path=corpus_path,
|
|
623
|
+
mode=mode,
|
|
624
|
+
)
|
|
625
|
+
except json.JSONDecodeError as exc:
|
|
626
|
+
raise click.ClickException(
|
|
627
|
+
"Invalid --body-template JSON at line %s column %s: %s"
|
|
628
|
+
% (exc.lineno, exc.colno, exc.msg)
|
|
629
|
+
) from exc
|
|
630
|
+
except (RuntimeError, ValueError, VerificationInputError) as exc:
|
|
631
|
+
raise click.ClickException(str(exc)) from exc
|
|
632
|
+
|
|
633
|
+
output_path = out or str(
|
|
634
|
+
Path(".contexttrace")
|
|
635
|
+
/ "suites"
|
|
636
|
+
/ ("%s_results.json" % _safe_filename(str(result.get("suite_name") or Path(suite_json).stem)))
|
|
637
|
+
)
|
|
638
|
+
written_result = write_suite_result(result, output_path)
|
|
639
|
+
|
|
640
|
+
written_report = None
|
|
641
|
+
if report or report_out:
|
|
642
|
+
report_path = report_out or str(
|
|
643
|
+
Path(".contexttrace")
|
|
644
|
+
/ "reports"
|
|
645
|
+
/ ("%s_suite.html" % _safe_filename(str(result.get("suite_name") or Path(suite_json).stem)))
|
|
646
|
+
)
|
|
647
|
+
written_report = SuiteReportGenerator().generate(result, path=report_path)
|
|
648
|
+
|
|
649
|
+
effective_fail_on = fail_on or ("failed_case", "error")
|
|
650
|
+
fail_messages = suite_failures(result, effective_fail_on)
|
|
651
|
+
if json_output:
|
|
652
|
+
if written_report:
|
|
653
|
+
click.echo("Report: %s" % written_report, err=True)
|
|
654
|
+
click.echo("Results: %s" % written_result, err=True)
|
|
655
|
+
click.echo(json.dumps(result, indent=2))
|
|
656
|
+
for message in fail_messages:
|
|
657
|
+
click.echo("Suite failed: %s" % message, err=True)
|
|
658
|
+
return 1 if fail_messages else 0
|
|
659
|
+
|
|
660
|
+
_print_suite_result(result, written_result=written_result, written_report=written_report)
|
|
661
|
+
for message in fail_messages:
|
|
662
|
+
click.echo("Suite failed: %s" % message, err=True)
|
|
663
|
+
return 1 if fail_messages else 0
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
@suite_group.command("report")
|
|
667
|
+
@click.argument("results_json")
|
|
668
|
+
@click.option("--out", default=None, help="HTML report path.")
|
|
669
|
+
def suite_report_command(results_json: str, out: Optional[str]) -> int:
|
|
670
|
+
"""Generate a local HTML report from a suite result JSON file."""
|
|
671
|
+
|
|
672
|
+
try:
|
|
673
|
+
result = load_suite_result_file(results_json)
|
|
674
|
+
except VerificationInputError as exc:
|
|
675
|
+
raise click.ClickException(str(exc)) from exc
|
|
676
|
+
output_path = out or str(Path(".contexttrace") / "reports" / ("%s.html" % Path(results_json).stem))
|
|
677
|
+
written = SuiteReportGenerator().generate(result, path=output_path)
|
|
678
|
+
click.echo("Report: %s" % written)
|
|
679
|
+
return 0
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
@cli.command("verify-demo")
|
|
683
|
+
@click.argument("demo_name", required=False, default="unsupported_claim")
|
|
384
684
|
@click.option("--json", "json_output", is_flag=True, help="Print the full verification result as JSON.")
|
|
385
685
|
@click.option("--report", is_flag=True, help="Generate a local HTML verification report.")
|
|
386
686
|
@click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
|
|
@@ -1144,7 +1444,7 @@ def viewer(ctx: click.Context, host: str, port: int) -> None:
|
|
|
1144
1444
|
serve_viewer(storage_path=config.storage_path, host=host, port=port)
|
|
1145
1445
|
|
|
1146
1446
|
|
|
1147
|
-
def main(argv: Optional[list[str]] = None) -> int:
|
|
1447
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
1148
1448
|
try:
|
|
1149
1449
|
result = cli.main(args=argv, prog_name="contexttrace", standalone_mode=False)
|
|
1150
1450
|
return int(result or 0)
|
|
@@ -1156,13 +1456,47 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|
|
1156
1456
|
except ContextTraceError as exc:
|
|
1157
1457
|
click.echo("ContextTrace failed: %s" % exc, err=True)
|
|
1158
1458
|
return 2
|
|
1159
|
-
except ValueError as exc:
|
|
1160
|
-
click.echo("ContextTrace failed: %s" % exc, err=True)
|
|
1161
|
-
return 2
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
def
|
|
1165
|
-
|
|
1459
|
+
except ValueError as exc:
|
|
1460
|
+
click.echo("ContextTrace failed: %s" % exc, err=True)
|
|
1461
|
+
return 2
|
|
1462
|
+
|
|
1463
|
+
|
|
1464
|
+
def _print_suite_result(
|
|
1465
|
+
result: dict,
|
|
1466
|
+
*,
|
|
1467
|
+
written_result: str,
|
|
1468
|
+
written_report: Optional[str],
|
|
1469
|
+
) -> None:
|
|
1470
|
+
summary = result.get("summary") or {}
|
|
1471
|
+
click.echo("Suite: %s" % result.get("suite_name"))
|
|
1472
|
+
click.echo("Status: %s" % summary.get("status"))
|
|
1473
|
+
click.echo("Cases: %s" % summary.get("total_cases"))
|
|
1474
|
+
click.echo("Passed: %s" % summary.get("passed"))
|
|
1475
|
+
click.echo("Failed: %s" % summary.get("failed"))
|
|
1476
|
+
click.echo("Errors: %s" % summary.get("errors"))
|
|
1477
|
+
click.echo("Regressions: %s" % summary.get("regressions"))
|
|
1478
|
+
click.echo("Resolved failures: %s" % summary.get("resolved_failures"))
|
|
1479
|
+
click.echo("Average support rate: %.3f" % float(summary.get("average_support_rate") or 0.0))
|
|
1480
|
+
click.echo("Results: %s" % written_result)
|
|
1481
|
+
if written_report:
|
|
1482
|
+
click.echo("Report: %s" % written_report)
|
|
1483
|
+
|
|
1484
|
+
failed_cases = [case for case in result.get("cases") or [] if case.get("status") in {"failed", "error"}]
|
|
1485
|
+
if failed_cases:
|
|
1486
|
+
click.echo("Failed cases:")
|
|
1487
|
+
for case in failed_cases:
|
|
1488
|
+
failures = "; ".join(str(item) for item in case.get("failures") or []) or "unknown failure"
|
|
1489
|
+
click.echo("- %s: %s" % (case.get("id"), failures))
|
|
1490
|
+
|
|
1491
|
+
|
|
1492
|
+
def _safe_filename(value: str) -> str:
|
|
1493
|
+
cleaned = "".join(char if char.isalnum() or char in {"-", "_"} else "_" for char in value.strip().lower())
|
|
1494
|
+
cleaned = "_".join(part for part in cleaned.split("_") if part)
|
|
1495
|
+
return cleaned[:80] or "contexttrace"
|
|
1496
|
+
|
|
1497
|
+
|
|
1498
|
+
def _load(ctx: click.Context) -> ContextTraceConfig:
|
|
1499
|
+
return load_config(config_path=(ctx.obj or {}).get("config_path"))
|
|
1166
1500
|
|
|
1167
1501
|
|
|
1168
1502
|
def _client(ctx: click.Context) -> ContextTrace:
|
|
@@ -10,10 +10,10 @@ from contexttrace.verify.schema import (
|
|
|
10
10
|
load_trace_file,
|
|
11
11
|
)
|
|
12
12
|
from contexttrace.verify.demos import list_verify_demos, load_verify_demo
|
|
13
|
-
from contexttrace.verify.qa import qa_failures, qa_trace
|
|
14
|
-
from contexttrace.verify.trace_inspect import inspect_trace
|
|
15
|
-
|
|
16
|
-
__all__ = [
|
|
13
|
+
from contexttrace.verify.qa import qa_failures, qa_trace
|
|
14
|
+
from contexttrace.verify.trace_inspect import inspect_trace
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
17
|
"RAGTrace",
|
|
18
18
|
"TraceCitation",
|
|
19
19
|
"TraceContext",
|
|
@@ -22,17 +22,17 @@ __all__ = [
|
|
|
22
22
|
"audit_trace",
|
|
23
23
|
"audit_trace_file",
|
|
24
24
|
"audit_trace_with_corpus",
|
|
25
|
-
"compare_failures",
|
|
26
|
-
"compare_trace_files",
|
|
27
|
-
"compare_verifications",
|
|
28
|
-
"inspect_trace",
|
|
29
|
-
"list_verify_demos",
|
|
30
|
-
"load_corpus",
|
|
31
|
-
"load_trace_file",
|
|
32
|
-
"load_verify_demo",
|
|
33
|
-
"qa_failures",
|
|
34
|
-
"qa_trace",
|
|
35
|
-
"run_audit_benchmark",
|
|
36
|
-
"verify_trace",
|
|
37
|
-
"verify_trace_file",
|
|
38
|
-
]
|
|
25
|
+
"compare_failures",
|
|
26
|
+
"compare_trace_files",
|
|
27
|
+
"compare_verifications",
|
|
28
|
+
"inspect_trace",
|
|
29
|
+
"list_verify_demos",
|
|
30
|
+
"load_corpus",
|
|
31
|
+
"load_trace_file",
|
|
32
|
+
"load_verify_demo",
|
|
33
|
+
"qa_failures",
|
|
34
|
+
"qa_trace",
|
|
35
|
+
"run_audit_benchmark",
|
|
36
|
+
"verify_trace",
|
|
37
|
+
"verify_trace_file",
|
|
38
|
+
]
|