contexttrace 0.6.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {contexttrace-0.6.0 → contexttrace-0.7.0}/PKG-INFO +10 -2
  2. {contexttrace-0.6.0 → contexttrace-0.7.0}/README.md +19 -11
  3. contexttrace-0.7.0/contexttrace/_version.py +1 -0
  4. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/cli.py +362 -28
  5. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/__init__.py +18 -18
  6. contexttrace-0.7.0/contexttrace/verify/suite.py +662 -0
  7. contexttrace-0.7.0/contexttrace/verify/suite_report.py +316 -0
  8. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace.egg-info/SOURCES.txt +2 -0
  9. {contexttrace-0.6.0 → contexttrace-0.7.0}/pyproject.toml +1 -1
  10. contexttrace-0.6.0/contexttrace/_version.py +0 -1
  11. {contexttrace-0.6.0 → contexttrace-0.7.0}/MANIFEST.in +0 -0
  12. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/__init__.py +0 -0
  13. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/capture.py +0 -0
  14. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/capture_endpoint.py +0 -0
  15. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/client.py +0 -0
  16. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/config.py +0 -0
  17. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/demo.py +0 -0
  18. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/demo_data.py +0 -0
  19. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/endpoint_eval.py +0 -0
  20. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/errors.py +0 -0
  21. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/evaluator.py +0 -0
  22. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/integrations/__init__.py +0 -0
  23. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/integrations/fastapi.py +0 -0
  24. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/integrations/langchain.py +0 -0
  25. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/integrations/langgraph.py +0 -0
  26. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/integrations/llamaindex.py +0 -0
  27. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/integrations/opentelemetry.py +0 -0
  28. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/local.py +0 -0
  29. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/py.typed +0 -0
  30. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/regression.py +0 -0
  31. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/reliability.py +0 -0
  32. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/report.py +0 -0
  33. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/storage/__init__.py +0 -0
  34. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/storage/sqlite_store.py +0 -0
  35. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/thresholds.py +0 -0
  36. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/transport.py +0 -0
  37. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/abstention.py +0 -0
  38. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/audit.py +0 -0
  39. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/audit_benchmark.py +0 -0
  40. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/audit_benchmark_cases.json +0 -0
  41. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/audit_report.py +0 -0
  42. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/benchmark.py +0 -0
  43. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/citations.py +0 -0
  44. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/claims.py +0 -0
  45. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/compare.py +0 -0
  46. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/compare_report.py +0 -0
  47. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/demos.py +0 -0
  48. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/evidence.py +0 -0
  49. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/external_benchmark_cases.json +0 -0
  50. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/facts.py +0 -0
  51. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/qa.py +0 -0
  52. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/qa_report.py +0 -0
  53. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/real_benchmark_cases.json +0 -0
  54. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/report.py +0 -0
  55. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/root_cause.py +0 -0
  56. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/runner.py +0 -0
  57. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/schema.py +0 -0
  58. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/spans.py +0 -0
  59. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/trace_inspect.py +0 -0
  60. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/verify/verdicts.py +0 -0
  61. {contexttrace-0.6.0 → contexttrace-0.7.0}/contexttrace/viewer.py +0 -0
  62. {contexttrace-0.6.0 → contexttrace-0.7.0}/setup.cfg +0 -0
  63. {contexttrace-0.6.0 → contexttrace-0.7.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: contexttrace
3
- Version: 0.6.0
3
+ Version: 0.7.0
4
4
  Summary: Local-first SDK and CLI for RAG and agent reliability tracing, citation checks, and failure diagnosis.
5
5
  Author: ContextTrace contributors
6
6
  License: MIT
@@ -176,6 +176,12 @@ contexttrace verify-benchmark --case-set external --mode semantic --report
176
176
  contexttrace compare baseline.json current.json
177
177
  contexttrace compare baseline.json current.json --report
178
178
  contexttrace compare baseline.json current.json --fail-on new_failure
179
+ contexttrace suite create traces/*.json --out contexttrace-suite.json
180
+ contexttrace suite add contexttrace-suite.json traces/new_failure.json
181
+ contexttrace suite list contexttrace-suite.json
182
+ contexttrace suite run contexttrace-suite.json --endpoint http://localhost:8000/query --report
183
+ contexttrace suite prune contexttrace-suite.json --results .contexttrace/suites/contexttrace-regression-suite_results.json
184
+ contexttrace suite report .contexttrace/suites/contexttrace-regression-suite_results.json
179
185
  contexttrace audit trace.json --corpus docs/
180
186
  contexttrace audit trace.json --corpus docs/ --report
181
187
  contexttrace audit trace.json --corpus docs/ --fail-on retrieval_miss
@@ -204,11 +210,13 @@ write_rag_trace(trace, "trace.json")
204
210
 
205
211
  Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
206
212
 
213
+ Use `contexttrace suite create`, `suite add`, and `suite run` to turn saved failures into replayable endpoint tests. Suite runs call your current RAG endpoint with the saved query, verify the new answer, compare it with the baseline trace, and exit non-zero when a saved failure still reproduces or a good case regresses. Use `suite list`, `suite remove`, and `suite prune` to manage the suite as failures are fixed or retired.
214
+
207
215
  Use `contexttrace audit trace.json --corpus docs/` to diagnose whether an unsupported claim failed because retrieval missed evidence, reranking buried it, chunking omitted the supporting span, the corpus lacks coverage, or generation overclaimed. Audit output includes failure stages, diagnostic signals, and prioritized next actions.
208
216
 
209
217
  Use `contexttrace audit-benchmark --case-set real --mode semantic` to test retrieval-audit labels against bundled public OSS documentation and GitHub issue snippets from Qdrant, Chroma, Haystack, LangChain, and ContextTrace docs.
210
218
 
211
- The v0.6.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
219
+ The v0.7.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
212
220
 
213
221
  ## What It Catches
214
222
 
@@ -116,12 +116,18 @@ contexttrace verify trace.json --fail-on unsupported --fail-on citation_mismatch
116
116
  contexttrace verify-benchmark --mode semantic
117
117
  contexttrace verify-benchmark --mode semantic --report
118
118
  contexttrace verify-benchmark --case-set external --mode semantic --report
119
- contexttrace compare baseline.json current.json
120
- contexttrace compare baseline.json current.json --report
121
- contexttrace compare baseline.json current.json --fail-on new_failure
122
- contexttrace audit trace.json --corpus docs/
123
- contexttrace audit trace.json --corpus docs/ --report
124
- contexttrace audit trace.json --corpus docs/ --fail-on retrieval_miss
119
+ contexttrace compare baseline.json current.json
120
+ contexttrace compare baseline.json current.json --report
121
+ contexttrace compare baseline.json current.json --fail-on new_failure
122
+ contexttrace suite create traces/*.json --out contexttrace-suite.json
123
+ contexttrace suite add contexttrace-suite.json traces/new_failure.json
124
+ contexttrace suite list contexttrace-suite.json
125
+ contexttrace suite run contexttrace-suite.json --endpoint http://localhost:8000/query --report
126
+ contexttrace suite prune contexttrace-suite.json --results .contexttrace/suites/contexttrace-regression-suite_results.json
127
+ contexttrace suite report .contexttrace/suites/contexttrace-regression-suite_results.json
128
+ contexttrace audit trace.json --corpus docs/
129
+ contexttrace audit trace.json --corpus docs/ --report
130
+ contexttrace audit trace.json --corpus docs/ --fail-on retrieval_miss
125
131
  contexttrace audit-benchmark --case-set real --mode semantic
126
132
  contexttrace audit-benchmark --case-set real --mode semantic --report
127
133
  ```
@@ -144,14 +150,16 @@ from contexttrace import capture_rag_trace, write_rag_trace
144
150
  trace = capture_rag_trace(query=question, answer=answer, contexts=retrieved_docs)
145
151
  write_rag_trace(trace, "trace.json")
146
152
  ```
147
-
148
- Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
149
-
150
- Use `contexttrace audit trace.json --corpus docs/` to diagnose whether an unsupported claim failed because retrieval missed evidence, reranking buried it, chunking omitted the supporting span, the corpus lacks coverage, or generation overclaimed. Audit output includes failure stages, diagnostic signals, and prioritized next actions.
153
+
154
+ Use `contexttrace compare baseline.json current.json` to diff two portable traces or saved `verify --json` outputs. It reports support-rate deltas, new unsupported claims, citation regressions, should-abstain flips, and new root causes, with `--fail-on` gates for CI.
155
+
156
+ Use `contexttrace suite create`, `suite add`, and `suite run` to turn saved failures into replayable endpoint tests. Suite runs call your current RAG endpoint with the saved query, verify the new answer, compare it with the baseline trace, and exit non-zero when a saved failure still reproduces or a good case regresses. Use `suite list`, `suite remove`, and `suite prune` to manage the suite as failures are fixed or retired.
157
+
158
+ Use `contexttrace audit trace.json --corpus docs/` to diagnose whether an unsupported claim failed because retrieval missed evidence, reranking buried it, chunking omitted the supporting span, the corpus lacks coverage, or generation overclaimed. Audit output includes failure stages, diagnostic signals, and prioritized next actions.
151
159
 
152
160
  Use `contexttrace audit-benchmark --case-set real --mode semantic` to test retrieval-audit labels against bundled public OSS documentation and GitHub issue snippets from Qdrant, Chroma, Haystack, LangChain, and ContextTrace docs.
153
161
 
154
- The v0.6.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
162
+ The v0.7.0 verifier uses local lexical heuristics by default. Claim extraction is rule-based, contradiction detection is conservative, and semantic or LLM-judge support can be added later.
155
163
 
156
164
  ## What It Catches
157
165
 
@@ -0,0 +1 @@
1
+ __version__ = "0.7.0"
@@ -25,24 +25,38 @@ from contexttrace.report import ReportGenerator
25
25
  from contexttrace.storage import SQLiteTraceStore
26
26
  from contexttrace.thresholds import parse_thresholds, threshold_failures
27
27
  from contexttrace.verify import (
28
- VerificationInputError,
29
- audit_failures,
30
- audit_trace,
31
- compare_failures,
32
- compare_trace_files,
33
- list_verify_demos,
34
- load_trace_file,
35
- load_verify_demo,
36
- verify_trace,
37
- )
28
+ VerificationInputError,
29
+ audit_failures,
30
+ audit_trace,
31
+ compare_failures,
32
+ compare_trace_files,
33
+ list_verify_demos,
34
+ load_trace_file,
35
+ load_verify_demo,
36
+ verify_trace,
37
+ )
38
38
  from contexttrace.verify.benchmark import run_verify_benchmark, write_verify_benchmark_report
39
39
  from contexttrace.verify.audit_benchmark import run_audit_benchmark, write_audit_benchmark_report
40
40
  from contexttrace.verify.audit_report import AuditReportGenerator
41
41
  from contexttrace.verify.compare_report import CompareReportGenerator
42
- from contexttrace.verify.qa import qa_failures, qa_trace
43
- from contexttrace.verify.qa_report import QAReportGenerator
44
- from contexttrace.verify.report import VerifyReportGenerator
45
- from contexttrace.verify.trace_inspect import inspect_trace
42
+ from contexttrace.verify.qa import qa_failures, qa_trace
43
+ from contexttrace.verify.qa_report import QAReportGenerator
44
+ from contexttrace.verify.report import VerifyReportGenerator
45
+ from contexttrace.verify.suite import (
46
+ add_trace_files_to_suite,
47
+ create_suite_from_trace_files,
48
+ list_suite_cases,
49
+ load_suite_file,
50
+ load_suite_result_file,
51
+ prune_suite_cases,
52
+ remove_suite_cases,
53
+ run_suite,
54
+ suite_failures,
55
+ write_suite_file,
56
+ write_suite_result,
57
+ )
58
+ from contexttrace.verify.suite_report import SuiteReportGenerator
59
+ from contexttrace.verify.trace_inspect import inspect_trace
46
60
  from contexttrace.viewer import serve_viewer
47
61
 
48
62
 
@@ -315,7 +329,7 @@ def inspect_command(trace_json: str, json_output: bool) -> int:
315
329
  @click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
316
330
  @click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode.")
317
331
  @click.option("--fail-on", multiple=True, help="Fail on high_risk, medium_risk, any_risk, unsupported, should_abstain, audit_failure, or inspect_warning.")
318
- def qa_command(
332
+ def qa_command(
319
333
  trace_json: str,
320
334
  corpus_path: Optional[str],
321
335
  json_output: bool,
@@ -376,11 +390,297 @@ def qa_command(
376
390
  click.echo("Report: %s" % written_report)
377
391
  for message in fail_messages:
378
392
  click.echo("QA failed: %s" % message, err=True)
379
- return 1 if fail_messages else 0
380
-
381
-
382
- @cli.command("verify-demo")
383
- @click.argument("demo_name", required=False, default="unsupported_claim")
393
+ return 1 if fail_messages else 0
394
+
395
+
396
+ @cli.group("suite")
397
+ def suite_group() -> None:
398
+ """Create and run local RAG regression suites."""
399
+
400
+
401
+ @suite_group.command("create")
402
+ @click.argument("trace_json", nargs=-1, required=True)
403
+ @click.option("--out", default="contexttrace-suite.json", show_default=True, help="Suite JSON file to write.")
404
+ @click.option("--name", default=None, help="Suite name.")
405
+ @click.option("--mode", default="lexical", show_default=True, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode for baseline QA.")
406
+ @click.option("--corpus", "corpus_path", default=None, help="Optional local corpus directory or file for baseline retrieval/corpus audit.")
407
+ def suite_create_command(
408
+ trace_json: tuple[str, ...],
409
+ out: str,
410
+ name: Optional[str],
411
+ mode: str,
412
+ corpus_path: Optional[str],
413
+ ) -> int:
414
+ """Create a suite from saved portable RAG trace files."""
415
+
416
+ try:
417
+ suite = create_suite_from_trace_files(
418
+ trace_json,
419
+ name=name,
420
+ mode=mode,
421
+ corpus_path=corpus_path,
422
+ )
423
+ written = write_suite_file(suite, out)
424
+ except VerificationInputError as exc:
425
+ raise click.ClickException(str(exc)) from exc
426
+
427
+ click.echo("Suite: %s" % written)
428
+ click.echo("Cases: %s" % len(suite.get("cases") or []))
429
+ click.echo("Policy: saved cases must pass on replay")
430
+ return 0
431
+
432
+
433
+ @suite_group.command("add")
434
+ @click.argument("suite_json")
435
+ @click.argument("trace_json", nargs=-1, required=True)
436
+ @click.option("--out", default=None, help="Suite JSON file to write. Defaults to overwriting suite_json.")
437
+ @click.option("--mode", default=None, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode for added baselines. Defaults to the suite mode.")
438
+ @click.option("--corpus", "corpus_path", default=None, help="Optional local corpus directory or file for baseline retrieval/corpus audit.")
439
+ @click.option("--replace", is_flag=True, help="Replace existing cases with the same generated case IDs.")
440
+ def suite_add_command(
441
+ suite_json: str,
442
+ trace_json: tuple[str, ...],
443
+ out: Optional[str],
444
+ mode: Optional[str],
445
+ corpus_path: Optional[str],
446
+ replace: bool,
447
+ ) -> int:
448
+ """Add saved portable RAG traces to an existing suite."""
449
+
450
+ try:
451
+ suite = load_suite_file(suite_json)
452
+ result = add_trace_files_to_suite(
453
+ suite,
454
+ trace_json,
455
+ mode=mode,
456
+ corpus_path=corpus_path,
457
+ replace=replace,
458
+ )
459
+ written = write_suite_file(result["suite"], out or suite_json)
460
+ except VerificationInputError as exc:
461
+ raise click.ClickException(str(exc)) from exc
462
+
463
+ click.echo("Suite: %s" % written)
464
+ click.echo("Added: %s" % len(result["added_case_ids"]))
465
+ if result["added_case_ids"]:
466
+ click.echo("Added case IDs: %s" % ", ".join(result["added_case_ids"]))
467
+ click.echo("Replaced: %s" % result["replaced"])
468
+ click.echo("Cases: %s" % len(result["suite"].get("cases") or []))
469
+ return 0
470
+
471
+
472
+ @suite_group.command("list")
473
+ @click.argument("suite_json")
474
+ @click.option("--json", "json_output", is_flag=True, help="Print cases as JSON.")
475
+ def suite_list_command(suite_json: str, json_output: bool) -> int:
476
+ """List cases in a local regression suite."""
477
+
478
+ try:
479
+ suite = load_suite_file(suite_json)
480
+ rows = list_suite_cases(suite)
481
+ except VerificationInputError as exc:
482
+ raise click.ClickException(str(exc)) from exc
483
+
484
+ if json_output:
485
+ click.echo(json.dumps({"suite": suite.get("name"), "cases": rows}, indent=2))
486
+ return 0
487
+
488
+ click.echo("Suite: %s" % (suite.get("name") or suite_json))
489
+ click.echo("Cases: %s" % len(rows))
490
+ click.echo("id\tbaseline_risk\tbaseline_issue\tsupport_rate\tquery")
491
+ for row in rows:
492
+ click.echo(
493
+ "%s\t%s\t%s\t%s\t%s"
494
+ % (
495
+ row.get("id"),
496
+ row.get("baseline_risk_level") or "",
497
+ row.get("baseline_primary_issue") or "",
498
+ row.get("baseline_support_rate") if row.get("baseline_support_rate") is not None else "",
499
+ _preview(row.get("query"), limit=90),
500
+ )
501
+ )
502
+ return 0
503
+
504
+
505
+ @suite_group.command("remove")
506
+ @click.argument("suite_json")
507
+ @click.argument("case_id", nargs=-1, required=True)
508
+ @click.option("--out", default=None, help="Suite JSON file to write. Defaults to overwriting suite_json.")
509
+ def suite_remove_command(suite_json: str, case_id: tuple[str, ...], out: Optional[str]) -> int:
510
+ """Remove one or more case IDs from a suite."""
511
+
512
+ try:
513
+ suite = load_suite_file(suite_json)
514
+ result = remove_suite_cases(suite, case_id)
515
+ written = write_suite_file(result["suite"], out or suite_json)
516
+ except VerificationInputError as exc:
517
+ raise click.ClickException(str(exc)) from exc
518
+
519
+ click.echo("Suite: %s" % written)
520
+ click.echo("Removed: %s" % len(result["removed_case_ids"]))
521
+ if result["removed_case_ids"]:
522
+ click.echo("Removed case IDs: %s" % ", ".join(result["removed_case_ids"]))
523
+ if result["missing_case_ids"]:
524
+ click.echo("Missing case IDs: %s" % ", ".join(result["missing_case_ids"]))
525
+ click.echo("Cases: %s" % len(result["suite"].get("cases") or []))
526
+ return 1 if result["missing_case_ids"] else 0
527
+
528
+
529
+ @suite_group.command("prune")
530
+ @click.argument("suite_json")
531
+ @click.option("--results", "results_json", required=True, help="Suite result JSON from `contexttrace suite run`.")
532
+ @click.option("--status", "statuses", multiple=True, default=("passed",), show_default=True, help="Result status to remove. May be repeated.")
533
+ @click.option("--out", default=None, help="Suite JSON file to write. Defaults to overwriting suite_json.")
534
+ def suite_prune_command(
535
+ suite_json: str,
536
+ results_json: str,
537
+ statuses: tuple[str, ...],
538
+ out: Optional[str],
539
+ ) -> int:
540
+ """Remove cases by status from a saved suite result."""
541
+
542
+ try:
543
+ suite = load_suite_file(suite_json)
544
+ result_payload = load_suite_result_file(results_json)
545
+ result = prune_suite_cases(suite, result_payload, statuses=statuses)
546
+ written = write_suite_file(result["suite"], out or suite_json)
547
+ except VerificationInputError as exc:
548
+ raise click.ClickException(str(exc)) from exc
549
+
550
+ click.echo("Suite: %s" % written)
551
+ click.echo("Pruned statuses: %s" % ", ".join(result["statuses"]))
552
+ click.echo("Removed: %s" % len(result["removed_case_ids"]))
553
+ if result["removed_case_ids"]:
554
+ click.echo("Removed case IDs: %s" % ", ".join(result["removed_case_ids"]))
555
+ click.echo("Cases: %s" % len(result["suite"].get("cases") or []))
556
+ return 0
557
+
558
+
559
+ @suite_group.command("run")
560
+ @click.argument("suite_json")
561
+ @click.option("--endpoint", default=None, help="RAG endpoint URL. Defaults to config eval_endpoint.")
562
+ @click.option("--method", default="POST", type=click.Choice(["GET", "POST"], case_sensitive=False), help="Endpoint method.")
563
+ @click.option("--input-key", default="question", show_default=True, help="Request body/query key for the question.")
564
+ @click.option("--answer-path", default="$.answer", show_default=True, help="JSONPath for answer extraction.")
565
+ @click.option("--contexts-path", default="$.contexts", show_default=True, help="JSONPath for context extraction.")
566
+ @click.option("--citations-path", default="$.citations", show_default=True, help="JSONPath for citation extraction.")
567
+ @click.option("--metadata-path", default="$.metadata", show_default=True, help="JSONPath for response metadata extraction.")
568
+ @click.option("--body-template", default=None, help="JSON body template. Use {{query}} where the question should be inserted.")
569
+ @click.option("--endpoint-header", multiple=True, help="Header formatted as Name:Value. May be repeated.")
570
+ @click.option("--timeout", default=30.0, show_default=True, type=float, help="Per-request timeout.")
571
+ @click.option("--corpus", "corpus_path", default=None, help="Optional local corpus directory or file for retrieval/corpus audit.")
572
+ @click.option("--out", default=None, help="Suite result JSON path.")
573
+ @click.option("--json", "json_output", is_flag=True, help="Print the full suite result as JSON.")
574
+ @click.option("--report", is_flag=True, help="Generate a local HTML suite report.")
575
+ @click.option("--report-out", default=None, help="HTML report path. Implies --report when provided.")
576
+ @click.option("--mode", default=None, type=click.Choice(["lexical", "semantic"]), help="Evidence scoring mode. Defaults to the suite mode.")
577
+ @click.option("--fail-on", multiple=True, help="Fail on failed_case, regression, unsupported, should_abstain, high_risk, medium_risk, error, or any_failure.")
578
+ @click.pass_context
579
+ def suite_run_command(
580
+ ctx: click.Context,
581
+ suite_json: str,
582
+ endpoint: Optional[str],
583
+ method: str,
584
+ input_key: str,
585
+ answer_path: str,
586
+ contexts_path: str,
587
+ citations_path: str,
588
+ metadata_path: str,
589
+ body_template: Optional[str],
590
+ endpoint_header: tuple[str, ...],
591
+ timeout: float,
592
+ corpus_path: Optional[str],
593
+ out: Optional[str],
594
+ json_output: bool,
595
+ report: bool,
596
+ report_out: Optional[str],
597
+ mode: Optional[str],
598
+ fail_on: tuple[str, ...],
599
+ ) -> int:
600
+ """Replay a regression suite against a running RAG endpoint."""
601
+
602
+ config = _load(ctx)
603
+ resolved_endpoint = endpoint or config.eval_endpoint
604
+ if not resolved_endpoint:
605
+ raise click.ClickException("--endpoint or eval_endpoint in contexttrace.yaml is required.")
606
+
607
+ try:
608
+ suite = load_suite_file(suite_json)
609
+ body = json.loads(body_template) if body_template else None
610
+ result = run_suite(
611
+ suite,
612
+ endpoint=resolved_endpoint,
613
+ method=method,
614
+ headers=_parse_headers(list(endpoint_header)),
615
+ body_template=body,
616
+ input_key=input_key,
617
+ answer_path=answer_path,
618
+ contexts_path=contexts_path,
619
+ citations_path=citations_path,
620
+ metadata_path=metadata_path,
621
+ timeout=timeout,
622
+ corpus_path=corpus_path,
623
+ mode=mode,
624
+ )
625
+ except json.JSONDecodeError as exc:
626
+ raise click.ClickException(
627
+ "Invalid --body-template JSON at line %s column %s: %s"
628
+ % (exc.lineno, exc.colno, exc.msg)
629
+ ) from exc
630
+ except (RuntimeError, ValueError, VerificationInputError) as exc:
631
+ raise click.ClickException(str(exc)) from exc
632
+
633
+ output_path = out or str(
634
+ Path(".contexttrace")
635
+ / "suites"
636
+ / ("%s_results.json" % _safe_filename(str(result.get("suite_name") or Path(suite_json).stem)))
637
+ )
638
+ written_result = write_suite_result(result, output_path)
639
+
640
+ written_report = None
641
+ if report or report_out:
642
+ report_path = report_out or str(
643
+ Path(".contexttrace")
644
+ / "reports"
645
+ / ("%s_suite.html" % _safe_filename(str(result.get("suite_name") or Path(suite_json).stem)))
646
+ )
647
+ written_report = SuiteReportGenerator().generate(result, path=report_path)
648
+
649
+ effective_fail_on = fail_on or ("failed_case", "error")
650
+ fail_messages = suite_failures(result, effective_fail_on)
651
+ if json_output:
652
+ if written_report:
653
+ click.echo("Report: %s" % written_report, err=True)
654
+ click.echo("Results: %s" % written_result, err=True)
655
+ click.echo(json.dumps(result, indent=2))
656
+ for message in fail_messages:
657
+ click.echo("Suite failed: %s" % message, err=True)
658
+ return 1 if fail_messages else 0
659
+
660
+ _print_suite_result(result, written_result=written_result, written_report=written_report)
661
+ for message in fail_messages:
662
+ click.echo("Suite failed: %s" % message, err=True)
663
+ return 1 if fail_messages else 0
664
+
665
+
666
+ @suite_group.command("report")
667
+ @click.argument("results_json")
668
+ @click.option("--out", default=None, help="HTML report path.")
669
+ def suite_report_command(results_json: str, out: Optional[str]) -> int:
670
+ """Generate a local HTML report from a suite result JSON file."""
671
+
672
+ try:
673
+ result = load_suite_result_file(results_json)
674
+ except VerificationInputError as exc:
675
+ raise click.ClickException(str(exc)) from exc
676
+ output_path = out or str(Path(".contexttrace") / "reports" / ("%s.html" % Path(results_json).stem))
677
+ written = SuiteReportGenerator().generate(result, path=output_path)
678
+ click.echo("Report: %s" % written)
679
+ return 0
680
+
681
+
682
+ @cli.command("verify-demo")
683
+ @click.argument("demo_name", required=False, default="unsupported_claim")
384
684
  @click.option("--json", "json_output", is_flag=True, help="Print the full verification result as JSON.")
385
685
  @click.option("--report", is_flag=True, help="Generate a local HTML verification report.")
386
686
  @click.option("--out", default=None, help="HTML report path. Implies --report when provided.")
@@ -1144,7 +1444,7 @@ def viewer(ctx: click.Context, host: str, port: int) -> None:
1144
1444
  serve_viewer(storage_path=config.storage_path, host=host, port=port)
1145
1445
 
1146
1446
 
1147
- def main(argv: Optional[list[str]] = None) -> int:
1447
+ def main(argv: Optional[list[str]] = None) -> int:
1148
1448
  try:
1149
1449
  result = cli.main(args=argv, prog_name="contexttrace", standalone_mode=False)
1150
1450
  return int(result or 0)
@@ -1156,13 +1456,47 @@ def main(argv: Optional[list[str]] = None) -> int:
1156
1456
  except ContextTraceError as exc:
1157
1457
  click.echo("ContextTrace failed: %s" % exc, err=True)
1158
1458
  return 2
1159
- except ValueError as exc:
1160
- click.echo("ContextTrace failed: %s" % exc, err=True)
1161
- return 2
1162
-
1163
-
1164
- def _load(ctx: click.Context) -> ContextTraceConfig:
1165
- return load_config(config_path=(ctx.obj or {}).get("config_path"))
1459
+ except ValueError as exc:
1460
+ click.echo("ContextTrace failed: %s" % exc, err=True)
1461
+ return 2
1462
+
1463
+
1464
+ def _print_suite_result(
1465
+ result: dict,
1466
+ *,
1467
+ written_result: str,
1468
+ written_report: Optional[str],
1469
+ ) -> None:
1470
+ summary = result.get("summary") or {}
1471
+ click.echo("Suite: %s" % result.get("suite_name"))
1472
+ click.echo("Status: %s" % summary.get("status"))
1473
+ click.echo("Cases: %s" % summary.get("total_cases"))
1474
+ click.echo("Passed: %s" % summary.get("passed"))
1475
+ click.echo("Failed: %s" % summary.get("failed"))
1476
+ click.echo("Errors: %s" % summary.get("errors"))
1477
+ click.echo("Regressions: %s" % summary.get("regressions"))
1478
+ click.echo("Resolved failures: %s" % summary.get("resolved_failures"))
1479
+ click.echo("Average support rate: %.3f" % float(summary.get("average_support_rate") or 0.0))
1480
+ click.echo("Results: %s" % written_result)
1481
+ if written_report:
1482
+ click.echo("Report: %s" % written_report)
1483
+
1484
+ failed_cases = [case for case in result.get("cases") or [] if case.get("status") in {"failed", "error"}]
1485
+ if failed_cases:
1486
+ click.echo("Failed cases:")
1487
+ for case in failed_cases:
1488
+ failures = "; ".join(str(item) for item in case.get("failures") or []) or "unknown failure"
1489
+ click.echo("- %s: %s" % (case.get("id"), failures))
1490
+
1491
+
1492
+ def _safe_filename(value: str) -> str:
1493
+ cleaned = "".join(char if char.isalnum() or char in {"-", "_"} else "_" for char in value.strip().lower())
1494
+ cleaned = "_".join(part for part in cleaned.split("_") if part)
1495
+ return cleaned[:80] or "contexttrace"
1496
+
1497
+
1498
+ def _load(ctx: click.Context) -> ContextTraceConfig:
1499
+ return load_config(config_path=(ctx.obj or {}).get("config_path"))
1166
1500
 
1167
1501
 
1168
1502
  def _client(ctx: click.Context) -> ContextTrace:
@@ -10,10 +10,10 @@ from contexttrace.verify.schema import (
10
10
  load_trace_file,
11
11
  )
12
12
  from contexttrace.verify.demos import list_verify_demos, load_verify_demo
13
- from contexttrace.verify.qa import qa_failures, qa_trace
14
- from contexttrace.verify.trace_inspect import inspect_trace
15
-
16
- __all__ = [
13
+ from contexttrace.verify.qa import qa_failures, qa_trace
14
+ from contexttrace.verify.trace_inspect import inspect_trace
15
+
16
+ __all__ = [
17
17
  "RAGTrace",
18
18
  "TraceCitation",
19
19
  "TraceContext",
@@ -22,17 +22,17 @@ __all__ = [
22
22
  "audit_trace",
23
23
  "audit_trace_file",
24
24
  "audit_trace_with_corpus",
25
- "compare_failures",
26
- "compare_trace_files",
27
- "compare_verifications",
28
- "inspect_trace",
29
- "list_verify_demos",
30
- "load_corpus",
31
- "load_trace_file",
32
- "load_verify_demo",
33
- "qa_failures",
34
- "qa_trace",
35
- "run_audit_benchmark",
36
- "verify_trace",
37
- "verify_trace_file",
38
- ]
25
+ "compare_failures",
26
+ "compare_trace_files",
27
+ "compare_verifications",
28
+ "inspect_trace",
29
+ "list_verify_demos",
30
+ "load_corpus",
31
+ "load_trace_file",
32
+ "load_verify_demo",
33
+ "qa_failures",
34
+ "qa_trace",
35
+ "run_audit_benchmark",
36
+ "verify_trace",
37
+ "verify_trace_file",
38
+ ]