contexttrace 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contexttrace/__init__.py +36 -0
- contexttrace/_version.py +1 -0
- contexttrace/cli.py +474 -0
- contexttrace/client.py +1074 -0
- contexttrace/config.py +246 -0
- contexttrace/demo.py +311 -0
- contexttrace/demo_data.py +257 -0
- contexttrace/endpoint_eval.py +314 -0
- contexttrace/errors.py +14 -0
- contexttrace/evaluator.py +448 -0
- contexttrace/integrations/__init__.py +14 -0
- contexttrace/integrations/fastapi.py +311 -0
- contexttrace/integrations/langchain.py +440 -0
- contexttrace/integrations/langgraph.py +197 -0
- contexttrace/integrations/llamaindex.py +422 -0
- contexttrace/integrations/opentelemetry.py +111 -0
- contexttrace/local.py +325 -0
- contexttrace/py.typed +1 -0
- contexttrace/regression.py +123 -0
- contexttrace/reliability.py +284 -0
- contexttrace/report.py +550 -0
- contexttrace/storage/__init__.py +3 -0
- contexttrace/storage/sqlite_store.py +604 -0
- contexttrace/thresholds.py +50 -0
- contexttrace/transport.py +183 -0
- contexttrace/viewer.py +148 -0
- contexttrace-0.1.0.dist-info/METADATA +154 -0
- contexttrace-0.1.0.dist-info/RECORD +31 -0
- contexttrace-0.1.0.dist-info/WHEEL +5 -0
- contexttrace-0.1.0.dist-info/entry_points.txt +2 -0
- contexttrace-0.1.0.dist-info/top_level.txt +1 -0
contexttrace/__init__.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from contexttrace._version import __version__
|
|
2
|
+
from contexttrace.client import AsyncContextTrace, ContextTrace
|
|
3
|
+
from contexttrace.config import ContextTraceConfig
|
|
4
|
+
from contexttrace.errors import (
|
|
5
|
+
ContextTraceConfigError,
|
|
6
|
+
ContextTraceError,
|
|
7
|
+
ContextTraceHTTPError,
|
|
8
|
+
ContextTraceLocalError,
|
|
9
|
+
)
|
|
10
|
+
from contexttrace.integrations.fastapi import ContextTraceFastAPIMiddleware
|
|
11
|
+
from contexttrace.integrations.langchain import ContextTraceCallbackHandler
|
|
12
|
+
from contexttrace.integrations.langgraph import ContextTraceLangGraphTracer
|
|
13
|
+
from contexttrace.integrations.llamaindex import ContextTraceLlamaIndexCallbackHandler
|
|
14
|
+
from contexttrace.integrations.opentelemetry import OpenTelemetryExporter, export_contexttrace_trace
|
|
15
|
+
from contexttrace.reliability import ReliabilityScore, ReliabilityScorer
|
|
16
|
+
from contexttrace.report import ReportGenerator
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"AsyncContextTrace",
|
|
20
|
+
"ContextTrace",
|
|
21
|
+
"ContextTraceConfig",
|
|
22
|
+
"ContextTraceConfigError",
|
|
23
|
+
"ContextTraceCallbackHandler",
|
|
24
|
+
"ContextTraceError",
|
|
25
|
+
"ContextTraceFastAPIMiddleware",
|
|
26
|
+
"ContextTraceHTTPError",
|
|
27
|
+
"ContextTraceLocalError",
|
|
28
|
+
"ContextTraceLangGraphTracer",
|
|
29
|
+
"ContextTraceLlamaIndexCallbackHandler",
|
|
30
|
+
"OpenTelemetryExporter",
|
|
31
|
+
"ReliabilityScore",
|
|
32
|
+
"ReliabilityScorer",
|
|
33
|
+
"ReportGenerator",
|
|
34
|
+
"export_contexttrace_trace",
|
|
35
|
+
"__version__",
|
|
36
|
+
]
|
contexttrace/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
contexttrace/cli.py
ADDED
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
import urllib.error
|
|
6
|
+
import urllib.request
|
|
7
|
+
import webbrowser
|
|
8
|
+
from dataclasses import asdict
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
import click
|
|
13
|
+
|
|
14
|
+
from contexttrace._version import __version__
|
|
15
|
+
from contexttrace.client import ContextTrace
|
|
16
|
+
from contexttrace.config import ContextTraceConfig, load_config, write_default_config
|
|
17
|
+
from contexttrace.demo import run_demo_dataset
|
|
18
|
+
from contexttrace.demo_data import list_demo_datasets
|
|
19
|
+
from contexttrace.endpoint_eval import run_endpoint_eval
|
|
20
|
+
from contexttrace.errors import ContextTraceError
|
|
21
|
+
from contexttrace.regression import BENCHMARK_STRATEGIES, run_local_benchmark
|
|
22
|
+
from contexttrace.report import ReportGenerator
|
|
23
|
+
from contexttrace.storage import SQLiteTraceStore
|
|
24
|
+
from contexttrace.thresholds import parse_thresholds, threshold_failures
|
|
25
|
+
from contexttrace.viewer import serve_viewer
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
SAMPLE_QUESTIONS = [
|
|
29
|
+
{
|
|
30
|
+
"id": "refund_policy",
|
|
31
|
+
"query": "What is the refund policy?",
|
|
32
|
+
"expected_sources": ["refund_policy.md"],
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
|
|
38
|
+
@click.option("--config", "config_path", default=None, help="Path to contexttrace.yaml.")
|
|
39
|
+
@click.version_option(version=__version__, prog_name="contexttrace")
|
|
40
|
+
@click.pass_context
|
|
41
|
+
def cli(ctx: click.Context, config_path: Optional[str]) -> None:
|
|
42
|
+
ctx.obj = {"config_path": config_path}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@cli.command()
|
|
46
|
+
@click.option("--path", default="contexttrace.yaml", help="Configuration file to write.")
|
|
47
|
+
@click.option("--force", is_flag=True, help="Overwrite an existing config file.")
|
|
48
|
+
def init(path: str, force: bool) -> None:
|
|
49
|
+
config_path = write_default_config(path, overwrite=force)
|
|
50
|
+
config = load_config(config_path=config_path)
|
|
51
|
+
Path(config.local_store_dir).mkdir(parents=True, exist_ok=True)
|
|
52
|
+
Path(config.local_store_dir, "reports").mkdir(parents=True, exist_ok=True)
|
|
53
|
+
Path("evals").mkdir(parents=True, exist_ok=True)
|
|
54
|
+
sample_path = Path("evals") / "questions.json"
|
|
55
|
+
if force or not sample_path.exists():
|
|
56
|
+
sample_path.write_text(json.dumps(SAMPLE_QUESTIONS, indent=2), encoding="utf-8")
|
|
57
|
+
legacy_sample_path = Path("evals") / "sample_questions.json"
|
|
58
|
+
if force or not legacy_sample_path.exists():
|
|
59
|
+
legacy_sample_path.write_text(json.dumps(SAMPLE_QUESTIONS, indent=2), encoding="utf-8")
|
|
60
|
+
SQLiteTraceStore(config.storage_path)
|
|
61
|
+
click.echo("Wrote %s" % config_path)
|
|
62
|
+
click.echo("Initialized local trace store: %s" % config.storage_path)
|
|
63
|
+
click.echo("Created sample eval dataset: %s" % sample_path)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@cli.command()
|
|
67
|
+
@click.pass_context
|
|
68
|
+
def status(ctx: click.Context) -> None:
|
|
69
|
+
config = _load(ctx)
|
|
70
|
+
store = SQLiteTraceStore(config.storage_path)
|
|
71
|
+
last_eval = store.last_eval_run()
|
|
72
|
+
click.echo("Project: %s" % config.project)
|
|
73
|
+
click.echo("Mode: %s" % config.mode)
|
|
74
|
+
click.echo("Local DB: %s" % config.storage_path)
|
|
75
|
+
click.echo("Trace count: %s" % store.trace_count())
|
|
76
|
+
click.echo("Last eval run: %s" % ((last_eval or {}).get("id") or "None"))
|
|
77
|
+
click.echo("Judge provider: %s" % config.judge_provider)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@cli.group()
|
|
81
|
+
def config() -> None:
|
|
82
|
+
"""Inspect ContextTrace configuration."""
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@config.command("show")
|
|
86
|
+
@click.option("--show-secrets", is_flag=True, help="Show API keys instead of masking them.")
|
|
87
|
+
@click.pass_context
|
|
88
|
+
def config_show(ctx: click.Context, show_secrets: bool) -> None:
|
|
89
|
+
resolved = _load(ctx)
|
|
90
|
+
payload = asdict(resolved)
|
|
91
|
+
if payload.get("api_key") and not show_secrets:
|
|
92
|
+
payload["api_key"] = _mask_secret(str(payload["api_key"]))
|
|
93
|
+
click.echo(json.dumps(payload, indent=2, sort_keys=True))
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@cli.group()
|
|
97
|
+
def traces() -> None:
|
|
98
|
+
"""Inspect local traces."""
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@traces.command("list")
|
|
102
|
+
@click.option("--limit", default=20, show_default=True, help="Maximum traces to show.")
|
|
103
|
+
@click.pass_context
|
|
104
|
+
def traces_list(ctx: click.Context, limit: int) -> None:
|
|
105
|
+
client = _client(ctx)
|
|
106
|
+
rows = client.list_traces(limit=limit)
|
|
107
|
+
if not rows:
|
|
108
|
+
click.echo("No traces found.")
|
|
109
|
+
return
|
|
110
|
+
click.echo("trace_id\tquery\tfailure_type\tcitation_support\tcreated_at")
|
|
111
|
+
for trace in rows:
|
|
112
|
+
evaluation = trace.get("evaluation") or {}
|
|
113
|
+
failure = evaluation.get("failure") or {}
|
|
114
|
+
scores = evaluation.get("scores") or {}
|
|
115
|
+
click.echo(
|
|
116
|
+
"%s\t%s\t%s\t%s\t%s"
|
|
117
|
+
% (
|
|
118
|
+
trace.get("id") or trace.get("trace_id"),
|
|
119
|
+
_preview(trace.get("query")),
|
|
120
|
+
failure.get("failure_type") or "not_evaluated",
|
|
121
|
+
scores.get("citation_support", ""),
|
|
122
|
+
trace.get("created_at") or "",
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@traces.command("show")
|
|
128
|
+
@click.argument("trace_id")
|
|
129
|
+
@click.pass_context
|
|
130
|
+
def traces_show(ctx: click.Context, trace_id: str) -> None:
|
|
131
|
+
trace = _client(ctx).get_trace(trace_id)
|
|
132
|
+
answer = trace.get("answer") or {}
|
|
133
|
+
evaluation = trace.get("evaluation") or {}
|
|
134
|
+
failure = evaluation.get("failure") or {}
|
|
135
|
+
scores = evaluation.get("scores") or {}
|
|
136
|
+
click.echo("Trace: %s" % trace.get("id"))
|
|
137
|
+
click.echo("Project: %s" % trace.get("project"))
|
|
138
|
+
click.echo("Query: %s" % trace.get("query"))
|
|
139
|
+
click.echo("Answer: %s" % _preview(answer.get("answer"), limit=500))
|
|
140
|
+
click.echo("Failure type: %s" % (failure.get("failure_type") or "not_evaluated"))
|
|
141
|
+
click.echo("Severity: %s" % (failure.get("severity") or "unknown"))
|
|
142
|
+
click.echo("Citation support: %s" % scores.get("citation_support", ""))
|
|
143
|
+
click.echo("Unsupported claim rate: %s" % scores.get("unsupported_claim_rate", ""))
|
|
144
|
+
click.echo("Chunks: %s" % len(trace.get("chunks") or []))
|
|
145
|
+
click.echo("Citation checks: %s" % len(trace.get("citation_checks") or []))
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@cli.group("trace")
|
|
149
|
+
def trace_alias() -> None:
|
|
150
|
+
"""Backward-compatible alias for traces."""
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
trace_alias.add_command(traces_list, "list")
|
|
154
|
+
trace_alias.add_command(traces_show, "show")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@cli.command()
|
|
158
|
+
@click.option("--last", is_flag=True, help="Export the most recent trace.")
|
|
159
|
+
@click.option("--trace-id", default=None, help="Trace ID to export.")
|
|
160
|
+
@click.option("--eval-run", default=None, help="Eval run ID to export.")
|
|
161
|
+
@click.option("--output", default=None, help="HTML file to write.")
|
|
162
|
+
@click.option("--open", "open_browser", is_flag=True, help="Open the report in the default browser.")
|
|
163
|
+
@click.pass_context
|
|
164
|
+
def report(
|
|
165
|
+
ctx: click.Context,
|
|
166
|
+
last: bool,
|
|
167
|
+
trace_id: Optional[str],
|
|
168
|
+
eval_run: Optional[str],
|
|
169
|
+
output: Optional[str],
|
|
170
|
+
open_browser: bool,
|
|
171
|
+
) -> None:
|
|
172
|
+
config = _load(ctx)
|
|
173
|
+
client = _client(ctx)
|
|
174
|
+
report_dir = Path(config.local_store_dir) / "reports"
|
|
175
|
+
report_dir.mkdir(parents=True, exist_ok=True)
|
|
176
|
+
|
|
177
|
+
if eval_run:
|
|
178
|
+
store = SQLiteTraceStore(config.storage_path)
|
|
179
|
+
run = store.get_eval_run(eval_run)
|
|
180
|
+
traces_for_run = [
|
|
181
|
+
store.get_trace(question["trace_id"])
|
|
182
|
+
for question in run.get("questions") or []
|
|
183
|
+
if question.get("trace_id")
|
|
184
|
+
]
|
|
185
|
+
output_path = output or str(report_dir / ("%s.html" % eval_run))
|
|
186
|
+
written = ReportGenerator().generate_eval_report(run, traces_for_run, path=output_path)
|
|
187
|
+
else:
|
|
188
|
+
if not trace_id:
|
|
189
|
+
last = True
|
|
190
|
+
selected = client.last_trace() if last else client.get_trace(str(trace_id))
|
|
191
|
+
if selected is None:
|
|
192
|
+
raise click.ClickException("No traces found.")
|
|
193
|
+
output_path = output or str(report_dir / ("%s.html" % selected["id"]))
|
|
194
|
+
written = ReportGenerator().generate(selected, path=output_path)
|
|
195
|
+
|
|
196
|
+
click.echo("Wrote %s" % written)
|
|
197
|
+
if open_browser:
|
|
198
|
+
webbrowser.open(Path(written).resolve().as_uri())
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@cli.command("eval")
|
|
202
|
+
@click.option("--dataset", required=True, help="Path to eval questions JSON.")
|
|
203
|
+
@click.option("--endpoint", default=None, help="RAG endpoint URL. Defaults to config eval_endpoint.")
|
|
204
|
+
@click.option("--method", default="POST", type=click.Choice(["GET", "POST"], case_sensitive=False), help="Endpoint method.")
|
|
205
|
+
@click.option("--input-key", default="question", show_default=True, help="Request body/query key for the question.")
|
|
206
|
+
@click.option("--answer-path", default="$.answer", show_default=True, help="JSONPath for answer extraction.")
|
|
207
|
+
@click.option("--contexts-path", default="$.contexts", show_default=True, help="JSONPath for context extraction.")
|
|
208
|
+
@click.option("--citations-path", default="$.citations", show_default=True, help="JSONPath for citation extraction.")
|
|
209
|
+
@click.option("--body-template", default=None, help="JSON body template. Use {{query}} where the question should be inserted.")
|
|
210
|
+
@click.option("--endpoint-header", multiple=True, help="Header formatted as Name:Value. May be repeated.")
|
|
211
|
+
@click.option("--timeout", default=30.0, show_default=True, type=float, help="Per-request timeout.")
|
|
212
|
+
@click.option("--report-path", default=None, help="HTML report path. Defaults to .contexttrace/reports/eval_<id>.html.")
|
|
213
|
+
@click.option("--api-key", default=None, help="Accepted for compatibility; local mode does not require it.")
|
|
214
|
+
@click.option("--contexttrace-url", default=None, help="Accepted for compatibility; local mode stores traces locally.")
|
|
215
|
+
@click.option("--min-citation-support", default=0.0, show_default=True, type=float, help="Fail when average citation support is below this value.")
|
|
216
|
+
@click.option("--max-unsupported-claim-rate", default=1.0, show_default=True, type=float, help="Fail when unsupported claim rate is above this value.")
|
|
217
|
+
@click.option("--max-failure-rate", default=1.0, show_default=True, type=float, help="Fail when failure rate is above this value.")
|
|
218
|
+
@click.option("--summary-path", default=None, help="Optional markdown summary output path.")
|
|
219
|
+
@click.option("--fail-on", multiple=True, help="Threshold rule such as failure_rate>0.25. May be repeated.")
|
|
220
|
+
@click.option("--results-path", default=None, help="Optional JSON results output path.")
|
|
221
|
+
@click.pass_context
|
|
222
|
+
def eval_command(
|
|
223
|
+
ctx: click.Context,
|
|
224
|
+
dataset: str,
|
|
225
|
+
endpoint: Optional[str],
|
|
226
|
+
method: str,
|
|
227
|
+
input_key: str,
|
|
228
|
+
answer_path: str,
|
|
229
|
+
contexts_path: str,
|
|
230
|
+
citations_path: str,
|
|
231
|
+
body_template: Optional[str],
|
|
232
|
+
endpoint_header: tuple[str, ...],
|
|
233
|
+
timeout: float,
|
|
234
|
+
report_path: Optional[str],
|
|
235
|
+
api_key: Optional[str],
|
|
236
|
+
contexttrace_url: Optional[str],
|
|
237
|
+
min_citation_support: float,
|
|
238
|
+
max_unsupported_claim_rate: float,
|
|
239
|
+
max_failure_rate: float,
|
|
240
|
+
summary_path: Optional[str],
|
|
241
|
+
fail_on: tuple[str, ...],
|
|
242
|
+
results_path: Optional[str],
|
|
243
|
+
) -> None:
|
|
244
|
+
config = _load(ctx)
|
|
245
|
+
resolved_endpoint = endpoint or config.eval_endpoint
|
|
246
|
+
if not resolved_endpoint:
|
|
247
|
+
raise click.ClickException("--endpoint or eval_endpoint in contexttrace.yaml is required.")
|
|
248
|
+
body = json.loads(body_template) if body_template else None
|
|
249
|
+
result = run_endpoint_eval(
|
|
250
|
+
dataset_path=dataset,
|
|
251
|
+
endpoint=resolved_endpoint,
|
|
252
|
+
contexttrace=_client(ctx),
|
|
253
|
+
method=method,
|
|
254
|
+
headers=_parse_headers(list(endpoint_header)),
|
|
255
|
+
body_template=body,
|
|
256
|
+
input_key=input_key,
|
|
257
|
+
answer_path=answer_path,
|
|
258
|
+
contexts_path=contexts_path,
|
|
259
|
+
citations_path=citations_path,
|
|
260
|
+
timeout=timeout,
|
|
261
|
+
report_path=report_path,
|
|
262
|
+
)
|
|
263
|
+
click.echo("Questions tested: %s" % result.questions_tested)
|
|
264
|
+
click.echo("Reliability score: %s" % result.reliability_score)
|
|
265
|
+
click.echo("Failure rate: %s" % result.failure_rate)
|
|
266
|
+
click.echo("Avg citation support: %s" % result.avg_citation_support)
|
|
267
|
+
click.echo("Unsupported claim rate: %s" % result.unsupported_claim_rate)
|
|
268
|
+
click.echo("Top failures: %s" % (", ".join(result.top_failures) or "None"))
|
|
269
|
+
if result.report_path:
|
|
270
|
+
click.echo("Report: %s" % result.report_path)
|
|
271
|
+
if summary_path:
|
|
272
|
+
Path(summary_path).write_text(_eval_markdown(result), encoding="utf-8")
|
|
273
|
+
click.echo("Summary: %s" % summary_path)
|
|
274
|
+
if results_path:
|
|
275
|
+
Path(results_path).parent.mkdir(parents=True, exist_ok=True)
|
|
276
|
+
Path(results_path).write_text(json.dumps(result.to_dict(), indent=2), encoding="utf-8")
|
|
277
|
+
click.echo("Results: %s" % results_path)
|
|
278
|
+
metrics = {
|
|
279
|
+
"failure_rate": result.failure_rate,
|
|
280
|
+
"citation_support": result.avg_citation_support,
|
|
281
|
+
"avg_citation_support": result.avg_citation_support,
|
|
282
|
+
"unsupported_claim_rate": result.unsupported_claim_rate,
|
|
283
|
+
"reliability_score": result.reliability_score,
|
|
284
|
+
}
|
|
285
|
+
parsed_fail_on = parse_thresholds(fail_on)
|
|
286
|
+
fail_on_messages = threshold_failures(metrics, parsed_fail_on)
|
|
287
|
+
for message in fail_on_messages:
|
|
288
|
+
click.echo("Threshold failed: %s" % message, err=True)
|
|
289
|
+
failed = (
|
|
290
|
+
result.avg_citation_support < min_citation_support
|
|
291
|
+
or result.unsupported_claim_rate > max_unsupported_claim_rate
|
|
292
|
+
or result.failure_rate > max_failure_rate
|
|
293
|
+
or bool(fail_on_messages)
|
|
294
|
+
)
|
|
295
|
+
if failed:
|
|
296
|
+
return 1
|
|
297
|
+
return 0
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
@cli.command()
|
|
301
|
+
@click.option("--dataset", default="refund_policy", show_default=True, help="Demo dataset name or path.")
|
|
302
|
+
@click.option("--strategy", default="adaptive", show_default=True, help="Demo retrieval strategy.")
|
|
303
|
+
@click.pass_context
|
|
304
|
+
def demo(ctx: click.Context, dataset: str, strategy: str) -> None:
|
|
305
|
+
client = _client(ctx)
|
|
306
|
+
config = _load(ctx)
|
|
307
|
+
report_path = Path(config.local_store_dir) / "reports" / ("%s_demo.html" % Path(dataset).name)
|
|
308
|
+
result = run_demo_dataset(
|
|
309
|
+
dataset=dataset,
|
|
310
|
+
contexttrace=client,
|
|
311
|
+
strategy=strategy,
|
|
312
|
+
report_path=str(report_path),
|
|
313
|
+
)
|
|
314
|
+
click.echo("Dataset: %s" % result.dataset)
|
|
315
|
+
click.echo("Traces created: %s" % len(result.trace_ids))
|
|
316
|
+
click.echo("Reliability score: %s" % result.summary.get("reliability_score"))
|
|
317
|
+
click.echo("Failure rate: %s" % result.summary.get("failure_rate"))
|
|
318
|
+
click.echo("Citation support: %s" % result.summary.get("citation_support"))
|
|
319
|
+
click.echo("Top failures: %s" % (", ".join(result.summary.get("top_failures") or []) or "None"))
|
|
320
|
+
click.echo("Report: %s" % result.report_path)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
@cli.command()
|
|
324
|
+
@click.option("--dataset", required=True, help="Demo dataset name or path.")
|
|
325
|
+
@click.option("--strategy", "strategies", multiple=True, help="Strategy to run. May be repeated.")
|
|
326
|
+
@click.option("--output-dir", default=".contexttrace/benchmarks", show_default=True, help="Benchmark output directory.")
|
|
327
|
+
@click.option("--fail-on", multiple=True, help="Threshold rule such as failure_rate>0.25. May be repeated.")
|
|
328
|
+
@click.option("--report-path", default=None, help="Optional benchmark HTML report path.")
|
|
329
|
+
@click.pass_context
|
|
330
|
+
def benchmark(
|
|
331
|
+
ctx: click.Context,
|
|
332
|
+
dataset: str,
|
|
333
|
+
strategies: tuple[str, ...],
|
|
334
|
+
output_dir: str,
|
|
335
|
+
fail_on: tuple[str, ...],
|
|
336
|
+
report_path: Optional[str],
|
|
337
|
+
) -> None:
|
|
338
|
+
result = run_local_benchmark(
|
|
339
|
+
dataset=dataset,
|
|
340
|
+
contexttrace=_client(ctx),
|
|
341
|
+
output_dir=output_dir,
|
|
342
|
+
strategies=strategies or BENCHMARK_STRATEGIES,
|
|
343
|
+
fail_on=fail_on,
|
|
344
|
+
report_path=report_path,
|
|
345
|
+
)
|
|
346
|
+
summary = result["summary"]
|
|
347
|
+
click.echo("Status: %s" % result["status"])
|
|
348
|
+
click.echo("Questions tested: %s" % summary.get("questions_tested"))
|
|
349
|
+
click.echo("Reliability score: %s" % summary.get("reliability_score"))
|
|
350
|
+
click.echo("Failure rate: %s" % summary.get("failure_rate"))
|
|
351
|
+
click.echo("Citation support: %s" % summary.get("citation_support"))
|
|
352
|
+
click.echo("Unsupported claim rate: %s" % summary.get("unsupported_claim_rate"))
|
|
353
|
+
click.echo("Results: %s" % result["results_path"])
|
|
354
|
+
click.echo("Summary: %s" % result["summary_path"])
|
|
355
|
+
click.echo("Report: %s" % result["report_path"])
|
|
356
|
+
for failure in result["threshold_failures"]:
|
|
357
|
+
click.echo("Threshold failed: %s" % failure, err=True)
|
|
358
|
+
if result["threshold_failures"]:
|
|
359
|
+
return 1
|
|
360
|
+
return 0
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
@cli.command()
|
|
364
|
+
@click.pass_context
|
|
365
|
+
def doctor(ctx: click.Context) -> None:
|
|
366
|
+
config_path = Path(ctx.obj.get("config_path") or "contexttrace.yaml")
|
|
367
|
+
config = _load(ctx)
|
|
368
|
+
checks = []
|
|
369
|
+
checks.append(("config exists", config_path.exists()))
|
|
370
|
+
try:
|
|
371
|
+
SQLiteTraceStore(config.storage_path)
|
|
372
|
+
checks.append(("SQLite writable", True))
|
|
373
|
+
except Exception:
|
|
374
|
+
checks.append(("SQLite writable", False))
|
|
375
|
+
checks.append(("demo datasets available", bool(list_demo_datasets())))
|
|
376
|
+
if config.judge_provider in {"openai", "openai-compatible"}:
|
|
377
|
+
checks.append(("LLM API key present", bool(config.api_key)))
|
|
378
|
+
else:
|
|
379
|
+
checks.append(("LLM API key present", True))
|
|
380
|
+
if config.eval_endpoint:
|
|
381
|
+
checks.append(("endpoint reachable", _endpoint_reachable(config.eval_endpoint)))
|
|
382
|
+
failed = [name for name, ok in checks if not ok]
|
|
383
|
+
for name, ok in checks:
|
|
384
|
+
click.echo("%s: %s" % ("OK" if ok else "FAIL", name))
|
|
385
|
+
if failed:
|
|
386
|
+
raise click.ClickException("Doctor found %s failed check(s)." % len(failed))
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
@cli.command()
|
|
390
|
+
@click.option("--host", default="127.0.0.1", show_default=True)
|
|
391
|
+
@click.option("--port", default=8765, show_default=True, type=int)
|
|
392
|
+
@click.pass_context
|
|
393
|
+
def viewer(ctx: click.Context, host: str, port: int) -> None:
|
|
394
|
+
config = _load(ctx)
|
|
395
|
+
serve_viewer(storage_path=config.storage_path, host=host, port=port)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
399
|
+
try:
|
|
400
|
+
result = cli.main(args=argv, prog_name="contexttrace", standalone_mode=False)
|
|
401
|
+
return int(result or 0)
|
|
402
|
+
except click.exceptions.Exit as exc:
|
|
403
|
+
return int(exc.exit_code or 0)
|
|
404
|
+
except click.ClickException as exc:
|
|
405
|
+
exc.show(file=sys.stderr)
|
|
406
|
+
return int(exc.exit_code)
|
|
407
|
+
except ContextTraceError as exc:
|
|
408
|
+
click.echo("ContextTrace failed: %s" % exc, err=True)
|
|
409
|
+
return 2
|
|
410
|
+
except ValueError as exc:
|
|
411
|
+
click.echo("ContextTrace failed: %s" % exc, err=True)
|
|
412
|
+
return 2
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def _load(ctx: click.Context) -> ContextTraceConfig:
|
|
416
|
+
return load_config(config_path=(ctx.obj or {}).get("config_path"))
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def _client(ctx: click.Context) -> ContextTrace:
|
|
420
|
+
config_path = (ctx.obj or {}).get("config_path")
|
|
421
|
+
return ContextTrace(config_path=config_path)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def _parse_headers(values: list[str]) -> dict[str, str]:
|
|
425
|
+
headers = {}
|
|
426
|
+
for value in values:
|
|
427
|
+
if ":" not in value:
|
|
428
|
+
raise click.ClickException("Endpoint headers must be formatted as Name:Value.")
|
|
429
|
+
name, header_value = value.split(":", 1)
|
|
430
|
+
headers[name.strip()] = header_value.strip()
|
|
431
|
+
return headers
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def _mask_secret(value: str) -> str:
|
|
435
|
+
if len(value) <= 6:
|
|
436
|
+
return "***"
|
|
437
|
+
return "%s***%s" % (value[:3], value[-3:])
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def _preview(value: object, *, limit: int = 100) -> str:
|
|
441
|
+
text = "" if value is None else str(value).replace("\n", " ")
|
|
442
|
+
return text if len(text) <= limit else text[: limit - 1] + "..."
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def _endpoint_reachable(endpoint: str) -> bool:
|
|
446
|
+
request = urllib.request.Request(endpoint, method="GET")
|
|
447
|
+
try:
|
|
448
|
+
urllib.request.urlopen(request, timeout=2).close()
|
|
449
|
+
return True
|
|
450
|
+
except urllib.error.HTTPError:
|
|
451
|
+
return True
|
|
452
|
+
except urllib.error.URLError:
|
|
453
|
+
return False
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def _eval_markdown(result: object) -> str:
|
|
457
|
+
return "\n".join(
|
|
458
|
+
[
|
|
459
|
+
"# ContextTrace Local Eval Summary",
|
|
460
|
+
"",
|
|
461
|
+
"- Questions tested: %s" % getattr(result, "questions_tested", 0),
|
|
462
|
+
"- Reliability score: %s" % getattr(result, "reliability_score", 0),
|
|
463
|
+
"- Failure rate: %s" % getattr(result, "failure_rate", 0),
|
|
464
|
+
"- Average citation support: %s" % getattr(result, "avg_citation_support", 0),
|
|
465
|
+
"- Unsupported claim rate: %s" % getattr(result, "unsupported_claim_rate", 0),
|
|
466
|
+
"- Top failures: %s" % (", ".join(getattr(result, "top_failures", []) or []) or "None"),
|
|
467
|
+
"- Report: %s" % (getattr(result, "report_path", None) or "Not generated"),
|
|
468
|
+
"",
|
|
469
|
+
]
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
if __name__ == "__main__": # pragma: no cover
|
|
474
|
+
raise SystemExit(main())
|