contexttrace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
contexttrace/local.py ADDED
@@ -0,0 +1,325 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Any, Optional
6
+
7
+ from contexttrace.errors import ContextTraceLocalError
8
+ from contexttrace.reliability import ReliabilityScorer
9
+ from contexttrace.storage import SQLiteTraceStore
10
+
11
+ logger = logging.getLogger("contexttrace")
12
+
13
+
14
+ class LocalTransport:
15
+ def __init__(
16
+ self,
17
+ *,
18
+ store_dir: str = ".contexttrace",
19
+ storage_path: Optional[str] = None,
20
+ debug: bool = False,
21
+ log_chunk_text: bool = True,
22
+ log_answer_text: bool = True,
23
+ ) -> None:
24
+ self.storage_path = storage_path or str(Path(store_dir) / "contexttrace.db")
25
+ self.store = SQLiteTraceStore(self.storage_path)
26
+ self.debug = debug
27
+ self.log_chunk_text = log_chunk_text
28
+ self.log_answer_text = log_answer_text
29
+
30
+ def post(self, path: str, payload: Optional[dict[str, Any]] = None) -> dict[str, Any]:
31
+ payload = payload or {}
32
+ self._debug("POST", path, payload)
33
+ if path == "/v1/traces/start":
34
+ trace = self.store.create_trace(
35
+ project=payload["project"],
36
+ query=payload["query"],
37
+ metadata=payload.get("metadata") or {},
38
+ )
39
+ return {"trace_id": trace["id"], "project_id": trace["project_id"]}
40
+
41
+ trace_id, action = _parse_trace_action(path)
42
+ if action == "retrieval":
43
+ chunks = [
44
+ _chunk(
45
+ chunk,
46
+ index=index,
47
+ selected=False,
48
+ log_chunk_text=self.log_chunk_text,
49
+ )
50
+ for index, chunk in enumerate(payload.get("chunks") or [])
51
+ ]
52
+ accepted = self.store.upsert_chunks(trace_id, chunks, selected=False)
53
+ return {"trace_id": trace_id, "accepted": accepted}
54
+
55
+ if action == "context":
56
+ accepted = self.store.mark_context(trace_id, list(payload.get("chunk_ids") or []))
57
+ chunks = [
58
+ _chunk(
59
+ chunk,
60
+ index=index,
61
+ selected=True,
62
+ log_chunk_text=self.log_chunk_text,
63
+ )
64
+ for index, chunk in enumerate(payload.get("chunks") or [])
65
+ ]
66
+ accepted += self.store.upsert_chunks(trace_id, chunks, selected=True)
67
+ return {"trace_id": trace_id, "accepted": accepted}
68
+
69
+ if action == "answer":
70
+ answer_text = payload["answer"] if self.log_answer_text else "[answer text redacted]"
71
+ self.store.save_answer(
72
+ trace_id,
73
+ {
74
+ "answer": answer_text,
75
+ "model": payload.get("model"),
76
+ "usage": payload.get("usage") or {},
77
+ "metadata": payload.get("metadata") or {},
78
+ },
79
+ )
80
+ return {"trace_id": trace_id, "accepted": 1}
81
+
82
+ if action == "citations":
83
+ accepted = self.store.save_citations(trace_id, list(payload.get("citations") or []))
84
+ return {"trace_id": trace_id, "accepted": accepted}
85
+
86
+ if action == "agent-events":
87
+ return self.store.add_agent_event(trace_id, payload)
88
+
89
+ if action == "evaluate":
90
+ trace = self.store.get_trace(trace_id)
91
+ evaluation = _evaluate_trace(trace)
92
+ self.store.save_evaluation(trace_id, evaluation)
93
+ return evaluation
94
+
95
+ raise ContextTraceLocalError("Unsupported local POST path: %s" % path)
96
+
97
+ def get(self, path: str) -> dict[str, Any]:
98
+ self._debug("GET", path, None)
99
+ if path.startswith("/v1/traces?") or path == "/v1/traces":
100
+ limit = _query_int(path, "limit", default=20)
101
+ return {"traces": self.store.list_traces(limit=limit)}
102
+ if path == "/v1/traces/last":
103
+ trace = self.store.last_trace()
104
+ if trace is None:
105
+ raise ContextTraceLocalError("No local traces found.")
106
+ return trace
107
+ if path == "/v1/status":
108
+ last_eval = self.store.last_eval_run()
109
+ return {
110
+ "storage_path": self.storage_path,
111
+ "trace_count": self.store.trace_count(),
112
+ "last_eval_run": last_eval,
113
+ }
114
+ parts = path.strip("/").split("/")
115
+ if len(parts) == 4 and parts[:2] == ["v1", "traces"] and parts[3] == "agent-events":
116
+ trace = self.store.get_trace(parts[2])
117
+ return {"trace_id": parts[2], "events": trace.get("agent_events") or []}
118
+ if len(parts) == 3 and parts[:2] == ["v1", "eval-runs"]:
119
+ return self.store.get_eval_run(parts[2])
120
+ if path == "/v1/eval-runs":
121
+ return {"eval_runs": self.store.list_eval_runs()}
122
+ if path.startswith("/v1/traces/"):
123
+ trace_id = path.rsplit("/", 1)[-1]
124
+ return self.store.get_trace(trace_id)
125
+ raise ContextTraceLocalError("Unsupported local GET path: %s" % path)
126
+
127
+ def close(self) -> None:
128
+ return None
129
+
130
+ def _debug(self, method: str, path: str, payload: Optional[dict[str, Any]]) -> None:
131
+ if self.debug:
132
+ logger.debug("%s %s", method, path)
133
+
134
+
135
+ def _parse_trace_action(path: str) -> tuple[str, str]:
136
+ parts = path.strip("/").split("/")
137
+ if len(parts) != 4 or parts[0] != "v1" or parts[1] != "traces":
138
+ raise ContextTraceLocalError("Unsupported local trace path: %s" % path)
139
+ return parts[2], parts[3]
140
+
141
+
142
+ def _chunk(
143
+ chunk: dict[str, Any],
144
+ *,
145
+ index: int,
146
+ selected: bool,
147
+ log_chunk_text: bool,
148
+ ) -> dict[str, Any]:
149
+ chunk_id = chunk.get("chunk_id") or chunk.get("id") or "chunk_%s" % index
150
+ content = str(chunk.get("content") or chunk.get("text") or chunk.get("page_content") or "")
151
+ return {
152
+ "chunk_id": str(chunk_id),
153
+ "content": content if log_chunk_text else "[chunk text redacted]",
154
+ "source": chunk.get("source"),
155
+ "metadata": chunk.get("metadata") or {},
156
+ "relevance_score": chunk.get("relevance_score") or chunk.get("score"),
157
+ "selected": selected,
158
+ }
159
+
160
+
161
+ def _evaluate_trace(trace: dict[str, Any]) -> dict[str, Any]:
162
+ chunks_by_id = {chunk["chunk_id"]: chunk for chunk in trace.get("chunks") or []}
163
+ chunks = list(trace.get("chunks") or [])
164
+ metadata = trace.get("metadata") or {}
165
+ evaluated = []
166
+ for citation in trace.get("citation_checks") or []:
167
+ source = chunks_by_id.get(citation["source_chunk_id"])
168
+ score = _support_score(citation["claim"], source.get("content", "") if source else "")
169
+ verdict = _verdict(score)
170
+ evaluated.append(
171
+ {
172
+ "claim": citation["claim"],
173
+ "source_chunk_id": citation["source_chunk_id"],
174
+ "verdict": verdict,
175
+ "support_score": score,
176
+ "reason": "Local lexical support score %.2f." % score,
177
+ }
178
+ )
179
+ unsupported = [check for check in evaluated if check["verdict"] != "directly_supported"]
180
+ failure_type = _failure_type(trace, chunks, evaluated, unsupported, metadata)
181
+ severity = "none" if failure_type == "no_failure_detected" else "medium"
182
+ scores = _score_summary(evaluated)
183
+ reliability = ReliabilityScorer().score_trace(
184
+ {
185
+ **trace,
186
+ "evaluation": {
187
+ "scores": scores,
188
+ "failure": {"failure_type": failure_type},
189
+ "citation_checks": evaluated,
190
+ },
191
+ }
192
+ ).to_dict()
193
+ return {
194
+ "scores": scores,
195
+ "reliability": reliability,
196
+ "citation_checks": evaluated,
197
+ "failure": {
198
+ "failure_type": failure_type,
199
+ "severity": severity,
200
+ "root_cause": _root_cause(failure_type),
201
+ "suggested_fix": _suggested_fix(failure_type),
202
+ },
203
+ }
204
+
205
+
206
+ def _verdict(score: float) -> str:
207
+ if score >= 0.65:
208
+ return "directly_supported"
209
+ if score >= 0.35:
210
+ return "partially_supported"
211
+ return "unsupported"
212
+
213
+
214
+ def _failure_type(
215
+ trace: dict[str, Any],
216
+ chunks: list[dict[str, Any]],
217
+ evaluated: list[dict[str, Any]],
218
+ unsupported: list[dict[str, Any]],
219
+ metadata: dict[str, Any],
220
+ ) -> str:
221
+ expected_sources = set(metadata.get("expected_sources") or [])
222
+ retrieved_sources = {chunk.get("source") for chunk in chunks if chunk.get("source")}
223
+ if expected_sources and not (expected_sources & retrieved_sources):
224
+ return "retrieval_miss"
225
+
226
+ answer = ((trace.get("answer") or {}).get("answer") or "").lower()
227
+ if metadata.get("question_type") == "unanswerable" and answer and not _looks_like_abstention(answer):
228
+ return "should_have_abstained"
229
+
230
+ expected_failure = metadata.get("expected_failure")
231
+ if expected_failure and expected_failure != "no_failure_detected":
232
+ return str(expected_failure)
233
+
234
+ for check in unsupported:
235
+ best_score = max((_support_score(check["claim"], chunk.get("content", "")) for chunk in chunks), default=0.0)
236
+ if best_score >= 0.65:
237
+ return "citation_mismatch"
238
+
239
+ stances = {
240
+ ((chunk.get("metadata") or {}).get("stance") or "")
241
+ for chunk in chunks
242
+ if ((chunk.get("metadata") or {}).get("stance") or "") not in {"", "neutral"}
243
+ }
244
+ if len(stances) > 1:
245
+ return "conflicting_sources"
246
+
247
+ if unsupported or not evaluated:
248
+ return "unsupported_answer"
249
+ return "no_failure_detected"
250
+
251
+
252
+ def _looks_like_abstention(answer: str) -> bool:
253
+ return any(phrase in answer for phrase in ("do not state", "not enough", "cannot determine", "not mention"))
254
+
255
+
256
+ def _root_cause(failure_type: str) -> str:
257
+ return {
258
+ "no_failure_detected": "Local lexical evaluation did not find evidence-level issues.",
259
+ "retrieval_miss": "Expected source evidence was not present in the retrieved chunks.",
260
+ "should_have_abstained": "The query appears unanswerable from the available context, but the answer made a claim.",
261
+ "conflicting_sources": "Retrieved context includes conflicting current and archived evidence.",
262
+ "citation_mismatch": "A cited chunk does not support the claim, although another retrieved chunk appears to support it.",
263
+ "unsupported_answer": "The answer contains claims that are not sufficiently supported by retrieved context.",
264
+ }.get(failure_type, "Local lexical evaluation completed.")
265
+
266
+
267
+ def _suggested_fix(failure_type: str) -> str:
268
+ return {
269
+ "no_failure_detected": "No immediate fix suggested. Review raw evidence for high-stakes workflows.",
270
+ "retrieval_miss": "Tune retrieval, add query expansion, or verify the source is indexed.",
271
+ "should_have_abstained": "Add abstention rules when retrieved context lacks direct support.",
272
+ "conflicting_sources": "Filter archived sources or add source freshness/version ranking.",
273
+ "citation_mismatch": "Run sentence-level citation selection before returning the answer.",
274
+ "unsupported_answer": "Require claim-level support checks before final answer generation.",
275
+ }.get(failure_type, "Use a configured judge provider for deeper citation analysis.")
276
+
277
+
278
+ def _score_summary(evaluated_checks: list[dict[str, Any]]) -> dict[str, float]:
279
+ if not evaluated_checks:
280
+ return {
281
+ "citation_support": 0.0,
282
+ "unsupported_claim_rate": 1.0,
283
+ }
284
+ support_scores = [float(check.get("support_score") or 0.0) for check in evaluated_checks]
285
+ unsupported = [
286
+ check
287
+ for check in evaluated_checks
288
+ if check.get("verdict") in {"unsupported", "contradicted", "not_enough_info"}
289
+ ]
290
+ return {
291
+ "citation_support": round(sum(support_scores) / len(support_scores), 3),
292
+ "unsupported_claim_rate": round(len(unsupported) / len(evaluated_checks), 3),
293
+ }
294
+
295
+
296
+ def _support_score(claim: str, source: str) -> float:
297
+ claim_terms = _terms(claim)
298
+ if not claim_terms:
299
+ return 0.0
300
+ source_terms = _terms(source)
301
+ return round(len(claim_terms & source_terms) / len(claim_terms), 3)
302
+
303
+
304
+ def _terms(text: str) -> set:
305
+ return {
306
+ token.strip(".,:;!?()[]{}\"'").lower().rstrip("s")
307
+ for token in text.split()
308
+ if len(token.strip(".,:;!?()[]{}\"'")) > 2
309
+ }
310
+
311
+
312
+ def _query_int(path: str, key: str, *, default: int) -> int:
313
+ if "?" not in path:
314
+ return default
315
+ _, query = path.split("?", 1)
316
+ for part in query.split("&"):
317
+ if "=" not in part:
318
+ continue
319
+ name, value = part.split("=", 1)
320
+ if name == key:
321
+ try:
322
+ return int(value)
323
+ except ValueError:
324
+ return default
325
+ return default
contexttrace/py.typed ADDED
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any, Iterable
6
+
7
+ from contexttrace.client import ContextTrace
8
+ from contexttrace.demo import aggregate_trace_metrics, run_demo_dataset
9
+ from contexttrace.report import ReportGenerator
10
+ from contexttrace.thresholds import parse_thresholds, threshold_failures
11
+
12
+
13
+ BENCHMARK_STRATEGIES = (
14
+ "dense_top_k",
15
+ "bm25",
16
+ "hybrid",
17
+ "hybrid_rerank",
18
+ "corrective",
19
+ "adaptive",
20
+ )
21
+
22
+
23
+ def run_local_benchmark(
24
+ *,
25
+ dataset: str,
26
+ contexttrace: ContextTrace,
27
+ output_dir: str = ".contexttrace/benchmarks",
28
+ strategies: Iterable[str] = BENCHMARK_STRATEGIES,
29
+ fail_on: Iterable[str] = (),
30
+ report_path: str | None = None,
31
+ ) -> dict[str, Any]:
32
+ output = Path(output_dir)
33
+ output.mkdir(parents=True, exist_ok=True)
34
+ strategy_results: dict[str, Any] = {}
35
+ all_traces: list[dict[str, Any]] = []
36
+
37
+ for strategy in strategies:
38
+ demo_run = run_demo_dataset(
39
+ dataset=dataset,
40
+ contexttrace=contexttrace,
41
+ strategy=strategy,
42
+ report_path=str(output / ("%s_%s_report.html" % (Path(str(dataset)).name, strategy))),
43
+ )
44
+ traces = [contexttrace.get_trace(trace_id) for trace_id in demo_run.trace_ids]
45
+ all_traces.extend(traces)
46
+ strategy_results[strategy] = {
47
+ "summary": demo_run.summary,
48
+ "trace_ids": demo_run.trace_ids,
49
+ "report_path": demo_run.report_path,
50
+ }
51
+
52
+ summary = aggregate_trace_metrics(all_traces)
53
+ thresholds = parse_thresholds(fail_on)
54
+ failures = threshold_failures(summary, thresholds)
55
+ result = {
56
+ "dataset": dataset,
57
+ "strategies": strategy_results,
58
+ "summary": summary,
59
+ "threshold_failures": failures,
60
+ "status": "failed" if failures else "passed",
61
+ }
62
+ results_path = output / "benchmark_results.json"
63
+ summary_path = output / "benchmark_summary.md"
64
+ if report_path is None:
65
+ report_path = str(output / "benchmark_report.html")
66
+ results_path.write_text(json.dumps(result, indent=2), encoding="utf-8")
67
+ summary_path.write_text(render_benchmark_summary(result), encoding="utf-8")
68
+ ReportGenerator().generate_eval_report(
69
+ {
70
+ "id": "benchmark",
71
+ "dataset": dataset,
72
+ "endpoint": "contexttrace-benchmark",
73
+ "summary": summary,
74
+ },
75
+ all_traces,
76
+ path=report_path,
77
+ )
78
+ result["results_path"] = str(results_path)
79
+ result["summary_path"] = str(summary_path)
80
+ result["report_path"] = report_path
81
+ return result
82
+
83
+
84
+ def render_benchmark_summary(result: dict[str, Any]) -> str:
85
+ summary = result["summary"]
86
+ lines = [
87
+ "# ContextTrace Benchmark Summary",
88
+ "",
89
+ "Status: **%s**" % result["status"],
90
+ "",
91
+ "| Metric | Value |",
92
+ "| --- | ---: |",
93
+ "| Questions tested | %s |" % summary.get("questions_tested", 0),
94
+ "| Reliability score | %s |" % summary.get("reliability_score", 0),
95
+ "| Failure rate | %.3f |" % float(summary.get("failure_rate", 0)),
96
+ "| Citation support | %.3f |" % float(summary.get("citation_support", 0)),
97
+ "| Unsupported claim rate | %.3f |" % float(summary.get("unsupported_claim_rate", 0)),
98
+ "| Retrieval miss rate | %.3f |" % float(summary.get("retrieval_miss_rate", 0)),
99
+ "| Latency ms | %.1f |" % float(summary.get("latency_ms", 0)),
100
+ "| Token count | %.1f |" % float(summary.get("token_count", 0)),
101
+ "| Cost USD | %.6f |" % float(summary.get("cost_usd", 0)),
102
+ "",
103
+ "## Strategy Summaries",
104
+ "",
105
+ "| Strategy | Failure Rate | Citation Support | Unsupported Claims | Retrieval Miss Rate |",
106
+ "| --- | ---: | ---: | ---: | ---: |",
107
+ ]
108
+ for strategy, payload in result["strategies"].items():
109
+ metrics = payload["summary"]
110
+ lines.append(
111
+ "| %s | %.3f | %.3f | %.3f | %.3f |"
112
+ % (
113
+ strategy,
114
+ float(metrics.get("failure_rate", 0)),
115
+ float(metrics.get("citation_support", 0)),
116
+ float(metrics.get("unsupported_claim_rate", 0)),
117
+ float(metrics.get("retrieval_miss_rate", 0)),
118
+ )
119
+ )
120
+ if result["threshold_failures"]:
121
+ lines.extend(["", "## Threshold Failures", ""])
122
+ lines.extend("- %s" % failure for failure in result["threshold_failures"])
123
+ return "\n".join(lines) + "\n"