contexttrace 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contexttrace/__init__.py +36 -0
- contexttrace/_version.py +1 -0
- contexttrace/cli.py +474 -0
- contexttrace/client.py +1074 -0
- contexttrace/config.py +246 -0
- contexttrace/demo.py +311 -0
- contexttrace/demo_data.py +257 -0
- contexttrace/endpoint_eval.py +314 -0
- contexttrace/errors.py +14 -0
- contexttrace/evaluator.py +448 -0
- contexttrace/integrations/__init__.py +14 -0
- contexttrace/integrations/fastapi.py +311 -0
- contexttrace/integrations/langchain.py +440 -0
- contexttrace/integrations/langgraph.py +197 -0
- contexttrace/integrations/llamaindex.py +422 -0
- contexttrace/integrations/opentelemetry.py +111 -0
- contexttrace/local.py +325 -0
- contexttrace/py.typed +1 -0
- contexttrace/regression.py +123 -0
- contexttrace/reliability.py +284 -0
- contexttrace/report.py +550 -0
- contexttrace/storage/__init__.py +3 -0
- contexttrace/storage/sqlite_store.py +604 -0
- contexttrace/thresholds.py +50 -0
- contexttrace/transport.py +183 -0
- contexttrace/viewer.py +148 -0
- contexttrace-0.1.0.dist-info/METADATA +154 -0
- contexttrace-0.1.0.dist-info/RECORD +31 -0
- contexttrace-0.1.0.dist-info/WHEEL +5 -0
- contexttrace-0.1.0.dist-info/entry_points.txt +2 -0
- contexttrace-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
DEMO_DATASETS: dict[str, dict[str, Any]] = {
|
|
9
|
+
"refund_policy": {
|
|
10
|
+
"documents": {
|
|
11
|
+
"refund_policy.md": """# Refund Policy
|
|
12
|
+
|
|
13
|
+
## Standard refunds
|
|
14
|
+
Customers may request a refund within 30 days of purchase when the product has not been consumed or customized. Refunds are returned to the original payment method.
|
|
15
|
+
|
|
16
|
+
## Processing time
|
|
17
|
+
Approved refunds are processed within 7 business days after the support team confirms eligibility.
|
|
18
|
+
|
|
19
|
+
## Non-refundable items
|
|
20
|
+
Shipping upgrades, gift cards, and consumed digital credits are non-refundable.
|
|
21
|
+
""",
|
|
22
|
+
"exchange_policy.md": """# Exchange Policy
|
|
23
|
+
|
|
24
|
+
## Exchanges
|
|
25
|
+
Customers may exchange an unopened item within 45 days of purchase. Exchanges use store credit when the original item is no longer available.
|
|
26
|
+
|
|
27
|
+
## Defective items
|
|
28
|
+
Defective items may be replaced after support receives a photo of the defect and the order number.
|
|
29
|
+
""",
|
|
30
|
+
"legacy_refund_memo.md": """# Legacy Refund Memo
|
|
31
|
+
|
|
32
|
+
## Archived policy
|
|
33
|
+
Before March 2024, refunds were limited to 14 days after purchase. This memo is archived and should not be used for current customer answers.
|
|
34
|
+
""",
|
|
35
|
+
"subscription_terms.md": """# Subscription Terms
|
|
36
|
+
|
|
37
|
+
## Renewal
|
|
38
|
+
Monthly subscriptions renew automatically until canceled. Cancellation stops future renewals but does not refund prior months unless support grants an exception.
|
|
39
|
+
""",
|
|
40
|
+
},
|
|
41
|
+
"questions": [
|
|
42
|
+
{"id": "refund_q1", "query": "How long do customers have to request a refund?", "type": "answerable"},
|
|
43
|
+
{"id": "refund_q2", "query": "How long do approved refunds take to process?", "type": "citation-sensitive", "expected_failure": "citation_mismatch"},
|
|
44
|
+
{"id": "refund_q3", "query": "Can shipping upgrades be refunded?", "type": "answerable"},
|
|
45
|
+
{"id": "refund_q4", "query": "Can customers refund consumed digital credits?", "type": "answerable"},
|
|
46
|
+
{"id": "refund_q5", "query": "Does the company provide refunds after 90 days for loyalty members?", "type": "unanswerable", "expected_failure": "should_have_abstained"},
|
|
47
|
+
{"id": "refund_q6", "query": "What was the old refund window and what is the current window?", "type": "conflicting", "expected_failure": "conflicting_sources"},
|
|
48
|
+
{"id": "refund_q7", "query": "Can an unopened item be exchanged after 40 days?", "type": "multi-hop"},
|
|
49
|
+
{"id": "refund_q8", "query": "Are defective items replaceable?", "type": "answerable"},
|
|
50
|
+
{"id": "refund_q9", "query": "Which payment method receives approved refunds?", "type": "answerable", "expected_failure": "retrieval_miss"},
|
|
51
|
+
{"id": "refund_q10", "query": "Are subscription renewals automatically charged?", "type": "edge-case", "expected_failure": "unsupported_answer"},
|
|
52
|
+
],
|
|
53
|
+
"expected_answers": {
|
|
54
|
+
"refund_q1": "Customers have 30 days from purchase to request a refund.",
|
|
55
|
+
"refund_q2": "Approved refunds are processed within 7 business days.",
|
|
56
|
+
"refund_q3": "No. Shipping upgrades are non-refundable.",
|
|
57
|
+
"refund_q4": "No. Consumed digital credits are non-refundable.",
|
|
58
|
+
"refund_q5": "The documents do not state a 90-day loyalty-member refund exception.",
|
|
59
|
+
"refund_q6": "The archived memo says 14 days before March 2024, while the current policy says 30 days.",
|
|
60
|
+
"refund_q7": "Yes. Unopened items may be exchanged within 45 days.",
|
|
61
|
+
"refund_q8": "Yes. Defective items may be replaced after support receives a photo and order number.",
|
|
62
|
+
"refund_q9": "Approved refunds are returned to the original payment method.",
|
|
63
|
+
"refund_q10": "Monthly subscriptions renew automatically until canceled.",
|
|
64
|
+
},
|
|
65
|
+
"expected_sources": {
|
|
66
|
+
"refund_q1": ["refund_policy.md"],
|
|
67
|
+
"refund_q2": ["refund_policy.md"],
|
|
68
|
+
"refund_q3": ["refund_policy.md"],
|
|
69
|
+
"refund_q4": ["refund_policy.md"],
|
|
70
|
+
"refund_q5": [],
|
|
71
|
+
"refund_q6": ["refund_policy.md", "legacy_refund_memo.md"],
|
|
72
|
+
"refund_q7": ["exchange_policy.md"],
|
|
73
|
+
"refund_q8": ["exchange_policy.md"],
|
|
74
|
+
"refund_q9": ["refund_policy.md"],
|
|
75
|
+
"refund_q10": ["subscription_terms.md"],
|
|
76
|
+
},
|
|
77
|
+
},
|
|
78
|
+
"employee_handbook": {
|
|
79
|
+
"documents": {
|
|
80
|
+
"pto_policy.md": """# PTO Policy
|
|
81
|
+
|
|
82
|
+
## Annual allowance
|
|
83
|
+
Full-time employees receive 18 days of paid time off each calendar year. Unused PTO does not roll over unless a state law requires it.
|
|
84
|
+
|
|
85
|
+
## Approval
|
|
86
|
+
PTO requests for more than five consecutive business days require manager approval at least two weeks before the first day away.
|
|
87
|
+
""",
|
|
88
|
+
"remote_work.md": """# Remote Work Policy
|
|
89
|
+
|
|
90
|
+
## Eligibility
|
|
91
|
+
Employees may work remotely up to three days per week with manager approval. Security training must be completed before remote work begins.
|
|
92
|
+
|
|
93
|
+
## Equipment
|
|
94
|
+
The company reimburses up to 600 USD for approved home-office equipment after receipts are submitted.
|
|
95
|
+
""",
|
|
96
|
+
"security_handbook.md": """# Security Handbook
|
|
97
|
+
|
|
98
|
+
## Data handling
|
|
99
|
+
Customer data must not be copied into personal accounts, public AI tools, or unmanaged devices. Lost devices must be reported within one hour.
|
|
100
|
+
""",
|
|
101
|
+
"old_remote_memo.md": """# Archived Remote Work Memo
|
|
102
|
+
|
|
103
|
+
## Archived rule
|
|
104
|
+
In 2021, employees could work remotely one day per week. This memo is archived and no longer applies.
|
|
105
|
+
""",
|
|
106
|
+
},
|
|
107
|
+
"questions": [
|
|
108
|
+
{"id": "emp_q1", "query": "How many PTO days do full-time employees receive?", "type": "answerable"},
|
|
109
|
+
{"id": "emp_q2", "query": "When is manager approval required for long PTO?", "type": "citation-sensitive", "expected_failure": "citation_mismatch"},
|
|
110
|
+
{"id": "emp_q3", "query": "How many remote days are allowed now?", "type": "answerable"},
|
|
111
|
+
{"id": "emp_q4", "query": "What is the home-office equipment reimbursement limit?", "type": "answerable"},
|
|
112
|
+
{"id": "emp_q5", "query": "Can employees paste customer data into a public AI tool?", "type": "answerable"},
|
|
113
|
+
{"id": "emp_q6", "query": "Does the handbook mention a four-day workweek benefit?", "type": "unanswerable", "expected_failure": "should_have_abstained"},
|
|
114
|
+
{"id": "emp_q7", "query": "Compare the old and current remote work allowance.", "type": "conflicting", "expected_failure": "conflicting_sources"},
|
|
115
|
+
{"id": "emp_q8", "query": "When must a lost device be reported?", "type": "answerable", "expected_failure": "retrieval_miss"},
|
|
116
|
+
{"id": "emp_q9", "query": "Can unused PTO roll over everywhere?", "type": "edge-case"},
|
|
117
|
+
{"id": "emp_q10", "query": "Can interns expense 1200 USD for chairs?", "type": "edge-case", "expected_failure": "unsupported_answer"},
|
|
118
|
+
],
|
|
119
|
+
"expected_answers": {
|
|
120
|
+
"emp_q1": "Full-time employees receive 18 days of PTO each calendar year.",
|
|
121
|
+
"emp_q2": "PTO requests over five consecutive business days require manager approval at least two weeks in advance.",
|
|
122
|
+
"emp_q3": "Employees may work remotely up to three days per week with manager approval.",
|
|
123
|
+
"emp_q4": "The reimbursement limit is 600 USD for approved home-office equipment.",
|
|
124
|
+
"emp_q5": "No. Customer data must not be copied into public AI tools.",
|
|
125
|
+
"emp_q6": "The handbook does not mention a four-day workweek benefit.",
|
|
126
|
+
"emp_q7": "The archived memo allowed one remote day per week; the current policy allows up to three days with approval.",
|
|
127
|
+
"emp_q8": "Lost devices must be reported within one hour.",
|
|
128
|
+
"emp_q9": "Unused PTO generally does not roll over unless state law requires it.",
|
|
129
|
+
"emp_q10": "The handbook only mentions up to 600 USD for approved equipment after receipts are submitted.",
|
|
130
|
+
},
|
|
131
|
+
"expected_sources": {
|
|
132
|
+
"emp_q1": ["pto_policy.md"],
|
|
133
|
+
"emp_q2": ["pto_policy.md"],
|
|
134
|
+
"emp_q3": ["remote_work.md"],
|
|
135
|
+
"emp_q4": ["remote_work.md"],
|
|
136
|
+
"emp_q5": ["security_handbook.md"],
|
|
137
|
+
"emp_q6": [],
|
|
138
|
+
"emp_q7": ["remote_work.md", "old_remote_memo.md"],
|
|
139
|
+
"emp_q8": ["security_handbook.md"],
|
|
140
|
+
"emp_q9": ["pto_policy.md"],
|
|
141
|
+
"emp_q10": ["remote_work.md"],
|
|
142
|
+
},
|
|
143
|
+
},
|
|
144
|
+
"ai_paper_qa": {
|
|
145
|
+
"documents": {
|
|
146
|
+
"retrieval_paper.md": """# Synthetic Retrieval Paper
|
|
147
|
+
|
|
148
|
+
## Method
|
|
149
|
+
The paper evaluates hybrid retrieval by combining sparse BM25 scores with dense embedding similarity before reranking the top candidates.
|
|
150
|
+
|
|
151
|
+
## Findings
|
|
152
|
+
Hybrid reranking improved answer citation support from 0.71 to 0.84 on the policy QA set, with a 120 ms median latency increase.
|
|
153
|
+
""",
|
|
154
|
+
"chunking_paper.md": """# Synthetic Chunking Paper
|
|
155
|
+
|
|
156
|
+
## Chunk size
|
|
157
|
+
The authors found that 350-token chunks with 60-token overlap reduced boundary errors on multi-hop questions.
|
|
158
|
+
|
|
159
|
+
## Compression
|
|
160
|
+
Aggressive context compression removed important qualifier sentences in 18 percent of citation-sensitive answers.
|
|
161
|
+
""",
|
|
162
|
+
"agent_memory_paper.md": """# Synthetic Agent Memory Paper
|
|
163
|
+
|
|
164
|
+
## Memory
|
|
165
|
+
The study warns that stale memory caused agents to reuse obsolete policy facts even when retrieval returned newer evidence.
|
|
166
|
+
""",
|
|
167
|
+
"old_retrieval_note.md": """# Archived Retrieval Note
|
|
168
|
+
|
|
169
|
+
## Archived result
|
|
170
|
+
An early draft claimed dense-only retrieval outperformed hybrid retrieval. The authors later retracted this note after finding a dataset labeling bug.
|
|
171
|
+
""",
|
|
172
|
+
},
|
|
173
|
+
"questions": [
|
|
174
|
+
{"id": "paper_q1", "query": "What retrieval strategy did the paper evaluate?", "type": "answerable"},
|
|
175
|
+
{"id": "paper_q2", "query": "How much did hybrid reranking improve citation support?", "type": "citation-sensitive", "expected_failure": "citation_mismatch"},
|
|
176
|
+
{"id": "paper_q3", "query": "What chunk size reduced boundary errors?", "type": "answerable"},
|
|
177
|
+
{"id": "paper_q4", "query": "What problem did aggressive compression create?", "type": "answerable"},
|
|
178
|
+
{"id": "paper_q5", "query": "Did the paper evaluate German legal contracts?", "type": "unanswerable", "expected_failure": "should_have_abstained"},
|
|
179
|
+
{"id": "paper_q6", "query": "Compare the archived dense-only claim with the final retrieval result.", "type": "conflicting", "expected_failure": "conflicting_sources"},
|
|
180
|
+
{"id": "paper_q7", "query": "Why can stale memory hurt agents?", "type": "answerable"},
|
|
181
|
+
{"id": "paper_q8", "query": "What latency cost did reranking add?", "type": "answerable", "expected_failure": "retrieval_miss"},
|
|
182
|
+
{"id": "paper_q9", "query": "What overlap did the chunking paper use?", "type": "multi-hop"},
|
|
183
|
+
{"id": "paper_q10", "query": "Did hybrid retrieval reduce latency by 500 ms?", "type": "edge-case", "expected_failure": "unsupported_answer"},
|
|
184
|
+
],
|
|
185
|
+
"expected_answers": {
|
|
186
|
+
"paper_q1": "It evaluated hybrid retrieval combining BM25 scores with dense embedding similarity before reranking.",
|
|
187
|
+
"paper_q2": "Hybrid reranking improved citation support from 0.71 to 0.84.",
|
|
188
|
+
"paper_q3": "The paper used 350-token chunks.",
|
|
189
|
+
"paper_q4": "Aggressive compression removed important qualifier sentences.",
|
|
190
|
+
"paper_q5": "The documents do not say the paper evaluated German legal contracts.",
|
|
191
|
+
"paper_q6": "The archived note claimed dense-only was better, but the final paper found hybrid reranking improved citation support.",
|
|
192
|
+
"paper_q7": "Stale memory can cause agents to reuse obsolete policy facts even when newer evidence is retrieved.",
|
|
193
|
+
"paper_q8": "Hybrid reranking added a 120 ms median latency increase.",
|
|
194
|
+
"paper_q9": "The chunking paper used 60-token overlap.",
|
|
195
|
+
"paper_q10": "No. The paper reports a 120 ms latency increase, not a 500 ms reduction.",
|
|
196
|
+
},
|
|
197
|
+
"expected_sources": {
|
|
198
|
+
"paper_q1": ["retrieval_paper.md"],
|
|
199
|
+
"paper_q2": ["retrieval_paper.md"],
|
|
200
|
+
"paper_q3": ["chunking_paper.md"],
|
|
201
|
+
"paper_q4": ["chunking_paper.md"],
|
|
202
|
+
"paper_q5": [],
|
|
203
|
+
"paper_q6": ["retrieval_paper.md", "old_retrieval_note.md"],
|
|
204
|
+
"paper_q7": ["agent_memory_paper.md"],
|
|
205
|
+
"paper_q8": ["retrieval_paper.md"],
|
|
206
|
+
"paper_q9": ["chunking_paper.md"],
|
|
207
|
+
"paper_q10": ["retrieval_paper.md"],
|
|
208
|
+
},
|
|
209
|
+
},
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def load_demo_dataset(name_or_path: str) -> dict[str, Any]:
|
|
214
|
+
path = _resolve_dataset_path(name_or_path)
|
|
215
|
+
if path is not None:
|
|
216
|
+
return _load_dataset_from_path(path)
|
|
217
|
+
if name_or_path in DEMO_DATASETS:
|
|
218
|
+
return {"name": name_or_path, **DEMO_DATASETS[name_or_path]}
|
|
219
|
+
raise FileNotFoundError("Demo dataset not found: %s" % name_or_path)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def list_demo_datasets() -> list[str]:
|
|
223
|
+
return sorted(DEMO_DATASETS)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _resolve_dataset_path(name_or_path: str) -> Path | None:
|
|
227
|
+
direct = Path(name_or_path)
|
|
228
|
+
if direct.exists():
|
|
229
|
+
return direct
|
|
230
|
+
cwd_dataset = Path("datasets") / "demo" / name_or_path
|
|
231
|
+
if cwd_dataset.exists():
|
|
232
|
+
return cwd_dataset
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _load_dataset_from_path(path: Path) -> dict[str, Any]:
|
|
237
|
+
documents_dir = path / "documents"
|
|
238
|
+
documents = {
|
|
239
|
+
document.name: document.read_text(encoding="utf-8")
|
|
240
|
+
for document in sorted(documents_dir.glob("*.md"))
|
|
241
|
+
}
|
|
242
|
+
questions = json.loads((path / "questions.json").read_text(encoding="utf-8"))
|
|
243
|
+
expected_answers = _read_optional(path / "expected_answers.json")
|
|
244
|
+
expected_sources = _read_optional(path / "expected_sources.json")
|
|
245
|
+
return {
|
|
246
|
+
"name": path.name,
|
|
247
|
+
"documents": documents,
|
|
248
|
+
"questions": questions,
|
|
249
|
+
"expected_answers": expected_answers,
|
|
250
|
+
"expected_sources": expected_sources,
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _read_optional(path: Path) -> dict[str, Any]:
|
|
255
|
+
if not path.exists():
|
|
256
|
+
return {}
|
|
257
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
import urllib.error
|
|
6
|
+
import urllib.parse
|
|
7
|
+
import urllib.request
|
|
8
|
+
from collections.abc import Iterable as CollectionsIterable
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Callable, Dict, Optional
|
|
12
|
+
|
|
13
|
+
from contexttrace.client import ContextTrace
|
|
14
|
+
from contexttrace.report import ReportGenerator
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
EndpointCaller = Callable[[str, str, Dict[str, str], Optional[Dict[str, Any]], float], Dict[str, Any]]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class EndpointEvalResult:
|
|
22
|
+
eval_run_id: str | None
|
|
23
|
+
trace_ids: list[str]
|
|
24
|
+
questions_tested: int
|
|
25
|
+
reliability_score: float
|
|
26
|
+
failure_rate: float
|
|
27
|
+
avg_citation_support: float
|
|
28
|
+
unsupported_claim_rate: float
|
|
29
|
+
top_failures: list[str]
|
|
30
|
+
report_path: str | None
|
|
31
|
+
|
|
32
|
+
def to_dict(self) -> dict[str, Any]:
|
|
33
|
+
return {
|
|
34
|
+
"eval_run_id": self.eval_run_id,
|
|
35
|
+
"trace_ids": self.trace_ids,
|
|
36
|
+
"questions_tested": self.questions_tested,
|
|
37
|
+
"reliability_score": self.reliability_score,
|
|
38
|
+
"failure_rate": self.failure_rate,
|
|
39
|
+
"avg_citation_support": self.avg_citation_support,
|
|
40
|
+
"unsupported_claim_rate": self.unsupported_claim_rate,
|
|
41
|
+
"top_failures": self.top_failures,
|
|
42
|
+
"report_path": self.report_path,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def run_endpoint_eval(
|
|
47
|
+
*,
|
|
48
|
+
dataset_path: str,
|
|
49
|
+
endpoint: str,
|
|
50
|
+
contexttrace: ContextTrace,
|
|
51
|
+
method: str = "POST",
|
|
52
|
+
headers: Optional[dict[str, str]] = None,
|
|
53
|
+
body_template: Optional[dict[str, Any]] = None,
|
|
54
|
+
input_key: str = "question",
|
|
55
|
+
answer_path: str = "$.answer",
|
|
56
|
+
contexts_path: str = "$.contexts",
|
|
57
|
+
citations_path: str = "$.citations",
|
|
58
|
+
timeout: float = 30.0,
|
|
59
|
+
caller: EndpointCaller | None = None,
|
|
60
|
+
generate_report: bool = True,
|
|
61
|
+
report_path: str | None = None,
|
|
62
|
+
) -> EndpointEvalResult:
|
|
63
|
+
questions = _load_dataset(dataset_path)
|
|
64
|
+
headers = headers or {}
|
|
65
|
+
method = method.upper()
|
|
66
|
+
trace_ids: list[str] = []
|
|
67
|
+
failures: list[str] = []
|
|
68
|
+
supports: list[float] = []
|
|
69
|
+
unsupported_rates: list[float] = []
|
|
70
|
+
reliability_scores: list[float] = []
|
|
71
|
+
eval_run_id: str | None = None
|
|
72
|
+
question_records: list[dict[str, Any]] = []
|
|
73
|
+
|
|
74
|
+
for index, question in enumerate(questions):
|
|
75
|
+
query = str(question.get("query") or question.get("question") or "")
|
|
76
|
+
if not query:
|
|
77
|
+
continue
|
|
78
|
+
body = _render_body(body_template, query=query) if body_template is not None else {input_key: query}
|
|
79
|
+
start = time.perf_counter()
|
|
80
|
+
response = (caller or _default_caller)(endpoint, method, headers, body, timeout)
|
|
81
|
+
latency_ms = round((time.perf_counter() - start) * 1000, 2)
|
|
82
|
+
|
|
83
|
+
answer = _extract(response, answer_path) or ""
|
|
84
|
+
raw_contexts = _extract(response, contexts_path) or _extract(response, "$.retrieved_chunks") or []
|
|
85
|
+
raw_citations = _extract(response, citations_path) or []
|
|
86
|
+
chunks = _normalize_chunks(raw_contexts)
|
|
87
|
+
citations = _normalize_citations(raw_citations, answer=answer, chunks=chunks)
|
|
88
|
+
|
|
89
|
+
with contexttrace.trace(
|
|
90
|
+
query=query,
|
|
91
|
+
metadata={
|
|
92
|
+
"source": "byo_rag_endpoint",
|
|
93
|
+
"dataset_id": question.get("id"),
|
|
94
|
+
"endpoint": endpoint,
|
|
95
|
+
"latency_ms": latency_ms,
|
|
96
|
+
},
|
|
97
|
+
) as trace:
|
|
98
|
+
if chunks:
|
|
99
|
+
trace.log_retrieval(chunks, metadata={"endpoint": endpoint})
|
|
100
|
+
trace.log_context(chunks)
|
|
101
|
+
trace.log_answer(
|
|
102
|
+
str(answer),
|
|
103
|
+
model=str(response.get("model") or "external-rag-endpoint"),
|
|
104
|
+
usage=response.get("usage") or {},
|
|
105
|
+
metadata={"latency_ms": latency_ms},
|
|
106
|
+
)
|
|
107
|
+
if citations:
|
|
108
|
+
trace.log_citations(citations)
|
|
109
|
+
evaluation = trace.evaluate()
|
|
110
|
+
trace_ids.append(str(trace.trace_id))
|
|
111
|
+
|
|
112
|
+
scores = evaluation.get("scores") or {}
|
|
113
|
+
failure = evaluation.get("failure") or {}
|
|
114
|
+
reliability = evaluation.get("reliability") or {}
|
|
115
|
+
failure_type = failure.get("failure_type") or failure.get("type") or "unknown"
|
|
116
|
+
if failure_type != "no_failure_detected":
|
|
117
|
+
failures.append(str(failure_type))
|
|
118
|
+
supports.append(float(scores.get("citation_support") or 0.0))
|
|
119
|
+
unsupported_rates.append(float(scores.get("unsupported_claim_rate") or 0.0))
|
|
120
|
+
reliability_scores.append(float(reliability.get("score") or 0.0))
|
|
121
|
+
question_records.append({"question": question, "trace_id": trace.trace_id, "position": index})
|
|
122
|
+
|
|
123
|
+
store = getattr(getattr(contexttrace, "_transport", None), "store", None)
|
|
124
|
+
if store is not None:
|
|
125
|
+
summary = {
|
|
126
|
+
"questions_tested": len(trace_ids),
|
|
127
|
+
"failure_rate": _rate(len(failures), len(trace_ids)),
|
|
128
|
+
"avg_citation_support": _avg(supports),
|
|
129
|
+
"unsupported_claim_rate": _avg(unsupported_rates),
|
|
130
|
+
"reliability_score": _avg(reliability_scores),
|
|
131
|
+
"top_failures": _top_failures(failures),
|
|
132
|
+
}
|
|
133
|
+
eval_run_id = store.create_eval_run(dataset=dataset_path, endpoint=endpoint, summary=summary)
|
|
134
|
+
for record in question_records:
|
|
135
|
+
store.add_eval_question(
|
|
136
|
+
eval_run_id=eval_run_id,
|
|
137
|
+
question=record["question"],
|
|
138
|
+
trace_id=record["trace_id"],
|
|
139
|
+
position=record["position"],
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
output_path = report_path
|
|
143
|
+
if generate_report and trace_ids:
|
|
144
|
+
if output_path is None:
|
|
145
|
+
output_path = str(Path(".contexttrace") / "reports" / ("eval_%s.html" % (eval_run_id or trace_ids[-1])))
|
|
146
|
+
traces = [contexttrace.get_trace(trace_id) for trace_id in trace_ids]
|
|
147
|
+
ReportGenerator().generate_eval_report(
|
|
148
|
+
{
|
|
149
|
+
"id": eval_run_id or "local-eval",
|
|
150
|
+
"dataset": dataset_path,
|
|
151
|
+
"endpoint": endpoint,
|
|
152
|
+
"summary": {
|
|
153
|
+
"questions_tested": len(trace_ids),
|
|
154
|
+
"failure_rate": _rate(len(failures), len(trace_ids)),
|
|
155
|
+
"avg_citation_support": _avg(supports),
|
|
156
|
+
"unsupported_claim_rate": _avg(unsupported_rates),
|
|
157
|
+
"reliability_score": _avg(reliability_scores),
|
|
158
|
+
"top_failures": _top_failures(failures),
|
|
159
|
+
},
|
|
160
|
+
},
|
|
161
|
+
traces,
|
|
162
|
+
path=output_path,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return EndpointEvalResult(
|
|
166
|
+
eval_run_id=eval_run_id,
|
|
167
|
+
trace_ids=trace_ids,
|
|
168
|
+
questions_tested=len(trace_ids),
|
|
169
|
+
reliability_score=_avg(reliability_scores),
|
|
170
|
+
failure_rate=_rate(len(failures), len(trace_ids)),
|
|
171
|
+
avg_citation_support=_avg(supports),
|
|
172
|
+
unsupported_claim_rate=_avg(unsupported_rates),
|
|
173
|
+
top_failures=_top_failures(failures),
|
|
174
|
+
report_path=output_path,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _load_dataset(path: str) -> list[dict[str, Any]]:
|
|
179
|
+
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
180
|
+
if isinstance(data, list):
|
|
181
|
+
raw_questions = data
|
|
182
|
+
elif isinstance(data, dict):
|
|
183
|
+
raw_questions = data.get("questions") or []
|
|
184
|
+
else:
|
|
185
|
+
raw_questions = []
|
|
186
|
+
|
|
187
|
+
questions: list[dict[str, Any]] = []
|
|
188
|
+
for index, item in enumerate(raw_questions):
|
|
189
|
+
if isinstance(item, str):
|
|
190
|
+
questions.append({"id": "q%s" % (index + 1), "query": item})
|
|
191
|
+
elif isinstance(item, dict):
|
|
192
|
+
questions.append(dict(item))
|
|
193
|
+
return questions
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _default_caller(
|
|
197
|
+
endpoint: str,
|
|
198
|
+
method: str,
|
|
199
|
+
headers: dict[str, str],
|
|
200
|
+
body: dict[str, Any] | None,
|
|
201
|
+
timeout: float,
|
|
202
|
+
) -> dict[str, Any]:
|
|
203
|
+
request_headers = {"Content-Type": "application/json", **headers}
|
|
204
|
+
url = endpoint
|
|
205
|
+
data = None
|
|
206
|
+
if method == "GET":
|
|
207
|
+
query = urllib.parse.urlencode(body or {})
|
|
208
|
+
separator = "&" if "?" in endpoint else "?"
|
|
209
|
+
url = endpoint + (separator + query if query else "")
|
|
210
|
+
else:
|
|
211
|
+
data = json.dumps(body or {}).encode("utf-8")
|
|
212
|
+
request = urllib.request.Request(url, data=data, headers=request_headers, method=method)
|
|
213
|
+
try:
|
|
214
|
+
with urllib.request.urlopen(request, timeout=timeout) as response:
|
|
215
|
+
return json.loads(response.read().decode("utf-8"))
|
|
216
|
+
except urllib.error.URLError as exc:
|
|
217
|
+
raise RuntimeError("RAG endpoint request failed: %s" % exc) from exc
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _render_body(template: Any, *, query: str) -> Any:
|
|
221
|
+
if isinstance(template, str):
|
|
222
|
+
return template.replace("{{query}}", query)
|
|
223
|
+
if isinstance(template, list):
|
|
224
|
+
return [_render_body(value, query=query) for value in template]
|
|
225
|
+
if isinstance(template, dict):
|
|
226
|
+
return {key: _render_body(value, query=query) for key, value in template.items()}
|
|
227
|
+
return template
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _extract(payload: Any, path: str) -> Any:
|
|
231
|
+
if path in {"", "$"}:
|
|
232
|
+
return payload
|
|
233
|
+
if not path.startswith("$."):
|
|
234
|
+
return None
|
|
235
|
+
value = payload
|
|
236
|
+
for part in path[2:].split("."):
|
|
237
|
+
if value is None:
|
|
238
|
+
return None
|
|
239
|
+
if "[" in part and part.endswith("]"):
|
|
240
|
+
name, raw_index = part[:-1].split("[", 1)
|
|
241
|
+
value = value.get(name) if isinstance(value, dict) else None
|
|
242
|
+
try:
|
|
243
|
+
value = value[int(raw_index)]
|
|
244
|
+
except (TypeError, ValueError, IndexError):
|
|
245
|
+
return None
|
|
246
|
+
elif isinstance(value, dict):
|
|
247
|
+
value = value.get(part)
|
|
248
|
+
else:
|
|
249
|
+
return None
|
|
250
|
+
return value
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _normalize_chunks(raw_contexts: Any) -> list[dict[str, Any]]:
|
|
254
|
+
if raw_contexts is None:
|
|
255
|
+
return []
|
|
256
|
+
if isinstance(raw_contexts, (str, dict)):
|
|
257
|
+
raw_contexts = [raw_contexts]
|
|
258
|
+
chunks = []
|
|
259
|
+
for index, item in enumerate(raw_contexts if isinstance(raw_contexts, CollectionsIterable) else []):
|
|
260
|
+
if isinstance(item, str):
|
|
261
|
+
chunks.append({"chunk_id": "chunk_%s" % (index + 1), "content": item})
|
|
262
|
+
elif isinstance(item, dict):
|
|
263
|
+
chunks.append(
|
|
264
|
+
{
|
|
265
|
+
"chunk_id": str(item.get("chunk_id") or item.get("id") or "chunk_%s" % (index + 1)),
|
|
266
|
+
"content": str(item.get("content") or item.get("text") or item.get("page_content") or ""),
|
|
267
|
+
"source": item.get("source"),
|
|
268
|
+
"metadata": item.get("metadata") or {},
|
|
269
|
+
"relevance_score": item.get("relevance_score") or item.get("score"),
|
|
270
|
+
}
|
|
271
|
+
)
|
|
272
|
+
return [chunk for chunk in chunks if chunk.get("content")]
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _normalize_citations(
|
|
276
|
+
raw_citations: Any,
|
|
277
|
+
*,
|
|
278
|
+
answer: Any,
|
|
279
|
+
chunks: list[dict[str, Any]],
|
|
280
|
+
) -> list[dict[str, Any]]:
|
|
281
|
+
if raw_citations is None:
|
|
282
|
+
raw_citations = []
|
|
283
|
+
if isinstance(raw_citations, dict):
|
|
284
|
+
raw_citations = [raw_citations]
|
|
285
|
+
citations = []
|
|
286
|
+
for item in raw_citations if isinstance(raw_citations, CollectionsIterable) and not isinstance(raw_citations, str) else []:
|
|
287
|
+
if isinstance(item, dict):
|
|
288
|
+
source_chunk_id = item.get("source_chunk_id") or item.get("chunk_id") or item.get("id")
|
|
289
|
+
if source_chunk_id is None and item.get("source") and chunks:
|
|
290
|
+
source_chunk_id = chunks[0]["chunk_id"]
|
|
291
|
+
citations.append(
|
|
292
|
+
{
|
|
293
|
+
"claim": str(item.get("claim") or item.get("text") or answer or ""),
|
|
294
|
+
"source_chunk_id": str(source_chunk_id or (chunks[0]["chunk_id"] if chunks else "")),
|
|
295
|
+
}
|
|
296
|
+
)
|
|
297
|
+
elif isinstance(item, str) and chunks:
|
|
298
|
+
citations.append({"claim": item, "source_chunk_id": chunks[0]["chunk_id"]})
|
|
299
|
+
if not citations and answer and chunks:
|
|
300
|
+
citations.append({"claim": str(answer), "source_chunk_id": chunks[0]["chunk_id"]})
|
|
301
|
+
return [citation for citation in citations if citation["claim"] and citation["source_chunk_id"]]
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _avg(values: list[float]) -> float:
|
|
305
|
+
return round(sum(values) / len(values), 3) if values else 0.0
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _rate(count: int, total: int) -> float:
|
|
309
|
+
return round(count / total, 3) if total else 0.0
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _top_failures(failures: list[str]) -> list[str]:
|
|
313
|
+
counts = {failure: failures.count(failure) for failure in set(failures)}
|
|
314
|
+
return [name for name, _ in sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:5]]
|
contexttrace/errors.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
class ContextTraceError(Exception):
|
|
2
|
+
"""Base SDK error."""
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ContextTraceConfigError(ContextTraceError):
|
|
6
|
+
"""Raised when SDK configuration is invalid."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ContextTraceHTTPError(ContextTraceError):
|
|
10
|
+
"""Raised when the ContextTrace API request fails."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ContextTraceLocalError(ContextTraceError):
|
|
14
|
+
"""Raised when local trace storage cannot satisfy a request."""
|