trajscore 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agenteval/__init__.py +78 -0
- agenteval/advanced.py +614 -0
- agenteval/evaluator.py +113 -0
- agenteval/exceptions.py +25 -0
- agenteval/metrics/__init__.py +16 -0
- agenteval/metrics/answer_faithfulness.py +51 -0
- agenteval/metrics/base.py +34 -0
- agenteval/metrics/goal_completion.py +52 -0
- agenteval/metrics/loop_detection.py +52 -0
- agenteval/metrics/reasoning_coherence.py +53 -0
- agenteval/metrics/step_efficiency.py +47 -0
- agenteval/metrics/tool_accuracy.py +61 -0
- agenteval/models.py +71 -0
- agenteval/watcher.py +63 -0
- trajscore-1.0.0.dist-info/METADATA +277 -0
- trajscore-1.0.0.dist-info/RECORD +18 -0
- trajscore-1.0.0.dist-info/WHEEL +5 -0
- trajscore-1.0.0.dist-info/top_level.txt +1 -0
agenteval/__init__.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""
|
|
2
|
+
agenteval — Agentic Multi-Step Trajectory Evaluation.
|
|
3
|
+
|
|
4
|
+
Evaluate any AI agent trajectory with production-grade metrics:
|
|
5
|
+
goal completion, tool accuracy, step efficiency, reasoning coherence,
|
|
6
|
+
loop detection, and answer faithfulness.
|
|
7
|
+
"""
|
|
8
|
+
from agenteval.models import (
|
|
9
|
+
StepType,
|
|
10
|
+
TrajectoryStep,
|
|
11
|
+
Trajectory,
|
|
12
|
+
StepScore,
|
|
13
|
+
TrajectoryScore,
|
|
14
|
+
EvaluationResult,
|
|
15
|
+
)
|
|
16
|
+
from agenteval.evaluator import TrajectoryEvaluator
|
|
17
|
+
from agenteval.watcher import TrajectoryWatcher
|
|
18
|
+
from agenteval.exceptions import (
|
|
19
|
+
AgentEvalError,
|
|
20
|
+
TrajectoryValidationError,
|
|
21
|
+
EvaluationError,
|
|
22
|
+
MetricNotFoundError,
|
|
23
|
+
BudgetExceededError,
|
|
24
|
+
SchemaViolationError,
|
|
25
|
+
)
|
|
26
|
+
from agenteval.metrics import (
|
|
27
|
+
GoalCompletionMetric,
|
|
28
|
+
ToolAccuracyMetric,
|
|
29
|
+
StepEfficiencyMetric,
|
|
30
|
+
ReasoningCoherenceMetric,
|
|
31
|
+
LoopDetectionMetric,
|
|
32
|
+
AnswerFaithfulnessMetric,
|
|
33
|
+
)
|
|
34
|
+
from agenteval.advanced import (
|
|
35
|
+
TrajectoryCache,
|
|
36
|
+
EvalPipeline,
|
|
37
|
+
TrajectoryRule,
|
|
38
|
+
TrajectoryValidator,
|
|
39
|
+
ConfidenceScorer,
|
|
40
|
+
RateLimiter,
|
|
41
|
+
CancellationToken,
|
|
42
|
+
abatch_evaluate,
|
|
43
|
+
batch_evaluate,
|
|
44
|
+
evaluate_with_budget,
|
|
45
|
+
EvaluationProfiler,
|
|
46
|
+
DriftDetector,
|
|
47
|
+
EvaluationReport,
|
|
48
|
+
stream_scores,
|
|
49
|
+
scores_to_ndjson,
|
|
50
|
+
ScoreDiff,
|
|
51
|
+
diff_results,
|
|
52
|
+
RegressionTracker,
|
|
53
|
+
AuditLog,
|
|
54
|
+
CostLedger,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
__version__ = "1.0.0"
|
|
58
|
+
|
|
59
|
+
__all__ = [
|
|
60
|
+
# Models
|
|
61
|
+
"StepType", "TrajectoryStep", "Trajectory", "StepScore",
|
|
62
|
+
"TrajectoryScore", "EvaluationResult",
|
|
63
|
+
# Core
|
|
64
|
+
"TrajectoryEvaluator", "TrajectoryWatcher",
|
|
65
|
+
# Exceptions
|
|
66
|
+
"AgentEvalError", "TrajectoryValidationError", "EvaluationError",
|
|
67
|
+
"MetricNotFoundError", "BudgetExceededError", "SchemaViolationError",
|
|
68
|
+
# Metrics
|
|
69
|
+
"GoalCompletionMetric", "ToolAccuracyMetric", "StepEfficiencyMetric",
|
|
70
|
+
"ReasoningCoherenceMetric", "LoopDetectionMetric", "AnswerFaithfulnessMetric",
|
|
71
|
+
# Advanced
|
|
72
|
+
"TrajectoryCache", "EvalPipeline", "TrajectoryRule", "TrajectoryValidator",
|
|
73
|
+
"ConfidenceScorer", "RateLimiter", "CancellationToken",
|
|
74
|
+
"abatch_evaluate", "batch_evaluate", "evaluate_with_budget",
|
|
75
|
+
"EvaluationProfiler", "DriftDetector", "EvaluationReport",
|
|
76
|
+
"stream_scores", "scores_to_ndjson", "ScoreDiff", "diff_results",
|
|
77
|
+
"RegressionTracker", "AuditLog", "CostLedger",
|
|
78
|
+
]
|
agenteval/advanced.py
ADDED
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Advanced features for agenteval — 2026 Standard.
|
|
3
|
+
|
|
4
|
+
Covers: Caching, Pipeline, Validation & Schema, Async & Concurrency,
|
|
5
|
+
Observability, Streaming & Storage, Diff & Regression, Security & Cost.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import csv
|
|
11
|
+
import hashlib
|
|
12
|
+
import io
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
import threading
|
|
16
|
+
import time
|
|
17
|
+
from collections import defaultdict, deque
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from typing import Any, Callable, Dict, Generator, List, Optional, Sequence, Tuple
|
|
20
|
+
|
|
21
|
+
from agenteval.exceptions import BudgetExceededError, EvaluationError
|
|
22
|
+
from agenteval.models import EvaluationResult, Trajectory, TrajectoryScore
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ─────────────────────────────────────────────
|
|
28
|
+
# 1. CACHING
|
|
29
|
+
# ─────────────────────────────────────────────
|
|
30
|
+
|
|
31
|
+
class TrajectoryCache:
|
|
32
|
+
"""LRU + TTL cache for TrajectoryScore results, keyed by SHA-256."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, max_size: int = 256, ttl: float = 300.0) -> None:
|
|
35
|
+
self.max_size = max_size
|
|
36
|
+
self.ttl = ttl
|
|
37
|
+
self._store: Dict[str, Tuple[TrajectoryScore, float]] = {}
|
|
38
|
+
self._order: deque = deque()
|
|
39
|
+
self._lock = threading.Lock()
|
|
40
|
+
self._hits = 0
|
|
41
|
+
self._misses = 0
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def _key(trajectory: Trajectory) -> str:
|
|
45
|
+
raw = json.dumps(trajectory.model_dump(), sort_keys=True, default=str)
|
|
46
|
+
return hashlib.sha256(raw.encode()).hexdigest()
|
|
47
|
+
|
|
48
|
+
def get(self, trajectory: Trajectory) -> Optional[TrajectoryScore]:
|
|
49
|
+
"""Return cached score or None if expired / not found."""
|
|
50
|
+
k = self._key(trajectory)
|
|
51
|
+
with self._lock:
|
|
52
|
+
if k in self._store:
|
|
53
|
+
score, ts = self._store[k]
|
|
54
|
+
if time.time() - ts <= self.ttl:
|
|
55
|
+
self._hits += 1
|
|
56
|
+
return score
|
|
57
|
+
del self._store[k]
|
|
58
|
+
self._misses += 1
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
def put(self, trajectory: Trajectory, score: TrajectoryScore) -> None:
|
|
62
|
+
"""Insert or update a cached score, evicting LRU if at capacity."""
|
|
63
|
+
k = self._key(trajectory)
|
|
64
|
+
with self._lock:
|
|
65
|
+
if k in self._store:
|
|
66
|
+
self._order.remove(k)
|
|
67
|
+
elif len(self._store) >= self.max_size:
|
|
68
|
+
oldest = self._order.popleft()
|
|
69
|
+
self._store.pop(oldest, None)
|
|
70
|
+
self._store[k] = (score, time.time())
|
|
71
|
+
self._order.append(k)
|
|
72
|
+
|
|
73
|
+
def memoize(self, evaluate_fn: Callable[[Trajectory], TrajectoryScore]) -> Callable[[Trajectory], TrajectoryScore]:
|
|
74
|
+
"""Decorator: cache the result of evaluate_fn."""
|
|
75
|
+
def wrapper(trajectory: Trajectory) -> TrajectoryScore:
|
|
76
|
+
cached = self.get(trajectory)
|
|
77
|
+
if cached is not None:
|
|
78
|
+
return cached
|
|
79
|
+
result = evaluate_fn(trajectory)
|
|
80
|
+
self.put(trajectory, result)
|
|
81
|
+
return result
|
|
82
|
+
return wrapper
|
|
83
|
+
|
|
84
|
+
def stats(self) -> Dict[str, Any]:
|
|
85
|
+
"""Return cache statistics."""
|
|
86
|
+
with self._lock:
|
|
87
|
+
total = self._hits + self._misses
|
|
88
|
+
return {
|
|
89
|
+
"size": len(self._store),
|
|
90
|
+
"max_size": self.max_size,
|
|
91
|
+
"ttl": self.ttl,
|
|
92
|
+
"hits": self._hits,
|
|
93
|
+
"misses": self._misses,
|
|
94
|
+
"hit_rate": self._hits / total if total > 0 else 0.0,
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
def save(self, path: str) -> None:
|
|
98
|
+
"""Persist cache to a JSON file."""
|
|
99
|
+
with self._lock:
|
|
100
|
+
data = {k: (v[0].model_dump(), v[1]) for k, v in self._store.items()}
|
|
101
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
102
|
+
json.dump(data, f)
|
|
103
|
+
logger.info("TrajectoryCache saved to %s", path)
|
|
104
|
+
|
|
105
|
+
def load(self, path: str) -> None:
|
|
106
|
+
"""Load cache from a JSON file."""
|
|
107
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
108
|
+
data = json.load(f)
|
|
109
|
+
with self._lock:
|
|
110
|
+
for k, (score_dict, ts) in data.items():
|
|
111
|
+
self._store[k] = (TrajectoryScore(**score_dict), ts)
|
|
112
|
+
self._order.append(k)
|
|
113
|
+
logger.info("TrajectoryCache loaded from %s", path)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# ─────────────────────────────────────────────
|
|
117
|
+
# 2. PIPELINE
|
|
118
|
+
# ─────────────────────────────────────────────
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class _PipelineStep:
|
|
122
|
+
name: str
|
|
123
|
+
fn: Callable[[List[Trajectory]], List[Trajectory]]
|
|
124
|
+
retries: int = 0
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class EvalPipeline:
|
|
128
|
+
"""Fluent, auditable pipeline for trajectory pre-processing and evaluation."""
|
|
129
|
+
|
|
130
|
+
def __init__(self) -> None:
|
|
131
|
+
self._steps: List[_PipelineStep] = []
|
|
132
|
+
self._audit_log: List[Dict[str, Any]] = []
|
|
133
|
+
|
|
134
|
+
def map(self, name: str, fn: Callable[[Trajectory], Trajectory]) -> "EvalPipeline":
|
|
135
|
+
"""Apply fn to each trajectory."""
|
|
136
|
+
self._steps.append(_PipelineStep(name, lambda ts: [fn(t) for t in ts]))
|
|
137
|
+
return self
|
|
138
|
+
|
|
139
|
+
def filter(self, name: str, fn: Callable[[Trajectory], bool]) -> "EvalPipeline":
|
|
140
|
+
"""Keep only trajectories for which fn returns True."""
|
|
141
|
+
self._steps.append(_PipelineStep(name, lambda ts: [t for t in ts if fn(t)]))
|
|
142
|
+
return self
|
|
143
|
+
|
|
144
|
+
def branch(self, condition: Callable[[Trajectory], bool],
|
|
145
|
+
true_fn: Callable[[Trajectory], Trajectory],
|
|
146
|
+
false_fn: Callable[[Trajectory], Trajectory]) -> "EvalPipeline":
|
|
147
|
+
"""Route trajectories to true_fn or false_fn based on condition."""
|
|
148
|
+
def _branch(ts: List[Trajectory]) -> List[Trajectory]:
|
|
149
|
+
return [true_fn(t) if condition(t) else false_fn(t) for t in ts]
|
|
150
|
+
self._steps.append(_PipelineStep("branch", _branch))
|
|
151
|
+
return self
|
|
152
|
+
|
|
153
|
+
def with_retry(self, step_name: str, retries: int = 3) -> "EvalPipeline":
|
|
154
|
+
"""Set retry count for a named step."""
|
|
155
|
+
for step in self._steps:
|
|
156
|
+
if step.name == step_name:
|
|
157
|
+
step.retries = retries
|
|
158
|
+
return self
|
|
159
|
+
|
|
160
|
+
def run(self, trajectories: List[Trajectory]) -> List[Trajectory]:
|
|
161
|
+
"""Execute the pipeline synchronously."""
|
|
162
|
+
result = list(trajectories)
|
|
163
|
+
for step in self._steps:
|
|
164
|
+
start = time.time()
|
|
165
|
+
attempt = 0
|
|
166
|
+
last_exc: Optional[Exception] = None
|
|
167
|
+
while attempt <= step.retries:
|
|
168
|
+
try:
|
|
169
|
+
result = step.fn(result)
|
|
170
|
+
break
|
|
171
|
+
except Exception as exc:
|
|
172
|
+
last_exc = exc
|
|
173
|
+
attempt += 1
|
|
174
|
+
logger.warning("Pipeline step '%s' attempt %d failed: %s", step.name, attempt, exc)
|
|
175
|
+
else:
|
|
176
|
+
raise EvaluationError(f"Pipeline step '{step.name}' failed after {step.retries + 1} attempts") from last_exc
|
|
177
|
+
elapsed = time.time() - start
|
|
178
|
+
self._audit_log.append({"step": step.name, "output_count": len(result), "elapsed_s": elapsed})
|
|
179
|
+
return result
|
|
180
|
+
|
|
181
|
+
async def arun(self, trajectories: List[Trajectory]) -> List[Trajectory]:
|
|
182
|
+
"""Execute the pipeline asynchronously."""
|
|
183
|
+
loop = asyncio.get_event_loop()
|
|
184
|
+
return await loop.run_in_executor(None, self.run, trajectories)
|
|
185
|
+
|
|
186
|
+
@property
|
|
187
|
+
def audit_log(self) -> List[Dict[str, Any]]:
|
|
188
|
+
"""Return step audit log."""
|
|
189
|
+
return list(self._audit_log)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# ─────────────────────────────────────────────
|
|
193
|
+
# 3. VALIDATION & SCHEMA
|
|
194
|
+
# ─────────────────────────────────────────────
|
|
195
|
+
|
|
196
|
+
@dataclass
|
|
197
|
+
class TrajectoryRule:
|
|
198
|
+
"""A single declarative validation rule for a trajectory."""
|
|
199
|
+
name: str
|
|
200
|
+
check: Callable[[Trajectory], bool]
|
|
201
|
+
message: str
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class TrajectoryValidator:
|
|
205
|
+
"""Declarative trajectory validator."""
|
|
206
|
+
|
|
207
|
+
def __init__(self) -> None:
|
|
208
|
+
self._rules: List[TrajectoryRule] = []
|
|
209
|
+
|
|
210
|
+
def add_rule(self, rule: TrajectoryRule) -> "TrajectoryValidator":
|
|
211
|
+
"""Register a validation rule."""
|
|
212
|
+
self._rules.append(rule)
|
|
213
|
+
return self
|
|
214
|
+
|
|
215
|
+
def validate(self, trajectory: Trajectory) -> List[str]:
|
|
216
|
+
"""Return list of violation messages; empty = valid."""
|
|
217
|
+
violations = []
|
|
218
|
+
for rule in self._rules:
|
|
219
|
+
try:
|
|
220
|
+
if not rule.check(trajectory):
|
|
221
|
+
violations.append(rule.message)
|
|
222
|
+
except Exception as exc:
|
|
223
|
+
violations.append(f"Rule '{rule.name}' error: {exc}")
|
|
224
|
+
return violations
|
|
225
|
+
|
|
226
|
+
def is_valid(self, trajectory: Trajectory) -> bool:
|
|
227
|
+
"""Return True if all rules pass."""
|
|
228
|
+
return len(self.validate(trajectory)) == 0
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class ConfidenceScorer:
|
|
232
|
+
"""Heuristic 0–1 confidence score for a TrajectoryScore."""
|
|
233
|
+
|
|
234
|
+
def score(self, ts: TrajectoryScore) -> float:
|
|
235
|
+
"""Return confidence based on metric variance and step score spread."""
|
|
236
|
+
if not ts.metric_scores:
|
|
237
|
+
return 0.0
|
|
238
|
+
scores = list(ts.metric_scores.values())
|
|
239
|
+
mean = sum(scores) / len(scores)
|
|
240
|
+
variance = sum((s - mean) ** 2 for s in scores) / len(scores)
|
|
241
|
+
# Low variance = high confidence
|
|
242
|
+
confidence = max(0.0, 1.0 - variance)
|
|
243
|
+
return round(confidence, 4)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
# ─────────────────────────────────────────────
|
|
247
|
+
# 4. ASYNC & CONCURRENCY
|
|
248
|
+
# ─────────────────────────────────────────────
|
|
249
|
+
|
|
250
|
+
class RateLimiter:
|
|
251
|
+
"""Token-bucket rate limiter (sync + async)."""
|
|
252
|
+
|
|
253
|
+
def __init__(self, rate: float, capacity: float) -> None:
|
|
254
|
+
self.rate = rate
|
|
255
|
+
self.capacity = capacity
|
|
256
|
+
self._tokens = capacity
|
|
257
|
+
self._last = time.monotonic()
|
|
258
|
+
self._lock = threading.Lock()
|
|
259
|
+
|
|
260
|
+
def _refill(self) -> None:
|
|
261
|
+
now = time.monotonic()
|
|
262
|
+
elapsed = now - self._last
|
|
263
|
+
self._tokens = min(self.capacity, self._tokens + elapsed * self.rate)
|
|
264
|
+
self._last = now
|
|
265
|
+
|
|
266
|
+
def acquire(self, tokens: float = 1.0) -> bool:
|
|
267
|
+
"""Synchronously acquire tokens. Returns False if denied."""
|
|
268
|
+
with self._lock:
|
|
269
|
+
self._refill()
|
|
270
|
+
if self._tokens >= tokens:
|
|
271
|
+
self._tokens -= tokens
|
|
272
|
+
return True
|
|
273
|
+
return False
|
|
274
|
+
|
|
275
|
+
async def aacquire(self, tokens: float = 1.0) -> bool:
|
|
276
|
+
"""Async acquire (non-blocking)."""
|
|
277
|
+
loop = asyncio.get_event_loop()
|
|
278
|
+
return await loop.run_in_executor(None, self.acquire, tokens)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class CancellationToken:
|
|
282
|
+
"""Token for cooperative cancellation of async batch evaluation."""
|
|
283
|
+
|
|
284
|
+
def __init__(self) -> None:
|
|
285
|
+
self._cancelled = False
|
|
286
|
+
|
|
287
|
+
def cancel(self) -> None:
|
|
288
|
+
"""Signal cancellation."""
|
|
289
|
+
self._cancelled = True
|
|
290
|
+
|
|
291
|
+
@property
|
|
292
|
+
def is_cancelled(self) -> bool:
|
|
293
|
+
return self._cancelled
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
async def abatch_evaluate(
|
|
297
|
+
trajectories: List[Trajectory],
|
|
298
|
+
evaluate_fn: Callable[[Trajectory], TrajectoryScore],
|
|
299
|
+
concurrency: int = 8,
|
|
300
|
+
token: Optional[CancellationToken] = None,
|
|
301
|
+
) -> List[TrajectoryScore]:
|
|
302
|
+
"""Async concurrent trajectory evaluation with optional cancellation."""
|
|
303
|
+
sem = asyncio.Semaphore(concurrency)
|
|
304
|
+
|
|
305
|
+
async def _eval(t: Trajectory) -> TrajectoryScore:
|
|
306
|
+
if token and token.is_cancelled:
|
|
307
|
+
raise asyncio.CancelledError()
|
|
308
|
+
async with sem:
|
|
309
|
+
loop = asyncio.get_event_loop()
|
|
310
|
+
return await loop.run_in_executor(None, evaluate_fn, t)
|
|
311
|
+
|
|
312
|
+
return list(await asyncio.gather(*[_eval(t) for t in trajectories]))
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def batch_evaluate(
|
|
316
|
+
trajectories: List[Trajectory],
|
|
317
|
+
evaluate_fn: Callable[[Trajectory], TrajectoryScore],
|
|
318
|
+
max_workers: int = 4,
|
|
319
|
+
) -> List[TrajectoryScore]:
|
|
320
|
+
"""Synchronous concurrent trajectory evaluation using thread pool."""
|
|
321
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
322
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
323
|
+
return list(pool.map(evaluate_fn, trajectories))
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def evaluate_with_budget(
|
|
327
|
+
trajectories: List[Trajectory],
|
|
328
|
+
evaluate_fn: Callable[[Trajectory], TrajectoryScore],
|
|
329
|
+
budget_seconds: float = 30.0,
|
|
330
|
+
) -> List[TrajectoryScore]:
|
|
331
|
+
"""Evaluate trajectories within a wall-clock time budget."""
|
|
332
|
+
results: List[TrajectoryScore] = []
|
|
333
|
+
deadline = time.monotonic() + budget_seconds
|
|
334
|
+
for t in trajectories:
|
|
335
|
+
if time.monotonic() >= deadline:
|
|
336
|
+
raise BudgetExceededError(f"Time budget of {budget_seconds}s exceeded after {len(results)} trajectories.")
|
|
337
|
+
results.append(evaluate_fn(t))
|
|
338
|
+
return results
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
# ─────────────────────────────────────────────
|
|
342
|
+
# 5. OBSERVABILITY
|
|
343
|
+
# ─────────────────────────────────────────────
|
|
344
|
+
|
|
345
|
+
class EvaluationProfiler:
|
|
346
|
+
"""Tracks timing and memory per evaluation call."""
|
|
347
|
+
|
|
348
|
+
def __init__(self) -> None:
|
|
349
|
+
self._records: List[Dict[str, Any]] = []
|
|
350
|
+
|
|
351
|
+
def profile(self, evaluate_fn: Callable[[Trajectory], TrajectoryScore]) -> Callable[[Trajectory], TrajectoryScore]:
|
|
352
|
+
"""Decorator that records call timing."""
|
|
353
|
+
def wrapper(trajectory: Trajectory) -> TrajectoryScore:
|
|
354
|
+
start = time.perf_counter()
|
|
355
|
+
result = evaluate_fn(trajectory)
|
|
356
|
+
elapsed = time.perf_counter() - start
|
|
357
|
+
self._records.append({
|
|
358
|
+
"trajectory_id": trajectory.trajectory_id,
|
|
359
|
+
"elapsed_s": round(elapsed, 6),
|
|
360
|
+
"overall_score": result.overall_score,
|
|
361
|
+
"passed": result.passed,
|
|
362
|
+
})
|
|
363
|
+
return result
|
|
364
|
+
return wrapper
|
|
365
|
+
|
|
366
|
+
def report(self) -> Dict[str, Any]:
|
|
367
|
+
"""Return profiling summary."""
|
|
368
|
+
if not self._records:
|
|
369
|
+
return {"calls": 0}
|
|
370
|
+
elapsed_vals = [r["elapsed_s"] for r in self._records]
|
|
371
|
+
return {
|
|
372
|
+
"calls": len(self._records),
|
|
373
|
+
"mean_elapsed_s": sum(elapsed_vals) / len(elapsed_vals),
|
|
374
|
+
"max_elapsed_s": max(elapsed_vals),
|
|
375
|
+
"min_elapsed_s": min(elapsed_vals),
|
|
376
|
+
"records": self._records,
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
class DriftDetector:
|
|
381
|
+
"""Detect metric drift across evaluation runs."""
|
|
382
|
+
|
|
383
|
+
def __init__(self, threshold: float = 0.1) -> None:
|
|
384
|
+
self.threshold = threshold
|
|
385
|
+
self._baseline: Optional[Dict[str, float]] = None
|
|
386
|
+
|
|
387
|
+
def set_baseline(self, result: EvaluationResult) -> None:
|
|
388
|
+
"""Set the baseline metric means."""
|
|
389
|
+
self._baseline = dict(result.metric_means)
|
|
390
|
+
logger.info("DriftDetector baseline set: %s", self._baseline)
|
|
391
|
+
|
|
392
|
+
def detect(self, result: EvaluationResult) -> Dict[str, float]:
|
|
393
|
+
"""Return dict of metric → drift (absolute diff). Empty if no baseline."""
|
|
394
|
+
if not self._baseline:
|
|
395
|
+
return {}
|
|
396
|
+
drifts: Dict[str, float] = {}
|
|
397
|
+
for metric, val in result.metric_means.items():
|
|
398
|
+
baseline_val = self._baseline.get(metric, val)
|
|
399
|
+
drift = abs(val - baseline_val)
|
|
400
|
+
if drift >= self.threshold:
|
|
401
|
+
drifts[metric] = round(drift, 4)
|
|
402
|
+
logger.warning("Drift detected in '%s': %.4f", metric, drift)
|
|
403
|
+
return drifts
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
class EvaluationReport:
|
|
407
|
+
"""Export EvaluationResult to JSON, CSV, or Markdown."""
|
|
408
|
+
|
|
409
|
+
def __init__(self, result: EvaluationResult) -> None:
|
|
410
|
+
self._result = result
|
|
411
|
+
|
|
412
|
+
def to_json(self, indent: int = 2) -> str:
|
|
413
|
+
"""Serialize to JSON string."""
|
|
414
|
+
return json.dumps(self._result.model_dump(), indent=indent, default=str)
|
|
415
|
+
|
|
416
|
+
def to_csv(self) -> str:
|
|
417
|
+
"""Serialize per-trajectory scores to CSV."""
|
|
418
|
+
buf = io.StringIO()
|
|
419
|
+
writer = csv.writer(buf)
|
|
420
|
+
if self._result.scores:
|
|
421
|
+
metric_names = list(self._result.scores[0].metric_scores.keys())
|
|
422
|
+
writer.writerow(["trajectory_id", "task", "overall_score", "passed"] + metric_names)
|
|
423
|
+
for s in self._result.scores:
|
|
424
|
+
row = [s.trajectory_id, s.task, s.overall_score, s.passed]
|
|
425
|
+
row += [s.metric_scores.get(m, "") for m in metric_names]
|
|
426
|
+
writer.writerow(row)
|
|
427
|
+
return buf.getvalue()
|
|
428
|
+
|
|
429
|
+
def to_markdown(self) -> str:
|
|
430
|
+
"""Render a Markdown summary table."""
|
|
431
|
+
r = self._result
|
|
432
|
+
lines = [
|
|
433
|
+
"# Evaluation Report",
|
|
434
|
+
f"**Trajectories evaluated:** {r.trajectories_evaluated}",
|
|
435
|
+
f"**Mean overall score:** {r.mean_overall:.3f}",
|
|
436
|
+
f"**Pass rate:** {r.pass_rate:.1%}",
|
|
437
|
+
"",
|
|
438
|
+
"## Metric Means",
|
|
439
|
+
"| Metric | Mean Score |",
|
|
440
|
+
"|--------|-----------|",
|
|
441
|
+
]
|
|
442
|
+
for metric, val in r.metric_means.items():
|
|
443
|
+
lines.append(f"| {metric} | {val:.3f} |")
|
|
444
|
+
return "\n".join(lines)
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
# ─────────────────────────────────────────────
|
|
448
|
+
# 6. STREAMING & STORAGE
|
|
449
|
+
# ─────────────────────────────────────────────
|
|
450
|
+
|
|
451
|
+
def stream_scores(
|
|
452
|
+
trajectories: List[Trajectory],
|
|
453
|
+
evaluate_fn: Callable[[Trajectory], TrajectoryScore],
|
|
454
|
+
) -> Generator[TrajectoryScore, None, None]:
|
|
455
|
+
"""Generator that yields TrajectoryScore one at a time (streaming)."""
|
|
456
|
+
for t in trajectories:
|
|
457
|
+
yield evaluate_fn(t)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def scores_to_ndjson(
|
|
461
|
+
trajectories: List[Trajectory],
|
|
462
|
+
evaluate_fn: Callable[[Trajectory], TrajectoryScore],
|
|
463
|
+
) -> Generator[str, None, None]:
|
|
464
|
+
"""Stream NDJSON lines of TrajectoryScore dicts."""
|
|
465
|
+
for score in stream_scores(trajectories, evaluate_fn):
|
|
466
|
+
yield json.dumps(score.model_dump(), default=str)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
# ─────────────────────────────────────────────
|
|
470
|
+
# 7. DIFF & REGRESSION
|
|
471
|
+
# ─────────────────────────────────────────────
|
|
472
|
+
|
|
473
|
+
@dataclass
|
|
474
|
+
class ScoreDiff:
|
|
475
|
+
"""Diff between two EvaluationResults."""
|
|
476
|
+
added_trajectories: List[str]
|
|
477
|
+
removed_trajectories: List[str]
|
|
478
|
+
improved: Dict[str, float] # trajectory_id → score delta
|
|
479
|
+
regressed: Dict[str, float]
|
|
480
|
+
unchanged: List[str]
|
|
481
|
+
|
|
482
|
+
def summary(self) -> str:
|
|
483
|
+
return (
|
|
484
|
+
f"Added: {len(self.added_trajectories)}, "
|
|
485
|
+
f"Removed: {len(self.removed_trajectories)}, "
|
|
486
|
+
f"Improved: {len(self.improved)}, "
|
|
487
|
+
f"Regressed: {len(self.regressed)}, "
|
|
488
|
+
f"Unchanged: {len(self.unchanged)}"
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
def to_json(self) -> str:
|
|
492
|
+
return json.dumps({
|
|
493
|
+
"added": self.added_trajectories,
|
|
494
|
+
"removed": self.removed_trajectories,
|
|
495
|
+
"improved": self.improved,
|
|
496
|
+
"regressed": self.regressed,
|
|
497
|
+
"unchanged": self.unchanged,
|
|
498
|
+
}, indent=2)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def diff_results(a: EvaluationResult, b: EvaluationResult) -> ScoreDiff:
|
|
502
|
+
"""Compute diff between two EvaluationResults."""
|
|
503
|
+
a_map = {s.trajectory_id: s.overall_score for s in a.scores}
|
|
504
|
+
b_map = {s.trajectory_id: s.overall_score for s in b.scores}
|
|
505
|
+
|
|
506
|
+
added = [tid for tid in b_map if tid not in a_map]
|
|
507
|
+
removed = [tid for tid in a_map if tid not in b_map]
|
|
508
|
+
improved: Dict[str, float] = {}
|
|
509
|
+
regressed: Dict[str, float] = {}
|
|
510
|
+
unchanged: List[str] = []
|
|
511
|
+
|
|
512
|
+
for tid in a_map:
|
|
513
|
+
if tid in b_map:
|
|
514
|
+
delta = b_map[tid] - a_map[tid]
|
|
515
|
+
if delta > 0.01:
|
|
516
|
+
improved[tid] = round(delta, 4)
|
|
517
|
+
elif delta < -0.01:
|
|
518
|
+
regressed[tid] = round(delta, 4)
|
|
519
|
+
else:
|
|
520
|
+
unchanged.append(tid)
|
|
521
|
+
|
|
522
|
+
return ScoreDiff(added_trajectories=added, removed_trajectories=removed,
|
|
523
|
+
improved=improved, regressed=regressed, unchanged=unchanged)
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
class RegressionTracker:
|
|
527
|
+
"""Track score trends and detect regressions across evaluation runs."""
|
|
528
|
+
|
|
529
|
+
def __init__(self, window: int = 10) -> None:
|
|
530
|
+
self.window = window
|
|
531
|
+
self._history: deque = deque(maxlen=window)
|
|
532
|
+
|
|
533
|
+
def record(self, result: EvaluationResult) -> None:
|
|
534
|
+
"""Record an evaluation result."""
|
|
535
|
+
self._history.append(result)
|
|
536
|
+
|
|
537
|
+
def trend(self) -> str:
|
|
538
|
+
"""Return 'improving', 'declining', or 'stable'."""
|
|
539
|
+
if len(self._history) < 2:
|
|
540
|
+
return "stable"
|
|
541
|
+
scores = [r.mean_overall for r in self._history]
|
|
542
|
+
deltas = [scores[i + 1] - scores[i] for i in range(len(scores) - 1)]
|
|
543
|
+
mean_delta = sum(deltas) / len(deltas)
|
|
544
|
+
if mean_delta > 0.01:
|
|
545
|
+
return "improving"
|
|
546
|
+
if mean_delta < -0.01:
|
|
547
|
+
return "declining"
|
|
548
|
+
return "stable"
|
|
549
|
+
|
|
550
|
+
def latest_regression(self) -> Optional[ScoreDiff]:
|
|
551
|
+
"""Return diff between last two runs, or None if fewer than 2."""
|
|
552
|
+
if len(self._history) < 2:
|
|
553
|
+
return None
|
|
554
|
+
a, b = list(self._history)[-2], list(self._history)[-1]
|
|
555
|
+
return diff_results(a, b)
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
# ─────────────────────────────────────────────
|
|
559
|
+
# 8. SECURITY & COST
|
|
560
|
+
# ─────────────────────────────────────────────
|
|
561
|
+
|
|
562
|
+
class AuditLog:
|
|
563
|
+
"""Append-only audit log for evaluation events."""
|
|
564
|
+
|
|
565
|
+
def __init__(self) -> None:
|
|
566
|
+
self._entries: List[Dict[str, Any]] = []
|
|
567
|
+
self._lock = threading.Lock()
|
|
568
|
+
|
|
569
|
+
def log(self, event: str, data: Dict[str, Any]) -> None:
|
|
570
|
+
"""Append an audit entry."""
|
|
571
|
+
entry = {"event": event, "timestamp": time.time(), **data}
|
|
572
|
+
with self._lock:
|
|
573
|
+
self._entries.append(entry)
|
|
574
|
+
|
|
575
|
+
def to_json(self, indent: int = 2) -> str:
|
|
576
|
+
with self._lock:
|
|
577
|
+
return json.dumps(self._entries, indent=indent, default=str)
|
|
578
|
+
|
|
579
|
+
@property
|
|
580
|
+
def entries(self) -> List[Dict[str, Any]]:
|
|
581
|
+
with self._lock:
|
|
582
|
+
return list(self._entries)
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
@dataclass
|
|
586
|
+
class CostLedger:
|
|
587
|
+
"""Track evaluation cost per run (token counts or arbitrary units)."""
|
|
588
|
+
_entries: List[Dict[str, Any]] = field(default_factory=list)
|
|
589
|
+
_lock: threading.Lock = field(default_factory=threading.Lock)
|
|
590
|
+
|
|
591
|
+
def record(self, trajectory_id: str, tokens: int, cost_usd: float) -> None:
|
|
592
|
+
with self._lock:
|
|
593
|
+
self._entries.append({
|
|
594
|
+
"trajectory_id": trajectory_id,
|
|
595
|
+
"tokens": tokens,
|
|
596
|
+
"cost_usd": cost_usd,
|
|
597
|
+
"timestamp": time.time(),
|
|
598
|
+
})
|
|
599
|
+
|
|
600
|
+
def total_cost(self) -> float:
|
|
601
|
+
with self._lock:
|
|
602
|
+
return sum(e["cost_usd"] for e in self._entries)
|
|
603
|
+
|
|
604
|
+
def total_tokens(self) -> int:
|
|
605
|
+
with self._lock:
|
|
606
|
+
return sum(e["tokens"] for e in self._entries)
|
|
607
|
+
|
|
608
|
+
def summary(self) -> Dict[str, Any]:
|
|
609
|
+
with self._lock:
|
|
610
|
+
return {
|
|
611
|
+
"calls": len(self._entries),
|
|
612
|
+
"total_tokens": self.total_tokens(),
|
|
613
|
+
"total_cost_usd": self.total_cost(),
|
|
614
|
+
}
|