trajscore 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agenteval/__init__.py ADDED
@@ -0,0 +1,78 @@
1
+ """
2
+ agenteval — Agentic Multi-Step Trajectory Evaluation.
3
+
4
+ Evaluate any AI agent trajectory with production-grade metrics:
5
+ goal completion, tool accuracy, step efficiency, reasoning coherence,
6
+ loop detection, and answer faithfulness.
7
+ """
8
+ from agenteval.models import (
9
+ StepType,
10
+ TrajectoryStep,
11
+ Trajectory,
12
+ StepScore,
13
+ TrajectoryScore,
14
+ EvaluationResult,
15
+ )
16
+ from agenteval.evaluator import TrajectoryEvaluator
17
+ from agenteval.watcher import TrajectoryWatcher
18
+ from agenteval.exceptions import (
19
+ AgentEvalError,
20
+ TrajectoryValidationError,
21
+ EvaluationError,
22
+ MetricNotFoundError,
23
+ BudgetExceededError,
24
+ SchemaViolationError,
25
+ )
26
+ from agenteval.metrics import (
27
+ GoalCompletionMetric,
28
+ ToolAccuracyMetric,
29
+ StepEfficiencyMetric,
30
+ ReasoningCoherenceMetric,
31
+ LoopDetectionMetric,
32
+ AnswerFaithfulnessMetric,
33
+ )
34
+ from agenteval.advanced import (
35
+ TrajectoryCache,
36
+ EvalPipeline,
37
+ TrajectoryRule,
38
+ TrajectoryValidator,
39
+ ConfidenceScorer,
40
+ RateLimiter,
41
+ CancellationToken,
42
+ abatch_evaluate,
43
+ batch_evaluate,
44
+ evaluate_with_budget,
45
+ EvaluationProfiler,
46
+ DriftDetector,
47
+ EvaluationReport,
48
+ stream_scores,
49
+ scores_to_ndjson,
50
+ ScoreDiff,
51
+ diff_results,
52
+ RegressionTracker,
53
+ AuditLog,
54
+ CostLedger,
55
+ )
56
+
57
+ __version__ = "1.0.0"
58
+
59
+ __all__ = [
60
+ # Models
61
+ "StepType", "TrajectoryStep", "Trajectory", "StepScore",
62
+ "TrajectoryScore", "EvaluationResult",
63
+ # Core
64
+ "TrajectoryEvaluator", "TrajectoryWatcher",
65
+ # Exceptions
66
+ "AgentEvalError", "TrajectoryValidationError", "EvaluationError",
67
+ "MetricNotFoundError", "BudgetExceededError", "SchemaViolationError",
68
+ # Metrics
69
+ "GoalCompletionMetric", "ToolAccuracyMetric", "StepEfficiencyMetric",
70
+ "ReasoningCoherenceMetric", "LoopDetectionMetric", "AnswerFaithfulnessMetric",
71
+ # Advanced
72
+ "TrajectoryCache", "EvalPipeline", "TrajectoryRule", "TrajectoryValidator",
73
+ "ConfidenceScorer", "RateLimiter", "CancellationToken",
74
+ "abatch_evaluate", "batch_evaluate", "evaluate_with_budget",
75
+ "EvaluationProfiler", "DriftDetector", "EvaluationReport",
76
+ "stream_scores", "scores_to_ndjson", "ScoreDiff", "diff_results",
77
+ "RegressionTracker", "AuditLog", "CostLedger",
78
+ ]
agenteval/advanced.py ADDED
@@ -0,0 +1,614 @@
1
+ """
2
+ Advanced features for agenteval — 2026 Standard.
3
+
4
+ Covers: Caching, Pipeline, Validation & Schema, Async & Concurrency,
5
+ Observability, Streaming & Storage, Diff & Regression, Security & Cost.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import csv
11
+ import hashlib
12
+ import io
13
+ import json
14
+ import logging
15
+ import threading
16
+ import time
17
+ from collections import defaultdict, deque
18
+ from dataclasses import dataclass, field
19
+ from typing import Any, Callable, Dict, Generator, List, Optional, Sequence, Tuple
20
+
21
+ from agenteval.exceptions import BudgetExceededError, EvaluationError
22
+ from agenteval.models import EvaluationResult, Trajectory, TrajectoryScore
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ # ─────────────────────────────────────────────
28
+ # 1. CACHING
29
+ # ─────────────────────────────────────────────
30
+
31
+ class TrajectoryCache:
32
+ """LRU + TTL cache for TrajectoryScore results, keyed by SHA-256."""
33
+
34
+ def __init__(self, max_size: int = 256, ttl: float = 300.0) -> None:
35
+ self.max_size = max_size
36
+ self.ttl = ttl
37
+ self._store: Dict[str, Tuple[TrajectoryScore, float]] = {}
38
+ self._order: deque = deque()
39
+ self._lock = threading.Lock()
40
+ self._hits = 0
41
+ self._misses = 0
42
+
43
+ @staticmethod
44
+ def _key(trajectory: Trajectory) -> str:
45
+ raw = json.dumps(trajectory.model_dump(), sort_keys=True, default=str)
46
+ return hashlib.sha256(raw.encode()).hexdigest()
47
+
48
+ def get(self, trajectory: Trajectory) -> Optional[TrajectoryScore]:
49
+ """Return cached score or None if expired / not found."""
50
+ k = self._key(trajectory)
51
+ with self._lock:
52
+ if k in self._store:
53
+ score, ts = self._store[k]
54
+ if time.time() - ts <= self.ttl:
55
+ self._hits += 1
56
+ return score
57
+ del self._store[k]
58
+ self._misses += 1
59
+ return None
60
+
61
+ def put(self, trajectory: Trajectory, score: TrajectoryScore) -> None:
62
+ """Insert or update a cached score, evicting LRU if at capacity."""
63
+ k = self._key(trajectory)
64
+ with self._lock:
65
+ if k in self._store:
66
+ self._order.remove(k)
67
+ elif len(self._store) >= self.max_size:
68
+ oldest = self._order.popleft()
69
+ self._store.pop(oldest, None)
70
+ self._store[k] = (score, time.time())
71
+ self._order.append(k)
72
+
73
+ def memoize(self, evaluate_fn: Callable[[Trajectory], TrajectoryScore]) -> Callable[[Trajectory], TrajectoryScore]:
74
+ """Decorator: cache the result of evaluate_fn."""
75
+ def wrapper(trajectory: Trajectory) -> TrajectoryScore:
76
+ cached = self.get(trajectory)
77
+ if cached is not None:
78
+ return cached
79
+ result = evaluate_fn(trajectory)
80
+ self.put(trajectory, result)
81
+ return result
82
+ return wrapper
83
+
84
+ def stats(self) -> Dict[str, Any]:
85
+ """Return cache statistics."""
86
+ with self._lock:
87
+ total = self._hits + self._misses
88
+ return {
89
+ "size": len(self._store),
90
+ "max_size": self.max_size,
91
+ "ttl": self.ttl,
92
+ "hits": self._hits,
93
+ "misses": self._misses,
94
+ "hit_rate": self._hits / total if total > 0 else 0.0,
95
+ }
96
+
97
+ def save(self, path: str) -> None:
98
+ """Persist cache to a JSON file."""
99
+ with self._lock:
100
+ data = {k: (v[0].model_dump(), v[1]) for k, v in self._store.items()}
101
+ with open(path, "w", encoding="utf-8") as f:
102
+ json.dump(data, f)
103
+ logger.info("TrajectoryCache saved to %s", path)
104
+
105
+ def load(self, path: str) -> None:
106
+ """Load cache from a JSON file."""
107
+ with open(path, "r", encoding="utf-8") as f:
108
+ data = json.load(f)
109
+ with self._lock:
110
+ for k, (score_dict, ts) in data.items():
111
+ self._store[k] = (TrajectoryScore(**score_dict), ts)
112
+ self._order.append(k)
113
+ logger.info("TrajectoryCache loaded from %s", path)
114
+
115
+
116
+ # ─────────────────────────────────────────────
117
+ # 2. PIPELINE
118
+ # ─────────────────────────────────────────────
119
+
120
+ @dataclass
121
+ class _PipelineStep:
122
+ name: str
123
+ fn: Callable[[List[Trajectory]], List[Trajectory]]
124
+ retries: int = 0
125
+
126
+
127
+ class EvalPipeline:
128
+ """Fluent, auditable pipeline for trajectory pre-processing and evaluation."""
129
+
130
+ def __init__(self) -> None:
131
+ self._steps: List[_PipelineStep] = []
132
+ self._audit_log: List[Dict[str, Any]] = []
133
+
134
+ def map(self, name: str, fn: Callable[[Trajectory], Trajectory]) -> "EvalPipeline":
135
+ """Apply fn to each trajectory."""
136
+ self._steps.append(_PipelineStep(name, lambda ts: [fn(t) for t in ts]))
137
+ return self
138
+
139
+ def filter(self, name: str, fn: Callable[[Trajectory], bool]) -> "EvalPipeline":
140
+ """Keep only trajectories for which fn returns True."""
141
+ self._steps.append(_PipelineStep(name, lambda ts: [t for t in ts if fn(t)]))
142
+ return self
143
+
144
+ def branch(self, condition: Callable[[Trajectory], bool],
145
+ true_fn: Callable[[Trajectory], Trajectory],
146
+ false_fn: Callable[[Trajectory], Trajectory]) -> "EvalPipeline":
147
+ """Route trajectories to true_fn or false_fn based on condition."""
148
+ def _branch(ts: List[Trajectory]) -> List[Trajectory]:
149
+ return [true_fn(t) if condition(t) else false_fn(t) for t in ts]
150
+ self._steps.append(_PipelineStep("branch", _branch))
151
+ return self
152
+
153
+ def with_retry(self, step_name: str, retries: int = 3) -> "EvalPipeline":
154
+ """Set retry count for a named step."""
155
+ for step in self._steps:
156
+ if step.name == step_name:
157
+ step.retries = retries
158
+ return self
159
+
160
+ def run(self, trajectories: List[Trajectory]) -> List[Trajectory]:
161
+ """Execute the pipeline synchronously."""
162
+ result = list(trajectories)
163
+ for step in self._steps:
164
+ start = time.time()
165
+ attempt = 0
166
+ last_exc: Optional[Exception] = None
167
+ while attempt <= step.retries:
168
+ try:
169
+ result = step.fn(result)
170
+ break
171
+ except Exception as exc:
172
+ last_exc = exc
173
+ attempt += 1
174
+ logger.warning("Pipeline step '%s' attempt %d failed: %s", step.name, attempt, exc)
175
+ else:
176
+ raise EvaluationError(f"Pipeline step '{step.name}' failed after {step.retries + 1} attempts") from last_exc
177
+ elapsed = time.time() - start
178
+ self._audit_log.append({"step": step.name, "output_count": len(result), "elapsed_s": elapsed})
179
+ return result
180
+
181
+ async def arun(self, trajectories: List[Trajectory]) -> List[Trajectory]:
182
+ """Execute the pipeline asynchronously."""
183
+ loop = asyncio.get_event_loop()
184
+ return await loop.run_in_executor(None, self.run, trajectories)
185
+
186
+ @property
187
+ def audit_log(self) -> List[Dict[str, Any]]:
188
+ """Return step audit log."""
189
+ return list(self._audit_log)
190
+
191
+
192
+ # ─────────────────────────────────────────────
193
+ # 3. VALIDATION & SCHEMA
194
+ # ─────────────────────────────────────────────
195
+
196
+ @dataclass
197
+ class TrajectoryRule:
198
+ """A single declarative validation rule for a trajectory."""
199
+ name: str
200
+ check: Callable[[Trajectory], bool]
201
+ message: str
202
+
203
+
204
+ class TrajectoryValidator:
205
+ """Declarative trajectory validator."""
206
+
207
+ def __init__(self) -> None:
208
+ self._rules: List[TrajectoryRule] = []
209
+
210
+ def add_rule(self, rule: TrajectoryRule) -> "TrajectoryValidator":
211
+ """Register a validation rule."""
212
+ self._rules.append(rule)
213
+ return self
214
+
215
+ def validate(self, trajectory: Trajectory) -> List[str]:
216
+ """Return list of violation messages; empty = valid."""
217
+ violations = []
218
+ for rule in self._rules:
219
+ try:
220
+ if not rule.check(trajectory):
221
+ violations.append(rule.message)
222
+ except Exception as exc:
223
+ violations.append(f"Rule '{rule.name}' error: {exc}")
224
+ return violations
225
+
226
+ def is_valid(self, trajectory: Trajectory) -> bool:
227
+ """Return True if all rules pass."""
228
+ return len(self.validate(trajectory)) == 0
229
+
230
+
231
+ class ConfidenceScorer:
232
+ """Heuristic 0–1 confidence score for a TrajectoryScore."""
233
+
234
+ def score(self, ts: TrajectoryScore) -> float:
235
+ """Return confidence based on metric variance and step score spread."""
236
+ if not ts.metric_scores:
237
+ return 0.0
238
+ scores = list(ts.metric_scores.values())
239
+ mean = sum(scores) / len(scores)
240
+ variance = sum((s - mean) ** 2 for s in scores) / len(scores)
241
+ # Low variance = high confidence
242
+ confidence = max(0.0, 1.0 - variance)
243
+ return round(confidence, 4)
244
+
245
+
246
+ # ─────────────────────────────────────────────
247
+ # 4. ASYNC & CONCURRENCY
248
+ # ─────────────────────────────────────────────
249
+
250
+ class RateLimiter:
251
+ """Token-bucket rate limiter (sync + async)."""
252
+
253
+ def __init__(self, rate: float, capacity: float) -> None:
254
+ self.rate = rate
255
+ self.capacity = capacity
256
+ self._tokens = capacity
257
+ self._last = time.monotonic()
258
+ self._lock = threading.Lock()
259
+
260
+ def _refill(self) -> None:
261
+ now = time.monotonic()
262
+ elapsed = now - self._last
263
+ self._tokens = min(self.capacity, self._tokens + elapsed * self.rate)
264
+ self._last = now
265
+
266
+ def acquire(self, tokens: float = 1.0) -> bool:
267
+ """Synchronously acquire tokens. Returns False if denied."""
268
+ with self._lock:
269
+ self._refill()
270
+ if self._tokens >= tokens:
271
+ self._tokens -= tokens
272
+ return True
273
+ return False
274
+
275
+ async def aacquire(self, tokens: float = 1.0) -> bool:
276
+ """Async acquire (non-blocking)."""
277
+ loop = asyncio.get_event_loop()
278
+ return await loop.run_in_executor(None, self.acquire, tokens)
279
+
280
+
281
+ class CancellationToken:
282
+ """Token for cooperative cancellation of async batch evaluation."""
283
+
284
+ def __init__(self) -> None:
285
+ self._cancelled = False
286
+
287
+ def cancel(self) -> None:
288
+ """Signal cancellation."""
289
+ self._cancelled = True
290
+
291
+ @property
292
+ def is_cancelled(self) -> bool:
293
+ return self._cancelled
294
+
295
+
296
+ async def abatch_evaluate(
297
+ trajectories: List[Trajectory],
298
+ evaluate_fn: Callable[[Trajectory], TrajectoryScore],
299
+ concurrency: int = 8,
300
+ token: Optional[CancellationToken] = None,
301
+ ) -> List[TrajectoryScore]:
302
+ """Async concurrent trajectory evaluation with optional cancellation."""
303
+ sem = asyncio.Semaphore(concurrency)
304
+
305
+ async def _eval(t: Trajectory) -> TrajectoryScore:
306
+ if token and token.is_cancelled:
307
+ raise asyncio.CancelledError()
308
+ async with sem:
309
+ loop = asyncio.get_event_loop()
310
+ return await loop.run_in_executor(None, evaluate_fn, t)
311
+
312
+ return list(await asyncio.gather(*[_eval(t) for t in trajectories]))
313
+
314
+
315
+ def batch_evaluate(
316
+ trajectories: List[Trajectory],
317
+ evaluate_fn: Callable[[Trajectory], TrajectoryScore],
318
+ max_workers: int = 4,
319
+ ) -> List[TrajectoryScore]:
320
+ """Synchronous concurrent trajectory evaluation using thread pool."""
321
+ from concurrent.futures import ThreadPoolExecutor
322
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
323
+ return list(pool.map(evaluate_fn, trajectories))
324
+
325
+
326
+ def evaluate_with_budget(
327
+ trajectories: List[Trajectory],
328
+ evaluate_fn: Callable[[Trajectory], TrajectoryScore],
329
+ budget_seconds: float = 30.0,
330
+ ) -> List[TrajectoryScore]:
331
+ """Evaluate trajectories within a wall-clock time budget."""
332
+ results: List[TrajectoryScore] = []
333
+ deadline = time.monotonic() + budget_seconds
334
+ for t in trajectories:
335
+ if time.monotonic() >= deadline:
336
+ raise BudgetExceededError(f"Time budget of {budget_seconds}s exceeded after {len(results)} trajectories.")
337
+ results.append(evaluate_fn(t))
338
+ return results
339
+
340
+
341
+ # ─────────────────────────────────────────────
342
+ # 5. OBSERVABILITY
343
+ # ─────────────────────────────────────────────
344
+
345
+ class EvaluationProfiler:
346
+ """Tracks timing and memory per evaluation call."""
347
+
348
+ def __init__(self) -> None:
349
+ self._records: List[Dict[str, Any]] = []
350
+
351
+ def profile(self, evaluate_fn: Callable[[Trajectory], TrajectoryScore]) -> Callable[[Trajectory], TrajectoryScore]:
352
+ """Decorator that records call timing."""
353
+ def wrapper(trajectory: Trajectory) -> TrajectoryScore:
354
+ start = time.perf_counter()
355
+ result = evaluate_fn(trajectory)
356
+ elapsed = time.perf_counter() - start
357
+ self._records.append({
358
+ "trajectory_id": trajectory.trajectory_id,
359
+ "elapsed_s": round(elapsed, 6),
360
+ "overall_score": result.overall_score,
361
+ "passed": result.passed,
362
+ })
363
+ return result
364
+ return wrapper
365
+
366
+ def report(self) -> Dict[str, Any]:
367
+ """Return profiling summary."""
368
+ if not self._records:
369
+ return {"calls": 0}
370
+ elapsed_vals = [r["elapsed_s"] for r in self._records]
371
+ return {
372
+ "calls": len(self._records),
373
+ "mean_elapsed_s": sum(elapsed_vals) / len(elapsed_vals),
374
+ "max_elapsed_s": max(elapsed_vals),
375
+ "min_elapsed_s": min(elapsed_vals),
376
+ "records": self._records,
377
+ }
378
+
379
+
380
+ class DriftDetector:
381
+ """Detect metric drift across evaluation runs."""
382
+
383
+ def __init__(self, threshold: float = 0.1) -> None:
384
+ self.threshold = threshold
385
+ self._baseline: Optional[Dict[str, float]] = None
386
+
387
+ def set_baseline(self, result: EvaluationResult) -> None:
388
+ """Set the baseline metric means."""
389
+ self._baseline = dict(result.metric_means)
390
+ logger.info("DriftDetector baseline set: %s", self._baseline)
391
+
392
+ def detect(self, result: EvaluationResult) -> Dict[str, float]:
393
+ """Return dict of metric → drift (absolute diff). Empty if no baseline."""
394
+ if not self._baseline:
395
+ return {}
396
+ drifts: Dict[str, float] = {}
397
+ for metric, val in result.metric_means.items():
398
+ baseline_val = self._baseline.get(metric, val)
399
+ drift = abs(val - baseline_val)
400
+ if drift >= self.threshold:
401
+ drifts[metric] = round(drift, 4)
402
+ logger.warning("Drift detected in '%s': %.4f", metric, drift)
403
+ return drifts
404
+
405
+
406
+ class EvaluationReport:
407
+ """Export EvaluationResult to JSON, CSV, or Markdown."""
408
+
409
+ def __init__(self, result: EvaluationResult) -> None:
410
+ self._result = result
411
+
412
+ def to_json(self, indent: int = 2) -> str:
413
+ """Serialize to JSON string."""
414
+ return json.dumps(self._result.model_dump(), indent=indent, default=str)
415
+
416
+ def to_csv(self) -> str:
417
+ """Serialize per-trajectory scores to CSV."""
418
+ buf = io.StringIO()
419
+ writer = csv.writer(buf)
420
+ if self._result.scores:
421
+ metric_names = list(self._result.scores[0].metric_scores.keys())
422
+ writer.writerow(["trajectory_id", "task", "overall_score", "passed"] + metric_names)
423
+ for s in self._result.scores:
424
+ row = [s.trajectory_id, s.task, s.overall_score, s.passed]
425
+ row += [s.metric_scores.get(m, "") for m in metric_names]
426
+ writer.writerow(row)
427
+ return buf.getvalue()
428
+
429
+ def to_markdown(self) -> str:
430
+ """Render a Markdown summary table."""
431
+ r = self._result
432
+ lines = [
433
+ "# Evaluation Report",
434
+ f"**Trajectories evaluated:** {r.trajectories_evaluated}",
435
+ f"**Mean overall score:** {r.mean_overall:.3f}",
436
+ f"**Pass rate:** {r.pass_rate:.1%}",
437
+ "",
438
+ "## Metric Means",
439
+ "| Metric | Mean Score |",
440
+ "|--------|-----------|",
441
+ ]
442
+ for metric, val in r.metric_means.items():
443
+ lines.append(f"| {metric} | {val:.3f} |")
444
+ return "\n".join(lines)
445
+
446
+
447
+ # ─────────────────────────────────────────────
448
+ # 6. STREAMING & STORAGE
449
+ # ─────────────────────────────────────────────
450
+
451
+ def stream_scores(
452
+ trajectories: List[Trajectory],
453
+ evaluate_fn: Callable[[Trajectory], TrajectoryScore],
454
+ ) -> Generator[TrajectoryScore, None, None]:
455
+ """Generator that yields TrajectoryScore one at a time (streaming)."""
456
+ for t in trajectories:
457
+ yield evaluate_fn(t)
458
+
459
+
460
+ def scores_to_ndjson(
461
+ trajectories: List[Trajectory],
462
+ evaluate_fn: Callable[[Trajectory], TrajectoryScore],
463
+ ) -> Generator[str, None, None]:
464
+ """Stream NDJSON lines of TrajectoryScore dicts."""
465
+ for score in stream_scores(trajectories, evaluate_fn):
466
+ yield json.dumps(score.model_dump(), default=str)
467
+
468
+
469
+ # ─────────────────────────────────────────────
470
+ # 7. DIFF & REGRESSION
471
+ # ─────────────────────────────────────────────
472
+
473
+ @dataclass
474
+ class ScoreDiff:
475
+ """Diff between two EvaluationResults."""
476
+ added_trajectories: List[str]
477
+ removed_trajectories: List[str]
478
+ improved: Dict[str, float] # trajectory_id → score delta
479
+ regressed: Dict[str, float]
480
+ unchanged: List[str]
481
+
482
+ def summary(self) -> str:
483
+ return (
484
+ f"Added: {len(self.added_trajectories)}, "
485
+ f"Removed: {len(self.removed_trajectories)}, "
486
+ f"Improved: {len(self.improved)}, "
487
+ f"Regressed: {len(self.regressed)}, "
488
+ f"Unchanged: {len(self.unchanged)}"
489
+ )
490
+
491
+ def to_json(self) -> str:
492
+ return json.dumps({
493
+ "added": self.added_trajectories,
494
+ "removed": self.removed_trajectories,
495
+ "improved": self.improved,
496
+ "regressed": self.regressed,
497
+ "unchanged": self.unchanged,
498
+ }, indent=2)
499
+
500
+
501
+ def diff_results(a: EvaluationResult, b: EvaluationResult) -> ScoreDiff:
502
+ """Compute diff between two EvaluationResults."""
503
+ a_map = {s.trajectory_id: s.overall_score for s in a.scores}
504
+ b_map = {s.trajectory_id: s.overall_score for s in b.scores}
505
+
506
+ added = [tid for tid in b_map if tid not in a_map]
507
+ removed = [tid for tid in a_map if tid not in b_map]
508
+ improved: Dict[str, float] = {}
509
+ regressed: Dict[str, float] = {}
510
+ unchanged: List[str] = []
511
+
512
+ for tid in a_map:
513
+ if tid in b_map:
514
+ delta = b_map[tid] - a_map[tid]
515
+ if delta > 0.01:
516
+ improved[tid] = round(delta, 4)
517
+ elif delta < -0.01:
518
+ regressed[tid] = round(delta, 4)
519
+ else:
520
+ unchanged.append(tid)
521
+
522
+ return ScoreDiff(added_trajectories=added, removed_trajectories=removed,
523
+ improved=improved, regressed=regressed, unchanged=unchanged)
524
+
525
+
526
+ class RegressionTracker:
527
+ """Track score trends and detect regressions across evaluation runs."""
528
+
529
+ def __init__(self, window: int = 10) -> None:
530
+ self.window = window
531
+ self._history: deque = deque(maxlen=window)
532
+
533
+ def record(self, result: EvaluationResult) -> None:
534
+ """Record an evaluation result."""
535
+ self._history.append(result)
536
+
537
+ def trend(self) -> str:
538
+ """Return 'improving', 'declining', or 'stable'."""
539
+ if len(self._history) < 2:
540
+ return "stable"
541
+ scores = [r.mean_overall for r in self._history]
542
+ deltas = [scores[i + 1] - scores[i] for i in range(len(scores) - 1)]
543
+ mean_delta = sum(deltas) / len(deltas)
544
+ if mean_delta > 0.01:
545
+ return "improving"
546
+ if mean_delta < -0.01:
547
+ return "declining"
548
+ return "stable"
549
+
550
+ def latest_regression(self) -> Optional[ScoreDiff]:
551
+ """Return diff between last two runs, or None if fewer than 2."""
552
+ if len(self._history) < 2:
553
+ return None
554
+ a, b = list(self._history)[-2], list(self._history)[-1]
555
+ return diff_results(a, b)
556
+
557
+
558
+ # ─────────────────────────────────────────────
559
+ # 8. SECURITY & COST
560
+ # ─────────────────────────────────────────────
561
+
562
+ class AuditLog:
563
+ """Append-only audit log for evaluation events."""
564
+
565
+ def __init__(self) -> None:
566
+ self._entries: List[Dict[str, Any]] = []
567
+ self._lock = threading.Lock()
568
+
569
+ def log(self, event: str, data: Dict[str, Any]) -> None:
570
+ """Append an audit entry."""
571
+ entry = {"event": event, "timestamp": time.time(), **data}
572
+ with self._lock:
573
+ self._entries.append(entry)
574
+
575
+ def to_json(self, indent: int = 2) -> str:
576
+ with self._lock:
577
+ return json.dumps(self._entries, indent=indent, default=str)
578
+
579
+ @property
580
+ def entries(self) -> List[Dict[str, Any]]:
581
+ with self._lock:
582
+ return list(self._entries)
583
+
584
+
585
+ @dataclass
586
+ class CostLedger:
587
+ """Track evaluation cost per run (token counts or arbitrary units)."""
588
+ _entries: List[Dict[str, Any]] = field(default_factory=list)
589
+ _lock: threading.Lock = field(default_factory=threading.Lock)
590
+
591
+ def record(self, trajectory_id: str, tokens: int, cost_usd: float) -> None:
592
+ with self._lock:
593
+ self._entries.append({
594
+ "trajectory_id": trajectory_id,
595
+ "tokens": tokens,
596
+ "cost_usd": cost_usd,
597
+ "timestamp": time.time(),
598
+ })
599
+
600
+ def total_cost(self) -> float:
601
+ with self._lock:
602
+ return sum(e["cost_usd"] for e in self._entries)
603
+
604
+ def total_tokens(self) -> int:
605
+ with self._lock:
606
+ return sum(e["tokens"] for e in self._entries)
607
+
608
+ def summary(self) -> Dict[str, Any]:
609
+ with self._lock:
610
+ return {
611
+ "calls": len(self._entries),
612
+ "total_tokens": self.total_tokens(),
613
+ "total_cost_usd": self.total_cost(),
614
+ }