evalvault 1.63.1__py3-none-any.whl → 1.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. evalvault/adapters/inbound/api/main.py +147 -9
  2. evalvault/adapters/inbound/api/routers/config.py +6 -1
  3. evalvault/adapters/inbound/api/routers/knowledge.py +62 -6
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  13. evalvault/adapters/outbound/artifact_fs.py +16 -0
  14. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  15. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  16. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  17. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  18. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  19. evalvault/adapters/outbound/methods/external_command.py +22 -1
  20. evalvault/adapters/outbound/tracker/langfuse_adapter.py +40 -15
  21. evalvault/adapters/outbound/tracker/log_sanitizer.py +93 -0
  22. evalvault/adapters/outbound/tracker/mlflow_adapter.py +3 -2
  23. evalvault/adapters/outbound/tracker/phoenix_adapter.py +90 -37
  24. evalvault/config/secret_manager.py +118 -0
  25. evalvault/config/settings.py +141 -1
  26. evalvault/domain/entities/__init__.py +10 -0
  27. evalvault/domain/entities/judge_calibration.py +50 -0
  28. evalvault/domain/entities/stage.py +11 -3
  29. evalvault/domain/services/artifact_lint_service.py +268 -0
  30. evalvault/domain/services/benchmark_runner.py +1 -6
  31. evalvault/domain/services/dataset_preprocessor.py +26 -0
  32. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  33. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  34. evalvault/domain/services/evaluator.py +2 -0
  35. evalvault/domain/services/judge_calibration_service.py +495 -0
  36. evalvault/domain/services/ops_snapshot_service.py +159 -0
  37. evalvault/domain/services/regression_gate_service.py +199 -0
  38. evalvault/domain/services/run_comparison_service.py +159 -0
  39. evalvault/domain/services/stage_event_builder.py +6 -1
  40. evalvault/domain/services/stage_metric_service.py +83 -18
  41. evalvault/ports/outbound/__init__.py +4 -0
  42. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  43. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  44. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  45. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  46. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  47. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/METADATA +8 -1
  48. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/RECORD +51 -23
  49. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
  50. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
  51. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,495 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import math
5
+ from collections.abc import Iterable
6
+ from dataclasses import asdict
7
+ from datetime import datetime
8
+
9
+ from evalvault.domain.entities import EvaluationRun, SatisfactionFeedback
10
+ from evalvault.domain.entities.judge_calibration import (
11
+ JudgeCalibrationCase,
12
+ JudgeCalibrationMetric,
13
+ JudgeCalibrationResult,
14
+ JudgeCalibrationSummary,
15
+ )
16
+ from evalvault.ports.outbound.judge_calibration_port import JudgeCalibrationPort
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class JudgeCalibrationService(JudgeCalibrationPort):
22
+ def calibrate(
23
+ self,
24
+ run: EvaluationRun,
25
+ feedbacks: list[SatisfactionFeedback],
26
+ *,
27
+ labels_source: str,
28
+ method: str,
29
+ metrics: list[str],
30
+ holdout_ratio: float,
31
+ seed: int,
32
+ parallel: bool = False,
33
+ concurrency: int = 8,
34
+ ) -> JudgeCalibrationResult:
35
+ resolved_metrics = self._resolve_metrics(run, metrics)
36
+ logger.info(
37
+ "Judge 보정 시작: run_id=%s metrics=%s method=%s parallel=%s concurrency=%s",
38
+ run.run_id,
39
+ ",".join(resolved_metrics),
40
+ method,
41
+ parallel,
42
+ concurrency,
43
+ )
44
+
45
+ feedback_index = self._build_feedback_index(feedbacks)
46
+ total_labels = 0
47
+ case_results: dict[str, list[JudgeCalibrationCase]] = {}
48
+ metric_results: list[JudgeCalibrationMetric] = []
49
+ warnings: list[str] = []
50
+ gate_threshold = 0.6
51
+ gate_passed = True
52
+ if labels_source == "gold":
53
+ warning = "gold 라벨 소스는 아직 지원되지 않습니다."
54
+ warnings.append(warning)
55
+ logger.error("Judge 보정 실패: %s", warning)
56
+ summary = JudgeCalibrationSummary(
57
+ run_id=run.run_id,
58
+ labels_source=labels_source,
59
+ method=method,
60
+ metrics=resolved_metrics,
61
+ holdout_ratio=holdout_ratio,
62
+ seed=seed,
63
+ total_labels=0,
64
+ total_samples=len(run.results),
65
+ gate_passed=False,
66
+ gate_threshold=gate_threshold,
67
+ notes=warnings,
68
+ )
69
+ logger.info(
70
+ "Judge 보정 종료: run_id=%s gate_passed=%s",
71
+ run.run_id,
72
+ summary.gate_passed,
73
+ )
74
+ return JudgeCalibrationResult(
75
+ summary=summary,
76
+ metrics=[],
77
+ case_results={},
78
+ warnings=warnings,
79
+ )
80
+
81
+ for metric in resolved_metrics:
82
+ scores, labels, label_sources, sample_ids = self._collect_metric_samples(
83
+ run,
84
+ feedback_index,
85
+ metric,
86
+ labels_source,
87
+ )
88
+ if not labels:
89
+ warning = f"{metric} 라벨이 없어 보정을 건너뜁니다."
90
+ warnings.append(warning)
91
+ metric_results.append(
92
+ JudgeCalibrationMetric(
93
+ metric=metric,
94
+ method=method,
95
+ sample_count=0,
96
+ label_count=0,
97
+ mae=None,
98
+ pearson=None,
99
+ spearman=None,
100
+ temperature=None,
101
+ parameters={},
102
+ gate_passed=False,
103
+ warning=warning,
104
+ )
105
+ )
106
+ gate_passed = False
107
+ continue
108
+ total_labels += len(labels)
109
+ if not scores:
110
+ warning = f"{metric} 점수가 없어 보정을 건너뜁니다."
111
+ warnings.append(warning)
112
+ metric_results.append(
113
+ JudgeCalibrationMetric(
114
+ metric=metric,
115
+ method=method,
116
+ sample_count=0,
117
+ label_count=len(labels),
118
+ mae=None,
119
+ pearson=None,
120
+ spearman=None,
121
+ temperature=None,
122
+ parameters={},
123
+ gate_passed=False,
124
+ warning=warning,
125
+ )
126
+ )
127
+ gate_passed = False
128
+ continue
129
+
130
+ fit = self._fit_calibration(
131
+ scores,
132
+ labels,
133
+ method=method,
134
+ holdout_ratio=holdout_ratio,
135
+ seed=seed,
136
+ )
137
+ calibrated_scores = fit[0]
138
+ mae, pearson, spearman = fit[1], fit[2], fit[3]
139
+ parameters = fit[4]
140
+ temperature = parameters.get("temperature") if parameters else None
141
+ gate_metric_pass = self._passes_gate(pearson, spearman, gate_threshold)
142
+ warning = None
143
+ if len(labels) < 2:
144
+ warning = f"{metric} 라벨이 부족해 보정 품질을 계산하지 못했습니다."
145
+ warnings.append(warning)
146
+ gate_metric_pass = False
147
+
148
+ if not gate_metric_pass:
149
+ gate_passed = False
150
+
151
+ metric_results.append(
152
+ JudgeCalibrationMetric(
153
+ metric=metric,
154
+ method=method,
155
+ sample_count=len(scores),
156
+ label_count=len(labels),
157
+ mae=mae,
158
+ pearson=pearson,
159
+ spearman=spearman,
160
+ temperature=temperature,
161
+ parameters=parameters,
162
+ gate_passed=gate_metric_pass,
163
+ warning=warning,
164
+ )
165
+ )
166
+
167
+ case_entries = []
168
+ label_count = len(labels)
169
+ for idx, (test_case_id, raw_score, calibrated, label_source) in enumerate(
170
+ zip(sample_ids, scores, calibrated_scores, label_sources, strict=False)
171
+ ):
172
+ label_value = labels[idx] if idx < label_count else None
173
+ case_entries.append(
174
+ JudgeCalibrationCase(
175
+ test_case_id=test_case_id,
176
+ raw_score=raw_score,
177
+ calibrated_score=calibrated,
178
+ label=label_value,
179
+ label_source=label_source,
180
+ )
181
+ )
182
+ case_results[metric] = case_entries
183
+
184
+ summary = JudgeCalibrationSummary(
185
+ run_id=run.run_id,
186
+ labels_source=labels_source,
187
+ method=method,
188
+ metrics=resolved_metrics,
189
+ holdout_ratio=holdout_ratio,
190
+ seed=seed,
191
+ total_labels=total_labels,
192
+ total_samples=len(run.results),
193
+ gate_passed=gate_passed,
194
+ gate_threshold=gate_threshold,
195
+ notes=warnings,
196
+ )
197
+
198
+ logger.info(
199
+ "Judge 보정 종료: run_id=%s gate_passed=%s",
200
+ run.run_id,
201
+ gate_passed,
202
+ )
203
+ return JudgeCalibrationResult(
204
+ summary=summary,
205
+ metrics=metric_results,
206
+ case_results=case_results,
207
+ warnings=warnings,
208
+ )
209
+
210
+ def to_dict(self, result: JudgeCalibrationResult) -> dict[str, object]:
211
+ return {
212
+ "summary": asdict(result.summary),
213
+ "metrics": [asdict(metric) for metric in result.metrics],
214
+ "case_results": {
215
+ metric: [asdict(entry) for entry in entries]
216
+ for metric, entries in result.case_results.items()
217
+ },
218
+ "warnings": list(result.warnings),
219
+ }
220
+
221
+ def _resolve_metrics(self, run: EvaluationRun, metrics: list[str]) -> list[str]:
222
+ if metrics:
223
+ return list(dict.fromkeys(metrics))
224
+ return list(run.metrics_evaluated)
225
+
226
+ def _build_feedback_index(
227
+ self, feedbacks: list[SatisfactionFeedback]
228
+ ) -> dict[str, SatisfactionFeedback]:
229
+ latest: dict[str, SatisfactionFeedback] = {}
230
+ for feedback in feedbacks:
231
+ current = latest.get(feedback.test_case_id)
232
+ if current is None:
233
+ latest[feedback.test_case_id] = feedback
234
+ continue
235
+ current_time = current.created_at
236
+ feedback_time = feedback.created_at
237
+ if (feedback_time or datetime.min) >= (current_time or datetime.min):
238
+ latest[feedback.test_case_id] = feedback
239
+ return latest
240
+
241
+ def _collect_metric_samples(
242
+ self,
243
+ run: EvaluationRun,
244
+ feedback_index: dict[str, SatisfactionFeedback],
245
+ metric: str,
246
+ labels_source: str,
247
+ ) -> tuple[list[float], list[float], list[str | None], list[str]]:
248
+ scores: list[float] = []
249
+ labels: list[float] = []
250
+ label_sources: list[str | None] = []
251
+ sample_ids: list[str] = []
252
+ for result in run.results:
253
+ metric_score = result.get_metric(metric)
254
+ if metric_score is None or metric_score.score is None:
255
+ continue
256
+ scores.append(float(metric_score.score))
257
+ sample_ids.append(result.test_case_id)
258
+ label_value, label_source = self._resolve_label(
259
+ feedback_index.get(result.test_case_id),
260
+ labels_source=labels_source,
261
+ )
262
+ if label_value is not None:
263
+ labels.append(label_value)
264
+ label_sources.append(label_source)
265
+ return scores, labels, label_sources, sample_ids
266
+
267
+ def _resolve_label(
268
+ self,
269
+ feedback: SatisfactionFeedback | None,
270
+ *,
271
+ labels_source: str,
272
+ ) -> tuple[float | None, str | None]:
273
+ if feedback is None:
274
+ return None, None
275
+ if labels_source in {"feedback", "hybrid"}:
276
+ if feedback.satisfaction_score is not None:
277
+ return float(feedback.satisfaction_score), "feedback"
278
+ if feedback.thumb_feedback:
279
+ thumb = feedback.thumb_feedback.lower()
280
+ if thumb == "up":
281
+ return 4.0, "thumb"
282
+ if thumb == "down":
283
+ return 2.0, "thumb"
284
+ return None, None
285
+
286
+ def _fit_calibration(
287
+ self,
288
+ scores: list[float],
289
+ labels: list[float],
290
+ *,
291
+ method: str,
292
+ holdout_ratio: float,
293
+ seed: int,
294
+ ) -> tuple[list[float], float | None, float | None, float | None, dict[str, float | None]]:
295
+ if not labels:
296
+ return scores, None, None, None, {}
297
+ train_scores, train_labels, test_scores, test_labels = self._split_holdout(
298
+ scores,
299
+ labels,
300
+ holdout_ratio=holdout_ratio,
301
+ seed=seed,
302
+ )
303
+ if method == "none":
304
+ calibrated = scores
305
+ mae = self._mae(test_scores, test_labels)
306
+ pearson = self._pearson(test_scores, test_labels)
307
+ spearman = self._spearman(test_scores, test_labels)
308
+ return calibrated, mae, pearson, spearman, {}
309
+ if method == "temperature":
310
+ temperature = self._fit_temperature(train_scores, train_labels)
311
+ calibrated = [self._calibrate_temperature(score, temperature) for score in scores]
312
+ calibrated_test = [
313
+ self._calibrate_temperature(score, temperature) for score in test_scores
314
+ ]
315
+ mae = self._mae(calibrated_test, test_labels)
316
+ pearson = self._pearson(calibrated_test, test_labels)
317
+ spearman = self._spearman(calibrated_test, test_labels)
318
+ return (
319
+ calibrated,
320
+ mae,
321
+ pearson,
322
+ spearman,
323
+ {"temperature": temperature},
324
+ )
325
+ if method == "platt":
326
+ slope, intercept = self._fit_platt(train_scores, train_labels)
327
+ calibrated = [self._calibrate_platt(score, slope, intercept) for score in scores]
328
+ calibrated_test = [
329
+ self._calibrate_platt(score, slope, intercept) for score in test_scores
330
+ ]
331
+ mae = self._mae(calibrated_test, test_labels)
332
+ pearson = self._pearson(calibrated_test, test_labels)
333
+ spearman = self._spearman(calibrated_test, test_labels)
334
+ return (
335
+ calibrated,
336
+ mae,
337
+ pearson,
338
+ spearman,
339
+ {"slope": slope, "intercept": intercept},
340
+ )
341
+ if method == "isotonic":
342
+ calibrated = self._calibrate_isotonic(train_scores, train_labels, scores)
343
+ calibrated_test = self._calibrate_isotonic(train_scores, train_labels, test_scores)
344
+ mae = self._mae(calibrated_test, test_labels)
345
+ pearson = self._pearson(calibrated_test, test_labels)
346
+ spearman = self._spearman(calibrated_test, test_labels)
347
+ return calibrated, mae, pearson, spearman, {}
348
+ calibrated = scores
349
+ mae = self._mae(test_scores, test_labels)
350
+ pearson = self._pearson(test_scores, test_labels)
351
+ spearman = self._spearman(test_scores, test_labels)
352
+ return calibrated, mae, pearson, spearman, {}
353
+
354
+ def _split_holdout(
355
+ self,
356
+ scores: list[float],
357
+ labels: list[float],
358
+ *,
359
+ holdout_ratio: float,
360
+ seed: int,
361
+ ) -> tuple[list[float], list[float], list[float], list[float]]:
362
+ pair_count = min(len(scores), len(labels))
363
+ paired = list(zip(scores[:pair_count], labels[:pair_count], strict=False))
364
+ if holdout_ratio <= 0 or holdout_ratio >= 1 or len(paired) < 2:
365
+ return scores, labels, scores, labels
366
+ rng = self._random(seed)
367
+ rng.shuffle(paired)
368
+ cutoff = max(1, int(len(paired) * (1 - holdout_ratio)))
369
+ train = paired[:cutoff]
370
+ test = paired[cutoff:]
371
+ train_scores = [score for score, _ in train]
372
+ train_labels = [label for _, label in train]
373
+ test_scores = [score for score, _ in test] or train_scores
374
+ test_labels = [label for _, label in test] or train_labels
375
+ return train_scores, train_labels, test_scores, test_labels
376
+
377
+ def _fit_temperature(self, scores: list[float], labels: list[float]) -> float:
378
+ if not scores:
379
+ return 1.0
380
+ mean_score = sum(scores) / len(scores)
381
+ mean_label = sum(labels) / len(labels)
382
+ if mean_score <= 0:
383
+ return 1.0
384
+ return max(0.1, min(10.0, mean_label / mean_score))
385
+
386
+ def _calibrate_temperature(self, score: float, temperature: float) -> float:
387
+ return self._clip(score * temperature)
388
+
389
+ def _fit_platt(self, scores: list[float], labels: list[float]) -> tuple[float, float]:
390
+ if not scores:
391
+ return 1.0, 0.0
392
+ mean_score = sum(scores) / len(scores)
393
+ mean_label = sum(labels) / len(labels)
394
+ var_score = sum((score - mean_score) ** 2 for score in scores) / len(scores)
395
+ if var_score == 0:
396
+ return 1.0, 0.0
397
+ pair_count = min(len(scores), len(labels))
398
+ if pair_count == 0:
399
+ return 1.0, 0.0
400
+ cov = (
401
+ sum(
402
+ (score - mean_score) * (label - mean_label)
403
+ for score, label in zip(scores[:pair_count], labels[:pair_count], strict=False)
404
+ )
405
+ / pair_count
406
+ )
407
+ slope = cov / var_score
408
+ intercept = mean_label - slope * mean_score
409
+ return slope, intercept
410
+
411
+ def _calibrate_platt(self, score: float, slope: float, intercept: float) -> float:
412
+ return self._clip(score * slope + intercept)
413
+
414
+ def _calibrate_isotonic(
415
+ self, train_scores: list[float], train_labels: list[float], scores: list[float]
416
+ ) -> list[float]:
417
+ if not train_scores:
418
+ return [self._clip(score) for score in scores]
419
+ pairs = sorted(zip(train_scores, train_labels, strict=False), key=lambda x: x[0])
420
+ calibrated = []
421
+ for score in scores:
422
+ calibrated.append(self._calibrate_isotonic_point(score, pairs))
423
+ return calibrated
424
+
425
+ def _calibrate_isotonic_point(self, score: float, pairs: list[tuple[float, float]]) -> float:
426
+ if not pairs:
427
+ return self._clip(score)
428
+ prev_score, prev_label = pairs[0]
429
+ if score <= prev_score:
430
+ return self._clip(prev_label)
431
+ for current_score, current_label in pairs[1:]:
432
+ if score <= current_score:
433
+ ratio = (score - prev_score) / (current_score - prev_score)
434
+ value = prev_label + ratio * (current_label - prev_label)
435
+ return self._clip(value)
436
+ prev_score, prev_label = current_score, current_label
437
+ return self._clip(pairs[-1][1])
438
+
439
+ def _mae(self, scores: Iterable[float], labels: Iterable[float]) -> float | None:
440
+ values = list(zip(scores, labels, strict=False))
441
+ if not values:
442
+ return None
443
+ return sum(abs(score - label) for score, label in values) / len(values)
444
+
445
+ def _pearson(self, scores: Iterable[float], labels: Iterable[float]) -> float | None:
446
+ values = list(zip(scores, labels, strict=False))
447
+ if len(values) < 2:
448
+ return None
449
+ score_vals = [score for score, _ in values]
450
+ label_vals = [label for _, label in values]
451
+ mean_score = sum(score_vals) / len(score_vals)
452
+ mean_label = sum(label_vals) / len(label_vals)
453
+ numerator = sum(
454
+ (score - mean_score) * (label - mean_label)
455
+ for score, label in zip(score_vals, label_vals, strict=False)
456
+ )
457
+ denom_score = math.sqrt(sum((score - mean_score) ** 2 for score in score_vals))
458
+ denom_label = math.sqrt(sum((label - mean_label) ** 2 for label in label_vals))
459
+ if denom_score == 0 or denom_label == 0:
460
+ return None
461
+ return numerator / (denom_score * denom_label)
462
+
463
+ def _spearman(self, scores: Iterable[float], labels: Iterable[float]) -> float | None:
464
+ values = list(zip(scores, labels, strict=False))
465
+ if len(values) < 2:
466
+ return None
467
+ score_vals = [score for score, _ in values]
468
+ label_vals = [label for _, label in values]
469
+ score_ranks = self._rank(score_vals)
470
+ label_ranks = self._rank(label_vals)
471
+ return self._pearson(score_ranks, label_ranks)
472
+
473
+ def _rank(self, values: list[float]) -> list[float]:
474
+ sorted_vals = sorted(enumerate(values), key=lambda item: item[1])
475
+ ranks = [0.0] * len(values)
476
+ for rank, (index, _) in enumerate(sorted_vals, start=1):
477
+ ranks[index] = float(rank)
478
+ return ranks
479
+
480
+ def _clip(self, value: float) -> float:
481
+ return max(0.0, min(1.0, value))
482
+
483
+ def _passes_gate(
484
+ self, pearson: float | None, spearman: float | None, gate_threshold: float
485
+ ) -> bool:
486
+ candidates = [metric for metric in (pearson, spearman) if metric is not None]
487
+ if not candidates:
488
+ return False
489
+ return max(candidates) >= gate_threshold
490
+
491
+ def _random(self, seed: int):
492
+ import random
493
+
494
+ rng = random.Random(seed)
495
+ return rng
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from dataclasses import dataclass, field
5
+ from datetime import UTC, datetime
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from evalvault.config.model_config import get_model_config
10
+ from evalvault.config.settings import Settings, apply_profile
11
+ from evalvault.domain.entities import EvaluationRun
12
+ from evalvault.ports.outbound.ops_snapshot_port import OpsSnapshotWriterPort
13
+ from evalvault.ports.outbound.storage_port import StoragePort
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class OpsSnapshotRequest:
20
+ run_id: str
21
+ profile: str | None
22
+ db_path: Path | None
23
+ include_model_config: bool
24
+ include_env: bool
25
+ redact_keys: tuple[str, ...] = field(default_factory=tuple)
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class OpsSnapshotEnvelope:
30
+ command: str
31
+ version: int
32
+ status: str
33
+ started_at: str
34
+ finished_at: str
35
+ duration_ms: int
36
+ artifacts: dict[str, Any]
37
+ data: dict[str, Any]
38
+
39
+
40
+ class OpsSnapshotService:
41
+ def __init__(
42
+ self,
43
+ *,
44
+ storage: StoragePort,
45
+ writer: OpsSnapshotWriterPort,
46
+ settings: Settings,
47
+ output_path: Path,
48
+ ) -> None:
49
+ self._storage = storage
50
+ self._writer = writer
51
+ self._settings = settings
52
+ self._output_path = output_path
53
+
54
+ def collect(self, request: OpsSnapshotRequest) -> OpsSnapshotEnvelope:
55
+ started_at = datetime.now(UTC)
56
+ logger.info("ops snapshot started", extra={"run_id": request.run_id})
57
+
58
+ try:
59
+ run = self._storage.get_run(request.run_id)
60
+ except KeyError:
61
+ logger.error("ops snapshot run missing", extra={"run_id": request.run_id})
62
+ raise
63
+
64
+ settings = self._settings
65
+ if request.profile:
66
+ settings = apply_profile(settings, request.profile)
67
+
68
+ data = {
69
+ "run": _build_run_snapshot(run),
70
+ "profile": request.profile or settings.evalvault_profile,
71
+ "db_path": str(request.db_path) if request.db_path else None,
72
+ }
73
+
74
+ if request.include_model_config:
75
+ data["model_config"] = _build_model_config_snapshot(request.profile)
76
+
77
+ if request.include_env:
78
+ data["env"] = _build_env_snapshot(settings, request.redact_keys)
79
+
80
+ finished_at = datetime.now(UTC)
81
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
82
+ payload = OpsSnapshotEnvelope(
83
+ command="ops_snapshot",
84
+ version=1,
85
+ status="ok",
86
+ started_at=started_at.isoformat(),
87
+ finished_at=finished_at.isoformat(),
88
+ duration_ms=duration_ms,
89
+ artifacts={},
90
+ data=data,
91
+ )
92
+
93
+ self._writer.write_snapshot(self._output_path, _serialize_envelope(payload))
94
+ logger.info("ops snapshot finished", extra={"run_id": request.run_id})
95
+ return payload
96
+
97
+
98
+ def _build_run_snapshot(run: EvaluationRun) -> dict[str, Any]:
99
+ return {
100
+ "run_id": run.run_id,
101
+ "dataset_name": run.dataset_name,
102
+ "dataset_version": run.dataset_version,
103
+ "model_name": run.model_name,
104
+ "metrics_evaluated": list(run.metrics_evaluated),
105
+ "started_at": run.started_at.isoformat() if run.started_at else None,
106
+ "finished_at": run.finished_at.isoformat() if run.finished_at else None,
107
+ "duration_seconds": run.duration_seconds,
108
+ "total_test_cases": run.total_test_cases,
109
+ "pass_rate": run.pass_rate,
110
+ "metric_pass_rate": run.metric_pass_rate,
111
+ "thresholds": run.thresholds,
112
+ "tracker_metadata": run.tracker_metadata,
113
+ "retrieval_metadata": run.retrieval_metadata,
114
+ }
115
+
116
+
117
+ def _build_model_config_snapshot(profile: str | None) -> dict[str, Any] | None:
118
+ try:
119
+ config = get_model_config()
120
+ except FileNotFoundError:
121
+ return None
122
+
123
+ if profile:
124
+ try:
125
+ profile_config = config.get_profile(profile)
126
+ except KeyError:
127
+ return {"available_profiles": sorted(config.profiles.keys())}
128
+ return {
129
+ "profile": profile,
130
+ "description": profile_config.description,
131
+ "llm": profile_config.llm.model_dump(),
132
+ "embedding": profile_config.embedding.model_dump(),
133
+ }
134
+
135
+ return {
136
+ "profiles": {name: entry.model_dump() for name, entry in config.profiles.items()},
137
+ }
138
+
139
+
140
+ def _build_env_snapshot(settings: Settings, redact_keys: tuple[str, ...]) -> dict[str, Any]:
141
+ data = settings.model_dump()
142
+ normalized_redact = {key.upper() for key in redact_keys}
143
+ for key in list(data.keys()):
144
+ if key.upper() in normalized_redact:
145
+ data[key] = "[redacted]"
146
+ return data
147
+
148
+
149
+ def _serialize_envelope(envelope: OpsSnapshotEnvelope) -> dict[str, Any]:
150
+ return {
151
+ "command": envelope.command,
152
+ "version": envelope.version,
153
+ "status": envelope.status,
154
+ "started_at": envelope.started_at,
155
+ "finished_at": envelope.finished_at,
156
+ "duration_ms": envelope.duration_ms,
157
+ "artifacts": envelope.artifacts,
158
+ "data": envelope.data,
159
+ }