evalvault 1.63.1__py3-none-any.whl → 1.65.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/main.py +147 -9
- evalvault/adapters/inbound/api/routers/config.py +6 -1
- evalvault/adapters/inbound/api/routers/knowledge.py +62 -6
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/methods/external_command.py +22 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +40 -15
- evalvault/adapters/outbound/tracker/log_sanitizer.py +93 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +3 -2
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +90 -37
- evalvault/config/secret_manager.py +118 -0
- evalvault/config/settings.py +141 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/METADATA +8 -1
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/RECORD +51 -23
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,495 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import math
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from dataclasses import asdict
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
|
|
9
|
+
from evalvault.domain.entities import EvaluationRun, SatisfactionFeedback
|
|
10
|
+
from evalvault.domain.entities.judge_calibration import (
|
|
11
|
+
JudgeCalibrationCase,
|
|
12
|
+
JudgeCalibrationMetric,
|
|
13
|
+
JudgeCalibrationResult,
|
|
14
|
+
JudgeCalibrationSummary,
|
|
15
|
+
)
|
|
16
|
+
from evalvault.ports.outbound.judge_calibration_port import JudgeCalibrationPort
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class JudgeCalibrationService(JudgeCalibrationPort):
|
|
22
|
+
def calibrate(
|
|
23
|
+
self,
|
|
24
|
+
run: EvaluationRun,
|
|
25
|
+
feedbacks: list[SatisfactionFeedback],
|
|
26
|
+
*,
|
|
27
|
+
labels_source: str,
|
|
28
|
+
method: str,
|
|
29
|
+
metrics: list[str],
|
|
30
|
+
holdout_ratio: float,
|
|
31
|
+
seed: int,
|
|
32
|
+
parallel: bool = False,
|
|
33
|
+
concurrency: int = 8,
|
|
34
|
+
) -> JudgeCalibrationResult:
|
|
35
|
+
resolved_metrics = self._resolve_metrics(run, metrics)
|
|
36
|
+
logger.info(
|
|
37
|
+
"Judge 보정 시작: run_id=%s metrics=%s method=%s parallel=%s concurrency=%s",
|
|
38
|
+
run.run_id,
|
|
39
|
+
",".join(resolved_metrics),
|
|
40
|
+
method,
|
|
41
|
+
parallel,
|
|
42
|
+
concurrency,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
feedback_index = self._build_feedback_index(feedbacks)
|
|
46
|
+
total_labels = 0
|
|
47
|
+
case_results: dict[str, list[JudgeCalibrationCase]] = {}
|
|
48
|
+
metric_results: list[JudgeCalibrationMetric] = []
|
|
49
|
+
warnings: list[str] = []
|
|
50
|
+
gate_threshold = 0.6
|
|
51
|
+
gate_passed = True
|
|
52
|
+
if labels_source == "gold":
|
|
53
|
+
warning = "gold 라벨 소스는 아직 지원되지 않습니다."
|
|
54
|
+
warnings.append(warning)
|
|
55
|
+
logger.error("Judge 보정 실패: %s", warning)
|
|
56
|
+
summary = JudgeCalibrationSummary(
|
|
57
|
+
run_id=run.run_id,
|
|
58
|
+
labels_source=labels_source,
|
|
59
|
+
method=method,
|
|
60
|
+
metrics=resolved_metrics,
|
|
61
|
+
holdout_ratio=holdout_ratio,
|
|
62
|
+
seed=seed,
|
|
63
|
+
total_labels=0,
|
|
64
|
+
total_samples=len(run.results),
|
|
65
|
+
gate_passed=False,
|
|
66
|
+
gate_threshold=gate_threshold,
|
|
67
|
+
notes=warnings,
|
|
68
|
+
)
|
|
69
|
+
logger.info(
|
|
70
|
+
"Judge 보정 종료: run_id=%s gate_passed=%s",
|
|
71
|
+
run.run_id,
|
|
72
|
+
summary.gate_passed,
|
|
73
|
+
)
|
|
74
|
+
return JudgeCalibrationResult(
|
|
75
|
+
summary=summary,
|
|
76
|
+
metrics=[],
|
|
77
|
+
case_results={},
|
|
78
|
+
warnings=warnings,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
for metric in resolved_metrics:
|
|
82
|
+
scores, labels, label_sources, sample_ids = self._collect_metric_samples(
|
|
83
|
+
run,
|
|
84
|
+
feedback_index,
|
|
85
|
+
metric,
|
|
86
|
+
labels_source,
|
|
87
|
+
)
|
|
88
|
+
if not labels:
|
|
89
|
+
warning = f"{metric} 라벨이 없어 보정을 건너뜁니다."
|
|
90
|
+
warnings.append(warning)
|
|
91
|
+
metric_results.append(
|
|
92
|
+
JudgeCalibrationMetric(
|
|
93
|
+
metric=metric,
|
|
94
|
+
method=method,
|
|
95
|
+
sample_count=0,
|
|
96
|
+
label_count=0,
|
|
97
|
+
mae=None,
|
|
98
|
+
pearson=None,
|
|
99
|
+
spearman=None,
|
|
100
|
+
temperature=None,
|
|
101
|
+
parameters={},
|
|
102
|
+
gate_passed=False,
|
|
103
|
+
warning=warning,
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
gate_passed = False
|
|
107
|
+
continue
|
|
108
|
+
total_labels += len(labels)
|
|
109
|
+
if not scores:
|
|
110
|
+
warning = f"{metric} 점수가 없어 보정을 건너뜁니다."
|
|
111
|
+
warnings.append(warning)
|
|
112
|
+
metric_results.append(
|
|
113
|
+
JudgeCalibrationMetric(
|
|
114
|
+
metric=metric,
|
|
115
|
+
method=method,
|
|
116
|
+
sample_count=0,
|
|
117
|
+
label_count=len(labels),
|
|
118
|
+
mae=None,
|
|
119
|
+
pearson=None,
|
|
120
|
+
spearman=None,
|
|
121
|
+
temperature=None,
|
|
122
|
+
parameters={},
|
|
123
|
+
gate_passed=False,
|
|
124
|
+
warning=warning,
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
gate_passed = False
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
fit = self._fit_calibration(
|
|
131
|
+
scores,
|
|
132
|
+
labels,
|
|
133
|
+
method=method,
|
|
134
|
+
holdout_ratio=holdout_ratio,
|
|
135
|
+
seed=seed,
|
|
136
|
+
)
|
|
137
|
+
calibrated_scores = fit[0]
|
|
138
|
+
mae, pearson, spearman = fit[1], fit[2], fit[3]
|
|
139
|
+
parameters = fit[4]
|
|
140
|
+
temperature = parameters.get("temperature") if parameters else None
|
|
141
|
+
gate_metric_pass = self._passes_gate(pearson, spearman, gate_threshold)
|
|
142
|
+
warning = None
|
|
143
|
+
if len(labels) < 2:
|
|
144
|
+
warning = f"{metric} 라벨이 부족해 보정 품질을 계산하지 못했습니다."
|
|
145
|
+
warnings.append(warning)
|
|
146
|
+
gate_metric_pass = False
|
|
147
|
+
|
|
148
|
+
if not gate_metric_pass:
|
|
149
|
+
gate_passed = False
|
|
150
|
+
|
|
151
|
+
metric_results.append(
|
|
152
|
+
JudgeCalibrationMetric(
|
|
153
|
+
metric=metric,
|
|
154
|
+
method=method,
|
|
155
|
+
sample_count=len(scores),
|
|
156
|
+
label_count=len(labels),
|
|
157
|
+
mae=mae,
|
|
158
|
+
pearson=pearson,
|
|
159
|
+
spearman=spearman,
|
|
160
|
+
temperature=temperature,
|
|
161
|
+
parameters=parameters,
|
|
162
|
+
gate_passed=gate_metric_pass,
|
|
163
|
+
warning=warning,
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
case_entries = []
|
|
168
|
+
label_count = len(labels)
|
|
169
|
+
for idx, (test_case_id, raw_score, calibrated, label_source) in enumerate(
|
|
170
|
+
zip(sample_ids, scores, calibrated_scores, label_sources, strict=False)
|
|
171
|
+
):
|
|
172
|
+
label_value = labels[idx] if idx < label_count else None
|
|
173
|
+
case_entries.append(
|
|
174
|
+
JudgeCalibrationCase(
|
|
175
|
+
test_case_id=test_case_id,
|
|
176
|
+
raw_score=raw_score,
|
|
177
|
+
calibrated_score=calibrated,
|
|
178
|
+
label=label_value,
|
|
179
|
+
label_source=label_source,
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
case_results[metric] = case_entries
|
|
183
|
+
|
|
184
|
+
summary = JudgeCalibrationSummary(
|
|
185
|
+
run_id=run.run_id,
|
|
186
|
+
labels_source=labels_source,
|
|
187
|
+
method=method,
|
|
188
|
+
metrics=resolved_metrics,
|
|
189
|
+
holdout_ratio=holdout_ratio,
|
|
190
|
+
seed=seed,
|
|
191
|
+
total_labels=total_labels,
|
|
192
|
+
total_samples=len(run.results),
|
|
193
|
+
gate_passed=gate_passed,
|
|
194
|
+
gate_threshold=gate_threshold,
|
|
195
|
+
notes=warnings,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
logger.info(
|
|
199
|
+
"Judge 보정 종료: run_id=%s gate_passed=%s",
|
|
200
|
+
run.run_id,
|
|
201
|
+
gate_passed,
|
|
202
|
+
)
|
|
203
|
+
return JudgeCalibrationResult(
|
|
204
|
+
summary=summary,
|
|
205
|
+
metrics=metric_results,
|
|
206
|
+
case_results=case_results,
|
|
207
|
+
warnings=warnings,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def to_dict(self, result: JudgeCalibrationResult) -> dict[str, object]:
|
|
211
|
+
return {
|
|
212
|
+
"summary": asdict(result.summary),
|
|
213
|
+
"metrics": [asdict(metric) for metric in result.metrics],
|
|
214
|
+
"case_results": {
|
|
215
|
+
metric: [asdict(entry) for entry in entries]
|
|
216
|
+
for metric, entries in result.case_results.items()
|
|
217
|
+
},
|
|
218
|
+
"warnings": list(result.warnings),
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
def _resolve_metrics(self, run: EvaluationRun, metrics: list[str]) -> list[str]:
|
|
222
|
+
if metrics:
|
|
223
|
+
return list(dict.fromkeys(metrics))
|
|
224
|
+
return list(run.metrics_evaluated)
|
|
225
|
+
|
|
226
|
+
def _build_feedback_index(
|
|
227
|
+
self, feedbacks: list[SatisfactionFeedback]
|
|
228
|
+
) -> dict[str, SatisfactionFeedback]:
|
|
229
|
+
latest: dict[str, SatisfactionFeedback] = {}
|
|
230
|
+
for feedback in feedbacks:
|
|
231
|
+
current = latest.get(feedback.test_case_id)
|
|
232
|
+
if current is None:
|
|
233
|
+
latest[feedback.test_case_id] = feedback
|
|
234
|
+
continue
|
|
235
|
+
current_time = current.created_at
|
|
236
|
+
feedback_time = feedback.created_at
|
|
237
|
+
if (feedback_time or datetime.min) >= (current_time or datetime.min):
|
|
238
|
+
latest[feedback.test_case_id] = feedback
|
|
239
|
+
return latest
|
|
240
|
+
|
|
241
|
+
def _collect_metric_samples(
|
|
242
|
+
self,
|
|
243
|
+
run: EvaluationRun,
|
|
244
|
+
feedback_index: dict[str, SatisfactionFeedback],
|
|
245
|
+
metric: str,
|
|
246
|
+
labels_source: str,
|
|
247
|
+
) -> tuple[list[float], list[float], list[str | None], list[str]]:
|
|
248
|
+
scores: list[float] = []
|
|
249
|
+
labels: list[float] = []
|
|
250
|
+
label_sources: list[str | None] = []
|
|
251
|
+
sample_ids: list[str] = []
|
|
252
|
+
for result in run.results:
|
|
253
|
+
metric_score = result.get_metric(metric)
|
|
254
|
+
if metric_score is None or metric_score.score is None:
|
|
255
|
+
continue
|
|
256
|
+
scores.append(float(metric_score.score))
|
|
257
|
+
sample_ids.append(result.test_case_id)
|
|
258
|
+
label_value, label_source = self._resolve_label(
|
|
259
|
+
feedback_index.get(result.test_case_id),
|
|
260
|
+
labels_source=labels_source,
|
|
261
|
+
)
|
|
262
|
+
if label_value is not None:
|
|
263
|
+
labels.append(label_value)
|
|
264
|
+
label_sources.append(label_source)
|
|
265
|
+
return scores, labels, label_sources, sample_ids
|
|
266
|
+
|
|
267
|
+
def _resolve_label(
|
|
268
|
+
self,
|
|
269
|
+
feedback: SatisfactionFeedback | None,
|
|
270
|
+
*,
|
|
271
|
+
labels_source: str,
|
|
272
|
+
) -> tuple[float | None, str | None]:
|
|
273
|
+
if feedback is None:
|
|
274
|
+
return None, None
|
|
275
|
+
if labels_source in {"feedback", "hybrid"}:
|
|
276
|
+
if feedback.satisfaction_score is not None:
|
|
277
|
+
return float(feedback.satisfaction_score), "feedback"
|
|
278
|
+
if feedback.thumb_feedback:
|
|
279
|
+
thumb = feedback.thumb_feedback.lower()
|
|
280
|
+
if thumb == "up":
|
|
281
|
+
return 4.0, "thumb"
|
|
282
|
+
if thumb == "down":
|
|
283
|
+
return 2.0, "thumb"
|
|
284
|
+
return None, None
|
|
285
|
+
|
|
286
|
+
def _fit_calibration(
|
|
287
|
+
self,
|
|
288
|
+
scores: list[float],
|
|
289
|
+
labels: list[float],
|
|
290
|
+
*,
|
|
291
|
+
method: str,
|
|
292
|
+
holdout_ratio: float,
|
|
293
|
+
seed: int,
|
|
294
|
+
) -> tuple[list[float], float | None, float | None, float | None, dict[str, float | None]]:
|
|
295
|
+
if not labels:
|
|
296
|
+
return scores, None, None, None, {}
|
|
297
|
+
train_scores, train_labels, test_scores, test_labels = self._split_holdout(
|
|
298
|
+
scores,
|
|
299
|
+
labels,
|
|
300
|
+
holdout_ratio=holdout_ratio,
|
|
301
|
+
seed=seed,
|
|
302
|
+
)
|
|
303
|
+
if method == "none":
|
|
304
|
+
calibrated = scores
|
|
305
|
+
mae = self._mae(test_scores, test_labels)
|
|
306
|
+
pearson = self._pearson(test_scores, test_labels)
|
|
307
|
+
spearman = self._spearman(test_scores, test_labels)
|
|
308
|
+
return calibrated, mae, pearson, spearman, {}
|
|
309
|
+
if method == "temperature":
|
|
310
|
+
temperature = self._fit_temperature(train_scores, train_labels)
|
|
311
|
+
calibrated = [self._calibrate_temperature(score, temperature) for score in scores]
|
|
312
|
+
calibrated_test = [
|
|
313
|
+
self._calibrate_temperature(score, temperature) for score in test_scores
|
|
314
|
+
]
|
|
315
|
+
mae = self._mae(calibrated_test, test_labels)
|
|
316
|
+
pearson = self._pearson(calibrated_test, test_labels)
|
|
317
|
+
spearman = self._spearman(calibrated_test, test_labels)
|
|
318
|
+
return (
|
|
319
|
+
calibrated,
|
|
320
|
+
mae,
|
|
321
|
+
pearson,
|
|
322
|
+
spearman,
|
|
323
|
+
{"temperature": temperature},
|
|
324
|
+
)
|
|
325
|
+
if method == "platt":
|
|
326
|
+
slope, intercept = self._fit_platt(train_scores, train_labels)
|
|
327
|
+
calibrated = [self._calibrate_platt(score, slope, intercept) for score in scores]
|
|
328
|
+
calibrated_test = [
|
|
329
|
+
self._calibrate_platt(score, slope, intercept) for score in test_scores
|
|
330
|
+
]
|
|
331
|
+
mae = self._mae(calibrated_test, test_labels)
|
|
332
|
+
pearson = self._pearson(calibrated_test, test_labels)
|
|
333
|
+
spearman = self._spearman(calibrated_test, test_labels)
|
|
334
|
+
return (
|
|
335
|
+
calibrated,
|
|
336
|
+
mae,
|
|
337
|
+
pearson,
|
|
338
|
+
spearman,
|
|
339
|
+
{"slope": slope, "intercept": intercept},
|
|
340
|
+
)
|
|
341
|
+
if method == "isotonic":
|
|
342
|
+
calibrated = self._calibrate_isotonic(train_scores, train_labels, scores)
|
|
343
|
+
calibrated_test = self._calibrate_isotonic(train_scores, train_labels, test_scores)
|
|
344
|
+
mae = self._mae(calibrated_test, test_labels)
|
|
345
|
+
pearson = self._pearson(calibrated_test, test_labels)
|
|
346
|
+
spearman = self._spearman(calibrated_test, test_labels)
|
|
347
|
+
return calibrated, mae, pearson, spearman, {}
|
|
348
|
+
calibrated = scores
|
|
349
|
+
mae = self._mae(test_scores, test_labels)
|
|
350
|
+
pearson = self._pearson(test_scores, test_labels)
|
|
351
|
+
spearman = self._spearman(test_scores, test_labels)
|
|
352
|
+
return calibrated, mae, pearson, spearman, {}
|
|
353
|
+
|
|
354
|
+
def _split_holdout(
|
|
355
|
+
self,
|
|
356
|
+
scores: list[float],
|
|
357
|
+
labels: list[float],
|
|
358
|
+
*,
|
|
359
|
+
holdout_ratio: float,
|
|
360
|
+
seed: int,
|
|
361
|
+
) -> tuple[list[float], list[float], list[float], list[float]]:
|
|
362
|
+
pair_count = min(len(scores), len(labels))
|
|
363
|
+
paired = list(zip(scores[:pair_count], labels[:pair_count], strict=False))
|
|
364
|
+
if holdout_ratio <= 0 or holdout_ratio >= 1 or len(paired) < 2:
|
|
365
|
+
return scores, labels, scores, labels
|
|
366
|
+
rng = self._random(seed)
|
|
367
|
+
rng.shuffle(paired)
|
|
368
|
+
cutoff = max(1, int(len(paired) * (1 - holdout_ratio)))
|
|
369
|
+
train = paired[:cutoff]
|
|
370
|
+
test = paired[cutoff:]
|
|
371
|
+
train_scores = [score for score, _ in train]
|
|
372
|
+
train_labels = [label for _, label in train]
|
|
373
|
+
test_scores = [score for score, _ in test] or train_scores
|
|
374
|
+
test_labels = [label for _, label in test] or train_labels
|
|
375
|
+
return train_scores, train_labels, test_scores, test_labels
|
|
376
|
+
|
|
377
|
+
def _fit_temperature(self, scores: list[float], labels: list[float]) -> float:
|
|
378
|
+
if not scores:
|
|
379
|
+
return 1.0
|
|
380
|
+
mean_score = sum(scores) / len(scores)
|
|
381
|
+
mean_label = sum(labels) / len(labels)
|
|
382
|
+
if mean_score <= 0:
|
|
383
|
+
return 1.0
|
|
384
|
+
return max(0.1, min(10.0, mean_label / mean_score))
|
|
385
|
+
|
|
386
|
+
def _calibrate_temperature(self, score: float, temperature: float) -> float:
|
|
387
|
+
return self._clip(score * temperature)
|
|
388
|
+
|
|
389
|
+
def _fit_platt(self, scores: list[float], labels: list[float]) -> tuple[float, float]:
|
|
390
|
+
if not scores:
|
|
391
|
+
return 1.0, 0.0
|
|
392
|
+
mean_score = sum(scores) / len(scores)
|
|
393
|
+
mean_label = sum(labels) / len(labels)
|
|
394
|
+
var_score = sum((score - mean_score) ** 2 for score in scores) / len(scores)
|
|
395
|
+
if var_score == 0:
|
|
396
|
+
return 1.0, 0.0
|
|
397
|
+
pair_count = min(len(scores), len(labels))
|
|
398
|
+
if pair_count == 0:
|
|
399
|
+
return 1.0, 0.0
|
|
400
|
+
cov = (
|
|
401
|
+
sum(
|
|
402
|
+
(score - mean_score) * (label - mean_label)
|
|
403
|
+
for score, label in zip(scores[:pair_count], labels[:pair_count], strict=False)
|
|
404
|
+
)
|
|
405
|
+
/ pair_count
|
|
406
|
+
)
|
|
407
|
+
slope = cov / var_score
|
|
408
|
+
intercept = mean_label - slope * mean_score
|
|
409
|
+
return slope, intercept
|
|
410
|
+
|
|
411
|
+
def _calibrate_platt(self, score: float, slope: float, intercept: float) -> float:
|
|
412
|
+
return self._clip(score * slope + intercept)
|
|
413
|
+
|
|
414
|
+
def _calibrate_isotonic(
|
|
415
|
+
self, train_scores: list[float], train_labels: list[float], scores: list[float]
|
|
416
|
+
) -> list[float]:
|
|
417
|
+
if not train_scores:
|
|
418
|
+
return [self._clip(score) for score in scores]
|
|
419
|
+
pairs = sorted(zip(train_scores, train_labels, strict=False), key=lambda x: x[0])
|
|
420
|
+
calibrated = []
|
|
421
|
+
for score in scores:
|
|
422
|
+
calibrated.append(self._calibrate_isotonic_point(score, pairs))
|
|
423
|
+
return calibrated
|
|
424
|
+
|
|
425
|
+
def _calibrate_isotonic_point(self, score: float, pairs: list[tuple[float, float]]) -> float:
|
|
426
|
+
if not pairs:
|
|
427
|
+
return self._clip(score)
|
|
428
|
+
prev_score, prev_label = pairs[0]
|
|
429
|
+
if score <= prev_score:
|
|
430
|
+
return self._clip(prev_label)
|
|
431
|
+
for current_score, current_label in pairs[1:]:
|
|
432
|
+
if score <= current_score:
|
|
433
|
+
ratio = (score - prev_score) / (current_score - prev_score)
|
|
434
|
+
value = prev_label + ratio * (current_label - prev_label)
|
|
435
|
+
return self._clip(value)
|
|
436
|
+
prev_score, prev_label = current_score, current_label
|
|
437
|
+
return self._clip(pairs[-1][1])
|
|
438
|
+
|
|
439
|
+
def _mae(self, scores: Iterable[float], labels: Iterable[float]) -> float | None:
|
|
440
|
+
values = list(zip(scores, labels, strict=False))
|
|
441
|
+
if not values:
|
|
442
|
+
return None
|
|
443
|
+
return sum(abs(score - label) for score, label in values) / len(values)
|
|
444
|
+
|
|
445
|
+
def _pearson(self, scores: Iterable[float], labels: Iterable[float]) -> float | None:
|
|
446
|
+
values = list(zip(scores, labels, strict=False))
|
|
447
|
+
if len(values) < 2:
|
|
448
|
+
return None
|
|
449
|
+
score_vals = [score for score, _ in values]
|
|
450
|
+
label_vals = [label for _, label in values]
|
|
451
|
+
mean_score = sum(score_vals) / len(score_vals)
|
|
452
|
+
mean_label = sum(label_vals) / len(label_vals)
|
|
453
|
+
numerator = sum(
|
|
454
|
+
(score - mean_score) * (label - mean_label)
|
|
455
|
+
for score, label in zip(score_vals, label_vals, strict=False)
|
|
456
|
+
)
|
|
457
|
+
denom_score = math.sqrt(sum((score - mean_score) ** 2 for score in score_vals))
|
|
458
|
+
denom_label = math.sqrt(sum((label - mean_label) ** 2 for label in label_vals))
|
|
459
|
+
if denom_score == 0 or denom_label == 0:
|
|
460
|
+
return None
|
|
461
|
+
return numerator / (denom_score * denom_label)
|
|
462
|
+
|
|
463
|
+
def _spearman(self, scores: Iterable[float], labels: Iterable[float]) -> float | None:
|
|
464
|
+
values = list(zip(scores, labels, strict=False))
|
|
465
|
+
if len(values) < 2:
|
|
466
|
+
return None
|
|
467
|
+
score_vals = [score for score, _ in values]
|
|
468
|
+
label_vals = [label for _, label in values]
|
|
469
|
+
score_ranks = self._rank(score_vals)
|
|
470
|
+
label_ranks = self._rank(label_vals)
|
|
471
|
+
return self._pearson(score_ranks, label_ranks)
|
|
472
|
+
|
|
473
|
+
def _rank(self, values: list[float]) -> list[float]:
|
|
474
|
+
sorted_vals = sorted(enumerate(values), key=lambda item: item[1])
|
|
475
|
+
ranks = [0.0] * len(values)
|
|
476
|
+
for rank, (index, _) in enumerate(sorted_vals, start=1):
|
|
477
|
+
ranks[index] = float(rank)
|
|
478
|
+
return ranks
|
|
479
|
+
|
|
480
|
+
def _clip(self, value: float) -> float:
|
|
481
|
+
return max(0.0, min(1.0, value))
|
|
482
|
+
|
|
483
|
+
def _passes_gate(
|
|
484
|
+
self, pearson: float | None, spearman: float | None, gate_threshold: float
|
|
485
|
+
) -> bool:
|
|
486
|
+
candidates = [metric for metric in (pearson, spearman) if metric is not None]
|
|
487
|
+
if not candidates:
|
|
488
|
+
return False
|
|
489
|
+
return max(candidates) >= gate_threshold
|
|
490
|
+
|
|
491
|
+
def _random(self, seed: int):
|
|
492
|
+
import random
|
|
493
|
+
|
|
494
|
+
rng = random.Random(seed)
|
|
495
|
+
return rng
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from evalvault.config.model_config import get_model_config
|
|
10
|
+
from evalvault.config.settings import Settings, apply_profile
|
|
11
|
+
from evalvault.domain.entities import EvaluationRun
|
|
12
|
+
from evalvault.ports.outbound.ops_snapshot_port import OpsSnapshotWriterPort
|
|
13
|
+
from evalvault.ports.outbound.storage_port import StoragePort
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class OpsSnapshotRequest:
|
|
20
|
+
run_id: str
|
|
21
|
+
profile: str | None
|
|
22
|
+
db_path: Path | None
|
|
23
|
+
include_model_config: bool
|
|
24
|
+
include_env: bool
|
|
25
|
+
redact_keys: tuple[str, ...] = field(default_factory=tuple)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(frozen=True)
|
|
29
|
+
class OpsSnapshotEnvelope:
|
|
30
|
+
command: str
|
|
31
|
+
version: int
|
|
32
|
+
status: str
|
|
33
|
+
started_at: str
|
|
34
|
+
finished_at: str
|
|
35
|
+
duration_ms: int
|
|
36
|
+
artifacts: dict[str, Any]
|
|
37
|
+
data: dict[str, Any]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class OpsSnapshotService:
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
*,
|
|
44
|
+
storage: StoragePort,
|
|
45
|
+
writer: OpsSnapshotWriterPort,
|
|
46
|
+
settings: Settings,
|
|
47
|
+
output_path: Path,
|
|
48
|
+
) -> None:
|
|
49
|
+
self._storage = storage
|
|
50
|
+
self._writer = writer
|
|
51
|
+
self._settings = settings
|
|
52
|
+
self._output_path = output_path
|
|
53
|
+
|
|
54
|
+
def collect(self, request: OpsSnapshotRequest) -> OpsSnapshotEnvelope:
|
|
55
|
+
started_at = datetime.now(UTC)
|
|
56
|
+
logger.info("ops snapshot started", extra={"run_id": request.run_id})
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
run = self._storage.get_run(request.run_id)
|
|
60
|
+
except KeyError:
|
|
61
|
+
logger.error("ops snapshot run missing", extra={"run_id": request.run_id})
|
|
62
|
+
raise
|
|
63
|
+
|
|
64
|
+
settings = self._settings
|
|
65
|
+
if request.profile:
|
|
66
|
+
settings = apply_profile(settings, request.profile)
|
|
67
|
+
|
|
68
|
+
data = {
|
|
69
|
+
"run": _build_run_snapshot(run),
|
|
70
|
+
"profile": request.profile or settings.evalvault_profile,
|
|
71
|
+
"db_path": str(request.db_path) if request.db_path else None,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if request.include_model_config:
|
|
75
|
+
data["model_config"] = _build_model_config_snapshot(request.profile)
|
|
76
|
+
|
|
77
|
+
if request.include_env:
|
|
78
|
+
data["env"] = _build_env_snapshot(settings, request.redact_keys)
|
|
79
|
+
|
|
80
|
+
finished_at = datetime.now(UTC)
|
|
81
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
82
|
+
payload = OpsSnapshotEnvelope(
|
|
83
|
+
command="ops_snapshot",
|
|
84
|
+
version=1,
|
|
85
|
+
status="ok",
|
|
86
|
+
started_at=started_at.isoformat(),
|
|
87
|
+
finished_at=finished_at.isoformat(),
|
|
88
|
+
duration_ms=duration_ms,
|
|
89
|
+
artifacts={},
|
|
90
|
+
data=data,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
self._writer.write_snapshot(self._output_path, _serialize_envelope(payload))
|
|
94
|
+
logger.info("ops snapshot finished", extra={"run_id": request.run_id})
|
|
95
|
+
return payload
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _build_run_snapshot(run: EvaluationRun) -> dict[str, Any]:
|
|
99
|
+
return {
|
|
100
|
+
"run_id": run.run_id,
|
|
101
|
+
"dataset_name": run.dataset_name,
|
|
102
|
+
"dataset_version": run.dataset_version,
|
|
103
|
+
"model_name": run.model_name,
|
|
104
|
+
"metrics_evaluated": list(run.metrics_evaluated),
|
|
105
|
+
"started_at": run.started_at.isoformat() if run.started_at else None,
|
|
106
|
+
"finished_at": run.finished_at.isoformat() if run.finished_at else None,
|
|
107
|
+
"duration_seconds": run.duration_seconds,
|
|
108
|
+
"total_test_cases": run.total_test_cases,
|
|
109
|
+
"pass_rate": run.pass_rate,
|
|
110
|
+
"metric_pass_rate": run.metric_pass_rate,
|
|
111
|
+
"thresholds": run.thresholds,
|
|
112
|
+
"tracker_metadata": run.tracker_metadata,
|
|
113
|
+
"retrieval_metadata": run.retrieval_metadata,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _build_model_config_snapshot(profile: str | None) -> dict[str, Any] | None:
|
|
118
|
+
try:
|
|
119
|
+
config = get_model_config()
|
|
120
|
+
except FileNotFoundError:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
if profile:
|
|
124
|
+
try:
|
|
125
|
+
profile_config = config.get_profile(profile)
|
|
126
|
+
except KeyError:
|
|
127
|
+
return {"available_profiles": sorted(config.profiles.keys())}
|
|
128
|
+
return {
|
|
129
|
+
"profile": profile,
|
|
130
|
+
"description": profile_config.description,
|
|
131
|
+
"llm": profile_config.llm.model_dump(),
|
|
132
|
+
"embedding": profile_config.embedding.model_dump(),
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return {
|
|
136
|
+
"profiles": {name: entry.model_dump() for name, entry in config.profiles.items()},
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _build_env_snapshot(settings: Settings, redact_keys: tuple[str, ...]) -> dict[str, Any]:
|
|
141
|
+
data = settings.model_dump()
|
|
142
|
+
normalized_redact = {key.upper() for key in redact_keys}
|
|
143
|
+
for key in list(data.keys()):
|
|
144
|
+
if key.upper() in normalized_redact:
|
|
145
|
+
data[key] = "[redacted]"
|
|
146
|
+
return data
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _serialize_envelope(envelope: OpsSnapshotEnvelope) -> dict[str, Any]:
|
|
150
|
+
return {
|
|
151
|
+
"command": envelope.command,
|
|
152
|
+
"version": envelope.version,
|
|
153
|
+
"status": envelope.status,
|
|
154
|
+
"started_at": envelope.started_at,
|
|
155
|
+
"finished_at": envelope.finished_at,
|
|
156
|
+
"duration_ms": envelope.duration_ms,
|
|
157
|
+
"artifacts": envelope.artifacts,
|
|
158
|
+
"data": envelope.data,
|
|
159
|
+
}
|