evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +14 -0
- evalvault/adapters/inbound/api/main.py +14 -4
- evalvault/adapters/inbound/api/routers/chat.py +543 -0
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/inbound/cli/commands/run.py +14 -0
- evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
- evalvault/adapters/outbound/storage/base_sql.py +41 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
- evalvault/config/settings.py +21 -0
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/prompt.py +1 -1
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/metrics/__init__.py +8 -0
- evalvault/domain/metrics/registry.py +39 -3
- evalvault/domain/metrics/summary_accuracy.py +189 -0
- evalvault/domain/metrics/summary_needs_followup.py +45 -0
- evalvault/domain/metrics/summary_non_definitive.py +41 -0
- evalvault/domain/metrics/summary_risk_coverage.py +45 -0
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/custom_metric_snapshot.py +233 -0
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +282 -27
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/prompt_registry.py +39 -10
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/domain/services/threshold_profiles.py +4 -0
- evalvault/domain/services/visual_space_service.py +79 -4
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from evalvault.domain.entities import EvaluationRun, MetricScore, TestCaseResult
|
|
10
|
+
from evalvault.domain.services.difficulty_profile_reporter import DifficultyProfileReporter
|
|
11
|
+
from evalvault.ports.outbound.storage_port import StoragePort
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class DifficultyProfileRequest:
|
|
18
|
+
dataset_name: str | None
|
|
19
|
+
run_id: str | None
|
|
20
|
+
limit_runs: int | None
|
|
21
|
+
metrics: tuple[str, ...] | None
|
|
22
|
+
bucket_count: int
|
|
23
|
+
min_samples: int
|
|
24
|
+
output_path: Path
|
|
25
|
+
artifacts_dir: Path
|
|
26
|
+
parallel: bool
|
|
27
|
+
concurrency: int | None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class DifficultyCaseProfile:
|
|
32
|
+
run_id: str
|
|
33
|
+
test_case_id: str
|
|
34
|
+
metric_scores: dict[str, float]
|
|
35
|
+
avg_score: float
|
|
36
|
+
difficulty_score: float
|
|
37
|
+
bucket: str
|
|
38
|
+
passed: bool
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(frozen=True)
|
|
42
|
+
class DifficultyBucketSummary:
|
|
43
|
+
label: str
|
|
44
|
+
count: int
|
|
45
|
+
ratio: float
|
|
46
|
+
avg_score: float | None
|
|
47
|
+
pass_rate: float | None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DifficultyProfilingService:
|
|
51
|
+
def __init__(self, *, storage: StoragePort, reporter: DifficultyProfileReporter) -> None:
|
|
52
|
+
self._storage = storage
|
|
53
|
+
self._reporter = reporter
|
|
54
|
+
|
|
55
|
+
def profile(self, request: DifficultyProfileRequest) -> dict[str, Any]:
|
|
56
|
+
started_at = datetime.now(UTC)
|
|
57
|
+
logger.info(
|
|
58
|
+
"difficulty profiling started",
|
|
59
|
+
extra={
|
|
60
|
+
"dataset_name": request.dataset_name,
|
|
61
|
+
"run_id": request.run_id,
|
|
62
|
+
"bucket_count": request.bucket_count,
|
|
63
|
+
"parallel": request.parallel,
|
|
64
|
+
},
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
runs = self._load_runs(request)
|
|
68
|
+
metrics = self._resolve_metrics(request, runs)
|
|
69
|
+
cases = self._collect_cases(runs, metrics)
|
|
70
|
+
|
|
71
|
+
if len(cases) < request.min_samples:
|
|
72
|
+
logger.warning(
|
|
73
|
+
"difficulty profiling aborted: insufficient samples",
|
|
74
|
+
extra={"sample_count": len(cases), "min_samples": request.min_samples},
|
|
75
|
+
)
|
|
76
|
+
raise ValueError("insufficient history to build difficulty profile")
|
|
77
|
+
|
|
78
|
+
case_profiles, bucket_summaries = self._assign_buckets(
|
|
79
|
+
cases, bucket_count=request.bucket_count
|
|
80
|
+
)
|
|
81
|
+
breakdown = _build_breakdown(bucket_summaries)
|
|
82
|
+
failure_concentration = _build_failure_concentration(bucket_summaries)
|
|
83
|
+
|
|
84
|
+
data = {
|
|
85
|
+
"run_id": request.run_id,
|
|
86
|
+
"dataset_name": request.dataset_name,
|
|
87
|
+
"run_ids": sorted({case.run_id for case in case_profiles}),
|
|
88
|
+
"metrics": list(metrics),
|
|
89
|
+
"bucket_count": request.bucket_count,
|
|
90
|
+
"min_samples": request.min_samples,
|
|
91
|
+
"total_cases": len(case_profiles),
|
|
92
|
+
"dataset_difficulty_distribution": breakdown,
|
|
93
|
+
"accuracy_by_difficulty": _build_accuracy(bucket_summaries),
|
|
94
|
+
"failure_concentration": failure_concentration,
|
|
95
|
+
"buckets": [
|
|
96
|
+
{
|
|
97
|
+
"label": bucket.label,
|
|
98
|
+
"count": bucket.count,
|
|
99
|
+
"ratio": bucket.ratio,
|
|
100
|
+
"avg_score": bucket.avg_score,
|
|
101
|
+
"pass_rate": bucket.pass_rate,
|
|
102
|
+
}
|
|
103
|
+
for bucket in bucket_summaries
|
|
104
|
+
],
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
finished_at = datetime.now(UTC)
|
|
108
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
109
|
+
envelope = {
|
|
110
|
+
"command": "profile-difficulty",
|
|
111
|
+
"version": 1,
|
|
112
|
+
"status": "ok",
|
|
113
|
+
"started_at": started_at.isoformat(),
|
|
114
|
+
"finished_at": finished_at.isoformat(),
|
|
115
|
+
"duration_ms": duration_ms,
|
|
116
|
+
"artifacts": {},
|
|
117
|
+
"data": data,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
artifacts_payload = {
|
|
121
|
+
"breakdown": {
|
|
122
|
+
"run_id": request.run_id,
|
|
123
|
+
"dataset_difficulty_distribution": breakdown,
|
|
124
|
+
"accuracy_by_difficulty": _build_accuracy(bucket_summaries),
|
|
125
|
+
"failure_concentration": failure_concentration,
|
|
126
|
+
},
|
|
127
|
+
"cases": [
|
|
128
|
+
{
|
|
129
|
+
"run_id": case.run_id,
|
|
130
|
+
"test_case_id": case.test_case_id,
|
|
131
|
+
"metric_scores": case.metric_scores,
|
|
132
|
+
"avg_score": case.avg_score,
|
|
133
|
+
"difficulty_score": case.difficulty_score,
|
|
134
|
+
"bucket": case.bucket,
|
|
135
|
+
"passed": case.passed,
|
|
136
|
+
}
|
|
137
|
+
for case in case_profiles
|
|
138
|
+
],
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
artifacts_index = self._reporter.write(
|
|
142
|
+
output_path=request.output_path,
|
|
143
|
+
artifacts_dir=request.artifacts_dir,
|
|
144
|
+
envelope=envelope,
|
|
145
|
+
artifacts=artifacts_payload,
|
|
146
|
+
)
|
|
147
|
+
envelope["artifacts"] = artifacts_index
|
|
148
|
+
logger.info(
|
|
149
|
+
"difficulty profiling completed",
|
|
150
|
+
extra={"artifact_dir": artifacts_index.get("dir"), "case_count": len(case_profiles)},
|
|
151
|
+
)
|
|
152
|
+
return envelope
|
|
153
|
+
|
|
154
|
+
def _load_runs(self, request: DifficultyProfileRequest) -> list[EvaluationRun]:
|
|
155
|
+
if request.run_id:
|
|
156
|
+
run = self._storage.get_run(request.run_id)
|
|
157
|
+
return [run]
|
|
158
|
+
if not request.dataset_name:
|
|
159
|
+
raise ValueError("dataset_name or run_id is required")
|
|
160
|
+
limit = request.limit_runs or 50
|
|
161
|
+
runs = self._storage.list_runs(limit=limit, dataset_name=request.dataset_name)
|
|
162
|
+
if not runs:
|
|
163
|
+
raise ValueError("no runs found for dataset")
|
|
164
|
+
return runs
|
|
165
|
+
|
|
166
|
+
def _resolve_metrics(
|
|
167
|
+
self, request: DifficultyProfileRequest, runs: list[EvaluationRun]
|
|
168
|
+
) -> tuple[str, ...]:
|
|
169
|
+
if request.metrics:
|
|
170
|
+
return request.metrics
|
|
171
|
+
metrics: set[str] = set()
|
|
172
|
+
for run in runs:
|
|
173
|
+
metrics.update(run.metrics_evaluated)
|
|
174
|
+
if not metrics:
|
|
175
|
+
raise ValueError("no metrics available for difficulty profiling")
|
|
176
|
+
return tuple(sorted(metrics))
|
|
177
|
+
|
|
178
|
+
def _collect_cases(
|
|
179
|
+
self, runs: list[EvaluationRun], metrics: tuple[str, ...]
|
|
180
|
+
) -> list[tuple[str, TestCaseResult, dict[str, float], bool]]:
|
|
181
|
+
cases = []
|
|
182
|
+
for run in runs:
|
|
183
|
+
for result in run.results:
|
|
184
|
+
metric_scores, passed = _extract_metric_scores(result, metrics)
|
|
185
|
+
if not metric_scores:
|
|
186
|
+
continue
|
|
187
|
+
cases.append((run.run_id, result, metric_scores, passed))
|
|
188
|
+
return cases
|
|
189
|
+
|
|
190
|
+
def _assign_buckets(
|
|
191
|
+
self, cases: list[tuple[str, TestCaseResult, dict[str, float], bool]], *, bucket_count: int
|
|
192
|
+
) -> tuple[list[DifficultyCaseProfile], list[DifficultyBucketSummary]]:
|
|
193
|
+
sorted_cases = sorted(cases, key=lambda item: _difficulty_score(item[2]))
|
|
194
|
+
total_cases = len(sorted_cases)
|
|
195
|
+
if total_cases == 0:
|
|
196
|
+
return [], []
|
|
197
|
+
labels = _bucket_labels(bucket_count)
|
|
198
|
+
|
|
199
|
+
bucket_map: dict[str, list[DifficultyCaseProfile]] = {label: [] for label in labels}
|
|
200
|
+
|
|
201
|
+
for index, (run_id, result, metric_scores, passed) in enumerate(sorted_cases):
|
|
202
|
+
bucket_index = min(int(index / total_cases * bucket_count), bucket_count - 1)
|
|
203
|
+
label = labels[bucket_index]
|
|
204
|
+
avg_score = sum(metric_scores.values()) / len(metric_scores)
|
|
205
|
+
difficulty_score = _difficulty_score(metric_scores)
|
|
206
|
+
profile = DifficultyCaseProfile(
|
|
207
|
+
run_id=run_id,
|
|
208
|
+
test_case_id=result.test_case_id,
|
|
209
|
+
metric_scores=metric_scores,
|
|
210
|
+
avg_score=avg_score,
|
|
211
|
+
difficulty_score=difficulty_score,
|
|
212
|
+
bucket=label,
|
|
213
|
+
passed=passed,
|
|
214
|
+
)
|
|
215
|
+
bucket_map[label].append(profile)
|
|
216
|
+
|
|
217
|
+
bucket_summaries: list[DifficultyBucketSummary] = []
|
|
218
|
+
case_profiles: list[DifficultyCaseProfile] = []
|
|
219
|
+
for label in labels:
|
|
220
|
+
bucket_cases = bucket_map[label]
|
|
221
|
+
case_profiles.extend(bucket_cases)
|
|
222
|
+
count = len(bucket_cases)
|
|
223
|
+
ratio = count / total_cases if total_cases else 0.0
|
|
224
|
+
avg_score = _safe_average([case.avg_score for case in bucket_cases])
|
|
225
|
+
pass_rate = _safe_average([1.0 if case.passed else 0.0 for case in bucket_cases])
|
|
226
|
+
bucket_summaries.append(
|
|
227
|
+
DifficultyBucketSummary(
|
|
228
|
+
label=label,
|
|
229
|
+
count=count,
|
|
230
|
+
ratio=ratio,
|
|
231
|
+
avg_score=avg_score,
|
|
232
|
+
pass_rate=pass_rate,
|
|
233
|
+
)
|
|
234
|
+
)
|
|
235
|
+
return case_profiles, bucket_summaries
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _extract_metric_scores(
|
|
239
|
+
result: TestCaseResult, metrics: tuple[str, ...]
|
|
240
|
+
) -> tuple[dict[str, float], bool]:
|
|
241
|
+
scores: dict[str, float] = {}
|
|
242
|
+
passed_all = True
|
|
243
|
+
for metric_name in metrics:
|
|
244
|
+
metric: MetricScore | None = result.get_metric(metric_name)
|
|
245
|
+
if metric is None:
|
|
246
|
+
continue
|
|
247
|
+
scores[metric_name] = float(metric.score)
|
|
248
|
+
passed_all = passed_all and metric.score >= metric.threshold
|
|
249
|
+
if not scores:
|
|
250
|
+
passed_all = False
|
|
251
|
+
return scores, passed_all
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _difficulty_score(metric_scores: dict[str, float]) -> float:
|
|
255
|
+
if not metric_scores:
|
|
256
|
+
return 1.0
|
|
257
|
+
avg_score = sum(metric_scores.values()) / len(metric_scores)
|
|
258
|
+
score = 1.0 - avg_score
|
|
259
|
+
return min(max(score, 0.0), 1.0)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _safe_average(values: list[float]) -> float | None:
|
|
263
|
+
if not values:
|
|
264
|
+
return None
|
|
265
|
+
return sum(values) / len(values)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _bucket_labels(bucket_count: int) -> list[str]:
|
|
269
|
+
if bucket_count == 3:
|
|
270
|
+
return ["easy", "medium", "hard"]
|
|
271
|
+
return [f"bucket_{index}" for index in range(1, bucket_count + 1)]
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _build_breakdown(buckets: list[DifficultyBucketSummary]) -> dict[str, float]:
|
|
275
|
+
return {bucket.label: bucket.ratio for bucket in buckets}
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _build_accuracy(buckets: list[DifficultyBucketSummary]) -> dict[str, float | None]:
|
|
279
|
+
return {bucket.label: bucket.pass_rate for bucket in buckets}
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _build_failure_concentration(buckets: list[DifficultyBucketSummary]) -> dict[str, Any]:
|
|
283
|
+
if not buckets:
|
|
284
|
+
return {
|
|
285
|
+
"primary_difficulty": None,
|
|
286
|
+
"primary_flags": [],
|
|
287
|
+
"actionable_insight": "난이도 분포 데이터가 없습니다.",
|
|
288
|
+
}
|
|
289
|
+
primary = max(buckets, key=lambda bucket: (1 - (bucket.pass_rate or 0.0), bucket.count))
|
|
290
|
+
flags: list[str] = []
|
|
291
|
+
if primary.pass_rate is not None and primary.pass_rate < 0.5:
|
|
292
|
+
flags.append("low_pass_rate")
|
|
293
|
+
if primary.avg_score is not None and primary.avg_score < 0.5:
|
|
294
|
+
flags.append("low_avg_score")
|
|
295
|
+
insight = "난이도 분포에 큰 편차가 없습니다."
|
|
296
|
+
if "low_pass_rate" in flags:
|
|
297
|
+
insight = "해당 난이도 구간에서 정답률이 낮습니다."
|
|
298
|
+
elif "low_avg_score" in flags:
|
|
299
|
+
insight = "메트릭 평균 점수가 낮습니다."
|
|
300
|
+
return {
|
|
301
|
+
"primary_difficulty": primary.label,
|
|
302
|
+
"primary_flags": flags,
|
|
303
|
+
"actionable_insight": insight,
|
|
304
|
+
}
|