evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  14. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  15. evalvault/adapters/outbound/artifact_fs.py +16 -0
  16. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  17. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  18. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  19. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  20. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  21. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  22. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  23. evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
  24. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  25. evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
  26. evalvault/config/settings.py +21 -0
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/judge_calibration.py +50 -0
  29. evalvault/domain/entities/prompt.py +1 -1
  30. evalvault/domain/entities/stage.py +11 -3
  31. evalvault/domain/metrics/__init__.py +8 -0
  32. evalvault/domain/metrics/registry.py +39 -3
  33. evalvault/domain/metrics/summary_accuracy.py +189 -0
  34. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  35. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  36. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  37. evalvault/domain/services/artifact_lint_service.py +268 -0
  38. evalvault/domain/services/benchmark_runner.py +1 -6
  39. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  40. evalvault/domain/services/dataset_preprocessor.py +26 -0
  41. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  42. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  43. evalvault/domain/services/evaluator.py +282 -27
  44. evalvault/domain/services/judge_calibration_service.py +495 -0
  45. evalvault/domain/services/ops_snapshot_service.py +159 -0
  46. evalvault/domain/services/prompt_registry.py +39 -10
  47. evalvault/domain/services/regression_gate_service.py +199 -0
  48. evalvault/domain/services/run_comparison_service.py +159 -0
  49. evalvault/domain/services/stage_event_builder.py +6 -1
  50. evalvault/domain/services/stage_metric_service.py +83 -18
  51. evalvault/domain/services/threshold_profiles.py +4 -0
  52. evalvault/domain/services/visual_space_service.py +79 -4
  53. evalvault/ports/outbound/__init__.py +4 -0
  54. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  55. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  56. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  57. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  58. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  59. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  60. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
  61. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  62. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  63. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,304 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from dataclasses import dataclass
5
+ from datetime import UTC, datetime
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from evalvault.domain.entities import EvaluationRun, MetricScore, TestCaseResult
10
+ from evalvault.domain.services.difficulty_profile_reporter import DifficultyProfileReporter
11
+ from evalvault.ports.outbound.storage_port import StoragePort
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class DifficultyProfileRequest:
18
+ dataset_name: str | None
19
+ run_id: str | None
20
+ limit_runs: int | None
21
+ metrics: tuple[str, ...] | None
22
+ bucket_count: int
23
+ min_samples: int
24
+ output_path: Path
25
+ artifacts_dir: Path
26
+ parallel: bool
27
+ concurrency: int | None
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class DifficultyCaseProfile:
32
+ run_id: str
33
+ test_case_id: str
34
+ metric_scores: dict[str, float]
35
+ avg_score: float
36
+ difficulty_score: float
37
+ bucket: str
38
+ passed: bool
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class DifficultyBucketSummary:
43
+ label: str
44
+ count: int
45
+ ratio: float
46
+ avg_score: float | None
47
+ pass_rate: float | None
48
+
49
+
50
+ class DifficultyProfilingService:
51
+ def __init__(self, *, storage: StoragePort, reporter: DifficultyProfileReporter) -> None:
52
+ self._storage = storage
53
+ self._reporter = reporter
54
+
55
+ def profile(self, request: DifficultyProfileRequest) -> dict[str, Any]:
56
+ started_at = datetime.now(UTC)
57
+ logger.info(
58
+ "difficulty profiling started",
59
+ extra={
60
+ "dataset_name": request.dataset_name,
61
+ "run_id": request.run_id,
62
+ "bucket_count": request.bucket_count,
63
+ "parallel": request.parallel,
64
+ },
65
+ )
66
+
67
+ runs = self._load_runs(request)
68
+ metrics = self._resolve_metrics(request, runs)
69
+ cases = self._collect_cases(runs, metrics)
70
+
71
+ if len(cases) < request.min_samples:
72
+ logger.warning(
73
+ "difficulty profiling aborted: insufficient samples",
74
+ extra={"sample_count": len(cases), "min_samples": request.min_samples},
75
+ )
76
+ raise ValueError("insufficient history to build difficulty profile")
77
+
78
+ case_profiles, bucket_summaries = self._assign_buckets(
79
+ cases, bucket_count=request.bucket_count
80
+ )
81
+ breakdown = _build_breakdown(bucket_summaries)
82
+ failure_concentration = _build_failure_concentration(bucket_summaries)
83
+
84
+ data = {
85
+ "run_id": request.run_id,
86
+ "dataset_name": request.dataset_name,
87
+ "run_ids": sorted({case.run_id for case in case_profiles}),
88
+ "metrics": list(metrics),
89
+ "bucket_count": request.bucket_count,
90
+ "min_samples": request.min_samples,
91
+ "total_cases": len(case_profiles),
92
+ "dataset_difficulty_distribution": breakdown,
93
+ "accuracy_by_difficulty": _build_accuracy(bucket_summaries),
94
+ "failure_concentration": failure_concentration,
95
+ "buckets": [
96
+ {
97
+ "label": bucket.label,
98
+ "count": bucket.count,
99
+ "ratio": bucket.ratio,
100
+ "avg_score": bucket.avg_score,
101
+ "pass_rate": bucket.pass_rate,
102
+ }
103
+ for bucket in bucket_summaries
104
+ ],
105
+ }
106
+
107
+ finished_at = datetime.now(UTC)
108
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
109
+ envelope = {
110
+ "command": "profile-difficulty",
111
+ "version": 1,
112
+ "status": "ok",
113
+ "started_at": started_at.isoformat(),
114
+ "finished_at": finished_at.isoformat(),
115
+ "duration_ms": duration_ms,
116
+ "artifacts": {},
117
+ "data": data,
118
+ }
119
+
120
+ artifacts_payload = {
121
+ "breakdown": {
122
+ "run_id": request.run_id,
123
+ "dataset_difficulty_distribution": breakdown,
124
+ "accuracy_by_difficulty": _build_accuracy(bucket_summaries),
125
+ "failure_concentration": failure_concentration,
126
+ },
127
+ "cases": [
128
+ {
129
+ "run_id": case.run_id,
130
+ "test_case_id": case.test_case_id,
131
+ "metric_scores": case.metric_scores,
132
+ "avg_score": case.avg_score,
133
+ "difficulty_score": case.difficulty_score,
134
+ "bucket": case.bucket,
135
+ "passed": case.passed,
136
+ }
137
+ for case in case_profiles
138
+ ],
139
+ }
140
+
141
+ artifacts_index = self._reporter.write(
142
+ output_path=request.output_path,
143
+ artifacts_dir=request.artifacts_dir,
144
+ envelope=envelope,
145
+ artifacts=artifacts_payload,
146
+ )
147
+ envelope["artifacts"] = artifacts_index
148
+ logger.info(
149
+ "difficulty profiling completed",
150
+ extra={"artifact_dir": artifacts_index.get("dir"), "case_count": len(case_profiles)},
151
+ )
152
+ return envelope
153
+
154
+ def _load_runs(self, request: DifficultyProfileRequest) -> list[EvaluationRun]:
155
+ if request.run_id:
156
+ run = self._storage.get_run(request.run_id)
157
+ return [run]
158
+ if not request.dataset_name:
159
+ raise ValueError("dataset_name or run_id is required")
160
+ limit = request.limit_runs or 50
161
+ runs = self._storage.list_runs(limit=limit, dataset_name=request.dataset_name)
162
+ if not runs:
163
+ raise ValueError("no runs found for dataset")
164
+ return runs
165
+
166
+ def _resolve_metrics(
167
+ self, request: DifficultyProfileRequest, runs: list[EvaluationRun]
168
+ ) -> tuple[str, ...]:
169
+ if request.metrics:
170
+ return request.metrics
171
+ metrics: set[str] = set()
172
+ for run in runs:
173
+ metrics.update(run.metrics_evaluated)
174
+ if not metrics:
175
+ raise ValueError("no metrics available for difficulty profiling")
176
+ return tuple(sorted(metrics))
177
+
178
+ def _collect_cases(
179
+ self, runs: list[EvaluationRun], metrics: tuple[str, ...]
180
+ ) -> list[tuple[str, TestCaseResult, dict[str, float], bool]]:
181
+ cases = []
182
+ for run in runs:
183
+ for result in run.results:
184
+ metric_scores, passed = _extract_metric_scores(result, metrics)
185
+ if not metric_scores:
186
+ continue
187
+ cases.append((run.run_id, result, metric_scores, passed))
188
+ return cases
189
+
190
+ def _assign_buckets(
191
+ self, cases: list[tuple[str, TestCaseResult, dict[str, float], bool]], *, bucket_count: int
192
+ ) -> tuple[list[DifficultyCaseProfile], list[DifficultyBucketSummary]]:
193
+ sorted_cases = sorted(cases, key=lambda item: _difficulty_score(item[2]))
194
+ total_cases = len(sorted_cases)
195
+ if total_cases == 0:
196
+ return [], []
197
+ labels = _bucket_labels(bucket_count)
198
+
199
+ bucket_map: dict[str, list[DifficultyCaseProfile]] = {label: [] for label in labels}
200
+
201
+ for index, (run_id, result, metric_scores, passed) in enumerate(sorted_cases):
202
+ bucket_index = min(int(index / total_cases * bucket_count), bucket_count - 1)
203
+ label = labels[bucket_index]
204
+ avg_score = sum(metric_scores.values()) / len(metric_scores)
205
+ difficulty_score = _difficulty_score(metric_scores)
206
+ profile = DifficultyCaseProfile(
207
+ run_id=run_id,
208
+ test_case_id=result.test_case_id,
209
+ metric_scores=metric_scores,
210
+ avg_score=avg_score,
211
+ difficulty_score=difficulty_score,
212
+ bucket=label,
213
+ passed=passed,
214
+ )
215
+ bucket_map[label].append(profile)
216
+
217
+ bucket_summaries: list[DifficultyBucketSummary] = []
218
+ case_profiles: list[DifficultyCaseProfile] = []
219
+ for label in labels:
220
+ bucket_cases = bucket_map[label]
221
+ case_profiles.extend(bucket_cases)
222
+ count = len(bucket_cases)
223
+ ratio = count / total_cases if total_cases else 0.0
224
+ avg_score = _safe_average([case.avg_score for case in bucket_cases])
225
+ pass_rate = _safe_average([1.0 if case.passed else 0.0 for case in bucket_cases])
226
+ bucket_summaries.append(
227
+ DifficultyBucketSummary(
228
+ label=label,
229
+ count=count,
230
+ ratio=ratio,
231
+ avg_score=avg_score,
232
+ pass_rate=pass_rate,
233
+ )
234
+ )
235
+ return case_profiles, bucket_summaries
236
+
237
+
238
+ def _extract_metric_scores(
239
+ result: TestCaseResult, metrics: tuple[str, ...]
240
+ ) -> tuple[dict[str, float], bool]:
241
+ scores: dict[str, float] = {}
242
+ passed_all = True
243
+ for metric_name in metrics:
244
+ metric: MetricScore | None = result.get_metric(metric_name)
245
+ if metric is None:
246
+ continue
247
+ scores[metric_name] = float(metric.score)
248
+ passed_all = passed_all and metric.score >= metric.threshold
249
+ if not scores:
250
+ passed_all = False
251
+ return scores, passed_all
252
+
253
+
254
+ def _difficulty_score(metric_scores: dict[str, float]) -> float:
255
+ if not metric_scores:
256
+ return 1.0
257
+ avg_score = sum(metric_scores.values()) / len(metric_scores)
258
+ score = 1.0 - avg_score
259
+ return min(max(score, 0.0), 1.0)
260
+
261
+
262
+ def _safe_average(values: list[float]) -> float | None:
263
+ if not values:
264
+ return None
265
+ return sum(values) / len(values)
266
+
267
+
268
+ def _bucket_labels(bucket_count: int) -> list[str]:
269
+ if bucket_count == 3:
270
+ return ["easy", "medium", "hard"]
271
+ return [f"bucket_{index}" for index in range(1, bucket_count + 1)]
272
+
273
+
274
+ def _build_breakdown(buckets: list[DifficultyBucketSummary]) -> dict[str, float]:
275
+ return {bucket.label: bucket.ratio for bucket in buckets}
276
+
277
+
278
+ def _build_accuracy(buckets: list[DifficultyBucketSummary]) -> dict[str, float | None]:
279
+ return {bucket.label: bucket.pass_rate for bucket in buckets}
280
+
281
+
282
+ def _build_failure_concentration(buckets: list[DifficultyBucketSummary]) -> dict[str, Any]:
283
+ if not buckets:
284
+ return {
285
+ "primary_difficulty": None,
286
+ "primary_flags": [],
287
+ "actionable_insight": "난이도 분포 데이터가 없습니다.",
288
+ }
289
+ primary = max(buckets, key=lambda bucket: (1 - (bucket.pass_rate or 0.0), bucket.count))
290
+ flags: list[str] = []
291
+ if primary.pass_rate is not None and primary.pass_rate < 0.5:
292
+ flags.append("low_pass_rate")
293
+ if primary.avg_score is not None and primary.avg_score < 0.5:
294
+ flags.append("low_avg_score")
295
+ insight = "난이도 분포에 큰 편차가 없습니다."
296
+ if "low_pass_rate" in flags:
297
+ insight = "해당 난이도 구간에서 정답률이 낮습니다."
298
+ elif "low_avg_score" in flags:
299
+ insight = "메트릭 평균 점수가 낮습니다."
300
+ return {
301
+ "primary_difficulty": primary.label,
302
+ "primary_flags": flags,
303
+ "actionable_insight": insight,
304
+ }