evalvault 1.63.1__py3-none-any.whl → 1.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. evalvault/adapters/inbound/api/main.py +147 -9
  2. evalvault/adapters/inbound/api/routers/config.py +6 -1
  3. evalvault/adapters/inbound/api/routers/knowledge.py +62 -6
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  13. evalvault/adapters/outbound/artifact_fs.py +16 -0
  14. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  15. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  16. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  17. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  18. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  19. evalvault/adapters/outbound/methods/external_command.py +22 -1
  20. evalvault/adapters/outbound/tracker/langfuse_adapter.py +40 -15
  21. evalvault/adapters/outbound/tracker/log_sanitizer.py +93 -0
  22. evalvault/adapters/outbound/tracker/mlflow_adapter.py +3 -2
  23. evalvault/adapters/outbound/tracker/phoenix_adapter.py +90 -37
  24. evalvault/config/secret_manager.py +118 -0
  25. evalvault/config/settings.py +141 -1
  26. evalvault/domain/entities/__init__.py +10 -0
  27. evalvault/domain/entities/judge_calibration.py +50 -0
  28. evalvault/domain/entities/stage.py +11 -3
  29. evalvault/domain/services/artifact_lint_service.py +268 -0
  30. evalvault/domain/services/benchmark_runner.py +1 -6
  31. evalvault/domain/services/dataset_preprocessor.py +26 -0
  32. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  33. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  34. evalvault/domain/services/evaluator.py +2 -0
  35. evalvault/domain/services/judge_calibration_service.py +495 -0
  36. evalvault/domain/services/ops_snapshot_service.py +159 -0
  37. evalvault/domain/services/regression_gate_service.py +199 -0
  38. evalvault/domain/services/run_comparison_service.py +159 -0
  39. evalvault/domain/services/stage_event_builder.py +6 -1
  40. evalvault/domain/services/stage_metric_service.py +83 -18
  41. evalvault/ports/outbound/__init__.py +4 -0
  42. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  43. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  44. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  45. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  46. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  47. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/METADATA +8 -1
  48. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/RECORD +51 -23
  49. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
  50. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
  51. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,268 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from dataclasses import dataclass
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+ from typing import Literal
9
+
10
+ from evalvault.ports.outbound.artifact_fs_port import ArtifactFileSystemPort
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ LintLevel = Literal["error", "warning"]
16
+ LintStatus = Literal["ok", "warning", "error"]
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class ArtifactLintIssue:
21
+ level: LintLevel
22
+ code: str
23
+ message: str
24
+ path: str | None = None
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class ArtifactLintSummary:
29
+ status: LintStatus
30
+ issues: list[ArtifactLintIssue]
31
+ artifacts_dir: Path
32
+ index_path: Path
33
+ started_at: datetime
34
+ finished_at: datetime
35
+ duration_ms: int
36
+ strict: bool
37
+
38
+
39
+ class ArtifactLintService:
40
+ def __init__(self, fs: ArtifactFileSystemPort) -> None:
41
+ self._fs = fs
42
+
43
+ def lint(self, artifacts_dir: Path, *, strict: bool = False) -> ArtifactLintSummary:
44
+ started_at = datetime.now(UTC)
45
+ issues: list[ArtifactLintIssue] = []
46
+ index_path = artifacts_dir / "index.json"
47
+ logger.info("Artifact lint started: %s", artifacts_dir)
48
+
49
+ try:
50
+ self._validate_dir(artifacts_dir, issues)
51
+ if not self._fs.exists(index_path):
52
+ issues.append(
53
+ ArtifactLintIssue(
54
+ "error",
55
+ "artifacts.index.missing",
56
+ "index.json is missing.",
57
+ path=str(index_path),
58
+ )
59
+ )
60
+ elif self._fs.exists(artifacts_dir) and self._fs.is_dir(artifacts_dir):
61
+ index_payload = self._load_index(index_path, issues)
62
+ if index_payload is not None:
63
+ self._validate_index(
64
+ index_payload,
65
+ artifacts_dir,
66
+ issues,
67
+ strict=strict,
68
+ )
69
+ except Exception as exc:
70
+ logger.exception("Artifact lint failed: %s", artifacts_dir)
71
+ issues.append(
72
+ ArtifactLintIssue(
73
+ "error",
74
+ "artifacts.lint.exception",
75
+ f"Unexpected error: {exc}",
76
+ )
77
+ )
78
+
79
+ finished_at = datetime.now(UTC)
80
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
81
+ status = _resolve_status(issues)
82
+ logger.info("Artifact lint finished: %s (%s)", artifacts_dir, status)
83
+ return ArtifactLintSummary(
84
+ status=status,
85
+ issues=issues,
86
+ artifacts_dir=artifacts_dir,
87
+ index_path=index_path,
88
+ started_at=started_at,
89
+ finished_at=finished_at,
90
+ duration_ms=duration_ms,
91
+ strict=strict,
92
+ )
93
+
94
+ def _validate_dir(self, artifacts_dir: Path, issues: list[ArtifactLintIssue]) -> None:
95
+ if not self._fs.exists(artifacts_dir):
96
+ issues.append(
97
+ ArtifactLintIssue(
98
+ "error",
99
+ "artifacts.dir.missing",
100
+ "Artifacts directory is missing.",
101
+ path=str(artifacts_dir),
102
+ )
103
+ )
104
+ return
105
+ if not self._fs.is_dir(artifacts_dir):
106
+ issues.append(
107
+ ArtifactLintIssue(
108
+ "error",
109
+ "artifacts.dir.not_directory",
110
+ "Artifacts path is not a directory.",
111
+ path=str(artifacts_dir),
112
+ )
113
+ )
114
+
115
+ def _load_index(
116
+ self,
117
+ index_path: Path,
118
+ issues: list[ArtifactLintIssue],
119
+ ) -> dict[str, object] | None:
120
+ try:
121
+ payload = json.loads(self._fs.read_text(index_path))
122
+ except json.JSONDecodeError as exc:
123
+ issues.append(
124
+ ArtifactLintIssue(
125
+ "error",
126
+ "artifacts.index.invalid_json",
127
+ f"index.json parse failed: {exc}",
128
+ path=str(index_path),
129
+ )
130
+ )
131
+ return None
132
+ except OSError as exc:
133
+ issues.append(
134
+ ArtifactLintIssue(
135
+ "error",
136
+ "artifacts.index.read_failed",
137
+ f"index.json read failed: {exc}",
138
+ path=str(index_path),
139
+ )
140
+ )
141
+ return None
142
+
143
+ if not isinstance(payload, dict):
144
+ issues.append(
145
+ ArtifactLintIssue(
146
+ "error",
147
+ "artifacts.index.invalid_schema",
148
+ "index.json root must be an object.",
149
+ path=str(index_path),
150
+ )
151
+ )
152
+ return None
153
+ return payload
154
+
155
+ def _validate_index(
156
+ self,
157
+ payload: dict[str, object],
158
+ artifacts_dir: Path,
159
+ issues: list[ArtifactLintIssue],
160
+ *,
161
+ strict: bool,
162
+ ) -> None:
163
+ pipeline_id = payload.get("pipeline_id")
164
+ if not isinstance(pipeline_id, str) or not pipeline_id.strip():
165
+ issues.append(
166
+ ArtifactLintIssue(
167
+ "error",
168
+ "artifacts.index.pipeline_id.missing",
169
+ "pipeline_id is missing.",
170
+ )
171
+ )
172
+
173
+ nodes = payload.get("nodes")
174
+ if not isinstance(nodes, list):
175
+ issues.append(
176
+ ArtifactLintIssue(
177
+ "error",
178
+ "artifacts.index.nodes.invalid",
179
+ "nodes list is missing or invalid.",
180
+ )
181
+ )
182
+ return
183
+
184
+ for idx, node in enumerate(nodes, start=1):
185
+ if not isinstance(node, dict):
186
+ issues.append(
187
+ ArtifactLintIssue(
188
+ "error",
189
+ "artifacts.index.node.invalid",
190
+ f"nodes[{idx}] entry must be an object.",
191
+ )
192
+ )
193
+ continue
194
+ node_id = node.get("node_id")
195
+ if not isinstance(node_id, str) or not node_id.strip():
196
+ issues.append(
197
+ ArtifactLintIssue(
198
+ "error",
199
+ "artifacts.index.node_id.missing",
200
+ f"nodes[{idx}] node_id is missing.",
201
+ )
202
+ )
203
+ path_value = node.get("path")
204
+ self._validate_path(
205
+ path_value,
206
+ artifacts_dir,
207
+ issues,
208
+ strict=strict,
209
+ code="artifacts.index.node.path.missing",
210
+ message=f"nodes[{idx}] path is missing.",
211
+ )
212
+
213
+ final_output = payload.get("final_output_path")
214
+ if final_output:
215
+ self._validate_path(
216
+ final_output,
217
+ artifacts_dir,
218
+ issues,
219
+ strict=strict,
220
+ code="artifacts.index.final_output.missing",
221
+ message="final_output_path is missing.",
222
+ )
223
+
224
+ def _validate_path(
225
+ self,
226
+ path_value: object,
227
+ artifacts_dir: Path,
228
+ issues: list[ArtifactLintIssue],
229
+ *,
230
+ strict: bool,
231
+ code: str,
232
+ message: str,
233
+ ) -> None:
234
+ if not isinstance(path_value, str) or not path_value.strip():
235
+ issues.append(
236
+ ArtifactLintIssue(
237
+ "error",
238
+ code,
239
+ message,
240
+ )
241
+ )
242
+ return
243
+
244
+ resolved = _resolve_artifact_path(artifacts_dir, Path(path_value))
245
+ if self._fs.exists(resolved):
246
+ return
247
+ issues.append(
248
+ ArtifactLintIssue(
249
+ "error" if strict else "warning",
250
+ code,
251
+ "Artifact file is missing.",
252
+ path=str(resolved),
253
+ )
254
+ )
255
+
256
+
257
+ def _resolve_artifact_path(base_dir: Path, candidate: Path) -> Path:
258
+ if candidate.is_absolute():
259
+ return candidate
260
+ return base_dir / candidate
261
+
262
+
263
+ def _resolve_status(issues: list[ArtifactLintIssue]) -> LintStatus:
264
+ if any(issue.level == "error" for issue in issues):
265
+ return "error"
266
+ if any(issue.level == "warning" for issue in issues):
267
+ return "warning"
268
+ return "ok"
@@ -414,12 +414,7 @@ class KoreanRAGBenchmarkRunner:
414
414
  try:
415
415
  # 형태소 분석 기반 검색
416
416
  if retriever:
417
- if self.use_hybrid_search and hasattr(retriever, "has_embeddings"):
418
- results = retriever.search(
419
- query, top_k=recall_k, use_dense=retriever.has_embeddings
420
- )
421
- else:
422
- results = retriever.search(query, top_k=recall_k)
417
+ results = retriever.search(query, top_k=recall_k)
423
418
  retrieved_doc_ids = [
424
419
  resolve_doc_id(getattr(res, "doc_id", None), doc_ids, idx)
425
420
  for idx, res in enumerate(results, start=1)
@@ -17,9 +17,22 @@ REFERENCE_REQUIRED_METRICS = {
17
17
  }
18
18
 
19
19
  _WHITESPACE_RE = re.compile(r"\s+")
20
+ _PUNCT_ONLY_RE = re.compile(r"^[\W_]+$")
20
21
  _HANGUL_RE = re.compile(r"[\uac00-\ud7a3]")
21
22
  _LATIN_RE = re.compile(r"[A-Za-z]")
22
23
 
24
+ _PLACEHOLDER_TEXT = {
25
+ "n/a",
26
+ "na",
27
+ "none",
28
+ "null",
29
+ "nil",
30
+ "unknown",
31
+ "tbd",
32
+ "todo",
33
+ "undefined",
34
+ }
35
+
23
36
 
24
37
  @dataclass(frozen=True)
25
38
  class DatasetPreprocessConfig:
@@ -205,8 +218,18 @@ class DatasetPreprocessor:
205
218
  if self._config.trim_whitespace:
206
219
  text = text.replace("\u00a0", " ")
207
220
  text = _WHITESPACE_RE.sub(" ", text).strip()
221
+ if self._is_noise_text(text):
222
+ return ""
208
223
  return text
209
224
 
225
+ def _is_noise_text(self, text: str) -> bool:
226
+ if not text:
227
+ return True
228
+ if _PUNCT_ONLY_RE.fullmatch(text):
229
+ return True
230
+ lower_text = text.casefold()
231
+ return lower_text in _PLACEHOLDER_TEXT
232
+
210
233
  def _normalize_contexts(self, contexts: Any) -> tuple[list[str], dict[str, int]]:
211
234
  removed = 0
212
235
  deduped = 0
@@ -292,6 +315,9 @@ class DatasetPreprocessor:
292
315
  elif source == "context":
293
316
  filled_from_context = 1
294
317
 
318
+ if reference:
319
+ reference = self._normalize_text(reference)
320
+
295
321
  if reference and self._config.max_reference_chars > 0:
296
322
  reference, did_truncate = self._truncate_text(
297
323
  reference, self._config.max_reference_chars
@@ -0,0 +1,25 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from evalvault.ports.outbound.difficulty_profile_port import DifficultyProfileWriterPort
6
+
7
+
8
+ class DifficultyProfileReporter:
9
+ def __init__(self, writer: DifficultyProfileWriterPort) -> None:
10
+ self._writer = writer
11
+
12
+ def write(
13
+ self,
14
+ *,
15
+ output_path: Path,
16
+ artifacts_dir: Path,
17
+ envelope: dict[str, object],
18
+ artifacts: dict[str, object],
19
+ ) -> dict[str, object]:
20
+ return self._writer.write_profile(
21
+ output_path=output_path,
22
+ artifacts_dir=artifacts_dir,
23
+ envelope=envelope,
24
+ artifacts=artifacts,
25
+ )
@@ -0,0 +1,304 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from dataclasses import dataclass
5
+ from datetime import UTC, datetime
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from evalvault.domain.entities import EvaluationRun, MetricScore, TestCaseResult
10
+ from evalvault.domain.services.difficulty_profile_reporter import DifficultyProfileReporter
11
+ from evalvault.ports.outbound.storage_port import StoragePort
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class DifficultyProfileRequest:
18
+ dataset_name: str | None
19
+ run_id: str | None
20
+ limit_runs: int | None
21
+ metrics: tuple[str, ...] | None
22
+ bucket_count: int
23
+ min_samples: int
24
+ output_path: Path
25
+ artifacts_dir: Path
26
+ parallel: bool
27
+ concurrency: int | None
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class DifficultyCaseProfile:
32
+ run_id: str
33
+ test_case_id: str
34
+ metric_scores: dict[str, float]
35
+ avg_score: float
36
+ difficulty_score: float
37
+ bucket: str
38
+ passed: bool
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class DifficultyBucketSummary:
43
+ label: str
44
+ count: int
45
+ ratio: float
46
+ avg_score: float | None
47
+ pass_rate: float | None
48
+
49
+
50
+ class DifficultyProfilingService:
51
+ def __init__(self, *, storage: StoragePort, reporter: DifficultyProfileReporter) -> None:
52
+ self._storage = storage
53
+ self._reporter = reporter
54
+
55
+ def profile(self, request: DifficultyProfileRequest) -> dict[str, Any]:
56
+ started_at = datetime.now(UTC)
57
+ logger.info(
58
+ "difficulty profiling started",
59
+ extra={
60
+ "dataset_name": request.dataset_name,
61
+ "run_id": request.run_id,
62
+ "bucket_count": request.bucket_count,
63
+ "parallel": request.parallel,
64
+ },
65
+ )
66
+
67
+ runs = self._load_runs(request)
68
+ metrics = self._resolve_metrics(request, runs)
69
+ cases = self._collect_cases(runs, metrics)
70
+
71
+ if len(cases) < request.min_samples:
72
+ logger.warning(
73
+ "difficulty profiling aborted: insufficient samples",
74
+ extra={"sample_count": len(cases), "min_samples": request.min_samples},
75
+ )
76
+ raise ValueError("insufficient history to build difficulty profile")
77
+
78
+ case_profiles, bucket_summaries = self._assign_buckets(
79
+ cases, bucket_count=request.bucket_count
80
+ )
81
+ breakdown = _build_breakdown(bucket_summaries)
82
+ failure_concentration = _build_failure_concentration(bucket_summaries)
83
+
84
+ data = {
85
+ "run_id": request.run_id,
86
+ "dataset_name": request.dataset_name,
87
+ "run_ids": sorted({case.run_id for case in case_profiles}),
88
+ "metrics": list(metrics),
89
+ "bucket_count": request.bucket_count,
90
+ "min_samples": request.min_samples,
91
+ "total_cases": len(case_profiles),
92
+ "dataset_difficulty_distribution": breakdown,
93
+ "accuracy_by_difficulty": _build_accuracy(bucket_summaries),
94
+ "failure_concentration": failure_concentration,
95
+ "buckets": [
96
+ {
97
+ "label": bucket.label,
98
+ "count": bucket.count,
99
+ "ratio": bucket.ratio,
100
+ "avg_score": bucket.avg_score,
101
+ "pass_rate": bucket.pass_rate,
102
+ }
103
+ for bucket in bucket_summaries
104
+ ],
105
+ }
106
+
107
+ finished_at = datetime.now(UTC)
108
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
109
+ envelope = {
110
+ "command": "profile-difficulty",
111
+ "version": 1,
112
+ "status": "ok",
113
+ "started_at": started_at.isoformat(),
114
+ "finished_at": finished_at.isoformat(),
115
+ "duration_ms": duration_ms,
116
+ "artifacts": {},
117
+ "data": data,
118
+ }
119
+
120
+ artifacts_payload = {
121
+ "breakdown": {
122
+ "run_id": request.run_id,
123
+ "dataset_difficulty_distribution": breakdown,
124
+ "accuracy_by_difficulty": _build_accuracy(bucket_summaries),
125
+ "failure_concentration": failure_concentration,
126
+ },
127
+ "cases": [
128
+ {
129
+ "run_id": case.run_id,
130
+ "test_case_id": case.test_case_id,
131
+ "metric_scores": case.metric_scores,
132
+ "avg_score": case.avg_score,
133
+ "difficulty_score": case.difficulty_score,
134
+ "bucket": case.bucket,
135
+ "passed": case.passed,
136
+ }
137
+ for case in case_profiles
138
+ ],
139
+ }
140
+
141
+ artifacts_index = self._reporter.write(
142
+ output_path=request.output_path,
143
+ artifacts_dir=request.artifacts_dir,
144
+ envelope=envelope,
145
+ artifacts=artifacts_payload,
146
+ )
147
+ envelope["artifacts"] = artifacts_index
148
+ logger.info(
149
+ "difficulty profiling completed",
150
+ extra={"artifact_dir": artifacts_index.get("dir"), "case_count": len(case_profiles)},
151
+ )
152
+ return envelope
153
+
154
+ def _load_runs(self, request: DifficultyProfileRequest) -> list[EvaluationRun]:
155
+ if request.run_id:
156
+ run = self._storage.get_run(request.run_id)
157
+ return [run]
158
+ if not request.dataset_name:
159
+ raise ValueError("dataset_name or run_id is required")
160
+ limit = request.limit_runs or 50
161
+ runs = self._storage.list_runs(limit=limit, dataset_name=request.dataset_name)
162
+ if not runs:
163
+ raise ValueError("no runs found for dataset")
164
+ return runs
165
+
166
+ def _resolve_metrics(
167
+ self, request: DifficultyProfileRequest, runs: list[EvaluationRun]
168
+ ) -> tuple[str, ...]:
169
+ if request.metrics:
170
+ return request.metrics
171
+ metrics: set[str] = set()
172
+ for run in runs:
173
+ metrics.update(run.metrics_evaluated)
174
+ if not metrics:
175
+ raise ValueError("no metrics available for difficulty profiling")
176
+ return tuple(sorted(metrics))
177
+
178
+ def _collect_cases(
179
+ self, runs: list[EvaluationRun], metrics: tuple[str, ...]
180
+ ) -> list[tuple[str, TestCaseResult, dict[str, float], bool]]:
181
+ cases = []
182
+ for run in runs:
183
+ for result in run.results:
184
+ metric_scores, passed = _extract_metric_scores(result, metrics)
185
+ if not metric_scores:
186
+ continue
187
+ cases.append((run.run_id, result, metric_scores, passed))
188
+ return cases
189
+
190
+ def _assign_buckets(
191
+ self, cases: list[tuple[str, TestCaseResult, dict[str, float], bool]], *, bucket_count: int
192
+ ) -> tuple[list[DifficultyCaseProfile], list[DifficultyBucketSummary]]:
193
+ sorted_cases = sorted(cases, key=lambda item: _difficulty_score(item[2]))
194
+ total_cases = len(sorted_cases)
195
+ if total_cases == 0:
196
+ return [], []
197
+ labels = _bucket_labels(bucket_count)
198
+
199
+ bucket_map: dict[str, list[DifficultyCaseProfile]] = {label: [] for label in labels}
200
+
201
+ for index, (run_id, result, metric_scores, passed) in enumerate(sorted_cases):
202
+ bucket_index = min(int(index / total_cases * bucket_count), bucket_count - 1)
203
+ label = labels[bucket_index]
204
+ avg_score = sum(metric_scores.values()) / len(metric_scores)
205
+ difficulty_score = _difficulty_score(metric_scores)
206
+ profile = DifficultyCaseProfile(
207
+ run_id=run_id,
208
+ test_case_id=result.test_case_id,
209
+ metric_scores=metric_scores,
210
+ avg_score=avg_score,
211
+ difficulty_score=difficulty_score,
212
+ bucket=label,
213
+ passed=passed,
214
+ )
215
+ bucket_map[label].append(profile)
216
+
217
+ bucket_summaries: list[DifficultyBucketSummary] = []
218
+ case_profiles: list[DifficultyCaseProfile] = []
219
+ for label in labels:
220
+ bucket_cases = bucket_map[label]
221
+ case_profiles.extend(bucket_cases)
222
+ count = len(bucket_cases)
223
+ ratio = count / total_cases if total_cases else 0.0
224
+ avg_score = _safe_average([case.avg_score for case in bucket_cases])
225
+ pass_rate = _safe_average([1.0 if case.passed else 0.0 for case in bucket_cases])
226
+ bucket_summaries.append(
227
+ DifficultyBucketSummary(
228
+ label=label,
229
+ count=count,
230
+ ratio=ratio,
231
+ avg_score=avg_score,
232
+ pass_rate=pass_rate,
233
+ )
234
+ )
235
+ return case_profiles, bucket_summaries
236
+
237
+
238
+ def _extract_metric_scores(
239
+ result: TestCaseResult, metrics: tuple[str, ...]
240
+ ) -> tuple[dict[str, float], bool]:
241
+ scores: dict[str, float] = {}
242
+ passed_all = True
243
+ for metric_name in metrics:
244
+ metric: MetricScore | None = result.get_metric(metric_name)
245
+ if metric is None:
246
+ continue
247
+ scores[metric_name] = float(metric.score)
248
+ passed_all = passed_all and metric.score >= metric.threshold
249
+ if not scores:
250
+ passed_all = False
251
+ return scores, passed_all
252
+
253
+
254
+ def _difficulty_score(metric_scores: dict[str, float]) -> float:
255
+ if not metric_scores:
256
+ return 1.0
257
+ avg_score = sum(metric_scores.values()) / len(metric_scores)
258
+ score = 1.0 - avg_score
259
+ return min(max(score, 0.0), 1.0)
260
+
261
+
262
+ def _safe_average(values: list[float]) -> float | None:
263
+ if not values:
264
+ return None
265
+ return sum(values) / len(values)
266
+
267
+
268
+ def _bucket_labels(bucket_count: int) -> list[str]:
269
+ if bucket_count == 3:
270
+ return ["easy", "medium", "hard"]
271
+ return [f"bucket_{index}" for index in range(1, bucket_count + 1)]
272
+
273
+
274
+ def _build_breakdown(buckets: list[DifficultyBucketSummary]) -> dict[str, float]:
275
+ return {bucket.label: bucket.ratio for bucket in buckets}
276
+
277
+
278
+ def _build_accuracy(buckets: list[DifficultyBucketSummary]) -> dict[str, float | None]:
279
+ return {bucket.label: bucket.pass_rate for bucket in buckets}
280
+
281
+
282
+ def _build_failure_concentration(buckets: list[DifficultyBucketSummary]) -> dict[str, Any]:
283
+ if not buckets:
284
+ return {
285
+ "primary_difficulty": None,
286
+ "primary_flags": [],
287
+ "actionable_insight": "난이도 분포 데이터가 없습니다.",
288
+ }
289
+ primary = max(buckets, key=lambda bucket: (1 - (bucket.pass_rate or 0.0), bucket.count))
290
+ flags: list[str] = []
291
+ if primary.pass_rate is not None and primary.pass_rate < 0.5:
292
+ flags.append("low_pass_rate")
293
+ if primary.avg_score is not None and primary.avg_score < 0.5:
294
+ flags.append("low_avg_score")
295
+ insight = "난이도 분포에 큰 편차가 없습니다."
296
+ if "low_pass_rate" in flags:
297
+ insight = "해당 난이도 구간에서 정답률이 낮습니다."
298
+ elif "low_avg_score" in flags:
299
+ insight = "메트릭 평균 점수가 낮습니다."
300
+ return {
301
+ "primary_difficulty": primary.label,
302
+ "primary_flags": flags,
303
+ "actionable_insight": insight,
304
+ }
@@ -330,6 +330,8 @@ class RagasEvaluator:
330
330
  self._active_llm_model = llm.get_model_name()
331
331
  self._active_llm = llm
332
332
  self._prompt_language = self._normalize_language_hint(language) if language else None
333
+ if self._prompt_language is None:
334
+ self._prompt_language = self._resolve_dataset_language(dataset)
333
335
  # Resolve thresholds: CLI > dataset > default(0.7)
334
336
  resolved_thresholds = {}
335
337
  for metric in metrics: