evalvault 1.63.1__py3-none-any.whl → 1.65.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/main.py +147 -9
- evalvault/adapters/inbound/api/routers/config.py +6 -1
- evalvault/adapters/inbound/api/routers/knowledge.py +62 -6
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/methods/external_command.py +22 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +40 -15
- evalvault/adapters/outbound/tracker/log_sanitizer.py +93 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +3 -2
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +90 -37
- evalvault/config/secret_manager.py +118 -0
- evalvault/config/settings.py +141 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/METADATA +8 -1
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/RECORD +51 -23
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Literal
|
|
9
|
+
|
|
10
|
+
from evalvault.ports.outbound.artifact_fs_port import ArtifactFileSystemPort
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
LintLevel = Literal["error", "warning"]
|
|
16
|
+
LintStatus = Literal["ok", "warning", "error"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class ArtifactLintIssue:
|
|
21
|
+
level: LintLevel
|
|
22
|
+
code: str
|
|
23
|
+
message: str
|
|
24
|
+
path: str | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class ArtifactLintSummary:
|
|
29
|
+
status: LintStatus
|
|
30
|
+
issues: list[ArtifactLintIssue]
|
|
31
|
+
artifacts_dir: Path
|
|
32
|
+
index_path: Path
|
|
33
|
+
started_at: datetime
|
|
34
|
+
finished_at: datetime
|
|
35
|
+
duration_ms: int
|
|
36
|
+
strict: bool
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ArtifactLintService:
|
|
40
|
+
def __init__(self, fs: ArtifactFileSystemPort) -> None:
|
|
41
|
+
self._fs = fs
|
|
42
|
+
|
|
43
|
+
def lint(self, artifacts_dir: Path, *, strict: bool = False) -> ArtifactLintSummary:
|
|
44
|
+
started_at = datetime.now(UTC)
|
|
45
|
+
issues: list[ArtifactLintIssue] = []
|
|
46
|
+
index_path = artifacts_dir / "index.json"
|
|
47
|
+
logger.info("Artifact lint started: %s", artifacts_dir)
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
self._validate_dir(artifacts_dir, issues)
|
|
51
|
+
if not self._fs.exists(index_path):
|
|
52
|
+
issues.append(
|
|
53
|
+
ArtifactLintIssue(
|
|
54
|
+
"error",
|
|
55
|
+
"artifacts.index.missing",
|
|
56
|
+
"index.json is missing.",
|
|
57
|
+
path=str(index_path),
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
elif self._fs.exists(artifacts_dir) and self._fs.is_dir(artifacts_dir):
|
|
61
|
+
index_payload = self._load_index(index_path, issues)
|
|
62
|
+
if index_payload is not None:
|
|
63
|
+
self._validate_index(
|
|
64
|
+
index_payload,
|
|
65
|
+
artifacts_dir,
|
|
66
|
+
issues,
|
|
67
|
+
strict=strict,
|
|
68
|
+
)
|
|
69
|
+
except Exception as exc:
|
|
70
|
+
logger.exception("Artifact lint failed: %s", artifacts_dir)
|
|
71
|
+
issues.append(
|
|
72
|
+
ArtifactLintIssue(
|
|
73
|
+
"error",
|
|
74
|
+
"artifacts.lint.exception",
|
|
75
|
+
f"Unexpected error: {exc}",
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
finished_at = datetime.now(UTC)
|
|
80
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
81
|
+
status = _resolve_status(issues)
|
|
82
|
+
logger.info("Artifact lint finished: %s (%s)", artifacts_dir, status)
|
|
83
|
+
return ArtifactLintSummary(
|
|
84
|
+
status=status,
|
|
85
|
+
issues=issues,
|
|
86
|
+
artifacts_dir=artifacts_dir,
|
|
87
|
+
index_path=index_path,
|
|
88
|
+
started_at=started_at,
|
|
89
|
+
finished_at=finished_at,
|
|
90
|
+
duration_ms=duration_ms,
|
|
91
|
+
strict=strict,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def _validate_dir(self, artifacts_dir: Path, issues: list[ArtifactLintIssue]) -> None:
|
|
95
|
+
if not self._fs.exists(artifacts_dir):
|
|
96
|
+
issues.append(
|
|
97
|
+
ArtifactLintIssue(
|
|
98
|
+
"error",
|
|
99
|
+
"artifacts.dir.missing",
|
|
100
|
+
"Artifacts directory is missing.",
|
|
101
|
+
path=str(artifacts_dir),
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
return
|
|
105
|
+
if not self._fs.is_dir(artifacts_dir):
|
|
106
|
+
issues.append(
|
|
107
|
+
ArtifactLintIssue(
|
|
108
|
+
"error",
|
|
109
|
+
"artifacts.dir.not_directory",
|
|
110
|
+
"Artifacts path is not a directory.",
|
|
111
|
+
path=str(artifacts_dir),
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def _load_index(
|
|
116
|
+
self,
|
|
117
|
+
index_path: Path,
|
|
118
|
+
issues: list[ArtifactLintIssue],
|
|
119
|
+
) -> dict[str, object] | None:
|
|
120
|
+
try:
|
|
121
|
+
payload = json.loads(self._fs.read_text(index_path))
|
|
122
|
+
except json.JSONDecodeError as exc:
|
|
123
|
+
issues.append(
|
|
124
|
+
ArtifactLintIssue(
|
|
125
|
+
"error",
|
|
126
|
+
"artifacts.index.invalid_json",
|
|
127
|
+
f"index.json parse failed: {exc}",
|
|
128
|
+
path=str(index_path),
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
return None
|
|
132
|
+
except OSError as exc:
|
|
133
|
+
issues.append(
|
|
134
|
+
ArtifactLintIssue(
|
|
135
|
+
"error",
|
|
136
|
+
"artifacts.index.read_failed",
|
|
137
|
+
f"index.json read failed: {exc}",
|
|
138
|
+
path=str(index_path),
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
if not isinstance(payload, dict):
|
|
144
|
+
issues.append(
|
|
145
|
+
ArtifactLintIssue(
|
|
146
|
+
"error",
|
|
147
|
+
"artifacts.index.invalid_schema",
|
|
148
|
+
"index.json root must be an object.",
|
|
149
|
+
path=str(index_path),
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
return None
|
|
153
|
+
return payload
|
|
154
|
+
|
|
155
|
+
def _validate_index(
|
|
156
|
+
self,
|
|
157
|
+
payload: dict[str, object],
|
|
158
|
+
artifacts_dir: Path,
|
|
159
|
+
issues: list[ArtifactLintIssue],
|
|
160
|
+
*,
|
|
161
|
+
strict: bool,
|
|
162
|
+
) -> None:
|
|
163
|
+
pipeline_id = payload.get("pipeline_id")
|
|
164
|
+
if not isinstance(pipeline_id, str) or not pipeline_id.strip():
|
|
165
|
+
issues.append(
|
|
166
|
+
ArtifactLintIssue(
|
|
167
|
+
"error",
|
|
168
|
+
"artifacts.index.pipeline_id.missing",
|
|
169
|
+
"pipeline_id is missing.",
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
nodes = payload.get("nodes")
|
|
174
|
+
if not isinstance(nodes, list):
|
|
175
|
+
issues.append(
|
|
176
|
+
ArtifactLintIssue(
|
|
177
|
+
"error",
|
|
178
|
+
"artifacts.index.nodes.invalid",
|
|
179
|
+
"nodes list is missing or invalid.",
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
for idx, node in enumerate(nodes, start=1):
|
|
185
|
+
if not isinstance(node, dict):
|
|
186
|
+
issues.append(
|
|
187
|
+
ArtifactLintIssue(
|
|
188
|
+
"error",
|
|
189
|
+
"artifacts.index.node.invalid",
|
|
190
|
+
f"nodes[{idx}] entry must be an object.",
|
|
191
|
+
)
|
|
192
|
+
)
|
|
193
|
+
continue
|
|
194
|
+
node_id = node.get("node_id")
|
|
195
|
+
if not isinstance(node_id, str) or not node_id.strip():
|
|
196
|
+
issues.append(
|
|
197
|
+
ArtifactLintIssue(
|
|
198
|
+
"error",
|
|
199
|
+
"artifacts.index.node_id.missing",
|
|
200
|
+
f"nodes[{idx}] node_id is missing.",
|
|
201
|
+
)
|
|
202
|
+
)
|
|
203
|
+
path_value = node.get("path")
|
|
204
|
+
self._validate_path(
|
|
205
|
+
path_value,
|
|
206
|
+
artifacts_dir,
|
|
207
|
+
issues,
|
|
208
|
+
strict=strict,
|
|
209
|
+
code="artifacts.index.node.path.missing",
|
|
210
|
+
message=f"nodes[{idx}] path is missing.",
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
final_output = payload.get("final_output_path")
|
|
214
|
+
if final_output:
|
|
215
|
+
self._validate_path(
|
|
216
|
+
final_output,
|
|
217
|
+
artifacts_dir,
|
|
218
|
+
issues,
|
|
219
|
+
strict=strict,
|
|
220
|
+
code="artifacts.index.final_output.missing",
|
|
221
|
+
message="final_output_path is missing.",
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
def _validate_path(
|
|
225
|
+
self,
|
|
226
|
+
path_value: object,
|
|
227
|
+
artifacts_dir: Path,
|
|
228
|
+
issues: list[ArtifactLintIssue],
|
|
229
|
+
*,
|
|
230
|
+
strict: bool,
|
|
231
|
+
code: str,
|
|
232
|
+
message: str,
|
|
233
|
+
) -> None:
|
|
234
|
+
if not isinstance(path_value, str) or not path_value.strip():
|
|
235
|
+
issues.append(
|
|
236
|
+
ArtifactLintIssue(
|
|
237
|
+
"error",
|
|
238
|
+
code,
|
|
239
|
+
message,
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
resolved = _resolve_artifact_path(artifacts_dir, Path(path_value))
|
|
245
|
+
if self._fs.exists(resolved):
|
|
246
|
+
return
|
|
247
|
+
issues.append(
|
|
248
|
+
ArtifactLintIssue(
|
|
249
|
+
"error" if strict else "warning",
|
|
250
|
+
code,
|
|
251
|
+
"Artifact file is missing.",
|
|
252
|
+
path=str(resolved),
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _resolve_artifact_path(base_dir: Path, candidate: Path) -> Path:
|
|
258
|
+
if candidate.is_absolute():
|
|
259
|
+
return candidate
|
|
260
|
+
return base_dir / candidate
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _resolve_status(issues: list[ArtifactLintIssue]) -> LintStatus:
|
|
264
|
+
if any(issue.level == "error" for issue in issues):
|
|
265
|
+
return "error"
|
|
266
|
+
if any(issue.level == "warning" for issue in issues):
|
|
267
|
+
return "warning"
|
|
268
|
+
return "ok"
|
|
@@ -414,12 +414,7 @@ class KoreanRAGBenchmarkRunner:
|
|
|
414
414
|
try:
|
|
415
415
|
# 형태소 분석 기반 검색
|
|
416
416
|
if retriever:
|
|
417
|
-
|
|
418
|
-
results = retriever.search(
|
|
419
|
-
query, top_k=recall_k, use_dense=retriever.has_embeddings
|
|
420
|
-
)
|
|
421
|
-
else:
|
|
422
|
-
results = retriever.search(query, top_k=recall_k)
|
|
417
|
+
results = retriever.search(query, top_k=recall_k)
|
|
423
418
|
retrieved_doc_ids = [
|
|
424
419
|
resolve_doc_id(getattr(res, "doc_id", None), doc_ids, idx)
|
|
425
420
|
for idx, res in enumerate(results, start=1)
|
|
@@ -17,9 +17,22 @@ REFERENCE_REQUIRED_METRICS = {
|
|
|
17
17
|
}
|
|
18
18
|
|
|
19
19
|
_WHITESPACE_RE = re.compile(r"\s+")
|
|
20
|
+
_PUNCT_ONLY_RE = re.compile(r"^[\W_]+$")
|
|
20
21
|
_HANGUL_RE = re.compile(r"[\uac00-\ud7a3]")
|
|
21
22
|
_LATIN_RE = re.compile(r"[A-Za-z]")
|
|
22
23
|
|
|
24
|
+
_PLACEHOLDER_TEXT = {
|
|
25
|
+
"n/a",
|
|
26
|
+
"na",
|
|
27
|
+
"none",
|
|
28
|
+
"null",
|
|
29
|
+
"nil",
|
|
30
|
+
"unknown",
|
|
31
|
+
"tbd",
|
|
32
|
+
"todo",
|
|
33
|
+
"undefined",
|
|
34
|
+
}
|
|
35
|
+
|
|
23
36
|
|
|
24
37
|
@dataclass(frozen=True)
|
|
25
38
|
class DatasetPreprocessConfig:
|
|
@@ -205,8 +218,18 @@ class DatasetPreprocessor:
|
|
|
205
218
|
if self._config.trim_whitespace:
|
|
206
219
|
text = text.replace("\u00a0", " ")
|
|
207
220
|
text = _WHITESPACE_RE.sub(" ", text).strip()
|
|
221
|
+
if self._is_noise_text(text):
|
|
222
|
+
return ""
|
|
208
223
|
return text
|
|
209
224
|
|
|
225
|
+
def _is_noise_text(self, text: str) -> bool:
|
|
226
|
+
if not text:
|
|
227
|
+
return True
|
|
228
|
+
if _PUNCT_ONLY_RE.fullmatch(text):
|
|
229
|
+
return True
|
|
230
|
+
lower_text = text.casefold()
|
|
231
|
+
return lower_text in _PLACEHOLDER_TEXT
|
|
232
|
+
|
|
210
233
|
def _normalize_contexts(self, contexts: Any) -> tuple[list[str], dict[str, int]]:
|
|
211
234
|
removed = 0
|
|
212
235
|
deduped = 0
|
|
@@ -292,6 +315,9 @@ class DatasetPreprocessor:
|
|
|
292
315
|
elif source == "context":
|
|
293
316
|
filled_from_context = 1
|
|
294
317
|
|
|
318
|
+
if reference:
|
|
319
|
+
reference = self._normalize_text(reference)
|
|
320
|
+
|
|
295
321
|
if reference and self._config.max_reference_chars > 0:
|
|
296
322
|
reference, did_truncate = self._truncate_text(
|
|
297
323
|
reference, self._config.max_reference_chars
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from evalvault.ports.outbound.difficulty_profile_port import DifficultyProfileWriterPort
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DifficultyProfileReporter:
|
|
9
|
+
def __init__(self, writer: DifficultyProfileWriterPort) -> None:
|
|
10
|
+
self._writer = writer
|
|
11
|
+
|
|
12
|
+
def write(
|
|
13
|
+
self,
|
|
14
|
+
*,
|
|
15
|
+
output_path: Path,
|
|
16
|
+
artifacts_dir: Path,
|
|
17
|
+
envelope: dict[str, object],
|
|
18
|
+
artifacts: dict[str, object],
|
|
19
|
+
) -> dict[str, object]:
|
|
20
|
+
return self._writer.write_profile(
|
|
21
|
+
output_path=output_path,
|
|
22
|
+
artifacts_dir=artifacts_dir,
|
|
23
|
+
envelope=envelope,
|
|
24
|
+
artifacts=artifacts,
|
|
25
|
+
)
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from evalvault.domain.entities import EvaluationRun, MetricScore, TestCaseResult
|
|
10
|
+
from evalvault.domain.services.difficulty_profile_reporter import DifficultyProfileReporter
|
|
11
|
+
from evalvault.ports.outbound.storage_port import StoragePort
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class DifficultyProfileRequest:
|
|
18
|
+
dataset_name: str | None
|
|
19
|
+
run_id: str | None
|
|
20
|
+
limit_runs: int | None
|
|
21
|
+
metrics: tuple[str, ...] | None
|
|
22
|
+
bucket_count: int
|
|
23
|
+
min_samples: int
|
|
24
|
+
output_path: Path
|
|
25
|
+
artifacts_dir: Path
|
|
26
|
+
parallel: bool
|
|
27
|
+
concurrency: int | None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class DifficultyCaseProfile:
|
|
32
|
+
run_id: str
|
|
33
|
+
test_case_id: str
|
|
34
|
+
metric_scores: dict[str, float]
|
|
35
|
+
avg_score: float
|
|
36
|
+
difficulty_score: float
|
|
37
|
+
bucket: str
|
|
38
|
+
passed: bool
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(frozen=True)
|
|
42
|
+
class DifficultyBucketSummary:
|
|
43
|
+
label: str
|
|
44
|
+
count: int
|
|
45
|
+
ratio: float
|
|
46
|
+
avg_score: float | None
|
|
47
|
+
pass_rate: float | None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DifficultyProfilingService:
|
|
51
|
+
def __init__(self, *, storage: StoragePort, reporter: DifficultyProfileReporter) -> None:
|
|
52
|
+
self._storage = storage
|
|
53
|
+
self._reporter = reporter
|
|
54
|
+
|
|
55
|
+
def profile(self, request: DifficultyProfileRequest) -> dict[str, Any]:
|
|
56
|
+
started_at = datetime.now(UTC)
|
|
57
|
+
logger.info(
|
|
58
|
+
"difficulty profiling started",
|
|
59
|
+
extra={
|
|
60
|
+
"dataset_name": request.dataset_name,
|
|
61
|
+
"run_id": request.run_id,
|
|
62
|
+
"bucket_count": request.bucket_count,
|
|
63
|
+
"parallel": request.parallel,
|
|
64
|
+
},
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
runs = self._load_runs(request)
|
|
68
|
+
metrics = self._resolve_metrics(request, runs)
|
|
69
|
+
cases = self._collect_cases(runs, metrics)
|
|
70
|
+
|
|
71
|
+
if len(cases) < request.min_samples:
|
|
72
|
+
logger.warning(
|
|
73
|
+
"difficulty profiling aborted: insufficient samples",
|
|
74
|
+
extra={"sample_count": len(cases), "min_samples": request.min_samples},
|
|
75
|
+
)
|
|
76
|
+
raise ValueError("insufficient history to build difficulty profile")
|
|
77
|
+
|
|
78
|
+
case_profiles, bucket_summaries = self._assign_buckets(
|
|
79
|
+
cases, bucket_count=request.bucket_count
|
|
80
|
+
)
|
|
81
|
+
breakdown = _build_breakdown(bucket_summaries)
|
|
82
|
+
failure_concentration = _build_failure_concentration(bucket_summaries)
|
|
83
|
+
|
|
84
|
+
data = {
|
|
85
|
+
"run_id": request.run_id,
|
|
86
|
+
"dataset_name": request.dataset_name,
|
|
87
|
+
"run_ids": sorted({case.run_id for case in case_profiles}),
|
|
88
|
+
"metrics": list(metrics),
|
|
89
|
+
"bucket_count": request.bucket_count,
|
|
90
|
+
"min_samples": request.min_samples,
|
|
91
|
+
"total_cases": len(case_profiles),
|
|
92
|
+
"dataset_difficulty_distribution": breakdown,
|
|
93
|
+
"accuracy_by_difficulty": _build_accuracy(bucket_summaries),
|
|
94
|
+
"failure_concentration": failure_concentration,
|
|
95
|
+
"buckets": [
|
|
96
|
+
{
|
|
97
|
+
"label": bucket.label,
|
|
98
|
+
"count": bucket.count,
|
|
99
|
+
"ratio": bucket.ratio,
|
|
100
|
+
"avg_score": bucket.avg_score,
|
|
101
|
+
"pass_rate": bucket.pass_rate,
|
|
102
|
+
}
|
|
103
|
+
for bucket in bucket_summaries
|
|
104
|
+
],
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
finished_at = datetime.now(UTC)
|
|
108
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
109
|
+
envelope = {
|
|
110
|
+
"command": "profile-difficulty",
|
|
111
|
+
"version": 1,
|
|
112
|
+
"status": "ok",
|
|
113
|
+
"started_at": started_at.isoformat(),
|
|
114
|
+
"finished_at": finished_at.isoformat(),
|
|
115
|
+
"duration_ms": duration_ms,
|
|
116
|
+
"artifacts": {},
|
|
117
|
+
"data": data,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
artifacts_payload = {
|
|
121
|
+
"breakdown": {
|
|
122
|
+
"run_id": request.run_id,
|
|
123
|
+
"dataset_difficulty_distribution": breakdown,
|
|
124
|
+
"accuracy_by_difficulty": _build_accuracy(bucket_summaries),
|
|
125
|
+
"failure_concentration": failure_concentration,
|
|
126
|
+
},
|
|
127
|
+
"cases": [
|
|
128
|
+
{
|
|
129
|
+
"run_id": case.run_id,
|
|
130
|
+
"test_case_id": case.test_case_id,
|
|
131
|
+
"metric_scores": case.metric_scores,
|
|
132
|
+
"avg_score": case.avg_score,
|
|
133
|
+
"difficulty_score": case.difficulty_score,
|
|
134
|
+
"bucket": case.bucket,
|
|
135
|
+
"passed": case.passed,
|
|
136
|
+
}
|
|
137
|
+
for case in case_profiles
|
|
138
|
+
],
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
artifacts_index = self._reporter.write(
|
|
142
|
+
output_path=request.output_path,
|
|
143
|
+
artifacts_dir=request.artifacts_dir,
|
|
144
|
+
envelope=envelope,
|
|
145
|
+
artifacts=artifacts_payload,
|
|
146
|
+
)
|
|
147
|
+
envelope["artifacts"] = artifacts_index
|
|
148
|
+
logger.info(
|
|
149
|
+
"difficulty profiling completed",
|
|
150
|
+
extra={"artifact_dir": artifacts_index.get("dir"), "case_count": len(case_profiles)},
|
|
151
|
+
)
|
|
152
|
+
return envelope
|
|
153
|
+
|
|
154
|
+
def _load_runs(self, request: DifficultyProfileRequest) -> list[EvaluationRun]:
|
|
155
|
+
if request.run_id:
|
|
156
|
+
run = self._storage.get_run(request.run_id)
|
|
157
|
+
return [run]
|
|
158
|
+
if not request.dataset_name:
|
|
159
|
+
raise ValueError("dataset_name or run_id is required")
|
|
160
|
+
limit = request.limit_runs or 50
|
|
161
|
+
runs = self._storage.list_runs(limit=limit, dataset_name=request.dataset_name)
|
|
162
|
+
if not runs:
|
|
163
|
+
raise ValueError("no runs found for dataset")
|
|
164
|
+
return runs
|
|
165
|
+
|
|
166
|
+
def _resolve_metrics(
|
|
167
|
+
self, request: DifficultyProfileRequest, runs: list[EvaluationRun]
|
|
168
|
+
) -> tuple[str, ...]:
|
|
169
|
+
if request.metrics:
|
|
170
|
+
return request.metrics
|
|
171
|
+
metrics: set[str] = set()
|
|
172
|
+
for run in runs:
|
|
173
|
+
metrics.update(run.metrics_evaluated)
|
|
174
|
+
if not metrics:
|
|
175
|
+
raise ValueError("no metrics available for difficulty profiling")
|
|
176
|
+
return tuple(sorted(metrics))
|
|
177
|
+
|
|
178
|
+
def _collect_cases(
|
|
179
|
+
self, runs: list[EvaluationRun], metrics: tuple[str, ...]
|
|
180
|
+
) -> list[tuple[str, TestCaseResult, dict[str, float], bool]]:
|
|
181
|
+
cases = []
|
|
182
|
+
for run in runs:
|
|
183
|
+
for result in run.results:
|
|
184
|
+
metric_scores, passed = _extract_metric_scores(result, metrics)
|
|
185
|
+
if not metric_scores:
|
|
186
|
+
continue
|
|
187
|
+
cases.append((run.run_id, result, metric_scores, passed))
|
|
188
|
+
return cases
|
|
189
|
+
|
|
190
|
+
def _assign_buckets(
|
|
191
|
+
self, cases: list[tuple[str, TestCaseResult, dict[str, float], bool]], *, bucket_count: int
|
|
192
|
+
) -> tuple[list[DifficultyCaseProfile], list[DifficultyBucketSummary]]:
|
|
193
|
+
sorted_cases = sorted(cases, key=lambda item: _difficulty_score(item[2]))
|
|
194
|
+
total_cases = len(sorted_cases)
|
|
195
|
+
if total_cases == 0:
|
|
196
|
+
return [], []
|
|
197
|
+
labels = _bucket_labels(bucket_count)
|
|
198
|
+
|
|
199
|
+
bucket_map: dict[str, list[DifficultyCaseProfile]] = {label: [] for label in labels}
|
|
200
|
+
|
|
201
|
+
for index, (run_id, result, metric_scores, passed) in enumerate(sorted_cases):
|
|
202
|
+
bucket_index = min(int(index / total_cases * bucket_count), bucket_count - 1)
|
|
203
|
+
label = labels[bucket_index]
|
|
204
|
+
avg_score = sum(metric_scores.values()) / len(metric_scores)
|
|
205
|
+
difficulty_score = _difficulty_score(metric_scores)
|
|
206
|
+
profile = DifficultyCaseProfile(
|
|
207
|
+
run_id=run_id,
|
|
208
|
+
test_case_id=result.test_case_id,
|
|
209
|
+
metric_scores=metric_scores,
|
|
210
|
+
avg_score=avg_score,
|
|
211
|
+
difficulty_score=difficulty_score,
|
|
212
|
+
bucket=label,
|
|
213
|
+
passed=passed,
|
|
214
|
+
)
|
|
215
|
+
bucket_map[label].append(profile)
|
|
216
|
+
|
|
217
|
+
bucket_summaries: list[DifficultyBucketSummary] = []
|
|
218
|
+
case_profiles: list[DifficultyCaseProfile] = []
|
|
219
|
+
for label in labels:
|
|
220
|
+
bucket_cases = bucket_map[label]
|
|
221
|
+
case_profiles.extend(bucket_cases)
|
|
222
|
+
count = len(bucket_cases)
|
|
223
|
+
ratio = count / total_cases if total_cases else 0.0
|
|
224
|
+
avg_score = _safe_average([case.avg_score for case in bucket_cases])
|
|
225
|
+
pass_rate = _safe_average([1.0 if case.passed else 0.0 for case in bucket_cases])
|
|
226
|
+
bucket_summaries.append(
|
|
227
|
+
DifficultyBucketSummary(
|
|
228
|
+
label=label,
|
|
229
|
+
count=count,
|
|
230
|
+
ratio=ratio,
|
|
231
|
+
avg_score=avg_score,
|
|
232
|
+
pass_rate=pass_rate,
|
|
233
|
+
)
|
|
234
|
+
)
|
|
235
|
+
return case_profiles, bucket_summaries
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _extract_metric_scores(
|
|
239
|
+
result: TestCaseResult, metrics: tuple[str, ...]
|
|
240
|
+
) -> tuple[dict[str, float], bool]:
|
|
241
|
+
scores: dict[str, float] = {}
|
|
242
|
+
passed_all = True
|
|
243
|
+
for metric_name in metrics:
|
|
244
|
+
metric: MetricScore | None = result.get_metric(metric_name)
|
|
245
|
+
if metric is None:
|
|
246
|
+
continue
|
|
247
|
+
scores[metric_name] = float(metric.score)
|
|
248
|
+
passed_all = passed_all and metric.score >= metric.threshold
|
|
249
|
+
if not scores:
|
|
250
|
+
passed_all = False
|
|
251
|
+
return scores, passed_all
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _difficulty_score(metric_scores: dict[str, float]) -> float:
|
|
255
|
+
if not metric_scores:
|
|
256
|
+
return 1.0
|
|
257
|
+
avg_score = sum(metric_scores.values()) / len(metric_scores)
|
|
258
|
+
score = 1.0 - avg_score
|
|
259
|
+
return min(max(score, 0.0), 1.0)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _safe_average(values: list[float]) -> float | None:
|
|
263
|
+
if not values:
|
|
264
|
+
return None
|
|
265
|
+
return sum(values) / len(values)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _bucket_labels(bucket_count: int) -> list[str]:
|
|
269
|
+
if bucket_count == 3:
|
|
270
|
+
return ["easy", "medium", "hard"]
|
|
271
|
+
return [f"bucket_{index}" for index in range(1, bucket_count + 1)]
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _build_breakdown(buckets: list[DifficultyBucketSummary]) -> dict[str, float]:
|
|
275
|
+
return {bucket.label: bucket.ratio for bucket in buckets}
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _build_accuracy(buckets: list[DifficultyBucketSummary]) -> dict[str, float | None]:
|
|
279
|
+
return {bucket.label: bucket.pass_rate for bucket in buckets}
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _build_failure_concentration(buckets: list[DifficultyBucketSummary]) -> dict[str, Any]:
|
|
283
|
+
if not buckets:
|
|
284
|
+
return {
|
|
285
|
+
"primary_difficulty": None,
|
|
286
|
+
"primary_flags": [],
|
|
287
|
+
"actionable_insight": "난이도 분포 데이터가 없습니다.",
|
|
288
|
+
}
|
|
289
|
+
primary = max(buckets, key=lambda bucket: (1 - (bucket.pass_rate or 0.0), bucket.count))
|
|
290
|
+
flags: list[str] = []
|
|
291
|
+
if primary.pass_rate is not None and primary.pass_rate < 0.5:
|
|
292
|
+
flags.append("low_pass_rate")
|
|
293
|
+
if primary.avg_score is not None and primary.avg_score < 0.5:
|
|
294
|
+
flags.append("low_avg_score")
|
|
295
|
+
insight = "난이도 분포에 큰 편차가 없습니다."
|
|
296
|
+
if "low_pass_rate" in flags:
|
|
297
|
+
insight = "해당 난이도 구간에서 정답률이 낮습니다."
|
|
298
|
+
elif "low_avg_score" in flags:
|
|
299
|
+
insight = "메트릭 평균 점수가 낮습니다."
|
|
300
|
+
return {
|
|
301
|
+
"primary_difficulty": primary.label,
|
|
302
|
+
"primary_flags": flags,
|
|
303
|
+
"actionable_insight": insight,
|
|
304
|
+
}
|
|
@@ -330,6 +330,8 @@ class RagasEvaluator:
|
|
|
330
330
|
self._active_llm_model = llm.get_model_name()
|
|
331
331
|
self._active_llm = llm
|
|
332
332
|
self._prompt_language = self._normalize_language_hint(language) if language else None
|
|
333
|
+
if self._prompt_language is None:
|
|
334
|
+
self._prompt_language = self._resolve_dataset_language(dataset)
|
|
333
335
|
# Resolve thresholds: CLI > dataset > default(0.7)
|
|
334
336
|
resolved_thresholds = {}
|
|
335
337
|
for metric in metrics:
|