evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +14 -0
- evalvault/adapters/inbound/api/main.py +14 -4
- evalvault/adapters/inbound/api/routers/chat.py +543 -0
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/inbound/cli/commands/run.py +14 -0
- evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
- evalvault/adapters/outbound/storage/base_sql.py +41 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
- evalvault/config/settings.py +21 -0
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/prompt.py +1 -1
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/metrics/__init__.py +8 -0
- evalvault/domain/metrics/registry.py +39 -3
- evalvault/domain/metrics/summary_accuracy.py +189 -0
- evalvault/domain/metrics/summary_needs_followup.py +45 -0
- evalvault/domain/metrics/summary_non_definitive.py +41 -0
- evalvault/domain/metrics/summary_risk_coverage.py +45 -0
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/custom_metric_snapshot.py +233 -0
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +282 -27
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/prompt_registry.py +39 -10
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/domain/services/threshold_profiles.py +4 -0
- evalvault/domain/services/visual_space_service.py +79 -4
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Literal
|
|
9
|
+
|
|
10
|
+
from evalvault.ports.outbound.artifact_fs_port import ArtifactFileSystemPort
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
LintLevel = Literal["error", "warning"]
|
|
16
|
+
LintStatus = Literal["ok", "warning", "error"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class ArtifactLintIssue:
|
|
21
|
+
level: LintLevel
|
|
22
|
+
code: str
|
|
23
|
+
message: str
|
|
24
|
+
path: str | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class ArtifactLintSummary:
|
|
29
|
+
status: LintStatus
|
|
30
|
+
issues: list[ArtifactLintIssue]
|
|
31
|
+
artifacts_dir: Path
|
|
32
|
+
index_path: Path
|
|
33
|
+
started_at: datetime
|
|
34
|
+
finished_at: datetime
|
|
35
|
+
duration_ms: int
|
|
36
|
+
strict: bool
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ArtifactLintService:
|
|
40
|
+
def __init__(self, fs: ArtifactFileSystemPort) -> None:
|
|
41
|
+
self._fs = fs
|
|
42
|
+
|
|
43
|
+
def lint(self, artifacts_dir: Path, *, strict: bool = False) -> ArtifactLintSummary:
|
|
44
|
+
started_at = datetime.now(UTC)
|
|
45
|
+
issues: list[ArtifactLintIssue] = []
|
|
46
|
+
index_path = artifacts_dir / "index.json"
|
|
47
|
+
logger.info("Artifact lint started: %s", artifacts_dir)
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
self._validate_dir(artifacts_dir, issues)
|
|
51
|
+
if not self._fs.exists(index_path):
|
|
52
|
+
issues.append(
|
|
53
|
+
ArtifactLintIssue(
|
|
54
|
+
"error",
|
|
55
|
+
"artifacts.index.missing",
|
|
56
|
+
"index.json is missing.",
|
|
57
|
+
path=str(index_path),
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
elif self._fs.exists(artifacts_dir) and self._fs.is_dir(artifacts_dir):
|
|
61
|
+
index_payload = self._load_index(index_path, issues)
|
|
62
|
+
if index_payload is not None:
|
|
63
|
+
self._validate_index(
|
|
64
|
+
index_payload,
|
|
65
|
+
artifacts_dir,
|
|
66
|
+
issues,
|
|
67
|
+
strict=strict,
|
|
68
|
+
)
|
|
69
|
+
except Exception as exc:
|
|
70
|
+
logger.exception("Artifact lint failed: %s", artifacts_dir)
|
|
71
|
+
issues.append(
|
|
72
|
+
ArtifactLintIssue(
|
|
73
|
+
"error",
|
|
74
|
+
"artifacts.lint.exception",
|
|
75
|
+
f"Unexpected error: {exc}",
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
finished_at = datetime.now(UTC)
|
|
80
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
81
|
+
status = _resolve_status(issues)
|
|
82
|
+
logger.info("Artifact lint finished: %s (%s)", artifacts_dir, status)
|
|
83
|
+
return ArtifactLintSummary(
|
|
84
|
+
status=status,
|
|
85
|
+
issues=issues,
|
|
86
|
+
artifacts_dir=artifacts_dir,
|
|
87
|
+
index_path=index_path,
|
|
88
|
+
started_at=started_at,
|
|
89
|
+
finished_at=finished_at,
|
|
90
|
+
duration_ms=duration_ms,
|
|
91
|
+
strict=strict,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def _validate_dir(self, artifacts_dir: Path, issues: list[ArtifactLintIssue]) -> None:
|
|
95
|
+
if not self._fs.exists(artifacts_dir):
|
|
96
|
+
issues.append(
|
|
97
|
+
ArtifactLintIssue(
|
|
98
|
+
"error",
|
|
99
|
+
"artifacts.dir.missing",
|
|
100
|
+
"Artifacts directory is missing.",
|
|
101
|
+
path=str(artifacts_dir),
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
return
|
|
105
|
+
if not self._fs.is_dir(artifacts_dir):
|
|
106
|
+
issues.append(
|
|
107
|
+
ArtifactLintIssue(
|
|
108
|
+
"error",
|
|
109
|
+
"artifacts.dir.not_directory",
|
|
110
|
+
"Artifacts path is not a directory.",
|
|
111
|
+
path=str(artifacts_dir),
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def _load_index(
|
|
116
|
+
self,
|
|
117
|
+
index_path: Path,
|
|
118
|
+
issues: list[ArtifactLintIssue],
|
|
119
|
+
) -> dict[str, object] | None:
|
|
120
|
+
try:
|
|
121
|
+
payload = json.loads(self._fs.read_text(index_path))
|
|
122
|
+
except json.JSONDecodeError as exc:
|
|
123
|
+
issues.append(
|
|
124
|
+
ArtifactLintIssue(
|
|
125
|
+
"error",
|
|
126
|
+
"artifacts.index.invalid_json",
|
|
127
|
+
f"index.json parse failed: {exc}",
|
|
128
|
+
path=str(index_path),
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
return None
|
|
132
|
+
except OSError as exc:
|
|
133
|
+
issues.append(
|
|
134
|
+
ArtifactLintIssue(
|
|
135
|
+
"error",
|
|
136
|
+
"artifacts.index.read_failed",
|
|
137
|
+
f"index.json read failed: {exc}",
|
|
138
|
+
path=str(index_path),
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
if not isinstance(payload, dict):
|
|
144
|
+
issues.append(
|
|
145
|
+
ArtifactLintIssue(
|
|
146
|
+
"error",
|
|
147
|
+
"artifacts.index.invalid_schema",
|
|
148
|
+
"index.json root must be an object.",
|
|
149
|
+
path=str(index_path),
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
return None
|
|
153
|
+
return payload
|
|
154
|
+
|
|
155
|
+
def _validate_index(
|
|
156
|
+
self,
|
|
157
|
+
payload: dict[str, object],
|
|
158
|
+
artifacts_dir: Path,
|
|
159
|
+
issues: list[ArtifactLintIssue],
|
|
160
|
+
*,
|
|
161
|
+
strict: bool,
|
|
162
|
+
) -> None:
|
|
163
|
+
pipeline_id = payload.get("pipeline_id")
|
|
164
|
+
if not isinstance(pipeline_id, str) or not pipeline_id.strip():
|
|
165
|
+
issues.append(
|
|
166
|
+
ArtifactLintIssue(
|
|
167
|
+
"error",
|
|
168
|
+
"artifacts.index.pipeline_id.missing",
|
|
169
|
+
"pipeline_id is missing.",
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
nodes = payload.get("nodes")
|
|
174
|
+
if not isinstance(nodes, list):
|
|
175
|
+
issues.append(
|
|
176
|
+
ArtifactLintIssue(
|
|
177
|
+
"error",
|
|
178
|
+
"artifacts.index.nodes.invalid",
|
|
179
|
+
"nodes list is missing or invalid.",
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
for idx, node in enumerate(nodes, start=1):
|
|
185
|
+
if not isinstance(node, dict):
|
|
186
|
+
issues.append(
|
|
187
|
+
ArtifactLintIssue(
|
|
188
|
+
"error",
|
|
189
|
+
"artifacts.index.node.invalid",
|
|
190
|
+
f"nodes[{idx}] entry must be an object.",
|
|
191
|
+
)
|
|
192
|
+
)
|
|
193
|
+
continue
|
|
194
|
+
node_id = node.get("node_id")
|
|
195
|
+
if not isinstance(node_id, str) or not node_id.strip():
|
|
196
|
+
issues.append(
|
|
197
|
+
ArtifactLintIssue(
|
|
198
|
+
"error",
|
|
199
|
+
"artifacts.index.node_id.missing",
|
|
200
|
+
f"nodes[{idx}] node_id is missing.",
|
|
201
|
+
)
|
|
202
|
+
)
|
|
203
|
+
path_value = node.get("path")
|
|
204
|
+
self._validate_path(
|
|
205
|
+
path_value,
|
|
206
|
+
artifacts_dir,
|
|
207
|
+
issues,
|
|
208
|
+
strict=strict,
|
|
209
|
+
code="artifacts.index.node.path.missing",
|
|
210
|
+
message=f"nodes[{idx}] path is missing.",
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
final_output = payload.get("final_output_path")
|
|
214
|
+
if final_output:
|
|
215
|
+
self._validate_path(
|
|
216
|
+
final_output,
|
|
217
|
+
artifacts_dir,
|
|
218
|
+
issues,
|
|
219
|
+
strict=strict,
|
|
220
|
+
code="artifacts.index.final_output.missing",
|
|
221
|
+
message="final_output_path is missing.",
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
def _validate_path(
|
|
225
|
+
self,
|
|
226
|
+
path_value: object,
|
|
227
|
+
artifacts_dir: Path,
|
|
228
|
+
issues: list[ArtifactLintIssue],
|
|
229
|
+
*,
|
|
230
|
+
strict: bool,
|
|
231
|
+
code: str,
|
|
232
|
+
message: str,
|
|
233
|
+
) -> None:
|
|
234
|
+
if not isinstance(path_value, str) or not path_value.strip():
|
|
235
|
+
issues.append(
|
|
236
|
+
ArtifactLintIssue(
|
|
237
|
+
"error",
|
|
238
|
+
code,
|
|
239
|
+
message,
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
resolved = _resolve_artifact_path(artifacts_dir, Path(path_value))
|
|
245
|
+
if self._fs.exists(resolved):
|
|
246
|
+
return
|
|
247
|
+
issues.append(
|
|
248
|
+
ArtifactLintIssue(
|
|
249
|
+
"error" if strict else "warning",
|
|
250
|
+
code,
|
|
251
|
+
"Artifact file is missing.",
|
|
252
|
+
path=str(resolved),
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _resolve_artifact_path(base_dir: Path, candidate: Path) -> Path:
|
|
258
|
+
if candidate.is_absolute():
|
|
259
|
+
return candidate
|
|
260
|
+
return base_dir / candidate
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _resolve_status(issues: list[ArtifactLintIssue]) -> LintStatus:
|
|
264
|
+
if any(issue.level == "error" for issue in issues):
|
|
265
|
+
return "error"
|
|
266
|
+
if any(issue.level == "warning" for issue in issues):
|
|
267
|
+
return "warning"
|
|
268
|
+
return "ok"
|
|
@@ -414,12 +414,7 @@ class KoreanRAGBenchmarkRunner:
|
|
|
414
414
|
try:
|
|
415
415
|
# 형태소 분석 기반 검색
|
|
416
416
|
if retriever:
|
|
417
|
-
|
|
418
|
-
results = retriever.search(
|
|
419
|
-
query, top_k=recall_k, use_dense=retriever.has_embeddings
|
|
420
|
-
)
|
|
421
|
-
else:
|
|
422
|
-
results = retriever.search(query, top_k=recall_k)
|
|
417
|
+
results = retriever.search(query, top_k=recall_k)
|
|
423
418
|
retrieved_doc_ids = [
|
|
424
419
|
resolve_doc_id(getattr(res, "doc_id", None), doc_ids, idx)
|
|
425
420
|
for idx, res in enumerate(results, start=1)
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import inspect
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from evalvault.domain.metrics.registry import get_metric_spec_map
|
|
10
|
+
|
|
11
|
+
SCHEMA_VERSION = 1
|
|
12
|
+
|
|
13
|
+
_CUSTOM_METRIC_DETAILS: dict[str, dict[str, Any]] = {
|
|
14
|
+
"entity_preservation": {
|
|
15
|
+
"evaluation_method": "rule-based",
|
|
16
|
+
"inputs": ["answer", "contexts"],
|
|
17
|
+
"output": "0.0-1.0 (preserved_entities / context_entities)",
|
|
18
|
+
"evaluation_process": "Extract numeric/keyword entities from contexts and measure how many appear in the summary.",
|
|
19
|
+
"rules": {
|
|
20
|
+
"numeric_entities": ["percent", "currency", "duration", "date"],
|
|
21
|
+
"keywords_ko": [
|
|
22
|
+
"면책",
|
|
23
|
+
"제외",
|
|
24
|
+
"단서",
|
|
25
|
+
"다만",
|
|
26
|
+
"조건",
|
|
27
|
+
"자기부담",
|
|
28
|
+
"한도",
|
|
29
|
+
"감액",
|
|
30
|
+
],
|
|
31
|
+
"keywords_en": [
|
|
32
|
+
"exclusion",
|
|
33
|
+
"deductible",
|
|
34
|
+
"limit",
|
|
35
|
+
"cap",
|
|
36
|
+
"copay",
|
|
37
|
+
"coinsurance",
|
|
38
|
+
],
|
|
39
|
+
},
|
|
40
|
+
"notes": "Insurance-risk oriented entity coverage check.",
|
|
41
|
+
},
|
|
42
|
+
"insurance_term_accuracy": {
|
|
43
|
+
"evaluation_method": "rule-based",
|
|
44
|
+
"inputs": ["answer", "contexts"],
|
|
45
|
+
"output": "0.0-1.0 (verified_terms / answer_terms)",
|
|
46
|
+
"evaluation_process": "Detect insurance terms in the answer and verify their presence in contexts.",
|
|
47
|
+
"rules": {"terms_dictionary": "terms_dictionary.json"},
|
|
48
|
+
"notes": "Insurance glossary matching with canonical/variant terms.",
|
|
49
|
+
},
|
|
50
|
+
"summary_accuracy": {
|
|
51
|
+
"evaluation_method": "rule-based",
|
|
52
|
+
"inputs": ["answer", "contexts"],
|
|
53
|
+
"output": "0.0-1.0 (supported_summary_entities / summary_entities)",
|
|
54
|
+
"evaluation_process": "Extract numeric/keyword entities from summary and verify their presence in contexts.",
|
|
55
|
+
"rules": {
|
|
56
|
+
"numeric_entities": ["percent", "currency", "duration", "date"],
|
|
57
|
+
"keywords_ko": ["면책", "제외", "단서", "다만", "조건", "자기부담", "한도", "감액"],
|
|
58
|
+
"keywords_en": ["exclusion", "deductible", "limit", "cap", "waiting period"],
|
|
59
|
+
},
|
|
60
|
+
"notes": "Penalizes summary entities not grounded in contexts.",
|
|
61
|
+
},
|
|
62
|
+
"summary_risk_coverage": {
|
|
63
|
+
"evaluation_method": "rule-based",
|
|
64
|
+
"inputs": ["answer", "metadata.summary_tags"],
|
|
65
|
+
"output": "0.0-1.0 (covered_tags / expected_tags)",
|
|
66
|
+
"evaluation_process": "Check if summary mentions expected insurance risk tags.",
|
|
67
|
+
"rules": {
|
|
68
|
+
"exclusion": ["면책", "보장 제외", "지급 불가", "exclusion"],
|
|
69
|
+
"deductible": ["자기부담", "본인부담금", "deductible", "copay"],
|
|
70
|
+
"limit": ["한도", "상한", "최대", "limit", "cap"],
|
|
71
|
+
"waiting_period": ["면책기간", "대기기간", "waiting period"],
|
|
72
|
+
"condition": ["조건", "단서", "다만", "condition"],
|
|
73
|
+
"documents_required": ["서류", "진단서", "영수증", "documents"],
|
|
74
|
+
"needs_followup": ["확인 필요", "추가 확인", "담당자 확인", "재문의", "follow up"],
|
|
75
|
+
},
|
|
76
|
+
"notes": "Uses metadata summary_tags to define expected coverage.",
|
|
77
|
+
},
|
|
78
|
+
"summary_non_definitive": {
|
|
79
|
+
"evaluation_method": "rule-based",
|
|
80
|
+
"inputs": ["answer"],
|
|
81
|
+
"output": "1.0 if definitive claims absent else 0.0",
|
|
82
|
+
"evaluation_process": "Detect definitive expressions that increase liability risk.",
|
|
83
|
+
"rules": {
|
|
84
|
+
"patterns_ko": ["무조건", "반드시", "100%", "전액 지급", "확실히", "분명히", "절대"],
|
|
85
|
+
"patterns_en": [
|
|
86
|
+
"always",
|
|
87
|
+
"guaranteed",
|
|
88
|
+
"definitely",
|
|
89
|
+
"certainly",
|
|
90
|
+
"absolutely",
|
|
91
|
+
"100%",
|
|
92
|
+
],
|
|
93
|
+
},
|
|
94
|
+
"notes": "Higher is safer; penalizes absolute guarantees.",
|
|
95
|
+
},
|
|
96
|
+
"summary_needs_followup": {
|
|
97
|
+
"evaluation_method": "rule-based",
|
|
98
|
+
"inputs": ["answer", "metadata.summary_tags"],
|
|
99
|
+
"output": "1.0 if follow-up guidance matches expected need",
|
|
100
|
+
"evaluation_process": "Check follow-up guidance when needs_followup tag exists.",
|
|
101
|
+
"rules": {
|
|
102
|
+
"followup_keywords": [
|
|
103
|
+
"확인 필요",
|
|
104
|
+
"추가 확인",
|
|
105
|
+
"담당자 확인",
|
|
106
|
+
"재문의",
|
|
107
|
+
"추가 문의",
|
|
108
|
+
"follow up",
|
|
109
|
+
]
|
|
110
|
+
},
|
|
111
|
+
"notes": "Requires tags to avoid false penalties.",
|
|
112
|
+
},
|
|
113
|
+
"no_answer_accuracy": {
|
|
114
|
+
"evaluation_method": "rule-based",
|
|
115
|
+
"inputs": ["answer", "ground_truth"],
|
|
116
|
+
"output": "1.0 if abstention behavior matches, else 0.0",
|
|
117
|
+
"evaluation_process": "Detect abstention patterns in answer and ground_truth and compare behavior.",
|
|
118
|
+
"rules": {"patterns": "Korean/English regex patterns"},
|
|
119
|
+
"notes": "Hallucination/abstention behavior check.",
|
|
120
|
+
},
|
|
121
|
+
"exact_match": {
|
|
122
|
+
"evaluation_method": "string-match",
|
|
123
|
+
"inputs": ["answer", "ground_truth"],
|
|
124
|
+
"output": "1.0 exact match else 0.0",
|
|
125
|
+
"evaluation_process": "Normalize text and compare exact match with optional strict number matching.",
|
|
126
|
+
"rules": {"normalize": True, "number_strict": True},
|
|
127
|
+
"notes": "Token/number strict matching for factual answers.",
|
|
128
|
+
},
|
|
129
|
+
"f1_score": {
|
|
130
|
+
"evaluation_method": "token-overlap",
|
|
131
|
+
"inputs": ["answer", "ground_truth"],
|
|
132
|
+
"output": "0.0-1.0 (weighted F1)",
|
|
133
|
+
"evaluation_process": "Tokenize, compute weighted precision/recall/F1 with number emphasis.",
|
|
134
|
+
"rules": {"number_weight": 2.0},
|
|
135
|
+
"notes": "Token-level overlap with numeric weighting.",
|
|
136
|
+
},
|
|
137
|
+
"mrr": {
|
|
138
|
+
"evaluation_method": "retrieval-rank",
|
|
139
|
+
"inputs": ["ground_truth", "contexts"],
|
|
140
|
+
"output": "0.0-1.0 (1/rank of first relevant context)",
|
|
141
|
+
"evaluation_process": "Compute relevance by token overlap and take reciprocal rank of first hit.",
|
|
142
|
+
"rules": {"relevance_threshold": 0.3},
|
|
143
|
+
"notes": "Ranking quality of retrieved contexts.",
|
|
144
|
+
},
|
|
145
|
+
"ndcg": {
|
|
146
|
+
"evaluation_method": "retrieval-rank",
|
|
147
|
+
"inputs": ["ground_truth", "contexts"],
|
|
148
|
+
"output": "0.0-1.0 (NDCG@K)",
|
|
149
|
+
"evaluation_process": "Compute graded relevance per context and calculate NDCG.",
|
|
150
|
+
"rules": {"k": 10, "use_graded": True},
|
|
151
|
+
"notes": "Ranking quality across all relevant contexts.",
|
|
152
|
+
},
|
|
153
|
+
"hit_rate": {
|
|
154
|
+
"evaluation_method": "retrieval-rank",
|
|
155
|
+
"inputs": ["ground_truth", "contexts"],
|
|
156
|
+
"output": "1.0 if any relevant context in top K else 0.0",
|
|
157
|
+
"evaluation_process": "Check whether top-K contexts contain a relevant hit.",
|
|
158
|
+
"rules": {"k": 10, "relevance_threshold": 0.3},
|
|
159
|
+
"notes": "Recall@K style coverage check.",
|
|
160
|
+
},
|
|
161
|
+
"confidence_score": {
|
|
162
|
+
"evaluation_method": "rule-based",
|
|
163
|
+
"inputs": ["answer", "ground_truth", "contexts"],
|
|
164
|
+
"output": "0.0-1.0 (weighted confidence)",
|
|
165
|
+
"evaluation_process": "Combine context coverage, answer specificity, and consistency scores.",
|
|
166
|
+
"rules": {"coverage": 0.4, "specificity": 0.3, "consistency": 0.3},
|
|
167
|
+
"notes": "Heuristic confidence signal for human escalation.",
|
|
168
|
+
},
|
|
169
|
+
"contextual_relevancy": {
|
|
170
|
+
"evaluation_method": "token-overlap",
|
|
171
|
+
"inputs": ["question", "contexts"],
|
|
172
|
+
"output": "0.0-1.0 (avg relevancy)",
|
|
173
|
+
"evaluation_process": "Measure question-context token overlap and average across contexts.",
|
|
174
|
+
"rules": {"relevance_threshold": 0.35},
|
|
175
|
+
"notes": "Reference-free context relevance check.",
|
|
176
|
+
},
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _hash_file(path: str | Path | None) -> str | None:
|
|
181
|
+
if not path:
|
|
182
|
+
return None
|
|
183
|
+
file_path = Path(path)
|
|
184
|
+
if not file_path.exists():
|
|
185
|
+
return None
|
|
186
|
+
payload = file_path.read_bytes()
|
|
187
|
+
return hashlib.sha256(payload).hexdigest()
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _resolve_source_path(metric_class: type[Any]) -> str | None:
|
|
191
|
+
try:
|
|
192
|
+
source = inspect.getsourcefile(metric_class)
|
|
193
|
+
except TypeError:
|
|
194
|
+
return None
|
|
195
|
+
if not source:
|
|
196
|
+
return None
|
|
197
|
+
return str(Path(source).resolve())
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def build_custom_metric_snapshot(
|
|
201
|
+
metric_classes: dict[str, type[Any]],
|
|
202
|
+
metrics: Iterable[str],
|
|
203
|
+
) -> dict[str, Any] | None:
|
|
204
|
+
custom_names = [name for name in metrics if name in metric_classes]
|
|
205
|
+
if not custom_names:
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
spec_map = get_metric_spec_map()
|
|
209
|
+
rows: list[dict[str, Any]] = []
|
|
210
|
+
for metric_name in custom_names:
|
|
211
|
+
metric_class = metric_classes.get(metric_name)
|
|
212
|
+
if metric_class is None:
|
|
213
|
+
continue
|
|
214
|
+
source_path = _resolve_source_path(metric_class)
|
|
215
|
+
details = _CUSTOM_METRIC_DETAILS.get(metric_name, {})
|
|
216
|
+
spec = spec_map.get(metric_name)
|
|
217
|
+
rows.append(
|
|
218
|
+
{
|
|
219
|
+
"metric_name": metric_name,
|
|
220
|
+
"source": "custom",
|
|
221
|
+
"description": spec.description if spec else None,
|
|
222
|
+
"evaluation_method": details.get("evaluation_method"),
|
|
223
|
+
"inputs": details.get("inputs"),
|
|
224
|
+
"output": details.get("output"),
|
|
225
|
+
"evaluation_process": details.get("evaluation_process"),
|
|
226
|
+
"rules": details.get("rules"),
|
|
227
|
+
"notes": details.get("notes"),
|
|
228
|
+
"implementation_path": source_path,
|
|
229
|
+
"implementation_hash": _hash_file(source_path),
|
|
230
|
+
}
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
return {"schema_version": SCHEMA_VERSION, "metrics": rows}
|
|
@@ -17,9 +17,22 @@ REFERENCE_REQUIRED_METRICS = {
|
|
|
17
17
|
}
|
|
18
18
|
|
|
19
19
|
_WHITESPACE_RE = re.compile(r"\s+")
|
|
20
|
+
_PUNCT_ONLY_RE = re.compile(r"^[\W_]+$")
|
|
20
21
|
_HANGUL_RE = re.compile(r"[\uac00-\ud7a3]")
|
|
21
22
|
_LATIN_RE = re.compile(r"[A-Za-z]")
|
|
22
23
|
|
|
24
|
+
_PLACEHOLDER_TEXT = {
|
|
25
|
+
"n/a",
|
|
26
|
+
"na",
|
|
27
|
+
"none",
|
|
28
|
+
"null",
|
|
29
|
+
"nil",
|
|
30
|
+
"unknown",
|
|
31
|
+
"tbd",
|
|
32
|
+
"todo",
|
|
33
|
+
"undefined",
|
|
34
|
+
}
|
|
35
|
+
|
|
23
36
|
|
|
24
37
|
@dataclass(frozen=True)
|
|
25
38
|
class DatasetPreprocessConfig:
|
|
@@ -205,8 +218,18 @@ class DatasetPreprocessor:
|
|
|
205
218
|
if self._config.trim_whitespace:
|
|
206
219
|
text = text.replace("\u00a0", " ")
|
|
207
220
|
text = _WHITESPACE_RE.sub(" ", text).strip()
|
|
221
|
+
if self._is_noise_text(text):
|
|
222
|
+
return ""
|
|
208
223
|
return text
|
|
209
224
|
|
|
225
|
+
def _is_noise_text(self, text: str) -> bool:
|
|
226
|
+
if not text:
|
|
227
|
+
return True
|
|
228
|
+
if _PUNCT_ONLY_RE.fullmatch(text):
|
|
229
|
+
return True
|
|
230
|
+
lower_text = text.casefold()
|
|
231
|
+
return lower_text in _PLACEHOLDER_TEXT
|
|
232
|
+
|
|
210
233
|
def _normalize_contexts(self, contexts: Any) -> tuple[list[str], dict[str, int]]:
|
|
211
234
|
removed = 0
|
|
212
235
|
deduped = 0
|
|
@@ -292,6 +315,9 @@ class DatasetPreprocessor:
|
|
|
292
315
|
elif source == "context":
|
|
293
316
|
filled_from_context = 1
|
|
294
317
|
|
|
318
|
+
if reference:
|
|
319
|
+
reference = self._normalize_text(reference)
|
|
320
|
+
|
|
295
321
|
if reference and self._config.max_reference_chars > 0:
|
|
296
322
|
reference, did_truncate = self._truncate_text(
|
|
297
323
|
reference, self._config.max_reference_chars
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from evalvault.ports.outbound.difficulty_profile_port import DifficultyProfileWriterPort
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DifficultyProfileReporter:
|
|
9
|
+
def __init__(self, writer: DifficultyProfileWriterPort) -> None:
|
|
10
|
+
self._writer = writer
|
|
11
|
+
|
|
12
|
+
def write(
|
|
13
|
+
self,
|
|
14
|
+
*,
|
|
15
|
+
output_path: Path,
|
|
16
|
+
artifacts_dir: Path,
|
|
17
|
+
envelope: dict[str, object],
|
|
18
|
+
artifacts: dict[str, object],
|
|
19
|
+
) -> dict[str, object]:
|
|
20
|
+
return self._writer.write_profile(
|
|
21
|
+
output_path=output_path,
|
|
22
|
+
artifacts_dir=artifacts_dir,
|
|
23
|
+
envelope=envelope,
|
|
24
|
+
artifacts=artifacts,
|
|
25
|
+
)
|