evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  14. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  15. evalvault/adapters/outbound/artifact_fs.py +16 -0
  16. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  17. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  18. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  19. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  20. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  21. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  22. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  23. evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
  24. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  25. evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
  26. evalvault/config/settings.py +21 -0
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/judge_calibration.py +50 -0
  29. evalvault/domain/entities/prompt.py +1 -1
  30. evalvault/domain/entities/stage.py +11 -3
  31. evalvault/domain/metrics/__init__.py +8 -0
  32. evalvault/domain/metrics/registry.py +39 -3
  33. evalvault/domain/metrics/summary_accuracy.py +189 -0
  34. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  35. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  36. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  37. evalvault/domain/services/artifact_lint_service.py +268 -0
  38. evalvault/domain/services/benchmark_runner.py +1 -6
  39. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  40. evalvault/domain/services/dataset_preprocessor.py +26 -0
  41. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  42. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  43. evalvault/domain/services/evaluator.py +282 -27
  44. evalvault/domain/services/judge_calibration_service.py +495 -0
  45. evalvault/domain/services/ops_snapshot_service.py +159 -0
  46. evalvault/domain/services/prompt_registry.py +39 -10
  47. evalvault/domain/services/regression_gate_service.py +199 -0
  48. evalvault/domain/services/run_comparison_service.py +159 -0
  49. evalvault/domain/services/stage_event_builder.py +6 -1
  50. evalvault/domain/services/stage_metric_service.py +83 -18
  51. evalvault/domain/services/threshold_profiles.py +4 -0
  52. evalvault/domain/services/visual_space_service.py +79 -4
  53. evalvault/ports/outbound/__init__.py +4 -0
  54. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  55. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  56. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  57. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  58. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  59. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  60. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
  61. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  62. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  63. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,268 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from dataclasses import dataclass
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+ from typing import Literal
9
+
10
+ from evalvault.ports.outbound.artifact_fs_port import ArtifactFileSystemPort
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ LintLevel = Literal["error", "warning"]
16
+ LintStatus = Literal["ok", "warning", "error"]
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class ArtifactLintIssue:
21
+ level: LintLevel
22
+ code: str
23
+ message: str
24
+ path: str | None = None
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class ArtifactLintSummary:
29
+ status: LintStatus
30
+ issues: list[ArtifactLintIssue]
31
+ artifacts_dir: Path
32
+ index_path: Path
33
+ started_at: datetime
34
+ finished_at: datetime
35
+ duration_ms: int
36
+ strict: bool
37
+
38
+
39
+ class ArtifactLintService:
40
+ def __init__(self, fs: ArtifactFileSystemPort) -> None:
41
+ self._fs = fs
42
+
43
+ def lint(self, artifacts_dir: Path, *, strict: bool = False) -> ArtifactLintSummary:
44
+ started_at = datetime.now(UTC)
45
+ issues: list[ArtifactLintIssue] = []
46
+ index_path = artifacts_dir / "index.json"
47
+ logger.info("Artifact lint started: %s", artifacts_dir)
48
+
49
+ try:
50
+ self._validate_dir(artifacts_dir, issues)
51
+ if not self._fs.exists(index_path):
52
+ issues.append(
53
+ ArtifactLintIssue(
54
+ "error",
55
+ "artifacts.index.missing",
56
+ "index.json is missing.",
57
+ path=str(index_path),
58
+ )
59
+ )
60
+ elif self._fs.exists(artifacts_dir) and self._fs.is_dir(artifacts_dir):
61
+ index_payload = self._load_index(index_path, issues)
62
+ if index_payload is not None:
63
+ self._validate_index(
64
+ index_payload,
65
+ artifacts_dir,
66
+ issues,
67
+ strict=strict,
68
+ )
69
+ except Exception as exc:
70
+ logger.exception("Artifact lint failed: %s", artifacts_dir)
71
+ issues.append(
72
+ ArtifactLintIssue(
73
+ "error",
74
+ "artifacts.lint.exception",
75
+ f"Unexpected error: {exc}",
76
+ )
77
+ )
78
+
79
+ finished_at = datetime.now(UTC)
80
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
81
+ status = _resolve_status(issues)
82
+ logger.info("Artifact lint finished: %s (%s)", artifacts_dir, status)
83
+ return ArtifactLintSummary(
84
+ status=status,
85
+ issues=issues,
86
+ artifacts_dir=artifacts_dir,
87
+ index_path=index_path,
88
+ started_at=started_at,
89
+ finished_at=finished_at,
90
+ duration_ms=duration_ms,
91
+ strict=strict,
92
+ )
93
+
94
+ def _validate_dir(self, artifacts_dir: Path, issues: list[ArtifactLintIssue]) -> None:
95
+ if not self._fs.exists(artifacts_dir):
96
+ issues.append(
97
+ ArtifactLintIssue(
98
+ "error",
99
+ "artifacts.dir.missing",
100
+ "Artifacts directory is missing.",
101
+ path=str(artifacts_dir),
102
+ )
103
+ )
104
+ return
105
+ if not self._fs.is_dir(artifacts_dir):
106
+ issues.append(
107
+ ArtifactLintIssue(
108
+ "error",
109
+ "artifacts.dir.not_directory",
110
+ "Artifacts path is not a directory.",
111
+ path=str(artifacts_dir),
112
+ )
113
+ )
114
+
115
+ def _load_index(
116
+ self,
117
+ index_path: Path,
118
+ issues: list[ArtifactLintIssue],
119
+ ) -> dict[str, object] | None:
120
+ try:
121
+ payload = json.loads(self._fs.read_text(index_path))
122
+ except json.JSONDecodeError as exc:
123
+ issues.append(
124
+ ArtifactLintIssue(
125
+ "error",
126
+ "artifacts.index.invalid_json",
127
+ f"index.json parse failed: {exc}",
128
+ path=str(index_path),
129
+ )
130
+ )
131
+ return None
132
+ except OSError as exc:
133
+ issues.append(
134
+ ArtifactLintIssue(
135
+ "error",
136
+ "artifacts.index.read_failed",
137
+ f"index.json read failed: {exc}",
138
+ path=str(index_path),
139
+ )
140
+ )
141
+ return None
142
+
143
+ if not isinstance(payload, dict):
144
+ issues.append(
145
+ ArtifactLintIssue(
146
+ "error",
147
+ "artifacts.index.invalid_schema",
148
+ "index.json root must be an object.",
149
+ path=str(index_path),
150
+ )
151
+ )
152
+ return None
153
+ return payload
154
+
155
+ def _validate_index(
156
+ self,
157
+ payload: dict[str, object],
158
+ artifacts_dir: Path,
159
+ issues: list[ArtifactLintIssue],
160
+ *,
161
+ strict: bool,
162
+ ) -> None:
163
+ pipeline_id = payload.get("pipeline_id")
164
+ if not isinstance(pipeline_id, str) or not pipeline_id.strip():
165
+ issues.append(
166
+ ArtifactLintIssue(
167
+ "error",
168
+ "artifacts.index.pipeline_id.missing",
169
+ "pipeline_id is missing.",
170
+ )
171
+ )
172
+
173
+ nodes = payload.get("nodes")
174
+ if not isinstance(nodes, list):
175
+ issues.append(
176
+ ArtifactLintIssue(
177
+ "error",
178
+ "artifacts.index.nodes.invalid",
179
+ "nodes list is missing or invalid.",
180
+ )
181
+ )
182
+ return
183
+
184
+ for idx, node in enumerate(nodes, start=1):
185
+ if not isinstance(node, dict):
186
+ issues.append(
187
+ ArtifactLintIssue(
188
+ "error",
189
+ "artifacts.index.node.invalid",
190
+ f"nodes[{idx}] entry must be an object.",
191
+ )
192
+ )
193
+ continue
194
+ node_id = node.get("node_id")
195
+ if not isinstance(node_id, str) or not node_id.strip():
196
+ issues.append(
197
+ ArtifactLintIssue(
198
+ "error",
199
+ "artifacts.index.node_id.missing",
200
+ f"nodes[{idx}] node_id is missing.",
201
+ )
202
+ )
203
+ path_value = node.get("path")
204
+ self._validate_path(
205
+ path_value,
206
+ artifacts_dir,
207
+ issues,
208
+ strict=strict,
209
+ code="artifacts.index.node.path.missing",
210
+ message=f"nodes[{idx}] path is missing.",
211
+ )
212
+
213
+ final_output = payload.get("final_output_path")
214
+ if final_output:
215
+ self._validate_path(
216
+ final_output,
217
+ artifacts_dir,
218
+ issues,
219
+ strict=strict,
220
+ code="artifacts.index.final_output.missing",
221
+ message="final_output_path is missing.",
222
+ )
223
+
224
+ def _validate_path(
225
+ self,
226
+ path_value: object,
227
+ artifacts_dir: Path,
228
+ issues: list[ArtifactLintIssue],
229
+ *,
230
+ strict: bool,
231
+ code: str,
232
+ message: str,
233
+ ) -> None:
234
+ if not isinstance(path_value, str) or not path_value.strip():
235
+ issues.append(
236
+ ArtifactLintIssue(
237
+ "error",
238
+ code,
239
+ message,
240
+ )
241
+ )
242
+ return
243
+
244
+ resolved = _resolve_artifact_path(artifacts_dir, Path(path_value))
245
+ if self._fs.exists(resolved):
246
+ return
247
+ issues.append(
248
+ ArtifactLintIssue(
249
+ "error" if strict else "warning",
250
+ code,
251
+ "Artifact file is missing.",
252
+ path=str(resolved),
253
+ )
254
+ )
255
+
256
+
257
+ def _resolve_artifact_path(base_dir: Path, candidate: Path) -> Path:
258
+ if candidate.is_absolute():
259
+ return candidate
260
+ return base_dir / candidate
261
+
262
+
263
+ def _resolve_status(issues: list[ArtifactLintIssue]) -> LintStatus:
264
+ if any(issue.level == "error" for issue in issues):
265
+ return "error"
266
+ if any(issue.level == "warning" for issue in issues):
267
+ return "warning"
268
+ return "ok"
@@ -414,12 +414,7 @@ class KoreanRAGBenchmarkRunner:
414
414
  try:
415
415
  # 형태소 분석 기반 검색
416
416
  if retriever:
417
- if self.use_hybrid_search and hasattr(retriever, "has_embeddings"):
418
- results = retriever.search(
419
- query, top_k=recall_k, use_dense=retriever.has_embeddings
420
- )
421
- else:
422
- results = retriever.search(query, top_k=recall_k)
417
+ results = retriever.search(query, top_k=recall_k)
423
418
  retrieved_doc_ids = [
424
419
  resolve_doc_id(getattr(res, "doc_id", None), doc_ids, idx)
425
420
  for idx, res in enumerate(results, start=1)
@@ -0,0 +1,233 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import inspect
5
+ from collections.abc import Iterable
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from evalvault.domain.metrics.registry import get_metric_spec_map
10
+
11
+ SCHEMA_VERSION = 1
12
+
13
+ _CUSTOM_METRIC_DETAILS: dict[str, dict[str, Any]] = {
14
+ "entity_preservation": {
15
+ "evaluation_method": "rule-based",
16
+ "inputs": ["answer", "contexts"],
17
+ "output": "0.0-1.0 (preserved_entities / context_entities)",
18
+ "evaluation_process": "Extract numeric/keyword entities from contexts and measure how many appear in the summary.",
19
+ "rules": {
20
+ "numeric_entities": ["percent", "currency", "duration", "date"],
21
+ "keywords_ko": [
22
+ "면책",
23
+ "제외",
24
+ "단서",
25
+ "다만",
26
+ "조건",
27
+ "자기부담",
28
+ "한도",
29
+ "감액",
30
+ ],
31
+ "keywords_en": [
32
+ "exclusion",
33
+ "deductible",
34
+ "limit",
35
+ "cap",
36
+ "copay",
37
+ "coinsurance",
38
+ ],
39
+ },
40
+ "notes": "Insurance-risk oriented entity coverage check.",
41
+ },
42
+ "insurance_term_accuracy": {
43
+ "evaluation_method": "rule-based",
44
+ "inputs": ["answer", "contexts"],
45
+ "output": "0.0-1.0 (verified_terms / answer_terms)",
46
+ "evaluation_process": "Detect insurance terms in the answer and verify their presence in contexts.",
47
+ "rules": {"terms_dictionary": "terms_dictionary.json"},
48
+ "notes": "Insurance glossary matching with canonical/variant terms.",
49
+ },
50
+ "summary_accuracy": {
51
+ "evaluation_method": "rule-based",
52
+ "inputs": ["answer", "contexts"],
53
+ "output": "0.0-1.0 (supported_summary_entities / summary_entities)",
54
+ "evaluation_process": "Extract numeric/keyword entities from summary and verify their presence in contexts.",
55
+ "rules": {
56
+ "numeric_entities": ["percent", "currency", "duration", "date"],
57
+ "keywords_ko": ["면책", "제외", "단서", "다만", "조건", "자기부담", "한도", "감액"],
58
+ "keywords_en": ["exclusion", "deductible", "limit", "cap", "waiting period"],
59
+ },
60
+ "notes": "Penalizes summary entities not grounded in contexts.",
61
+ },
62
+ "summary_risk_coverage": {
63
+ "evaluation_method": "rule-based",
64
+ "inputs": ["answer", "metadata.summary_tags"],
65
+ "output": "0.0-1.0 (covered_tags / expected_tags)",
66
+ "evaluation_process": "Check if summary mentions expected insurance risk tags.",
67
+ "rules": {
68
+ "exclusion": ["면책", "보장 제외", "지급 불가", "exclusion"],
69
+ "deductible": ["자기부담", "본인부담금", "deductible", "copay"],
70
+ "limit": ["한도", "상한", "최대", "limit", "cap"],
71
+ "waiting_period": ["면책기간", "대기기간", "waiting period"],
72
+ "condition": ["조건", "단서", "다만", "condition"],
73
+ "documents_required": ["서류", "진단서", "영수증", "documents"],
74
+ "needs_followup": ["확인 필요", "추가 확인", "담당자 확인", "재문의", "follow up"],
75
+ },
76
+ "notes": "Uses metadata summary_tags to define expected coverage.",
77
+ },
78
+ "summary_non_definitive": {
79
+ "evaluation_method": "rule-based",
80
+ "inputs": ["answer"],
81
+ "output": "1.0 if definitive claims absent else 0.0",
82
+ "evaluation_process": "Detect definitive expressions that increase liability risk.",
83
+ "rules": {
84
+ "patterns_ko": ["무조건", "반드시", "100%", "전액 지급", "확실히", "분명히", "절대"],
85
+ "patterns_en": [
86
+ "always",
87
+ "guaranteed",
88
+ "definitely",
89
+ "certainly",
90
+ "absolutely",
91
+ "100%",
92
+ ],
93
+ },
94
+ "notes": "Higher is safer; penalizes absolute guarantees.",
95
+ },
96
+ "summary_needs_followup": {
97
+ "evaluation_method": "rule-based",
98
+ "inputs": ["answer", "metadata.summary_tags"],
99
+ "output": "1.0 if follow-up guidance matches expected need",
100
+ "evaluation_process": "Check follow-up guidance when needs_followup tag exists.",
101
+ "rules": {
102
+ "followup_keywords": [
103
+ "확인 필요",
104
+ "추가 확인",
105
+ "담당자 확인",
106
+ "재문의",
107
+ "추가 문의",
108
+ "follow up",
109
+ ]
110
+ },
111
+ "notes": "Requires tags to avoid false penalties.",
112
+ },
113
+ "no_answer_accuracy": {
114
+ "evaluation_method": "rule-based",
115
+ "inputs": ["answer", "ground_truth"],
116
+ "output": "1.0 if abstention behavior matches, else 0.0",
117
+ "evaluation_process": "Detect abstention patterns in answer and ground_truth and compare behavior.",
118
+ "rules": {"patterns": "Korean/English regex patterns"},
119
+ "notes": "Hallucination/abstention behavior check.",
120
+ },
121
+ "exact_match": {
122
+ "evaluation_method": "string-match",
123
+ "inputs": ["answer", "ground_truth"],
124
+ "output": "1.0 exact match else 0.0",
125
+ "evaluation_process": "Normalize text and compare exact match with optional strict number matching.",
126
+ "rules": {"normalize": True, "number_strict": True},
127
+ "notes": "Token/number strict matching for factual answers.",
128
+ },
129
+ "f1_score": {
130
+ "evaluation_method": "token-overlap",
131
+ "inputs": ["answer", "ground_truth"],
132
+ "output": "0.0-1.0 (weighted F1)",
133
+ "evaluation_process": "Tokenize, compute weighted precision/recall/F1 with number emphasis.",
134
+ "rules": {"number_weight": 2.0},
135
+ "notes": "Token-level overlap with numeric weighting.",
136
+ },
137
+ "mrr": {
138
+ "evaluation_method": "retrieval-rank",
139
+ "inputs": ["ground_truth", "contexts"],
140
+ "output": "0.0-1.0 (1/rank of first relevant context)",
141
+ "evaluation_process": "Compute relevance by token overlap and take reciprocal rank of first hit.",
142
+ "rules": {"relevance_threshold": 0.3},
143
+ "notes": "Ranking quality of retrieved contexts.",
144
+ },
145
+ "ndcg": {
146
+ "evaluation_method": "retrieval-rank",
147
+ "inputs": ["ground_truth", "contexts"],
148
+ "output": "0.0-1.0 (NDCG@K)",
149
+ "evaluation_process": "Compute graded relevance per context and calculate NDCG.",
150
+ "rules": {"k": 10, "use_graded": True},
151
+ "notes": "Ranking quality across all relevant contexts.",
152
+ },
153
+ "hit_rate": {
154
+ "evaluation_method": "retrieval-rank",
155
+ "inputs": ["ground_truth", "contexts"],
156
+ "output": "1.0 if any relevant context in top K else 0.0",
157
+ "evaluation_process": "Check whether top-K contexts contain a relevant hit.",
158
+ "rules": {"k": 10, "relevance_threshold": 0.3},
159
+ "notes": "Recall@K style coverage check.",
160
+ },
161
+ "confidence_score": {
162
+ "evaluation_method": "rule-based",
163
+ "inputs": ["answer", "ground_truth", "contexts"],
164
+ "output": "0.0-1.0 (weighted confidence)",
165
+ "evaluation_process": "Combine context coverage, answer specificity, and consistency scores.",
166
+ "rules": {"coverage": 0.4, "specificity": 0.3, "consistency": 0.3},
167
+ "notes": "Heuristic confidence signal for human escalation.",
168
+ },
169
+ "contextual_relevancy": {
170
+ "evaluation_method": "token-overlap",
171
+ "inputs": ["question", "contexts"],
172
+ "output": "0.0-1.0 (avg relevancy)",
173
+ "evaluation_process": "Measure question-context token overlap and average across contexts.",
174
+ "rules": {"relevance_threshold": 0.35},
175
+ "notes": "Reference-free context relevance check.",
176
+ },
177
+ }
178
+
179
+
180
+ def _hash_file(path: str | Path | None) -> str | None:
181
+ if not path:
182
+ return None
183
+ file_path = Path(path)
184
+ if not file_path.exists():
185
+ return None
186
+ payload = file_path.read_bytes()
187
+ return hashlib.sha256(payload).hexdigest()
188
+
189
+
190
+ def _resolve_source_path(metric_class: type[Any]) -> str | None:
191
+ try:
192
+ source = inspect.getsourcefile(metric_class)
193
+ except TypeError:
194
+ return None
195
+ if not source:
196
+ return None
197
+ return str(Path(source).resolve())
198
+
199
+
200
+ def build_custom_metric_snapshot(
201
+ metric_classes: dict[str, type[Any]],
202
+ metrics: Iterable[str],
203
+ ) -> dict[str, Any] | None:
204
+ custom_names = [name for name in metrics if name in metric_classes]
205
+ if not custom_names:
206
+ return None
207
+
208
+ spec_map = get_metric_spec_map()
209
+ rows: list[dict[str, Any]] = []
210
+ for metric_name in custom_names:
211
+ metric_class = metric_classes.get(metric_name)
212
+ if metric_class is None:
213
+ continue
214
+ source_path = _resolve_source_path(metric_class)
215
+ details = _CUSTOM_METRIC_DETAILS.get(metric_name, {})
216
+ spec = spec_map.get(metric_name)
217
+ rows.append(
218
+ {
219
+ "metric_name": metric_name,
220
+ "source": "custom",
221
+ "description": spec.description if spec else None,
222
+ "evaluation_method": details.get("evaluation_method"),
223
+ "inputs": details.get("inputs"),
224
+ "output": details.get("output"),
225
+ "evaluation_process": details.get("evaluation_process"),
226
+ "rules": details.get("rules"),
227
+ "notes": details.get("notes"),
228
+ "implementation_path": source_path,
229
+ "implementation_hash": _hash_file(source_path),
230
+ }
231
+ )
232
+
233
+ return {"schema_version": SCHEMA_VERSION, "metrics": rows}
@@ -17,9 +17,22 @@ REFERENCE_REQUIRED_METRICS = {
17
17
  }
18
18
 
19
19
  _WHITESPACE_RE = re.compile(r"\s+")
20
+ _PUNCT_ONLY_RE = re.compile(r"^[\W_]+$")
20
21
  _HANGUL_RE = re.compile(r"[\uac00-\ud7a3]")
21
22
  _LATIN_RE = re.compile(r"[A-Za-z]")
22
23
 
24
+ _PLACEHOLDER_TEXT = {
25
+ "n/a",
26
+ "na",
27
+ "none",
28
+ "null",
29
+ "nil",
30
+ "unknown",
31
+ "tbd",
32
+ "todo",
33
+ "undefined",
34
+ }
35
+
23
36
 
24
37
  @dataclass(frozen=True)
25
38
  class DatasetPreprocessConfig:
@@ -205,8 +218,18 @@ class DatasetPreprocessor:
205
218
  if self._config.trim_whitespace:
206
219
  text = text.replace("\u00a0", " ")
207
220
  text = _WHITESPACE_RE.sub(" ", text).strip()
221
+ if self._is_noise_text(text):
222
+ return ""
208
223
  return text
209
224
 
225
+ def _is_noise_text(self, text: str) -> bool:
226
+ if not text:
227
+ return True
228
+ if _PUNCT_ONLY_RE.fullmatch(text):
229
+ return True
230
+ lower_text = text.casefold()
231
+ return lower_text in _PLACEHOLDER_TEXT
232
+
210
233
  def _normalize_contexts(self, contexts: Any) -> tuple[list[str], dict[str, int]]:
211
234
  removed = 0
212
235
  deduped = 0
@@ -292,6 +315,9 @@ class DatasetPreprocessor:
292
315
  elif source == "context":
293
316
  filled_from_context = 1
294
317
 
318
+ if reference:
319
+ reference = self._normalize_text(reference)
320
+
295
321
  if reference and self._config.max_reference_chars > 0:
296
322
  reference, did_truncate = self._truncate_text(
297
323
  reference, self._config.max_reference_chars
@@ -0,0 +1,25 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from evalvault.ports.outbound.difficulty_profile_port import DifficultyProfileWriterPort
6
+
7
+
8
+ class DifficultyProfileReporter:
9
+ def __init__(self, writer: DifficultyProfileWriterPort) -> None:
10
+ self._writer = writer
11
+
12
+ def write(
13
+ self,
14
+ *,
15
+ output_path: Path,
16
+ artifacts_dir: Path,
17
+ envelope: dict[str, object],
18
+ artifacts: dict[str, object],
19
+ ) -> dict[str, object]:
20
+ return self._writer.write_profile(
21
+ output_path=output_path,
22
+ artifacts_dir=artifacts_dir,
23
+ envelope=envelope,
24
+ artifacts=artifacts,
25
+ )