evalvault 1.64.0__py3-none-any.whl → 1.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  2. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  3. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  4. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  5. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  6. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  7. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  8. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  9. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  10. evalvault/adapters/outbound/artifact_fs.py +16 -0
  11. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  12. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  13. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  14. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  15. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  16. evalvault/adapters/outbound/tracker/langfuse_adapter.py +12 -7
  17. evalvault/adapters/outbound/tracker/phoenix_adapter.py +39 -12
  18. evalvault/domain/entities/__init__.py +10 -0
  19. evalvault/domain/entities/judge_calibration.py +50 -0
  20. evalvault/domain/entities/stage.py +11 -3
  21. evalvault/domain/services/artifact_lint_service.py +268 -0
  22. evalvault/domain/services/benchmark_runner.py +1 -6
  23. evalvault/domain/services/dataset_preprocessor.py +26 -0
  24. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  25. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  26. evalvault/domain/services/evaluator.py +2 -0
  27. evalvault/domain/services/judge_calibration_service.py +495 -0
  28. evalvault/domain/services/ops_snapshot_service.py +159 -0
  29. evalvault/domain/services/regression_gate_service.py +199 -0
  30. evalvault/domain/services/run_comparison_service.py +159 -0
  31. evalvault/domain/services/stage_event_builder.py +6 -1
  32. evalvault/domain/services/stage_metric_service.py +83 -18
  33. evalvault/ports/outbound/__init__.py +4 -0
  34. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  35. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  36. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  37. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  38. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  39. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/METADATA +1 -1
  40. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/RECORD +43 -17
  41. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
  42. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
  43. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -26,8 +26,7 @@ from evalvault.domain.entities import (
26
26
  from evalvault.ports.outbound.tracker_port import TrackerPort
27
27
 
28
28
  if TYPE_CHECKING:
29
- from opentelemetry.sdk.trace import Span, TracerProvider
30
- from opentelemetry.trace import Tracer
29
+ from opentelemetry.sdk.trace import TracerProvider
31
30
 
32
31
 
33
32
  class PhoenixAdapter(TrackerPort):
@@ -62,9 +61,10 @@ class PhoenixAdapter(TrackerPort):
62
61
  """
63
62
  self._endpoint = endpoint
64
63
  self._service_name = service_name
65
- self._tracer: Tracer | None = None
64
+ self._tracer: Any | None = None
66
65
  self._tracer_provider: TracerProvider | None = None
67
- self._active_spans: dict[str, Span] = {}
66
+ self._active_spans: dict[str, Any] = {}
67
+ self._tracer_any: Any | None = None
68
68
  self._initialized = False
69
69
 
70
70
  def _ensure_initialized(self) -> None:
@@ -90,7 +90,8 @@ class PhoenixAdapter(TrackerPort):
90
90
  provider = get_tracer_provider()
91
91
  if provider:
92
92
  self._tracer_provider = provider
93
- self._tracer = trace.get_tracer(__name__)
93
+ self._tracer_any = trace.get_tracer(__name__)
94
+ self._tracer = self._tracer_any
94
95
  self._initialized = True
95
96
  return
96
97
 
@@ -109,7 +110,8 @@ class PhoenixAdapter(TrackerPort):
109
110
  trace.set_tracer_provider(self._tracer_provider)
110
111
 
111
112
  # Get tracer
112
- self._tracer = trace.get_tracer(__name__)
113
+ self._tracer_any = trace.get_tracer(__name__)
114
+ self._tracer = self._tracer_any
113
115
  self._initialized = True
114
116
 
115
117
  except ImportError as e:
@@ -134,7 +136,12 @@ class PhoenixAdapter(TrackerPort):
134
136
  self._ensure_initialized()
135
137
 
136
138
  # Start a new span as root
137
- span = self._tracer.start_span(name)
139
+ tracer = self._tracer_any
140
+ if tracer is None:
141
+ tracer = self._tracer
142
+ if tracer is None:
143
+ raise RuntimeError("Phoenix tracer is not initialized")
144
+ span = tracer.start_span(name)
138
145
  trace_id = str(uuid.uuid4())
139
146
 
140
147
  # Set metadata as span attributes
@@ -173,10 +180,15 @@ class PhoenixAdapter(TrackerPort):
173
180
 
174
181
  from opentelemetry import trace
175
182
 
183
+ tracer = self._tracer_any
184
+ if tracer is None:
185
+ tracer = self._tracer
186
+ if tracer is None:
187
+ raise RuntimeError("Phoenix tracer is not initialized")
176
188
  parent_span = self._active_spans[trace_id]
177
189
  context = trace.set_span_in_context(parent_span)
178
190
 
179
- with self._tracer.start_span(name, context=context) as span:
191
+ with tracer.start_span(name, context=context) as span:
180
192
  if input_data is not None:
181
193
  safe_input = sanitize_payload(input_data, max_chars=MAX_LOG_CHARS)
182
194
  span.set_attribute("input", json.dumps(safe_input, default=str))
@@ -279,7 +291,7 @@ class PhoenixAdapter(TrackerPort):
279
291
  passed_count = sum(
280
292
  1
281
293
  for r in run.results
282
- if r.get_metric(metric_name) and r.get_metric(metric_name).passed
294
+ if (metric := r.get_metric(metric_name)) and metric.passed is True
283
295
  )
284
296
  avg_score = run.get_avg_score(metric_name)
285
297
  threshold = run.thresholds.get(metric_name, 0.7)
@@ -369,10 +381,15 @@ class PhoenixAdapter(TrackerPort):
369
381
  """
370
382
  from opentelemetry import trace
371
383
 
384
+ tracer = self._tracer_any
385
+ if tracer is None:
386
+ tracer = self._tracer
387
+ if tracer is None:
388
+ raise RuntimeError("Phoenix tracer is not initialized")
372
389
  parent_span = self._active_spans[trace_id]
373
390
  context = trace.set_span_in_context(parent_span)
374
391
 
375
- with self._tracer.start_span(
392
+ with tracer.start_span(
376
393
  f"test-case-{result.test_case_id}",
377
394
  context=context,
378
395
  ) as span:
@@ -478,7 +495,12 @@ class PhoenixAdapter(TrackerPort):
478
495
  parent_span = self._active_spans[trace_id]
479
496
  context = trace.set_span_in_context(parent_span)
480
497
 
481
- with self._tracer.start_span("retrieval", context=context) as span:
498
+ tracer = self._tracer_any
499
+ if tracer is None:
500
+ tracer = self._tracer
501
+ if tracer is None:
502
+ raise RuntimeError("Phoenix tracer is not initialized")
503
+ with tracer.start_span("retrieval", context=context) as span:
482
504
  # Set retrieval attributes
483
505
  for key, value in data.to_span_attributes().items():
484
506
  span.set_attribute(key, value)
@@ -560,7 +582,12 @@ class PhoenixAdapter(TrackerPort):
560
582
  parent_span = self._active_spans[trace_id]
561
583
  context = trace.set_span_in_context(parent_span)
562
584
 
563
- with self._tracer.start_span("generation", context=context) as span:
585
+ tracer = self._tracer_any
586
+ if tracer is None:
587
+ tracer = self._tracer
588
+ if tracer is None:
589
+ raise RuntimeError("Phoenix tracer is not initialized")
590
+ with tracer.start_span("generation", context=context) as span:
564
591
  # Set generation attributes
565
592
  for key, value in data.to_span_attributes().items():
566
593
  span.set_attribute(key, value)
@@ -34,6 +34,12 @@ from evalvault.domain.entities.improvement import (
34
34
  RAGComponent,
35
35
  RAGImprovementGuide,
36
36
  )
37
+ from evalvault.domain.entities.judge_calibration import (
38
+ JudgeCalibrationCase,
39
+ JudgeCalibrationMetric,
40
+ JudgeCalibrationResult,
41
+ JudgeCalibrationSummary,
42
+ )
37
43
  from evalvault.domain.entities.kg import EntityModel, RelationModel
38
44
  from evalvault.domain.entities.method import MethodInput, MethodInputDataset, MethodOutput
39
45
  from evalvault.domain.entities.prompt import Prompt, PromptSet, PromptSetBundle, PromptSetItem
@@ -104,6 +110,10 @@ __all__ = [
104
110
  "PatternType",
105
111
  "RAGComponent",
106
112
  "RAGImprovementGuide",
113
+ "JudgeCalibrationCase",
114
+ "JudgeCalibrationMetric",
115
+ "JudgeCalibrationResult",
116
+ "JudgeCalibrationSummary",
107
117
  # KG
108
118
  "EntityModel",
109
119
  "RelationModel",
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass
7
+ class JudgeCalibrationCase:
8
+ test_case_id: str
9
+ raw_score: float
10
+ calibrated_score: float
11
+ label: float | None = None
12
+ label_source: str | None = None
13
+
14
+
15
+ @dataclass
16
+ class JudgeCalibrationMetric:
17
+ metric: str
18
+ method: str
19
+ sample_count: int
20
+ label_count: int
21
+ mae: float | None
22
+ pearson: float | None
23
+ spearman: float | None
24
+ temperature: float | None = None
25
+ parameters: dict[str, float | None] = field(default_factory=dict)
26
+ gate_passed: bool | None = None
27
+ warning: str | None = None
28
+
29
+
30
+ @dataclass
31
+ class JudgeCalibrationSummary:
32
+ run_id: str
33
+ labels_source: str
34
+ method: str
35
+ metrics: list[str]
36
+ holdout_ratio: float
37
+ seed: int
38
+ total_labels: int
39
+ total_samples: int
40
+ gate_passed: bool
41
+ gate_threshold: float | None = None
42
+ notes: list[str] = field(default_factory=list)
43
+
44
+
45
+ @dataclass
46
+ class JudgeCalibrationResult:
47
+ summary: JudgeCalibrationSummary
48
+ metrics: list[JudgeCalibrationMetric] = field(default_factory=list)
49
+ case_results: dict[str, list[JudgeCalibrationCase]] = field(default_factory=dict)
50
+ warnings: list[str] = field(default_factory=list)
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass, field
6
6
  from datetime import datetime
7
- from typing import Any
7
+ from typing import Any, Literal, overload
8
8
  from uuid import uuid4
9
9
 
10
10
  REQUIRED_STAGE_TYPES: tuple[str, ...] = ("system_prompt", "input", "retrieval", "output")
@@ -82,8 +82,8 @@ class StageEvent:
82
82
  duration_ms=_optional_float(payload.get("duration_ms")),
83
83
  input_ref=input_ref,
84
84
  output_ref=output_ref,
85
- attributes=_ensure_dict(payload.get("attributes")),
86
- metadata=_ensure_dict(payload.get("metadata")),
85
+ attributes=_ensure_dict(payload.get("attributes"), allow_none=False),
86
+ metadata=_ensure_dict(payload.get("metadata"), allow_none=False),
87
87
  trace_id=_optional_str(payload.get("trace_id") or trace_payload.get("trace_id")),
88
88
  span_id=_optional_str(payload.get("span_id") or trace_payload.get("span_id")),
89
89
  )
@@ -187,6 +187,14 @@ def _parse_datetime(value: Any) -> datetime | None:
187
187
  raise ValueError("Invalid datetime value")
188
188
 
189
189
 
190
+ @overload
191
+ def _ensure_dict(value: None, *, allow_none: Literal[True]) -> None: ...
192
+
193
+
194
+ @overload
195
+ def _ensure_dict(value: Any, *, allow_none: Literal[False] = False) -> dict[str, Any]: ...
196
+
197
+
190
198
  def _ensure_dict(value: Any, *, allow_none: bool = False) -> dict[str, Any] | None:
191
199
  if value is None:
192
200
  return None if allow_none else {}
@@ -0,0 +1,268 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from dataclasses import dataclass
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+ from typing import Literal
9
+
10
+ from evalvault.ports.outbound.artifact_fs_port import ArtifactFileSystemPort
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ LintLevel = Literal["error", "warning"]
16
+ LintStatus = Literal["ok", "warning", "error"]
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class ArtifactLintIssue:
21
+ level: LintLevel
22
+ code: str
23
+ message: str
24
+ path: str | None = None
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class ArtifactLintSummary:
29
+ status: LintStatus
30
+ issues: list[ArtifactLintIssue]
31
+ artifacts_dir: Path
32
+ index_path: Path
33
+ started_at: datetime
34
+ finished_at: datetime
35
+ duration_ms: int
36
+ strict: bool
37
+
38
+
39
+ class ArtifactLintService:
40
+ def __init__(self, fs: ArtifactFileSystemPort) -> None:
41
+ self._fs = fs
42
+
43
+ def lint(self, artifacts_dir: Path, *, strict: bool = False) -> ArtifactLintSummary:
44
+ started_at = datetime.now(UTC)
45
+ issues: list[ArtifactLintIssue] = []
46
+ index_path = artifacts_dir / "index.json"
47
+ logger.info("Artifact lint started: %s", artifacts_dir)
48
+
49
+ try:
50
+ self._validate_dir(artifacts_dir, issues)
51
+ if not self._fs.exists(index_path):
52
+ issues.append(
53
+ ArtifactLintIssue(
54
+ "error",
55
+ "artifacts.index.missing",
56
+ "index.json is missing.",
57
+ path=str(index_path),
58
+ )
59
+ )
60
+ elif self._fs.exists(artifacts_dir) and self._fs.is_dir(artifacts_dir):
61
+ index_payload = self._load_index(index_path, issues)
62
+ if index_payload is not None:
63
+ self._validate_index(
64
+ index_payload,
65
+ artifacts_dir,
66
+ issues,
67
+ strict=strict,
68
+ )
69
+ except Exception as exc:
70
+ logger.exception("Artifact lint failed: %s", artifacts_dir)
71
+ issues.append(
72
+ ArtifactLintIssue(
73
+ "error",
74
+ "artifacts.lint.exception",
75
+ f"Unexpected error: {exc}",
76
+ )
77
+ )
78
+
79
+ finished_at = datetime.now(UTC)
80
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
81
+ status = _resolve_status(issues)
82
+ logger.info("Artifact lint finished: %s (%s)", artifacts_dir, status)
83
+ return ArtifactLintSummary(
84
+ status=status,
85
+ issues=issues,
86
+ artifacts_dir=artifacts_dir,
87
+ index_path=index_path,
88
+ started_at=started_at,
89
+ finished_at=finished_at,
90
+ duration_ms=duration_ms,
91
+ strict=strict,
92
+ )
93
+
94
+ def _validate_dir(self, artifacts_dir: Path, issues: list[ArtifactLintIssue]) -> None:
95
+ if not self._fs.exists(artifacts_dir):
96
+ issues.append(
97
+ ArtifactLintIssue(
98
+ "error",
99
+ "artifacts.dir.missing",
100
+ "Artifacts directory is missing.",
101
+ path=str(artifacts_dir),
102
+ )
103
+ )
104
+ return
105
+ if not self._fs.is_dir(artifacts_dir):
106
+ issues.append(
107
+ ArtifactLintIssue(
108
+ "error",
109
+ "artifacts.dir.not_directory",
110
+ "Artifacts path is not a directory.",
111
+ path=str(artifacts_dir),
112
+ )
113
+ )
114
+
115
+ def _load_index(
116
+ self,
117
+ index_path: Path,
118
+ issues: list[ArtifactLintIssue],
119
+ ) -> dict[str, object] | None:
120
+ try:
121
+ payload = json.loads(self._fs.read_text(index_path))
122
+ except json.JSONDecodeError as exc:
123
+ issues.append(
124
+ ArtifactLintIssue(
125
+ "error",
126
+ "artifacts.index.invalid_json",
127
+ f"index.json parse failed: {exc}",
128
+ path=str(index_path),
129
+ )
130
+ )
131
+ return None
132
+ except OSError as exc:
133
+ issues.append(
134
+ ArtifactLintIssue(
135
+ "error",
136
+ "artifacts.index.read_failed",
137
+ f"index.json read failed: {exc}",
138
+ path=str(index_path),
139
+ )
140
+ )
141
+ return None
142
+
143
+ if not isinstance(payload, dict):
144
+ issues.append(
145
+ ArtifactLintIssue(
146
+ "error",
147
+ "artifacts.index.invalid_schema",
148
+ "index.json root must be an object.",
149
+ path=str(index_path),
150
+ )
151
+ )
152
+ return None
153
+ return payload
154
+
155
+ def _validate_index(
156
+ self,
157
+ payload: dict[str, object],
158
+ artifacts_dir: Path,
159
+ issues: list[ArtifactLintIssue],
160
+ *,
161
+ strict: bool,
162
+ ) -> None:
163
+ pipeline_id = payload.get("pipeline_id")
164
+ if not isinstance(pipeline_id, str) or not pipeline_id.strip():
165
+ issues.append(
166
+ ArtifactLintIssue(
167
+ "error",
168
+ "artifacts.index.pipeline_id.missing",
169
+ "pipeline_id is missing.",
170
+ )
171
+ )
172
+
173
+ nodes = payload.get("nodes")
174
+ if not isinstance(nodes, list):
175
+ issues.append(
176
+ ArtifactLintIssue(
177
+ "error",
178
+ "artifacts.index.nodes.invalid",
179
+ "nodes list is missing or invalid.",
180
+ )
181
+ )
182
+ return
183
+
184
+ for idx, node in enumerate(nodes, start=1):
185
+ if not isinstance(node, dict):
186
+ issues.append(
187
+ ArtifactLintIssue(
188
+ "error",
189
+ "artifacts.index.node.invalid",
190
+ f"nodes[{idx}] entry must be an object.",
191
+ )
192
+ )
193
+ continue
194
+ node_id = node.get("node_id")
195
+ if not isinstance(node_id, str) or not node_id.strip():
196
+ issues.append(
197
+ ArtifactLintIssue(
198
+ "error",
199
+ "artifacts.index.node_id.missing",
200
+ f"nodes[{idx}] node_id is missing.",
201
+ )
202
+ )
203
+ path_value = node.get("path")
204
+ self._validate_path(
205
+ path_value,
206
+ artifacts_dir,
207
+ issues,
208
+ strict=strict,
209
+ code="artifacts.index.node.path.missing",
210
+ message=f"nodes[{idx}] path is missing.",
211
+ )
212
+
213
+ final_output = payload.get("final_output_path")
214
+ if final_output:
215
+ self._validate_path(
216
+ final_output,
217
+ artifacts_dir,
218
+ issues,
219
+ strict=strict,
220
+ code="artifacts.index.final_output.missing",
221
+ message="final_output_path is missing.",
222
+ )
223
+
224
+ def _validate_path(
225
+ self,
226
+ path_value: object,
227
+ artifacts_dir: Path,
228
+ issues: list[ArtifactLintIssue],
229
+ *,
230
+ strict: bool,
231
+ code: str,
232
+ message: str,
233
+ ) -> None:
234
+ if not isinstance(path_value, str) or not path_value.strip():
235
+ issues.append(
236
+ ArtifactLintIssue(
237
+ "error",
238
+ code,
239
+ message,
240
+ )
241
+ )
242
+ return
243
+
244
+ resolved = _resolve_artifact_path(artifacts_dir, Path(path_value))
245
+ if self._fs.exists(resolved):
246
+ return
247
+ issues.append(
248
+ ArtifactLintIssue(
249
+ "error" if strict else "warning",
250
+ code,
251
+ "Artifact file is missing.",
252
+ path=str(resolved),
253
+ )
254
+ )
255
+
256
+
257
+ def _resolve_artifact_path(base_dir: Path, candidate: Path) -> Path:
258
+ if candidate.is_absolute():
259
+ return candidate
260
+ return base_dir / candidate
261
+
262
+
263
+ def _resolve_status(issues: list[ArtifactLintIssue]) -> LintStatus:
264
+ if any(issue.level == "error" for issue in issues):
265
+ return "error"
266
+ if any(issue.level == "warning" for issue in issues):
267
+ return "warning"
268
+ return "ok"
@@ -414,12 +414,7 @@ class KoreanRAGBenchmarkRunner:
414
414
  try:
415
415
  # 형태소 분석 기반 검색
416
416
  if retriever:
417
- if self.use_hybrid_search and hasattr(retriever, "has_embeddings"):
418
- results = retriever.search(
419
- query, top_k=recall_k, use_dense=retriever.has_embeddings
420
- )
421
- else:
422
- results = retriever.search(query, top_k=recall_k)
417
+ results = retriever.search(query, top_k=recall_k)
423
418
  retrieved_doc_ids = [
424
419
  resolve_doc_id(getattr(res, "doc_id", None), doc_ids, idx)
425
420
  for idx, res in enumerate(results, start=1)
@@ -17,9 +17,22 @@ REFERENCE_REQUIRED_METRICS = {
17
17
  }
18
18
 
19
19
  _WHITESPACE_RE = re.compile(r"\s+")
20
+ _PUNCT_ONLY_RE = re.compile(r"^[\W_]+$")
20
21
  _HANGUL_RE = re.compile(r"[\uac00-\ud7a3]")
21
22
  _LATIN_RE = re.compile(r"[A-Za-z]")
22
23
 
24
+ _PLACEHOLDER_TEXT = {
25
+ "n/a",
26
+ "na",
27
+ "none",
28
+ "null",
29
+ "nil",
30
+ "unknown",
31
+ "tbd",
32
+ "todo",
33
+ "undefined",
34
+ }
35
+
23
36
 
24
37
  @dataclass(frozen=True)
25
38
  class DatasetPreprocessConfig:
@@ -205,8 +218,18 @@ class DatasetPreprocessor:
205
218
  if self._config.trim_whitespace:
206
219
  text = text.replace("\u00a0", " ")
207
220
  text = _WHITESPACE_RE.sub(" ", text).strip()
221
+ if self._is_noise_text(text):
222
+ return ""
208
223
  return text
209
224
 
225
+ def _is_noise_text(self, text: str) -> bool:
226
+ if not text:
227
+ return True
228
+ if _PUNCT_ONLY_RE.fullmatch(text):
229
+ return True
230
+ lower_text = text.casefold()
231
+ return lower_text in _PLACEHOLDER_TEXT
232
+
210
233
  def _normalize_contexts(self, contexts: Any) -> tuple[list[str], dict[str, int]]:
211
234
  removed = 0
212
235
  deduped = 0
@@ -292,6 +315,9 @@ class DatasetPreprocessor:
292
315
  elif source == "context":
293
316
  filled_from_context = 1
294
317
 
318
+ if reference:
319
+ reference = self._normalize_text(reference)
320
+
295
321
  if reference and self._config.max_reference_chars > 0:
296
322
  reference, did_truncate = self._truncate_text(
297
323
  reference, self._config.max_reference_chars
@@ -0,0 +1,25 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from evalvault.ports.outbound.difficulty_profile_port import DifficultyProfileWriterPort
6
+
7
+
8
+ class DifficultyProfileReporter:
9
+ def __init__(self, writer: DifficultyProfileWriterPort) -> None:
10
+ self._writer = writer
11
+
12
+ def write(
13
+ self,
14
+ *,
15
+ output_path: Path,
16
+ artifacts_dir: Path,
17
+ envelope: dict[str, object],
18
+ artifacts: dict[str, object],
19
+ ) -> dict[str, object]:
20
+ return self._writer.write_profile(
21
+ output_path=output_path,
22
+ artifacts_dir=artifacts_dir,
23
+ envelope=envelope,
24
+ artifacts=artifacts,
25
+ )