evalvault 1.64.0__py3-none-any.whl → 1.65.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +12 -7
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +39 -12
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/METADATA +1 -1
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/RECORD +43 -17
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -26,8 +26,7 @@ from evalvault.domain.entities import (
|
|
|
26
26
|
from evalvault.ports.outbound.tracker_port import TrackerPort
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
29
|
-
from opentelemetry.sdk.trace import
|
|
30
|
-
from opentelemetry.trace import Tracer
|
|
29
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
31
30
|
|
|
32
31
|
|
|
33
32
|
class PhoenixAdapter(TrackerPort):
|
|
@@ -62,9 +61,10 @@ class PhoenixAdapter(TrackerPort):
|
|
|
62
61
|
"""
|
|
63
62
|
self._endpoint = endpoint
|
|
64
63
|
self._service_name = service_name
|
|
65
|
-
self._tracer:
|
|
64
|
+
self._tracer: Any | None = None
|
|
66
65
|
self._tracer_provider: TracerProvider | None = None
|
|
67
|
-
self._active_spans: dict[str,
|
|
66
|
+
self._active_spans: dict[str, Any] = {}
|
|
67
|
+
self._tracer_any: Any | None = None
|
|
68
68
|
self._initialized = False
|
|
69
69
|
|
|
70
70
|
def _ensure_initialized(self) -> None:
|
|
@@ -90,7 +90,8 @@ class PhoenixAdapter(TrackerPort):
|
|
|
90
90
|
provider = get_tracer_provider()
|
|
91
91
|
if provider:
|
|
92
92
|
self._tracer_provider = provider
|
|
93
|
-
self.
|
|
93
|
+
self._tracer_any = trace.get_tracer(__name__)
|
|
94
|
+
self._tracer = self._tracer_any
|
|
94
95
|
self._initialized = True
|
|
95
96
|
return
|
|
96
97
|
|
|
@@ -109,7 +110,8 @@ class PhoenixAdapter(TrackerPort):
|
|
|
109
110
|
trace.set_tracer_provider(self._tracer_provider)
|
|
110
111
|
|
|
111
112
|
# Get tracer
|
|
112
|
-
self.
|
|
113
|
+
self._tracer_any = trace.get_tracer(__name__)
|
|
114
|
+
self._tracer = self._tracer_any
|
|
113
115
|
self._initialized = True
|
|
114
116
|
|
|
115
117
|
except ImportError as e:
|
|
@@ -134,7 +136,12 @@ class PhoenixAdapter(TrackerPort):
|
|
|
134
136
|
self._ensure_initialized()
|
|
135
137
|
|
|
136
138
|
# Start a new span as root
|
|
137
|
-
|
|
139
|
+
tracer = self._tracer_any
|
|
140
|
+
if tracer is None:
|
|
141
|
+
tracer = self._tracer
|
|
142
|
+
if tracer is None:
|
|
143
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
144
|
+
span = tracer.start_span(name)
|
|
138
145
|
trace_id = str(uuid.uuid4())
|
|
139
146
|
|
|
140
147
|
# Set metadata as span attributes
|
|
@@ -173,10 +180,15 @@ class PhoenixAdapter(TrackerPort):
|
|
|
173
180
|
|
|
174
181
|
from opentelemetry import trace
|
|
175
182
|
|
|
183
|
+
tracer = self._tracer_any
|
|
184
|
+
if tracer is None:
|
|
185
|
+
tracer = self._tracer
|
|
186
|
+
if tracer is None:
|
|
187
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
176
188
|
parent_span = self._active_spans[trace_id]
|
|
177
189
|
context = trace.set_span_in_context(parent_span)
|
|
178
190
|
|
|
179
|
-
with
|
|
191
|
+
with tracer.start_span(name, context=context) as span:
|
|
180
192
|
if input_data is not None:
|
|
181
193
|
safe_input = sanitize_payload(input_data, max_chars=MAX_LOG_CHARS)
|
|
182
194
|
span.set_attribute("input", json.dumps(safe_input, default=str))
|
|
@@ -279,7 +291,7 @@ class PhoenixAdapter(TrackerPort):
|
|
|
279
291
|
passed_count = sum(
|
|
280
292
|
1
|
|
281
293
|
for r in run.results
|
|
282
|
-
if r.get_metric(metric_name) and
|
|
294
|
+
if (metric := r.get_metric(metric_name)) and metric.passed is True
|
|
283
295
|
)
|
|
284
296
|
avg_score = run.get_avg_score(metric_name)
|
|
285
297
|
threshold = run.thresholds.get(metric_name, 0.7)
|
|
@@ -369,10 +381,15 @@ class PhoenixAdapter(TrackerPort):
|
|
|
369
381
|
"""
|
|
370
382
|
from opentelemetry import trace
|
|
371
383
|
|
|
384
|
+
tracer = self._tracer_any
|
|
385
|
+
if tracer is None:
|
|
386
|
+
tracer = self._tracer
|
|
387
|
+
if tracer is None:
|
|
388
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
372
389
|
parent_span = self._active_spans[trace_id]
|
|
373
390
|
context = trace.set_span_in_context(parent_span)
|
|
374
391
|
|
|
375
|
-
with
|
|
392
|
+
with tracer.start_span(
|
|
376
393
|
f"test-case-{result.test_case_id}",
|
|
377
394
|
context=context,
|
|
378
395
|
) as span:
|
|
@@ -478,7 +495,12 @@ class PhoenixAdapter(TrackerPort):
|
|
|
478
495
|
parent_span = self._active_spans[trace_id]
|
|
479
496
|
context = trace.set_span_in_context(parent_span)
|
|
480
497
|
|
|
481
|
-
|
|
498
|
+
tracer = self._tracer_any
|
|
499
|
+
if tracer is None:
|
|
500
|
+
tracer = self._tracer
|
|
501
|
+
if tracer is None:
|
|
502
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
503
|
+
with tracer.start_span("retrieval", context=context) as span:
|
|
482
504
|
# Set retrieval attributes
|
|
483
505
|
for key, value in data.to_span_attributes().items():
|
|
484
506
|
span.set_attribute(key, value)
|
|
@@ -560,7 +582,12 @@ class PhoenixAdapter(TrackerPort):
|
|
|
560
582
|
parent_span = self._active_spans[trace_id]
|
|
561
583
|
context = trace.set_span_in_context(parent_span)
|
|
562
584
|
|
|
563
|
-
|
|
585
|
+
tracer = self._tracer_any
|
|
586
|
+
if tracer is None:
|
|
587
|
+
tracer = self._tracer
|
|
588
|
+
if tracer is None:
|
|
589
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
590
|
+
with tracer.start_span("generation", context=context) as span:
|
|
564
591
|
# Set generation attributes
|
|
565
592
|
for key, value in data.to_span_attributes().items():
|
|
566
593
|
span.set_attribute(key, value)
|
|
@@ -34,6 +34,12 @@ from evalvault.domain.entities.improvement import (
|
|
|
34
34
|
RAGComponent,
|
|
35
35
|
RAGImprovementGuide,
|
|
36
36
|
)
|
|
37
|
+
from evalvault.domain.entities.judge_calibration import (
|
|
38
|
+
JudgeCalibrationCase,
|
|
39
|
+
JudgeCalibrationMetric,
|
|
40
|
+
JudgeCalibrationResult,
|
|
41
|
+
JudgeCalibrationSummary,
|
|
42
|
+
)
|
|
37
43
|
from evalvault.domain.entities.kg import EntityModel, RelationModel
|
|
38
44
|
from evalvault.domain.entities.method import MethodInput, MethodInputDataset, MethodOutput
|
|
39
45
|
from evalvault.domain.entities.prompt import Prompt, PromptSet, PromptSetBundle, PromptSetItem
|
|
@@ -104,6 +110,10 @@ __all__ = [
|
|
|
104
110
|
"PatternType",
|
|
105
111
|
"RAGComponent",
|
|
106
112
|
"RAGImprovementGuide",
|
|
113
|
+
"JudgeCalibrationCase",
|
|
114
|
+
"JudgeCalibrationMetric",
|
|
115
|
+
"JudgeCalibrationResult",
|
|
116
|
+
"JudgeCalibrationSummary",
|
|
107
117
|
# KG
|
|
108
118
|
"EntityModel",
|
|
109
119
|
"RelationModel",
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class JudgeCalibrationCase:
|
|
8
|
+
test_case_id: str
|
|
9
|
+
raw_score: float
|
|
10
|
+
calibrated_score: float
|
|
11
|
+
label: float | None = None
|
|
12
|
+
label_source: str | None = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class JudgeCalibrationMetric:
|
|
17
|
+
metric: str
|
|
18
|
+
method: str
|
|
19
|
+
sample_count: int
|
|
20
|
+
label_count: int
|
|
21
|
+
mae: float | None
|
|
22
|
+
pearson: float | None
|
|
23
|
+
spearman: float | None
|
|
24
|
+
temperature: float | None = None
|
|
25
|
+
parameters: dict[str, float | None] = field(default_factory=dict)
|
|
26
|
+
gate_passed: bool | None = None
|
|
27
|
+
warning: str | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class JudgeCalibrationSummary:
|
|
32
|
+
run_id: str
|
|
33
|
+
labels_source: str
|
|
34
|
+
method: str
|
|
35
|
+
metrics: list[str]
|
|
36
|
+
holdout_ratio: float
|
|
37
|
+
seed: int
|
|
38
|
+
total_labels: int
|
|
39
|
+
total_samples: int
|
|
40
|
+
gate_passed: bool
|
|
41
|
+
gate_threshold: float | None = None
|
|
42
|
+
notes: list[str] = field(default_factory=list)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class JudgeCalibrationResult:
|
|
47
|
+
summary: JudgeCalibrationSummary
|
|
48
|
+
metrics: list[JudgeCalibrationMetric] = field(default_factory=list)
|
|
49
|
+
case_results: dict[str, list[JudgeCalibrationCase]] = field(default_factory=dict)
|
|
50
|
+
warnings: list[str] = field(default_factory=list)
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
6
|
from datetime import datetime
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, Literal, overload
|
|
8
8
|
from uuid import uuid4
|
|
9
9
|
|
|
10
10
|
REQUIRED_STAGE_TYPES: tuple[str, ...] = ("system_prompt", "input", "retrieval", "output")
|
|
@@ -82,8 +82,8 @@ class StageEvent:
|
|
|
82
82
|
duration_ms=_optional_float(payload.get("duration_ms")),
|
|
83
83
|
input_ref=input_ref,
|
|
84
84
|
output_ref=output_ref,
|
|
85
|
-
attributes=_ensure_dict(payload.get("attributes")),
|
|
86
|
-
metadata=_ensure_dict(payload.get("metadata")),
|
|
85
|
+
attributes=_ensure_dict(payload.get("attributes"), allow_none=False),
|
|
86
|
+
metadata=_ensure_dict(payload.get("metadata"), allow_none=False),
|
|
87
87
|
trace_id=_optional_str(payload.get("trace_id") or trace_payload.get("trace_id")),
|
|
88
88
|
span_id=_optional_str(payload.get("span_id") or trace_payload.get("span_id")),
|
|
89
89
|
)
|
|
@@ -187,6 +187,14 @@ def _parse_datetime(value: Any) -> datetime | None:
|
|
|
187
187
|
raise ValueError("Invalid datetime value")
|
|
188
188
|
|
|
189
189
|
|
|
190
|
+
@overload
|
|
191
|
+
def _ensure_dict(value: None, *, allow_none: Literal[True]) -> None: ...
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@overload
|
|
195
|
+
def _ensure_dict(value: Any, *, allow_none: Literal[False] = False) -> dict[str, Any]: ...
|
|
196
|
+
|
|
197
|
+
|
|
190
198
|
def _ensure_dict(value: Any, *, allow_none: bool = False) -> dict[str, Any] | None:
|
|
191
199
|
if value is None:
|
|
192
200
|
return None if allow_none else {}
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Literal
|
|
9
|
+
|
|
10
|
+
from evalvault.ports.outbound.artifact_fs_port import ArtifactFileSystemPort
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
LintLevel = Literal["error", "warning"]
|
|
16
|
+
LintStatus = Literal["ok", "warning", "error"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class ArtifactLintIssue:
|
|
21
|
+
level: LintLevel
|
|
22
|
+
code: str
|
|
23
|
+
message: str
|
|
24
|
+
path: str | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class ArtifactLintSummary:
|
|
29
|
+
status: LintStatus
|
|
30
|
+
issues: list[ArtifactLintIssue]
|
|
31
|
+
artifacts_dir: Path
|
|
32
|
+
index_path: Path
|
|
33
|
+
started_at: datetime
|
|
34
|
+
finished_at: datetime
|
|
35
|
+
duration_ms: int
|
|
36
|
+
strict: bool
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ArtifactLintService:
|
|
40
|
+
def __init__(self, fs: ArtifactFileSystemPort) -> None:
|
|
41
|
+
self._fs = fs
|
|
42
|
+
|
|
43
|
+
def lint(self, artifacts_dir: Path, *, strict: bool = False) -> ArtifactLintSummary:
|
|
44
|
+
started_at = datetime.now(UTC)
|
|
45
|
+
issues: list[ArtifactLintIssue] = []
|
|
46
|
+
index_path = artifacts_dir / "index.json"
|
|
47
|
+
logger.info("Artifact lint started: %s", artifacts_dir)
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
self._validate_dir(artifacts_dir, issues)
|
|
51
|
+
if not self._fs.exists(index_path):
|
|
52
|
+
issues.append(
|
|
53
|
+
ArtifactLintIssue(
|
|
54
|
+
"error",
|
|
55
|
+
"artifacts.index.missing",
|
|
56
|
+
"index.json is missing.",
|
|
57
|
+
path=str(index_path),
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
elif self._fs.exists(artifacts_dir) and self._fs.is_dir(artifacts_dir):
|
|
61
|
+
index_payload = self._load_index(index_path, issues)
|
|
62
|
+
if index_payload is not None:
|
|
63
|
+
self._validate_index(
|
|
64
|
+
index_payload,
|
|
65
|
+
artifacts_dir,
|
|
66
|
+
issues,
|
|
67
|
+
strict=strict,
|
|
68
|
+
)
|
|
69
|
+
except Exception as exc:
|
|
70
|
+
logger.exception("Artifact lint failed: %s", artifacts_dir)
|
|
71
|
+
issues.append(
|
|
72
|
+
ArtifactLintIssue(
|
|
73
|
+
"error",
|
|
74
|
+
"artifacts.lint.exception",
|
|
75
|
+
f"Unexpected error: {exc}",
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
finished_at = datetime.now(UTC)
|
|
80
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
81
|
+
status = _resolve_status(issues)
|
|
82
|
+
logger.info("Artifact lint finished: %s (%s)", artifacts_dir, status)
|
|
83
|
+
return ArtifactLintSummary(
|
|
84
|
+
status=status,
|
|
85
|
+
issues=issues,
|
|
86
|
+
artifacts_dir=artifacts_dir,
|
|
87
|
+
index_path=index_path,
|
|
88
|
+
started_at=started_at,
|
|
89
|
+
finished_at=finished_at,
|
|
90
|
+
duration_ms=duration_ms,
|
|
91
|
+
strict=strict,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def _validate_dir(self, artifacts_dir: Path, issues: list[ArtifactLintIssue]) -> None:
|
|
95
|
+
if not self._fs.exists(artifacts_dir):
|
|
96
|
+
issues.append(
|
|
97
|
+
ArtifactLintIssue(
|
|
98
|
+
"error",
|
|
99
|
+
"artifacts.dir.missing",
|
|
100
|
+
"Artifacts directory is missing.",
|
|
101
|
+
path=str(artifacts_dir),
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
return
|
|
105
|
+
if not self._fs.is_dir(artifacts_dir):
|
|
106
|
+
issues.append(
|
|
107
|
+
ArtifactLintIssue(
|
|
108
|
+
"error",
|
|
109
|
+
"artifacts.dir.not_directory",
|
|
110
|
+
"Artifacts path is not a directory.",
|
|
111
|
+
path=str(artifacts_dir),
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def _load_index(
|
|
116
|
+
self,
|
|
117
|
+
index_path: Path,
|
|
118
|
+
issues: list[ArtifactLintIssue],
|
|
119
|
+
) -> dict[str, object] | None:
|
|
120
|
+
try:
|
|
121
|
+
payload = json.loads(self._fs.read_text(index_path))
|
|
122
|
+
except json.JSONDecodeError as exc:
|
|
123
|
+
issues.append(
|
|
124
|
+
ArtifactLintIssue(
|
|
125
|
+
"error",
|
|
126
|
+
"artifacts.index.invalid_json",
|
|
127
|
+
f"index.json parse failed: {exc}",
|
|
128
|
+
path=str(index_path),
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
return None
|
|
132
|
+
except OSError as exc:
|
|
133
|
+
issues.append(
|
|
134
|
+
ArtifactLintIssue(
|
|
135
|
+
"error",
|
|
136
|
+
"artifacts.index.read_failed",
|
|
137
|
+
f"index.json read failed: {exc}",
|
|
138
|
+
path=str(index_path),
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
if not isinstance(payload, dict):
|
|
144
|
+
issues.append(
|
|
145
|
+
ArtifactLintIssue(
|
|
146
|
+
"error",
|
|
147
|
+
"artifacts.index.invalid_schema",
|
|
148
|
+
"index.json root must be an object.",
|
|
149
|
+
path=str(index_path),
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
return None
|
|
153
|
+
return payload
|
|
154
|
+
|
|
155
|
+
def _validate_index(
|
|
156
|
+
self,
|
|
157
|
+
payload: dict[str, object],
|
|
158
|
+
artifacts_dir: Path,
|
|
159
|
+
issues: list[ArtifactLintIssue],
|
|
160
|
+
*,
|
|
161
|
+
strict: bool,
|
|
162
|
+
) -> None:
|
|
163
|
+
pipeline_id = payload.get("pipeline_id")
|
|
164
|
+
if not isinstance(pipeline_id, str) or not pipeline_id.strip():
|
|
165
|
+
issues.append(
|
|
166
|
+
ArtifactLintIssue(
|
|
167
|
+
"error",
|
|
168
|
+
"artifacts.index.pipeline_id.missing",
|
|
169
|
+
"pipeline_id is missing.",
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
nodes = payload.get("nodes")
|
|
174
|
+
if not isinstance(nodes, list):
|
|
175
|
+
issues.append(
|
|
176
|
+
ArtifactLintIssue(
|
|
177
|
+
"error",
|
|
178
|
+
"artifacts.index.nodes.invalid",
|
|
179
|
+
"nodes list is missing or invalid.",
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
for idx, node in enumerate(nodes, start=1):
|
|
185
|
+
if not isinstance(node, dict):
|
|
186
|
+
issues.append(
|
|
187
|
+
ArtifactLintIssue(
|
|
188
|
+
"error",
|
|
189
|
+
"artifacts.index.node.invalid",
|
|
190
|
+
f"nodes[{idx}] entry must be an object.",
|
|
191
|
+
)
|
|
192
|
+
)
|
|
193
|
+
continue
|
|
194
|
+
node_id = node.get("node_id")
|
|
195
|
+
if not isinstance(node_id, str) or not node_id.strip():
|
|
196
|
+
issues.append(
|
|
197
|
+
ArtifactLintIssue(
|
|
198
|
+
"error",
|
|
199
|
+
"artifacts.index.node_id.missing",
|
|
200
|
+
f"nodes[{idx}] node_id is missing.",
|
|
201
|
+
)
|
|
202
|
+
)
|
|
203
|
+
path_value = node.get("path")
|
|
204
|
+
self._validate_path(
|
|
205
|
+
path_value,
|
|
206
|
+
artifacts_dir,
|
|
207
|
+
issues,
|
|
208
|
+
strict=strict,
|
|
209
|
+
code="artifacts.index.node.path.missing",
|
|
210
|
+
message=f"nodes[{idx}] path is missing.",
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
final_output = payload.get("final_output_path")
|
|
214
|
+
if final_output:
|
|
215
|
+
self._validate_path(
|
|
216
|
+
final_output,
|
|
217
|
+
artifacts_dir,
|
|
218
|
+
issues,
|
|
219
|
+
strict=strict,
|
|
220
|
+
code="artifacts.index.final_output.missing",
|
|
221
|
+
message="final_output_path is missing.",
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
def _validate_path(
|
|
225
|
+
self,
|
|
226
|
+
path_value: object,
|
|
227
|
+
artifacts_dir: Path,
|
|
228
|
+
issues: list[ArtifactLintIssue],
|
|
229
|
+
*,
|
|
230
|
+
strict: bool,
|
|
231
|
+
code: str,
|
|
232
|
+
message: str,
|
|
233
|
+
) -> None:
|
|
234
|
+
if not isinstance(path_value, str) or not path_value.strip():
|
|
235
|
+
issues.append(
|
|
236
|
+
ArtifactLintIssue(
|
|
237
|
+
"error",
|
|
238
|
+
code,
|
|
239
|
+
message,
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
resolved = _resolve_artifact_path(artifacts_dir, Path(path_value))
|
|
245
|
+
if self._fs.exists(resolved):
|
|
246
|
+
return
|
|
247
|
+
issues.append(
|
|
248
|
+
ArtifactLintIssue(
|
|
249
|
+
"error" if strict else "warning",
|
|
250
|
+
code,
|
|
251
|
+
"Artifact file is missing.",
|
|
252
|
+
path=str(resolved),
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _resolve_artifact_path(base_dir: Path, candidate: Path) -> Path:
|
|
258
|
+
if candidate.is_absolute():
|
|
259
|
+
return candidate
|
|
260
|
+
return base_dir / candidate
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _resolve_status(issues: list[ArtifactLintIssue]) -> LintStatus:
|
|
264
|
+
if any(issue.level == "error" for issue in issues):
|
|
265
|
+
return "error"
|
|
266
|
+
if any(issue.level == "warning" for issue in issues):
|
|
267
|
+
return "warning"
|
|
268
|
+
return "ok"
|
|
@@ -414,12 +414,7 @@ class KoreanRAGBenchmarkRunner:
|
|
|
414
414
|
try:
|
|
415
415
|
# 형태소 분석 기반 검색
|
|
416
416
|
if retriever:
|
|
417
|
-
|
|
418
|
-
results = retriever.search(
|
|
419
|
-
query, top_k=recall_k, use_dense=retriever.has_embeddings
|
|
420
|
-
)
|
|
421
|
-
else:
|
|
422
|
-
results = retriever.search(query, top_k=recall_k)
|
|
417
|
+
results = retriever.search(query, top_k=recall_k)
|
|
423
418
|
retrieved_doc_ids = [
|
|
424
419
|
resolve_doc_id(getattr(res, "doc_id", None), doc_ids, idx)
|
|
425
420
|
for idx, res in enumerate(results, start=1)
|
|
@@ -17,9 +17,22 @@ REFERENCE_REQUIRED_METRICS = {
|
|
|
17
17
|
}
|
|
18
18
|
|
|
19
19
|
_WHITESPACE_RE = re.compile(r"\s+")
|
|
20
|
+
_PUNCT_ONLY_RE = re.compile(r"^[\W_]+$")
|
|
20
21
|
_HANGUL_RE = re.compile(r"[\uac00-\ud7a3]")
|
|
21
22
|
_LATIN_RE = re.compile(r"[A-Za-z]")
|
|
22
23
|
|
|
24
|
+
_PLACEHOLDER_TEXT = {
|
|
25
|
+
"n/a",
|
|
26
|
+
"na",
|
|
27
|
+
"none",
|
|
28
|
+
"null",
|
|
29
|
+
"nil",
|
|
30
|
+
"unknown",
|
|
31
|
+
"tbd",
|
|
32
|
+
"todo",
|
|
33
|
+
"undefined",
|
|
34
|
+
}
|
|
35
|
+
|
|
23
36
|
|
|
24
37
|
@dataclass(frozen=True)
|
|
25
38
|
class DatasetPreprocessConfig:
|
|
@@ -205,8 +218,18 @@ class DatasetPreprocessor:
|
|
|
205
218
|
if self._config.trim_whitespace:
|
|
206
219
|
text = text.replace("\u00a0", " ")
|
|
207
220
|
text = _WHITESPACE_RE.sub(" ", text).strip()
|
|
221
|
+
if self._is_noise_text(text):
|
|
222
|
+
return ""
|
|
208
223
|
return text
|
|
209
224
|
|
|
225
|
+
def _is_noise_text(self, text: str) -> bool:
|
|
226
|
+
if not text:
|
|
227
|
+
return True
|
|
228
|
+
if _PUNCT_ONLY_RE.fullmatch(text):
|
|
229
|
+
return True
|
|
230
|
+
lower_text = text.casefold()
|
|
231
|
+
return lower_text in _PLACEHOLDER_TEXT
|
|
232
|
+
|
|
210
233
|
def _normalize_contexts(self, contexts: Any) -> tuple[list[str], dict[str, int]]:
|
|
211
234
|
removed = 0
|
|
212
235
|
deduped = 0
|
|
@@ -292,6 +315,9 @@ class DatasetPreprocessor:
|
|
|
292
315
|
elif source == "context":
|
|
293
316
|
filled_from_context = 1
|
|
294
317
|
|
|
318
|
+
if reference:
|
|
319
|
+
reference = self._normalize_text(reference)
|
|
320
|
+
|
|
295
321
|
if reference and self._config.max_reference_chars > 0:
|
|
296
322
|
reference, did_truncate = self._truncate_text(
|
|
297
323
|
reference, self._config.max_reference_chars
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from evalvault.ports.outbound.difficulty_profile_port import DifficultyProfileWriterPort
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DifficultyProfileReporter:
|
|
9
|
+
def __init__(self, writer: DifficultyProfileWriterPort) -> None:
|
|
10
|
+
self._writer = writer
|
|
11
|
+
|
|
12
|
+
def write(
|
|
13
|
+
self,
|
|
14
|
+
*,
|
|
15
|
+
output_path: Path,
|
|
16
|
+
artifacts_dir: Path,
|
|
17
|
+
envelope: dict[str, object],
|
|
18
|
+
artifacts: dict[str, object],
|
|
19
|
+
) -> dict[str, object]:
|
|
20
|
+
return self._writer.write_profile(
|
|
21
|
+
output_path=output_path,
|
|
22
|
+
artifacts_dir=artifacts_dir,
|
|
23
|
+
envelope=envelope,
|
|
24
|
+
artifacts=artifacts,
|
|
25
|
+
)
|