evalvault 1.64.0__py3-none-any.whl → 1.65.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +12 -7
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +39 -12
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/METADATA +1 -1
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/RECORD +43 -17
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Regression gate service for CLI automation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
|
+
|
|
10
|
+
from evalvault.domain.entities.analysis import ComparisonResult, EffectSizeLevel
|
|
11
|
+
from evalvault.ports.outbound.analysis_port import AnalysisPort
|
|
12
|
+
from evalvault.ports.outbound.storage_port import StoragePort
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
TestType = str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class RegressionMetricResult:
|
|
21
|
+
metric: str
|
|
22
|
+
|
|
23
|
+
baseline_score: float
|
|
24
|
+
candidate_score: float
|
|
25
|
+
diff: float
|
|
26
|
+
diff_percent: float
|
|
27
|
+
p_value: float
|
|
28
|
+
effect_size: float
|
|
29
|
+
effect_level: EffectSizeLevel
|
|
30
|
+
is_significant: bool
|
|
31
|
+
regression: bool
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def from_comparison(
|
|
35
|
+
cls,
|
|
36
|
+
comparison: ComparisonResult,
|
|
37
|
+
*,
|
|
38
|
+
fail_on_regression: float,
|
|
39
|
+
) -> RegressionMetricResult:
|
|
40
|
+
regression = comparison.diff < -fail_on_regression
|
|
41
|
+
return cls(
|
|
42
|
+
metric=comparison.metric,
|
|
43
|
+
baseline_score=comparison.mean_a,
|
|
44
|
+
candidate_score=comparison.mean_b,
|
|
45
|
+
diff=comparison.diff,
|
|
46
|
+
diff_percent=comparison.diff_percent,
|
|
47
|
+
p_value=comparison.p_value,
|
|
48
|
+
effect_size=comparison.effect_size,
|
|
49
|
+
effect_level=comparison.effect_level,
|
|
50
|
+
is_significant=comparison.is_significant,
|
|
51
|
+
regression=regression,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def to_dict(self) -> dict[str, float | str | bool]:
|
|
55
|
+
return {
|
|
56
|
+
"metric": self.metric,
|
|
57
|
+
"baseline_score": self.baseline_score,
|
|
58
|
+
"candidate_score": self.candidate_score,
|
|
59
|
+
"diff": self.diff,
|
|
60
|
+
"diff_percent": self.diff_percent,
|
|
61
|
+
"p_value": self.p_value,
|
|
62
|
+
"effect_size": self.effect_size,
|
|
63
|
+
"effect_level": self.effect_level.value,
|
|
64
|
+
"is_significant": self.is_significant,
|
|
65
|
+
"regression": self.regression,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass(frozen=True)
|
|
70
|
+
class RegressionGateReport:
|
|
71
|
+
candidate_run_id: str
|
|
72
|
+
baseline_run_id: str
|
|
73
|
+
results: list[RegressionMetricResult]
|
|
74
|
+
regression_detected: bool
|
|
75
|
+
fail_on_regression: float
|
|
76
|
+
test_type: TestType
|
|
77
|
+
metrics: list[str]
|
|
78
|
+
started_at: datetime
|
|
79
|
+
finished_at: datetime
|
|
80
|
+
duration_ms: int
|
|
81
|
+
parallel: bool
|
|
82
|
+
concurrency: int | None
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def status(self) -> str:
|
|
86
|
+
return "failed" if self.regression_detected else "passed"
|
|
87
|
+
|
|
88
|
+
def to_dict(self) -> dict[str, object]:
|
|
89
|
+
return {
|
|
90
|
+
"candidate_run_id": self.candidate_run_id,
|
|
91
|
+
"baseline_run_id": self.baseline_run_id,
|
|
92
|
+
"status": self.status,
|
|
93
|
+
"regression_detected": self.regression_detected,
|
|
94
|
+
"fail_on_regression": self.fail_on_regression,
|
|
95
|
+
"test": self.test_type,
|
|
96
|
+
"metrics": list(self.metrics),
|
|
97
|
+
"results": [result.to_dict() for result in self.results],
|
|
98
|
+
"parallel": self.parallel,
|
|
99
|
+
"concurrency": self.concurrency,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class RegressionGateService:
|
|
104
|
+
def __init__(self, storage: StoragePort, analysis_adapter: AnalysisPort) -> None:
|
|
105
|
+
self._storage = storage
|
|
106
|
+
self._analysis = analysis_adapter
|
|
107
|
+
|
|
108
|
+
def run_gate(
|
|
109
|
+
self,
|
|
110
|
+
candidate_run_id: str,
|
|
111
|
+
baseline_run_id: str,
|
|
112
|
+
*,
|
|
113
|
+
metrics: list[str] | None = None,
|
|
114
|
+
test_type: TestType = "t-test",
|
|
115
|
+
fail_on_regression: float = 0.05,
|
|
116
|
+
parallel: bool = True,
|
|
117
|
+
concurrency: int | None = None,
|
|
118
|
+
) -> RegressionGateReport:
|
|
119
|
+
start_time = time.monotonic()
|
|
120
|
+
started_at = datetime.now(UTC)
|
|
121
|
+
logger.info(
|
|
122
|
+
"Regression gate start: candidate=%s baseline=%s",
|
|
123
|
+
candidate_run_id,
|
|
124
|
+
baseline_run_id,
|
|
125
|
+
)
|
|
126
|
+
try:
|
|
127
|
+
candidate = self._storage.get_run(candidate_run_id)
|
|
128
|
+
baseline = self._storage.get_run(baseline_run_id)
|
|
129
|
+
|
|
130
|
+
requested_metrics = [m for m in (metrics or []) if m]
|
|
131
|
+
if requested_metrics:
|
|
132
|
+
metric_list = requested_metrics
|
|
133
|
+
else:
|
|
134
|
+
metric_list = sorted(
|
|
135
|
+
set(candidate.metrics_evaluated) & set(baseline.metrics_evaluated)
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
if not metric_list:
|
|
139
|
+
raise ValueError("No shared metrics available for regression gate.")
|
|
140
|
+
|
|
141
|
+
comparisons = self._analysis.compare_runs(
|
|
142
|
+
baseline,
|
|
143
|
+
candidate,
|
|
144
|
+
metrics=metric_list,
|
|
145
|
+
test_type=test_type,
|
|
146
|
+
)
|
|
147
|
+
if not comparisons:
|
|
148
|
+
raise ValueError("No comparable metrics found for regression gate.")
|
|
149
|
+
|
|
150
|
+
comparison_map = {result.metric: result for result in comparisons}
|
|
151
|
+
missing = [metric for metric in metric_list if metric not in comparison_map]
|
|
152
|
+
if missing:
|
|
153
|
+
raise ValueError("Missing comparison results for metrics: " + ", ".join(missing))
|
|
154
|
+
|
|
155
|
+
ordered = [comparison_map[metric] for metric in metric_list]
|
|
156
|
+
results = [
|
|
157
|
+
RegressionMetricResult.from_comparison(
|
|
158
|
+
comparison,
|
|
159
|
+
fail_on_regression=fail_on_regression,
|
|
160
|
+
)
|
|
161
|
+
for comparison in ordered
|
|
162
|
+
]
|
|
163
|
+
regression_detected = any(result.regression for result in results)
|
|
164
|
+
finished_at = datetime.now(UTC)
|
|
165
|
+
duration_ms = int((time.monotonic() - start_time) * 1000)
|
|
166
|
+
logger.info(
|
|
167
|
+
"Regression gate complete: candidate=%s baseline=%s regressions=%s",
|
|
168
|
+
candidate_run_id,
|
|
169
|
+
baseline_run_id,
|
|
170
|
+
regression_detected,
|
|
171
|
+
)
|
|
172
|
+
return RegressionGateReport(
|
|
173
|
+
candidate_run_id=candidate_run_id,
|
|
174
|
+
baseline_run_id=baseline_run_id,
|
|
175
|
+
results=results,
|
|
176
|
+
regression_detected=regression_detected,
|
|
177
|
+
fail_on_regression=fail_on_regression,
|
|
178
|
+
test_type=test_type,
|
|
179
|
+
metrics=metric_list,
|
|
180
|
+
started_at=started_at,
|
|
181
|
+
finished_at=finished_at,
|
|
182
|
+
duration_ms=duration_ms,
|
|
183
|
+
parallel=parallel,
|
|
184
|
+
concurrency=concurrency,
|
|
185
|
+
)
|
|
186
|
+
except Exception:
|
|
187
|
+
logger.exception(
|
|
188
|
+
"Regression gate failed: candidate=%s baseline=%s",
|
|
189
|
+
candidate_run_id,
|
|
190
|
+
baseline_run_id,
|
|
191
|
+
)
|
|
192
|
+
raise
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
__all__ = [
|
|
196
|
+
"RegressionGateReport",
|
|
197
|
+
"RegressionGateService",
|
|
198
|
+
"RegressionMetricResult",
|
|
199
|
+
]
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
|
|
7
|
+
from evalvault.domain.entities.analysis import ComparisonResult
|
|
8
|
+
from evalvault.domain.entities.analysis_pipeline import AnalysisIntent, PipelineResult
|
|
9
|
+
from evalvault.ports.outbound.analysis_port import AnalysisPort
|
|
10
|
+
from evalvault.ports.outbound.comparison_pipeline_port import ComparisonPipelinePort
|
|
11
|
+
from evalvault.ports.outbound.storage_port import StoragePort
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class RunComparisonRequest:
|
|
18
|
+
run_id_a: str
|
|
19
|
+
run_id_b: str
|
|
20
|
+
metrics: list[str] | None = None
|
|
21
|
+
test_type: str = "t-test"
|
|
22
|
+
parallel: bool = False
|
|
23
|
+
concurrency: int | None = None
|
|
24
|
+
report_type: str = "comparison"
|
|
25
|
+
use_llm_report: bool = True
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class RunComparisonOutcome:
|
|
30
|
+
run_ids: tuple[str, str]
|
|
31
|
+
comparisons: list[ComparisonResult]
|
|
32
|
+
pipeline_result: PipelineResult
|
|
33
|
+
report_text: str
|
|
34
|
+
status: str
|
|
35
|
+
started_at: datetime
|
|
36
|
+
finished_at: datetime
|
|
37
|
+
duration_ms: int
|
|
38
|
+
degraded_reasons: list[str] = field(default_factory=list)
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def is_degraded(self) -> bool:
|
|
42
|
+
return self.status != "ok"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class RunComparisonError(Exception):
|
|
46
|
+
def __init__(self, message: str, *, exit_code: int = 1):
|
|
47
|
+
super().__init__(message)
|
|
48
|
+
self.exit_code = exit_code
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class RunComparisonService:
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
*,
|
|
55
|
+
storage: StoragePort,
|
|
56
|
+
analysis_port: AnalysisPort,
|
|
57
|
+
pipeline_port: ComparisonPipelinePort,
|
|
58
|
+
) -> None:
|
|
59
|
+
self._storage = storage
|
|
60
|
+
self._analysis = analysis_port
|
|
61
|
+
self._pipeline = pipeline_port
|
|
62
|
+
|
|
63
|
+
def compare_runs(self, request: RunComparisonRequest) -> RunComparisonOutcome:
|
|
64
|
+
started_at = datetime.now(UTC)
|
|
65
|
+
logger.info("Starting run comparison: %s vs %s", request.run_id_a, request.run_id_b)
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
run_a = self._storage.get_run(request.run_id_a)
|
|
69
|
+
run_b = self._storage.get_run(request.run_id_b)
|
|
70
|
+
except KeyError as exc:
|
|
71
|
+
logger.error("Run not found during comparison: %s", exc)
|
|
72
|
+
raise RunComparisonError("Run을 찾을 수 없습니다.", exit_code=1) from exc
|
|
73
|
+
|
|
74
|
+
comparisons = self._analysis.compare_runs(
|
|
75
|
+
run_a,
|
|
76
|
+
run_b,
|
|
77
|
+
metrics=request.metrics,
|
|
78
|
+
test_type=request.test_type,
|
|
79
|
+
)
|
|
80
|
+
if not comparisons:
|
|
81
|
+
logger.warning("No common metrics to compare for %s vs %s", run_a.run_id, run_b.run_id)
|
|
82
|
+
raise RunComparisonError("공통 메트릭이 없습니다.", exit_code=1)
|
|
83
|
+
|
|
84
|
+
pipeline_error: Exception | None = None
|
|
85
|
+
try:
|
|
86
|
+
pipeline_result = self._pipeline.run_comparison(
|
|
87
|
+
run_ids=[run_a.run_id, run_b.run_id],
|
|
88
|
+
compare_metrics=request.metrics,
|
|
89
|
+
test_type=request.test_type,
|
|
90
|
+
parallel=request.parallel,
|
|
91
|
+
concurrency=request.concurrency,
|
|
92
|
+
report_type=request.report_type,
|
|
93
|
+
use_llm_report=request.use_llm_report,
|
|
94
|
+
)
|
|
95
|
+
except Exception as exc:
|
|
96
|
+
pipeline_error = exc
|
|
97
|
+
logger.exception("Comparison pipeline failed: %s", exc)
|
|
98
|
+
pipeline_result = PipelineResult(
|
|
99
|
+
pipeline_id=f"compare-{run_a.run_id[:8]}-{run_b.run_id[:8]}",
|
|
100
|
+
intent=AnalysisIntent.GENERATE_COMPARISON,
|
|
101
|
+
)
|
|
102
|
+
pipeline_result.mark_complete()
|
|
103
|
+
|
|
104
|
+
report_text, report_found = self._extract_markdown_report(pipeline_result)
|
|
105
|
+
degraded_reasons: list[str] = []
|
|
106
|
+
if pipeline_error is not None:
|
|
107
|
+
degraded_reasons.append("pipeline_error")
|
|
108
|
+
if not report_found:
|
|
109
|
+
degraded_reasons.append("report_missing")
|
|
110
|
+
if not pipeline_result.all_succeeded:
|
|
111
|
+
degraded_reasons.append("pipeline_failed")
|
|
112
|
+
|
|
113
|
+
status = "degraded" if degraded_reasons else "ok"
|
|
114
|
+
if status == "degraded":
|
|
115
|
+
logger.warning("Comparison report degraded: %s", degraded_reasons)
|
|
116
|
+
finished_at = datetime.now(UTC)
|
|
117
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
118
|
+
|
|
119
|
+
logger.info("Completed run comparison: status=%s duration_ms=%s", status, duration_ms)
|
|
120
|
+
|
|
121
|
+
return RunComparisonOutcome(
|
|
122
|
+
run_ids=(run_a.run_id, run_b.run_id),
|
|
123
|
+
comparisons=comparisons,
|
|
124
|
+
pipeline_result=pipeline_result,
|
|
125
|
+
report_text=report_text,
|
|
126
|
+
status=status,
|
|
127
|
+
started_at=started_at,
|
|
128
|
+
finished_at=finished_at,
|
|
129
|
+
duration_ms=duration_ms,
|
|
130
|
+
degraded_reasons=degraded_reasons,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def _extract_markdown_report(pipeline_result: PipelineResult) -> tuple[str, bool]:
|
|
135
|
+
final_output = pipeline_result.final_output
|
|
136
|
+
if isinstance(final_output, dict):
|
|
137
|
+
report = RunComparisonService._find_report(final_output)
|
|
138
|
+
if report:
|
|
139
|
+
return report, True
|
|
140
|
+
return "# 비교 분석 보고서\n\n보고서 본문을 찾지 못했습니다.\n", False
|
|
141
|
+
|
|
142
|
+
@staticmethod
|
|
143
|
+
def _find_report(output: dict) -> str | None:
|
|
144
|
+
if "report" in output and isinstance(output["report"], str):
|
|
145
|
+
return output["report"]
|
|
146
|
+
for value in output.values():
|
|
147
|
+
if isinstance(value, dict):
|
|
148
|
+
nested = RunComparisonService._find_report(value)
|
|
149
|
+
if nested:
|
|
150
|
+
return nested
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
__all__ = [
|
|
155
|
+
"RunComparisonService",
|
|
156
|
+
"RunComparisonRequest",
|
|
157
|
+
"RunComparisonOutcome",
|
|
158
|
+
"RunComparisonError",
|
|
159
|
+
]
|
|
@@ -255,7 +255,12 @@ def _normalize_scores(value: Any) -> list[float]:
|
|
|
255
255
|
if value is None:
|
|
256
256
|
return []
|
|
257
257
|
if isinstance(value, list | tuple | set):
|
|
258
|
-
|
|
258
|
+
scores: list[float] = []
|
|
259
|
+
for item in value:
|
|
260
|
+
score = _coerce_float(item)
|
|
261
|
+
if score is not None:
|
|
262
|
+
scores.append(score)
|
|
263
|
+
return scores
|
|
259
264
|
coerced = _coerce_float(value)
|
|
260
265
|
return [coerced] if coerced is not None else []
|
|
261
266
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import math
|
|
5
6
|
from collections.abc import Iterable, Mapping, Sequence
|
|
6
7
|
from typing import Any
|
|
7
8
|
|
|
@@ -83,8 +84,16 @@ class StageMetricService:
|
|
|
83
84
|
relevance_map: Mapping[str, set[str]],
|
|
84
85
|
) -> list[StageMetric]:
|
|
85
86
|
metrics: list[StageMetric] = []
|
|
86
|
-
|
|
87
|
-
|
|
87
|
+
raw_doc_ids = event.attributes.get("doc_ids")
|
|
88
|
+
raw_scores = event.attributes.get("scores")
|
|
89
|
+
unordered_doc_ids = isinstance(raw_doc_ids, set | frozenset)
|
|
90
|
+
unordered_scores = isinstance(raw_scores, set | frozenset)
|
|
91
|
+
doc_ids = _to_str_list(raw_doc_ids)
|
|
92
|
+
scores = _to_float_list(raw_scores)
|
|
93
|
+
order_reconstructed = None
|
|
94
|
+
if unordered_doc_ids:
|
|
95
|
+
doc_ids = sorted(doc_ids)
|
|
96
|
+
order_reconstructed = "doc_id_asc"
|
|
88
97
|
|
|
89
98
|
metrics.append(
|
|
90
99
|
StageMetric(
|
|
@@ -92,19 +101,36 @@ class StageMetricService:
|
|
|
92
101
|
stage_id=event.stage_id,
|
|
93
102
|
metric_name="retrieval.result_count",
|
|
94
103
|
score=float(len(doc_ids)),
|
|
95
|
-
evidence={"count": len(doc_ids)},
|
|
104
|
+
evidence=_with_order_evidence({"count": len(doc_ids)}, unordered_doc_ids, None),
|
|
96
105
|
)
|
|
97
106
|
)
|
|
107
|
+
if unordered_doc_ids or unordered_scores:
|
|
108
|
+
metrics.append(
|
|
109
|
+
StageMetric(
|
|
110
|
+
run_id=event.run_id,
|
|
111
|
+
stage_id=event.stage_id,
|
|
112
|
+
metric_name="retrieval.ordering_warning",
|
|
113
|
+
score=1.0,
|
|
114
|
+
evidence=_with_order_evidence(
|
|
115
|
+
{
|
|
116
|
+
"doc_ids_unordered": unordered_doc_ids,
|
|
117
|
+
"scores_unordered": unordered_scores,
|
|
118
|
+
},
|
|
119
|
+
True,
|
|
120
|
+
order_reconstructed,
|
|
121
|
+
),
|
|
122
|
+
)
|
|
123
|
+
)
|
|
98
124
|
|
|
99
125
|
if scores:
|
|
100
|
-
avg_score =
|
|
126
|
+
avg_score = _safe_avg(scores)
|
|
101
127
|
metrics.append(
|
|
102
128
|
StageMetric(
|
|
103
129
|
run_id=event.run_id,
|
|
104
130
|
stage_id=event.stage_id,
|
|
105
131
|
metric_name="retrieval.avg_score",
|
|
106
132
|
score=avg_score,
|
|
107
|
-
evidence={"count": len(scores)},
|
|
133
|
+
evidence=_with_order_evidence({"count": len(scores)}, unordered_scores, None),
|
|
108
134
|
)
|
|
109
135
|
)
|
|
110
136
|
if len(scores) > 1:
|
|
@@ -115,14 +141,22 @@ class StageMetricService:
|
|
|
115
141
|
stage_id=event.stage_id,
|
|
116
142
|
metric_name="retrieval.score_gap",
|
|
117
143
|
score=score_gap,
|
|
118
|
-
evidence=
|
|
144
|
+
evidence=_with_order_evidence(
|
|
145
|
+
{"max": max(scores), "min": min(scores)}, unordered_scores, None
|
|
146
|
+
),
|
|
119
147
|
)
|
|
120
148
|
)
|
|
121
149
|
|
|
122
150
|
relevant_docs = _get_relevant_docs(event, relevance_map)
|
|
123
151
|
if doc_ids and relevant_docs:
|
|
124
152
|
top_k = _coerce_int(event.attributes.get("top_k"), default=len(doc_ids))
|
|
125
|
-
k =
|
|
153
|
+
k = len(doc_ids) if top_k is None or top_k <= 0 else min(top_k, len(doc_ids))
|
|
154
|
+
if unordered_scores and scores:
|
|
155
|
+
score_pairs = list(zip(doc_ids, scores, strict=False))
|
|
156
|
+
score_pairs.sort(key=lambda item: (-item[1], item[0]))
|
|
157
|
+
doc_ids = [doc_id for doc_id, _score in score_pairs]
|
|
158
|
+
scores = [score for _doc_id, score in score_pairs]
|
|
159
|
+
order_reconstructed = "score_desc_then_id"
|
|
126
160
|
retrieved_top_k = doc_ids[:k]
|
|
127
161
|
relevant_found = len(set(retrieved_top_k) & relevant_docs)
|
|
128
162
|
|
|
@@ -135,11 +169,15 @@ class StageMetricService:
|
|
|
135
169
|
stage_id=event.stage_id,
|
|
136
170
|
metric_name="retrieval.precision_at_k",
|
|
137
171
|
score=precision,
|
|
138
|
-
evidence=
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
172
|
+
evidence=_with_order_evidence(
|
|
173
|
+
{
|
|
174
|
+
"k": k,
|
|
175
|
+
"relevant_found": relevant_found,
|
|
176
|
+
"retrieved_count": k,
|
|
177
|
+
},
|
|
178
|
+
unordered_doc_ids or unordered_scores,
|
|
179
|
+
order_reconstructed,
|
|
180
|
+
),
|
|
143
181
|
)
|
|
144
182
|
)
|
|
145
183
|
metrics.append(
|
|
@@ -148,11 +186,15 @@ class StageMetricService:
|
|
|
148
186
|
stage_id=event.stage_id,
|
|
149
187
|
metric_name="retrieval.recall_at_k",
|
|
150
188
|
score=recall,
|
|
151
|
-
evidence=
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
189
|
+
evidence=_with_order_evidence(
|
|
190
|
+
{
|
|
191
|
+
"k": k,
|
|
192
|
+
"relevant_found": relevant_found,
|
|
193
|
+
"relevant_total": len(relevant_docs),
|
|
194
|
+
},
|
|
195
|
+
unordered_doc_ids or unordered_scores,
|
|
196
|
+
order_reconstructed,
|
|
197
|
+
),
|
|
156
198
|
)
|
|
157
199
|
)
|
|
158
200
|
|
|
@@ -180,7 +222,7 @@ class StageMetricService:
|
|
|
180
222
|
|
|
181
223
|
scores = _to_float_list(event.attributes.get("scores"))
|
|
182
224
|
if scores:
|
|
183
|
-
avg_score =
|
|
225
|
+
avg_score = _safe_avg(scores)
|
|
184
226
|
metrics.append(
|
|
185
227
|
StageMetric(
|
|
186
228
|
run_id=event.run_id,
|
|
@@ -358,6 +400,8 @@ def _to_str_list(value: Any) -> list[str]:
|
|
|
358
400
|
return []
|
|
359
401
|
if isinstance(value, str):
|
|
360
402
|
return [value]
|
|
403
|
+
if isinstance(value, set | frozenset):
|
|
404
|
+
return [str(item) for item in value if not isinstance(item, bytes | bytearray)]
|
|
361
405
|
if isinstance(value, Sequence):
|
|
362
406
|
return [str(item) for item in value if not isinstance(item, bytes | bytearray)]
|
|
363
407
|
return [str(value)]
|
|
@@ -370,6 +414,8 @@ def _to_str_set(value: Any) -> set[str]:
|
|
|
370
414
|
def _to_float_list(value: Any) -> list[float]:
|
|
371
415
|
if value is None:
|
|
372
416
|
return []
|
|
417
|
+
if isinstance(value, set | frozenset):
|
|
418
|
+
return [float(item) for item in value]
|
|
373
419
|
if isinstance(value, Sequence) and not isinstance(value, str | bytes | bytearray):
|
|
374
420
|
return [float(item) for item in value]
|
|
375
421
|
return [float(value)]
|
|
@@ -390,6 +436,25 @@ def _coerce_float(value: Any) -> float | None:
|
|
|
390
436
|
return None
|
|
391
437
|
|
|
392
438
|
|
|
439
|
+
def _safe_avg(values: Sequence[float]) -> float:
|
|
440
|
+
if not values:
|
|
441
|
+
return 0.0
|
|
442
|
+
total = math.fsum(values)
|
|
443
|
+
return total / len(values)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def _with_order_evidence(
|
|
447
|
+
evidence: dict[str, Any], unordered: bool, order_reconstructed: str | None
|
|
448
|
+
) -> dict[str, Any]:
|
|
449
|
+
if not unordered:
|
|
450
|
+
return evidence
|
|
451
|
+
enriched = dict(evidence)
|
|
452
|
+
enriched["unordered_input"] = True
|
|
453
|
+
if order_reconstructed:
|
|
454
|
+
enriched["order_reconstructed"] = order_reconstructed
|
|
455
|
+
return enriched
|
|
456
|
+
|
|
457
|
+
|
|
393
458
|
def _extract_violation_count(attributes: Mapping[str, Any]) -> int | None:
|
|
394
459
|
violations = attributes.get("violations")
|
|
395
460
|
if isinstance(violations, list | tuple | set):
|
|
@@ -11,6 +11,7 @@ from evalvault.ports.outbound.benchmark_port import (
|
|
|
11
11
|
BenchmarkTaskResult,
|
|
12
12
|
)
|
|
13
13
|
from evalvault.ports.outbound.causal_analysis_port import CausalAnalysisPort
|
|
14
|
+
from evalvault.ports.outbound.comparison_pipeline_port import ComparisonPipelinePort
|
|
14
15
|
from evalvault.ports.outbound.dataset_port import DatasetPort
|
|
15
16
|
from evalvault.ports.outbound.domain_memory_port import (
|
|
16
17
|
BehaviorMemoryPort,
|
|
@@ -38,6 +39,7 @@ from evalvault.ports.outbound.improvement_port import (
|
|
|
38
39
|
PlaybookPort,
|
|
39
40
|
)
|
|
40
41
|
from evalvault.ports.outbound.intent_classifier_port import IntentClassifierPort
|
|
42
|
+
from evalvault.ports.outbound.judge_calibration_port import JudgeCalibrationPort
|
|
41
43
|
from evalvault.ports.outbound.korean_nlp_port import (
|
|
42
44
|
FaithfulnessResultProtocol,
|
|
43
45
|
KoreanNLPToolkitPort,
|
|
@@ -58,6 +60,7 @@ from evalvault.ports.outbound.tracker_port import TrackerPort
|
|
|
58
60
|
__all__ = [
|
|
59
61
|
"AnalysisCachePort",
|
|
60
62
|
"AnalysisPort",
|
|
63
|
+
"ComparisonPipelinePort",
|
|
61
64
|
"CausalAnalysisPort",
|
|
62
65
|
"DatasetPort",
|
|
63
66
|
"DomainMemoryPort",
|
|
@@ -83,6 +86,7 @@ __all__ = [
|
|
|
83
86
|
"PatternDefinitionProtocol",
|
|
84
87
|
"MetricPlaybookProtocol",
|
|
85
88
|
"ClaimImprovementProtocol",
|
|
89
|
+
"JudgeCalibrationPort",
|
|
86
90
|
"LLMFactoryPort",
|
|
87
91
|
"LLMPort",
|
|
88
92
|
"MethodRuntime",
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Protocol
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ArtifactFileSystemPort(Protocol):
|
|
8
|
+
def exists(self, path: Path) -> bool: ...
|
|
9
|
+
|
|
10
|
+
def is_dir(self, path: Path) -> bool: ...
|
|
11
|
+
|
|
12
|
+
def read_text(self, path: Path) -> str: ...
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
from evalvault.domain.entities.analysis_pipeline import PipelineResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ComparisonPipelinePort(Protocol):
|
|
9
|
+
def run_comparison(
|
|
10
|
+
self,
|
|
11
|
+
*,
|
|
12
|
+
run_ids: list[str],
|
|
13
|
+
compare_metrics: list[str] | None,
|
|
14
|
+
test_type: str,
|
|
15
|
+
parallel: bool,
|
|
16
|
+
concurrency: int | None,
|
|
17
|
+
report_type: str,
|
|
18
|
+
use_llm_report: bool,
|
|
19
|
+
) -> PipelineResult: ...
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
__all__ = ["ComparisonPipelinePort"]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Protocol
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DifficultyProfileWriterPort(Protocol):
|
|
8
|
+
def write_profile(
|
|
9
|
+
self,
|
|
10
|
+
*,
|
|
11
|
+
output_path: Path,
|
|
12
|
+
artifacts_dir: Path,
|
|
13
|
+
envelope: dict[str, object],
|
|
14
|
+
artifacts: dict[str, object],
|
|
15
|
+
) -> dict[str, object]: ...
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
from evalvault.domain.entities import EvaluationRun, SatisfactionFeedback
|
|
6
|
+
from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class JudgeCalibrationPort(Protocol):
|
|
10
|
+
def calibrate(
|
|
11
|
+
self,
|
|
12
|
+
run: EvaluationRun,
|
|
13
|
+
feedbacks: list[SatisfactionFeedback],
|
|
14
|
+
*,
|
|
15
|
+
labels_source: str,
|
|
16
|
+
method: str,
|
|
17
|
+
metrics: list[str],
|
|
18
|
+
holdout_ratio: float,
|
|
19
|
+
seed: int,
|
|
20
|
+
parallel: bool = False,
|
|
21
|
+
concurrency: int = 8,
|
|
22
|
+
) -> JudgeCalibrationResult: ...
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: evalvault
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.65.0
|
|
4
4
|
Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
|
|
5
5
|
Project-URL: Homepage, https://github.com/ntts9990/EvalVault
|
|
6
6
|
Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
|