evalvault 1.64.0__py3-none-any.whl → 1.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  2. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  3. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  4. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  5. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  6. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  7. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  8. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  9. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  10. evalvault/adapters/outbound/artifact_fs.py +16 -0
  11. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  12. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  13. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  14. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  15. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  16. evalvault/adapters/outbound/tracker/langfuse_adapter.py +12 -7
  17. evalvault/adapters/outbound/tracker/phoenix_adapter.py +39 -12
  18. evalvault/domain/entities/__init__.py +10 -0
  19. evalvault/domain/entities/judge_calibration.py +50 -0
  20. evalvault/domain/entities/stage.py +11 -3
  21. evalvault/domain/services/artifact_lint_service.py +268 -0
  22. evalvault/domain/services/benchmark_runner.py +1 -6
  23. evalvault/domain/services/dataset_preprocessor.py +26 -0
  24. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  25. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  26. evalvault/domain/services/evaluator.py +2 -0
  27. evalvault/domain/services/judge_calibration_service.py +495 -0
  28. evalvault/domain/services/ops_snapshot_service.py +159 -0
  29. evalvault/domain/services/regression_gate_service.py +199 -0
  30. evalvault/domain/services/run_comparison_service.py +159 -0
  31. evalvault/domain/services/stage_event_builder.py +6 -1
  32. evalvault/domain/services/stage_metric_service.py +83 -18
  33. evalvault/ports/outbound/__init__.py +4 -0
  34. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  35. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  36. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  37. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  38. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  39. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/METADATA +1 -1
  40. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/RECORD +43 -17
  41. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
  42. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
  43. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,199 @@
1
+ """Regression gate service for CLI automation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import time
7
+ from dataclasses import dataclass
8
+ from datetime import UTC, datetime
9
+
10
+ from evalvault.domain.entities.analysis import ComparisonResult, EffectSizeLevel
11
+ from evalvault.ports.outbound.analysis_port import AnalysisPort
12
+ from evalvault.ports.outbound.storage_port import StoragePort
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ TestType = str
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class RegressionMetricResult:
21
+ metric: str
22
+
23
+ baseline_score: float
24
+ candidate_score: float
25
+ diff: float
26
+ diff_percent: float
27
+ p_value: float
28
+ effect_size: float
29
+ effect_level: EffectSizeLevel
30
+ is_significant: bool
31
+ regression: bool
32
+
33
+ @classmethod
34
+ def from_comparison(
35
+ cls,
36
+ comparison: ComparisonResult,
37
+ *,
38
+ fail_on_regression: float,
39
+ ) -> RegressionMetricResult:
40
+ regression = comparison.diff < -fail_on_regression
41
+ return cls(
42
+ metric=comparison.metric,
43
+ baseline_score=comparison.mean_a,
44
+ candidate_score=comparison.mean_b,
45
+ diff=comparison.diff,
46
+ diff_percent=comparison.diff_percent,
47
+ p_value=comparison.p_value,
48
+ effect_size=comparison.effect_size,
49
+ effect_level=comparison.effect_level,
50
+ is_significant=comparison.is_significant,
51
+ regression=regression,
52
+ )
53
+
54
+ def to_dict(self) -> dict[str, float | str | bool]:
55
+ return {
56
+ "metric": self.metric,
57
+ "baseline_score": self.baseline_score,
58
+ "candidate_score": self.candidate_score,
59
+ "diff": self.diff,
60
+ "diff_percent": self.diff_percent,
61
+ "p_value": self.p_value,
62
+ "effect_size": self.effect_size,
63
+ "effect_level": self.effect_level.value,
64
+ "is_significant": self.is_significant,
65
+ "regression": self.regression,
66
+ }
67
+
68
+
69
+ @dataclass(frozen=True)
70
+ class RegressionGateReport:
71
+ candidate_run_id: str
72
+ baseline_run_id: str
73
+ results: list[RegressionMetricResult]
74
+ regression_detected: bool
75
+ fail_on_regression: float
76
+ test_type: TestType
77
+ metrics: list[str]
78
+ started_at: datetime
79
+ finished_at: datetime
80
+ duration_ms: int
81
+ parallel: bool
82
+ concurrency: int | None
83
+
84
+ @property
85
+ def status(self) -> str:
86
+ return "failed" if self.regression_detected else "passed"
87
+
88
+ def to_dict(self) -> dict[str, object]:
89
+ return {
90
+ "candidate_run_id": self.candidate_run_id,
91
+ "baseline_run_id": self.baseline_run_id,
92
+ "status": self.status,
93
+ "regression_detected": self.regression_detected,
94
+ "fail_on_regression": self.fail_on_regression,
95
+ "test": self.test_type,
96
+ "metrics": list(self.metrics),
97
+ "results": [result.to_dict() for result in self.results],
98
+ "parallel": self.parallel,
99
+ "concurrency": self.concurrency,
100
+ }
101
+
102
+
103
+ class RegressionGateService:
104
+ def __init__(self, storage: StoragePort, analysis_adapter: AnalysisPort) -> None:
105
+ self._storage = storage
106
+ self._analysis = analysis_adapter
107
+
108
+ def run_gate(
109
+ self,
110
+ candidate_run_id: str,
111
+ baseline_run_id: str,
112
+ *,
113
+ metrics: list[str] | None = None,
114
+ test_type: TestType = "t-test",
115
+ fail_on_regression: float = 0.05,
116
+ parallel: bool = True,
117
+ concurrency: int | None = None,
118
+ ) -> RegressionGateReport:
119
+ start_time = time.monotonic()
120
+ started_at = datetime.now(UTC)
121
+ logger.info(
122
+ "Regression gate start: candidate=%s baseline=%s",
123
+ candidate_run_id,
124
+ baseline_run_id,
125
+ )
126
+ try:
127
+ candidate = self._storage.get_run(candidate_run_id)
128
+ baseline = self._storage.get_run(baseline_run_id)
129
+
130
+ requested_metrics = [m for m in (metrics or []) if m]
131
+ if requested_metrics:
132
+ metric_list = requested_metrics
133
+ else:
134
+ metric_list = sorted(
135
+ set(candidate.metrics_evaluated) & set(baseline.metrics_evaluated)
136
+ )
137
+
138
+ if not metric_list:
139
+ raise ValueError("No shared metrics available for regression gate.")
140
+
141
+ comparisons = self._analysis.compare_runs(
142
+ baseline,
143
+ candidate,
144
+ metrics=metric_list,
145
+ test_type=test_type,
146
+ )
147
+ if not comparisons:
148
+ raise ValueError("No comparable metrics found for regression gate.")
149
+
150
+ comparison_map = {result.metric: result for result in comparisons}
151
+ missing = [metric for metric in metric_list if metric not in comparison_map]
152
+ if missing:
153
+ raise ValueError("Missing comparison results for metrics: " + ", ".join(missing))
154
+
155
+ ordered = [comparison_map[metric] for metric in metric_list]
156
+ results = [
157
+ RegressionMetricResult.from_comparison(
158
+ comparison,
159
+ fail_on_regression=fail_on_regression,
160
+ )
161
+ for comparison in ordered
162
+ ]
163
+ regression_detected = any(result.regression for result in results)
164
+ finished_at = datetime.now(UTC)
165
+ duration_ms = int((time.monotonic() - start_time) * 1000)
166
+ logger.info(
167
+ "Regression gate complete: candidate=%s baseline=%s regressions=%s",
168
+ candidate_run_id,
169
+ baseline_run_id,
170
+ regression_detected,
171
+ )
172
+ return RegressionGateReport(
173
+ candidate_run_id=candidate_run_id,
174
+ baseline_run_id=baseline_run_id,
175
+ results=results,
176
+ regression_detected=regression_detected,
177
+ fail_on_regression=fail_on_regression,
178
+ test_type=test_type,
179
+ metrics=metric_list,
180
+ started_at=started_at,
181
+ finished_at=finished_at,
182
+ duration_ms=duration_ms,
183
+ parallel=parallel,
184
+ concurrency=concurrency,
185
+ )
186
+ except Exception:
187
+ logger.exception(
188
+ "Regression gate failed: candidate=%s baseline=%s",
189
+ candidate_run_id,
190
+ baseline_run_id,
191
+ )
192
+ raise
193
+
194
+
195
+ __all__ = [
196
+ "RegressionGateReport",
197
+ "RegressionGateService",
198
+ "RegressionMetricResult",
199
+ ]
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from dataclasses import dataclass, field
5
+ from datetime import UTC, datetime
6
+
7
+ from evalvault.domain.entities.analysis import ComparisonResult
8
+ from evalvault.domain.entities.analysis_pipeline import AnalysisIntent, PipelineResult
9
+ from evalvault.ports.outbound.analysis_port import AnalysisPort
10
+ from evalvault.ports.outbound.comparison_pipeline_port import ComparisonPipelinePort
11
+ from evalvault.ports.outbound.storage_port import StoragePort
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class RunComparisonRequest:
18
+ run_id_a: str
19
+ run_id_b: str
20
+ metrics: list[str] | None = None
21
+ test_type: str = "t-test"
22
+ parallel: bool = False
23
+ concurrency: int | None = None
24
+ report_type: str = "comparison"
25
+ use_llm_report: bool = True
26
+
27
+
28
+ @dataclass
29
+ class RunComparisonOutcome:
30
+ run_ids: tuple[str, str]
31
+ comparisons: list[ComparisonResult]
32
+ pipeline_result: PipelineResult
33
+ report_text: str
34
+ status: str
35
+ started_at: datetime
36
+ finished_at: datetime
37
+ duration_ms: int
38
+ degraded_reasons: list[str] = field(default_factory=list)
39
+
40
+ @property
41
+ def is_degraded(self) -> bool:
42
+ return self.status != "ok"
43
+
44
+
45
+ class RunComparisonError(Exception):
46
+ def __init__(self, message: str, *, exit_code: int = 1):
47
+ super().__init__(message)
48
+ self.exit_code = exit_code
49
+
50
+
51
+ class RunComparisonService:
52
+ def __init__(
53
+ self,
54
+ *,
55
+ storage: StoragePort,
56
+ analysis_port: AnalysisPort,
57
+ pipeline_port: ComparisonPipelinePort,
58
+ ) -> None:
59
+ self._storage = storage
60
+ self._analysis = analysis_port
61
+ self._pipeline = pipeline_port
62
+
63
+ def compare_runs(self, request: RunComparisonRequest) -> RunComparisonOutcome:
64
+ started_at = datetime.now(UTC)
65
+ logger.info("Starting run comparison: %s vs %s", request.run_id_a, request.run_id_b)
66
+
67
+ try:
68
+ run_a = self._storage.get_run(request.run_id_a)
69
+ run_b = self._storage.get_run(request.run_id_b)
70
+ except KeyError as exc:
71
+ logger.error("Run not found during comparison: %s", exc)
72
+ raise RunComparisonError("Run을 찾을 수 없습니다.", exit_code=1) from exc
73
+
74
+ comparisons = self._analysis.compare_runs(
75
+ run_a,
76
+ run_b,
77
+ metrics=request.metrics,
78
+ test_type=request.test_type,
79
+ )
80
+ if not comparisons:
81
+ logger.warning("No common metrics to compare for %s vs %s", run_a.run_id, run_b.run_id)
82
+ raise RunComparisonError("공통 메트릭이 없습니다.", exit_code=1)
83
+
84
+ pipeline_error: Exception | None = None
85
+ try:
86
+ pipeline_result = self._pipeline.run_comparison(
87
+ run_ids=[run_a.run_id, run_b.run_id],
88
+ compare_metrics=request.metrics,
89
+ test_type=request.test_type,
90
+ parallel=request.parallel,
91
+ concurrency=request.concurrency,
92
+ report_type=request.report_type,
93
+ use_llm_report=request.use_llm_report,
94
+ )
95
+ except Exception as exc:
96
+ pipeline_error = exc
97
+ logger.exception("Comparison pipeline failed: %s", exc)
98
+ pipeline_result = PipelineResult(
99
+ pipeline_id=f"compare-{run_a.run_id[:8]}-{run_b.run_id[:8]}",
100
+ intent=AnalysisIntent.GENERATE_COMPARISON,
101
+ )
102
+ pipeline_result.mark_complete()
103
+
104
+ report_text, report_found = self._extract_markdown_report(pipeline_result)
105
+ degraded_reasons: list[str] = []
106
+ if pipeline_error is not None:
107
+ degraded_reasons.append("pipeline_error")
108
+ if not report_found:
109
+ degraded_reasons.append("report_missing")
110
+ if not pipeline_result.all_succeeded:
111
+ degraded_reasons.append("pipeline_failed")
112
+
113
+ status = "degraded" if degraded_reasons else "ok"
114
+ if status == "degraded":
115
+ logger.warning("Comparison report degraded: %s", degraded_reasons)
116
+ finished_at = datetime.now(UTC)
117
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
118
+
119
+ logger.info("Completed run comparison: status=%s duration_ms=%s", status, duration_ms)
120
+
121
+ return RunComparisonOutcome(
122
+ run_ids=(run_a.run_id, run_b.run_id),
123
+ comparisons=comparisons,
124
+ pipeline_result=pipeline_result,
125
+ report_text=report_text,
126
+ status=status,
127
+ started_at=started_at,
128
+ finished_at=finished_at,
129
+ duration_ms=duration_ms,
130
+ degraded_reasons=degraded_reasons,
131
+ )
132
+
133
+ @staticmethod
134
+ def _extract_markdown_report(pipeline_result: PipelineResult) -> tuple[str, bool]:
135
+ final_output = pipeline_result.final_output
136
+ if isinstance(final_output, dict):
137
+ report = RunComparisonService._find_report(final_output)
138
+ if report:
139
+ return report, True
140
+ return "# 비교 분석 보고서\n\n보고서 본문을 찾지 못했습니다.\n", False
141
+
142
+ @staticmethod
143
+ def _find_report(output: dict) -> str | None:
144
+ if "report" in output and isinstance(output["report"], str):
145
+ return output["report"]
146
+ for value in output.values():
147
+ if isinstance(value, dict):
148
+ nested = RunComparisonService._find_report(value)
149
+ if nested:
150
+ return nested
151
+ return None
152
+
153
+
154
+ __all__ = [
155
+ "RunComparisonService",
156
+ "RunComparisonRequest",
157
+ "RunComparisonOutcome",
158
+ "RunComparisonError",
159
+ ]
@@ -255,7 +255,12 @@ def _normalize_scores(value: Any) -> list[float]:
255
255
  if value is None:
256
256
  return []
257
257
  if isinstance(value, list | tuple | set):
258
- return [_coerce_float(item) for item in value if _coerce_float(item) is not None]
258
+ scores: list[float] = []
259
+ for item in value:
260
+ score = _coerce_float(item)
261
+ if score is not None:
262
+ scores.append(score)
263
+ return scores
259
264
  coerced = _coerce_float(value)
260
265
  return [coerced] if coerced is not None else []
261
266
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import math
5
6
  from collections.abc import Iterable, Mapping, Sequence
6
7
  from typing import Any
7
8
 
@@ -83,8 +84,16 @@ class StageMetricService:
83
84
  relevance_map: Mapping[str, set[str]],
84
85
  ) -> list[StageMetric]:
85
86
  metrics: list[StageMetric] = []
86
- doc_ids = _to_str_list(event.attributes.get("doc_ids"))
87
- scores = _to_float_list(event.attributes.get("scores"))
87
+ raw_doc_ids = event.attributes.get("doc_ids")
88
+ raw_scores = event.attributes.get("scores")
89
+ unordered_doc_ids = isinstance(raw_doc_ids, set | frozenset)
90
+ unordered_scores = isinstance(raw_scores, set | frozenset)
91
+ doc_ids = _to_str_list(raw_doc_ids)
92
+ scores = _to_float_list(raw_scores)
93
+ order_reconstructed = None
94
+ if unordered_doc_ids:
95
+ doc_ids = sorted(doc_ids)
96
+ order_reconstructed = "doc_id_asc"
88
97
 
89
98
  metrics.append(
90
99
  StageMetric(
@@ -92,19 +101,36 @@ class StageMetricService:
92
101
  stage_id=event.stage_id,
93
102
  metric_name="retrieval.result_count",
94
103
  score=float(len(doc_ids)),
95
- evidence={"count": len(doc_ids)},
104
+ evidence=_with_order_evidence({"count": len(doc_ids)}, unordered_doc_ids, None),
96
105
  )
97
106
  )
107
+ if unordered_doc_ids or unordered_scores:
108
+ metrics.append(
109
+ StageMetric(
110
+ run_id=event.run_id,
111
+ stage_id=event.stage_id,
112
+ metric_name="retrieval.ordering_warning",
113
+ score=1.0,
114
+ evidence=_with_order_evidence(
115
+ {
116
+ "doc_ids_unordered": unordered_doc_ids,
117
+ "scores_unordered": unordered_scores,
118
+ },
119
+ True,
120
+ order_reconstructed,
121
+ ),
122
+ )
123
+ )
98
124
 
99
125
  if scores:
100
- avg_score = sum(scores) / len(scores)
126
+ avg_score = _safe_avg(scores)
101
127
  metrics.append(
102
128
  StageMetric(
103
129
  run_id=event.run_id,
104
130
  stage_id=event.stage_id,
105
131
  metric_name="retrieval.avg_score",
106
132
  score=avg_score,
107
- evidence={"count": len(scores)},
133
+ evidence=_with_order_evidence({"count": len(scores)}, unordered_scores, None),
108
134
  )
109
135
  )
110
136
  if len(scores) > 1:
@@ -115,14 +141,22 @@ class StageMetricService:
115
141
  stage_id=event.stage_id,
116
142
  metric_name="retrieval.score_gap",
117
143
  score=score_gap,
118
- evidence={"max": max(scores), "min": min(scores)},
144
+ evidence=_with_order_evidence(
145
+ {"max": max(scores), "min": min(scores)}, unordered_scores, None
146
+ ),
119
147
  )
120
148
  )
121
149
 
122
150
  relevant_docs = _get_relevant_docs(event, relevance_map)
123
151
  if doc_ids and relevant_docs:
124
152
  top_k = _coerce_int(event.attributes.get("top_k"), default=len(doc_ids))
125
- k = min(top_k, len(doc_ids)) if top_k > 0 else len(doc_ids)
153
+ k = len(doc_ids) if top_k is None or top_k <= 0 else min(top_k, len(doc_ids))
154
+ if unordered_scores and scores:
155
+ score_pairs = list(zip(doc_ids, scores, strict=False))
156
+ score_pairs.sort(key=lambda item: (-item[1], item[0]))
157
+ doc_ids = [doc_id for doc_id, _score in score_pairs]
158
+ scores = [score for _doc_id, score in score_pairs]
159
+ order_reconstructed = "score_desc_then_id"
126
160
  retrieved_top_k = doc_ids[:k]
127
161
  relevant_found = len(set(retrieved_top_k) & relevant_docs)
128
162
 
@@ -135,11 +169,15 @@ class StageMetricService:
135
169
  stage_id=event.stage_id,
136
170
  metric_name="retrieval.precision_at_k",
137
171
  score=precision,
138
- evidence={
139
- "k": k,
140
- "relevant_found": relevant_found,
141
- "retrieved_count": k,
142
- },
172
+ evidence=_with_order_evidence(
173
+ {
174
+ "k": k,
175
+ "relevant_found": relevant_found,
176
+ "retrieved_count": k,
177
+ },
178
+ unordered_doc_ids or unordered_scores,
179
+ order_reconstructed,
180
+ ),
143
181
  )
144
182
  )
145
183
  metrics.append(
@@ -148,11 +186,15 @@ class StageMetricService:
148
186
  stage_id=event.stage_id,
149
187
  metric_name="retrieval.recall_at_k",
150
188
  score=recall,
151
- evidence={
152
- "k": k,
153
- "relevant_found": relevant_found,
154
- "relevant_total": len(relevant_docs),
155
- },
189
+ evidence=_with_order_evidence(
190
+ {
191
+ "k": k,
192
+ "relevant_found": relevant_found,
193
+ "relevant_total": len(relevant_docs),
194
+ },
195
+ unordered_doc_ids or unordered_scores,
196
+ order_reconstructed,
197
+ ),
156
198
  )
157
199
  )
158
200
 
@@ -180,7 +222,7 @@ class StageMetricService:
180
222
 
181
223
  scores = _to_float_list(event.attributes.get("scores"))
182
224
  if scores:
183
- avg_score = sum(scores) / len(scores)
225
+ avg_score = _safe_avg(scores)
184
226
  metrics.append(
185
227
  StageMetric(
186
228
  run_id=event.run_id,
@@ -358,6 +400,8 @@ def _to_str_list(value: Any) -> list[str]:
358
400
  return []
359
401
  if isinstance(value, str):
360
402
  return [value]
403
+ if isinstance(value, set | frozenset):
404
+ return [str(item) for item in value if not isinstance(item, bytes | bytearray)]
361
405
  if isinstance(value, Sequence):
362
406
  return [str(item) for item in value if not isinstance(item, bytes | bytearray)]
363
407
  return [str(value)]
@@ -370,6 +414,8 @@ def _to_str_set(value: Any) -> set[str]:
370
414
  def _to_float_list(value: Any) -> list[float]:
371
415
  if value is None:
372
416
  return []
417
+ if isinstance(value, set | frozenset):
418
+ return [float(item) for item in value]
373
419
  if isinstance(value, Sequence) and not isinstance(value, str | bytes | bytearray):
374
420
  return [float(item) for item in value]
375
421
  return [float(value)]
@@ -390,6 +436,25 @@ def _coerce_float(value: Any) -> float | None:
390
436
  return None
391
437
 
392
438
 
439
+ def _safe_avg(values: Sequence[float]) -> float:
440
+ if not values:
441
+ return 0.0
442
+ total = math.fsum(values)
443
+ return total / len(values)
444
+
445
+
446
+ def _with_order_evidence(
447
+ evidence: dict[str, Any], unordered: bool, order_reconstructed: str | None
448
+ ) -> dict[str, Any]:
449
+ if not unordered:
450
+ return evidence
451
+ enriched = dict(evidence)
452
+ enriched["unordered_input"] = True
453
+ if order_reconstructed:
454
+ enriched["order_reconstructed"] = order_reconstructed
455
+ return enriched
456
+
457
+
393
458
  def _extract_violation_count(attributes: Mapping[str, Any]) -> int | None:
394
459
  violations = attributes.get("violations")
395
460
  if isinstance(violations, list | tuple | set):
@@ -11,6 +11,7 @@ from evalvault.ports.outbound.benchmark_port import (
11
11
  BenchmarkTaskResult,
12
12
  )
13
13
  from evalvault.ports.outbound.causal_analysis_port import CausalAnalysisPort
14
+ from evalvault.ports.outbound.comparison_pipeline_port import ComparisonPipelinePort
14
15
  from evalvault.ports.outbound.dataset_port import DatasetPort
15
16
  from evalvault.ports.outbound.domain_memory_port import (
16
17
  BehaviorMemoryPort,
@@ -38,6 +39,7 @@ from evalvault.ports.outbound.improvement_port import (
38
39
  PlaybookPort,
39
40
  )
40
41
  from evalvault.ports.outbound.intent_classifier_port import IntentClassifierPort
42
+ from evalvault.ports.outbound.judge_calibration_port import JudgeCalibrationPort
41
43
  from evalvault.ports.outbound.korean_nlp_port import (
42
44
  FaithfulnessResultProtocol,
43
45
  KoreanNLPToolkitPort,
@@ -58,6 +60,7 @@ from evalvault.ports.outbound.tracker_port import TrackerPort
58
60
  __all__ = [
59
61
  "AnalysisCachePort",
60
62
  "AnalysisPort",
63
+ "ComparisonPipelinePort",
61
64
  "CausalAnalysisPort",
62
65
  "DatasetPort",
63
66
  "DomainMemoryPort",
@@ -83,6 +86,7 @@ __all__ = [
83
86
  "PatternDefinitionProtocol",
84
87
  "MetricPlaybookProtocol",
85
88
  "ClaimImprovementProtocol",
89
+ "JudgeCalibrationPort",
86
90
  "LLMFactoryPort",
87
91
  "LLMPort",
88
92
  "MethodRuntime",
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Protocol
5
+
6
+
7
+ class ArtifactFileSystemPort(Protocol):
8
+ def exists(self, path: Path) -> bool: ...
9
+
10
+ def is_dir(self, path: Path) -> bool: ...
11
+
12
+ def read_text(self, path: Path) -> str: ...
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol
4
+
5
+ from evalvault.domain.entities.analysis_pipeline import PipelineResult
6
+
7
+
8
+ class ComparisonPipelinePort(Protocol):
9
+ def run_comparison(
10
+ self,
11
+ *,
12
+ run_ids: list[str],
13
+ compare_metrics: list[str] | None,
14
+ test_type: str,
15
+ parallel: bool,
16
+ concurrency: int | None,
17
+ report_type: str,
18
+ use_llm_report: bool,
19
+ ) -> PipelineResult: ...
20
+
21
+
22
+ __all__ = ["ComparisonPipelinePort"]
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Protocol
5
+
6
+
7
+ class DifficultyProfileWriterPort(Protocol):
8
+ def write_profile(
9
+ self,
10
+ *,
11
+ output_path: Path,
12
+ artifacts_dir: Path,
13
+ envelope: dict[str, object],
14
+ artifacts: dict[str, object],
15
+ ) -> dict[str, object]: ...
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol
4
+
5
+ from evalvault.domain.entities import EvaluationRun, SatisfactionFeedback
6
+ from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
7
+
8
+
9
+ class JudgeCalibrationPort(Protocol):
10
+ def calibrate(
11
+ self,
12
+ run: EvaluationRun,
13
+ feedbacks: list[SatisfactionFeedback],
14
+ *,
15
+ labels_source: str,
16
+ method: str,
17
+ metrics: list[str],
18
+ holdout_ratio: float,
19
+ seed: int,
20
+ parallel: bool = False,
21
+ concurrency: int = 8,
22
+ ) -> JudgeCalibrationResult: ...
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Protocol
5
+
6
+
7
+ class OpsSnapshotWriterPort(Protocol):
8
+ def write_snapshot(self, path: Path, payload: dict[str, Any]) -> None: ...
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evalvault
3
- Version: 1.64.0
3
+ Version: 1.65.0
4
4
  Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
5
5
  Project-URL: Homepage, https://github.com/ntts9990/EvalVault
6
6
  Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme