evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  14. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  15. evalvault/adapters/outbound/artifact_fs.py +16 -0
  16. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  17. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  18. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  19. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  20. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  21. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  22. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  23. evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
  24. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  25. evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
  26. evalvault/config/settings.py +21 -0
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/judge_calibration.py +50 -0
  29. evalvault/domain/entities/prompt.py +1 -1
  30. evalvault/domain/entities/stage.py +11 -3
  31. evalvault/domain/metrics/__init__.py +8 -0
  32. evalvault/domain/metrics/registry.py +39 -3
  33. evalvault/domain/metrics/summary_accuracy.py +189 -0
  34. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  35. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  36. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  37. evalvault/domain/services/artifact_lint_service.py +268 -0
  38. evalvault/domain/services/benchmark_runner.py +1 -6
  39. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  40. evalvault/domain/services/dataset_preprocessor.py +26 -0
  41. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  42. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  43. evalvault/domain/services/evaluator.py +282 -27
  44. evalvault/domain/services/judge_calibration_service.py +495 -0
  45. evalvault/domain/services/ops_snapshot_service.py +159 -0
  46. evalvault/domain/services/prompt_registry.py +39 -10
  47. evalvault/domain/services/regression_gate_service.py +199 -0
  48. evalvault/domain/services/run_comparison_service.py +159 -0
  49. evalvault/domain/services/stage_event_builder.py +6 -1
  50. evalvault/domain/services/stage_metric_service.py +83 -18
  51. evalvault/domain/services/threshold_profiles.py +4 -0
  52. evalvault/domain/services/visual_space_service.py +79 -4
  53. evalvault/ports/outbound/__init__.py +4 -0
  54. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  55. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  56. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  57. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  58. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  59. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  60. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
  61. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  62. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  63. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -109,30 +109,59 @@ def build_prompt_summary(bundle: PromptSetBundle) -> dict[str, Any]:
109
109
 
110
110
  def build_prompt_inputs_from_snapshots(
111
111
  snapshots: dict[str, dict[str, Any]] | None,
112
+ *,
113
+ kind: PromptKind = "ragas",
114
+ source: str | None = None,
112
115
  ) -> list[PromptInput]:
113
116
  if not snapshots:
114
117
  return []
115
118
  prompt_inputs: list[PromptInput] = []
116
119
  for metric_name, entry in snapshots.items():
117
- prompt_text = entry.get("prompt") if isinstance(entry, dict) else None
120
+ if not isinstance(entry, dict):
121
+ continue
122
+ entry_source = entry.get("source")
123
+ resolved_source = source if source else entry_source
124
+ metadata = {key: value for key, value in entry.items() if key != "prompt"}
125
+
126
+ prompts_map = entry.get("prompts")
127
+ if isinstance(prompts_map, dict) and prompts_map:
128
+ for prompt_key, prompt_text in prompts_map.items():
129
+ if not isinstance(prompt_text, str):
130
+ continue
131
+ normalized = prompt_text.strip()
132
+ if not normalized:
133
+ continue
134
+ prompt_inputs.append(
135
+ PromptInput(
136
+ content=normalized,
137
+ name=f"{kind}.{metric_name}.{prompt_key}",
138
+ kind=kind,
139
+ role=f"{metric_name}.{prompt_key}",
140
+ source=(
141
+ resolved_source
142
+ if isinstance(resolved_source, str) and resolved_source
143
+ else kind
144
+ ),
145
+ metadata=metadata or None,
146
+ )
147
+ )
148
+ continue
149
+
150
+ prompt_text = entry.get("prompt")
118
151
  if not isinstance(prompt_text, str):
119
152
  continue
120
153
  prompt_text = prompt_text.strip()
121
154
  if not prompt_text:
122
155
  continue
123
- source = entry.get("source") if isinstance(entry, dict) else None
124
- metadata = {
125
- key: value
126
- for key, value in entry.items()
127
- if key != "prompt" and isinstance(entry, dict)
128
- }
129
156
  prompt_inputs.append(
130
157
  PromptInput(
131
158
  content=prompt_text,
132
- name=f"ragas.{metric_name}",
133
- kind="ragas",
159
+ name=f"{kind}.{metric_name}",
160
+ kind=kind,
134
161
  role=str(metric_name),
135
- source=source if isinstance(source, str) and source else "ragas",
162
+ source=resolved_source
163
+ if isinstance(resolved_source, str) and resolved_source
164
+ else kind,
136
165
  metadata=metadata or None,
137
166
  )
138
167
  )
@@ -0,0 +1,199 @@
1
+ """Regression gate service for CLI automation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import time
7
+ from dataclasses import dataclass
8
+ from datetime import UTC, datetime
9
+
10
+ from evalvault.domain.entities.analysis import ComparisonResult, EffectSizeLevel
11
+ from evalvault.ports.outbound.analysis_port import AnalysisPort
12
+ from evalvault.ports.outbound.storage_port import StoragePort
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ TestType = str
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class RegressionMetricResult:
21
+ metric: str
22
+
23
+ baseline_score: float
24
+ candidate_score: float
25
+ diff: float
26
+ diff_percent: float
27
+ p_value: float
28
+ effect_size: float
29
+ effect_level: EffectSizeLevel
30
+ is_significant: bool
31
+ regression: bool
32
+
33
+ @classmethod
34
+ def from_comparison(
35
+ cls,
36
+ comparison: ComparisonResult,
37
+ *,
38
+ fail_on_regression: float,
39
+ ) -> RegressionMetricResult:
40
+ regression = comparison.diff < -fail_on_regression
41
+ return cls(
42
+ metric=comparison.metric,
43
+ baseline_score=comparison.mean_a,
44
+ candidate_score=comparison.mean_b,
45
+ diff=comparison.diff,
46
+ diff_percent=comparison.diff_percent,
47
+ p_value=comparison.p_value,
48
+ effect_size=comparison.effect_size,
49
+ effect_level=comparison.effect_level,
50
+ is_significant=comparison.is_significant,
51
+ regression=regression,
52
+ )
53
+
54
+ def to_dict(self) -> dict[str, float | str | bool]:
55
+ return {
56
+ "metric": self.metric,
57
+ "baseline_score": self.baseline_score,
58
+ "candidate_score": self.candidate_score,
59
+ "diff": self.diff,
60
+ "diff_percent": self.diff_percent,
61
+ "p_value": self.p_value,
62
+ "effect_size": self.effect_size,
63
+ "effect_level": self.effect_level.value,
64
+ "is_significant": self.is_significant,
65
+ "regression": self.regression,
66
+ }
67
+
68
+
69
+ @dataclass(frozen=True)
70
+ class RegressionGateReport:
71
+ candidate_run_id: str
72
+ baseline_run_id: str
73
+ results: list[RegressionMetricResult]
74
+ regression_detected: bool
75
+ fail_on_regression: float
76
+ test_type: TestType
77
+ metrics: list[str]
78
+ started_at: datetime
79
+ finished_at: datetime
80
+ duration_ms: int
81
+ parallel: bool
82
+ concurrency: int | None
83
+
84
+ @property
85
+ def status(self) -> str:
86
+ return "failed" if self.regression_detected else "passed"
87
+
88
+ def to_dict(self) -> dict[str, object]:
89
+ return {
90
+ "candidate_run_id": self.candidate_run_id,
91
+ "baseline_run_id": self.baseline_run_id,
92
+ "status": self.status,
93
+ "regression_detected": self.regression_detected,
94
+ "fail_on_regression": self.fail_on_regression,
95
+ "test": self.test_type,
96
+ "metrics": list(self.metrics),
97
+ "results": [result.to_dict() for result in self.results],
98
+ "parallel": self.parallel,
99
+ "concurrency": self.concurrency,
100
+ }
101
+
102
+
103
+ class RegressionGateService:
104
+ def __init__(self, storage: StoragePort, analysis_adapter: AnalysisPort) -> None:
105
+ self._storage = storage
106
+ self._analysis = analysis_adapter
107
+
108
+ def run_gate(
109
+ self,
110
+ candidate_run_id: str,
111
+ baseline_run_id: str,
112
+ *,
113
+ metrics: list[str] | None = None,
114
+ test_type: TestType = "t-test",
115
+ fail_on_regression: float = 0.05,
116
+ parallel: bool = True,
117
+ concurrency: int | None = None,
118
+ ) -> RegressionGateReport:
119
+ start_time = time.monotonic()
120
+ started_at = datetime.now(UTC)
121
+ logger.info(
122
+ "Regression gate start: candidate=%s baseline=%s",
123
+ candidate_run_id,
124
+ baseline_run_id,
125
+ )
126
+ try:
127
+ candidate = self._storage.get_run(candidate_run_id)
128
+ baseline = self._storage.get_run(baseline_run_id)
129
+
130
+ requested_metrics = [m for m in (metrics or []) if m]
131
+ if requested_metrics:
132
+ metric_list = requested_metrics
133
+ else:
134
+ metric_list = sorted(
135
+ set(candidate.metrics_evaluated) & set(baseline.metrics_evaluated)
136
+ )
137
+
138
+ if not metric_list:
139
+ raise ValueError("No shared metrics available for regression gate.")
140
+
141
+ comparisons = self._analysis.compare_runs(
142
+ baseline,
143
+ candidate,
144
+ metrics=metric_list,
145
+ test_type=test_type,
146
+ )
147
+ if not comparisons:
148
+ raise ValueError("No comparable metrics found for regression gate.")
149
+
150
+ comparison_map = {result.metric: result for result in comparisons}
151
+ missing = [metric for metric in metric_list if metric not in comparison_map]
152
+ if missing:
153
+ raise ValueError("Missing comparison results for metrics: " + ", ".join(missing))
154
+
155
+ ordered = [comparison_map[metric] for metric in metric_list]
156
+ results = [
157
+ RegressionMetricResult.from_comparison(
158
+ comparison,
159
+ fail_on_regression=fail_on_regression,
160
+ )
161
+ for comparison in ordered
162
+ ]
163
+ regression_detected = any(result.regression for result in results)
164
+ finished_at = datetime.now(UTC)
165
+ duration_ms = int((time.monotonic() - start_time) * 1000)
166
+ logger.info(
167
+ "Regression gate complete: candidate=%s baseline=%s regressions=%s",
168
+ candidate_run_id,
169
+ baseline_run_id,
170
+ regression_detected,
171
+ )
172
+ return RegressionGateReport(
173
+ candidate_run_id=candidate_run_id,
174
+ baseline_run_id=baseline_run_id,
175
+ results=results,
176
+ regression_detected=regression_detected,
177
+ fail_on_regression=fail_on_regression,
178
+ test_type=test_type,
179
+ metrics=metric_list,
180
+ started_at=started_at,
181
+ finished_at=finished_at,
182
+ duration_ms=duration_ms,
183
+ parallel=parallel,
184
+ concurrency=concurrency,
185
+ )
186
+ except Exception:
187
+ logger.exception(
188
+ "Regression gate failed: candidate=%s baseline=%s",
189
+ candidate_run_id,
190
+ baseline_run_id,
191
+ )
192
+ raise
193
+
194
+
195
+ __all__ = [
196
+ "RegressionGateReport",
197
+ "RegressionGateService",
198
+ "RegressionMetricResult",
199
+ ]
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from dataclasses import dataclass, field
5
+ from datetime import UTC, datetime
6
+
7
+ from evalvault.domain.entities.analysis import ComparisonResult
8
+ from evalvault.domain.entities.analysis_pipeline import AnalysisIntent, PipelineResult
9
+ from evalvault.ports.outbound.analysis_port import AnalysisPort
10
+ from evalvault.ports.outbound.comparison_pipeline_port import ComparisonPipelinePort
11
+ from evalvault.ports.outbound.storage_port import StoragePort
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class RunComparisonRequest:
18
+ run_id_a: str
19
+ run_id_b: str
20
+ metrics: list[str] | None = None
21
+ test_type: str = "t-test"
22
+ parallel: bool = False
23
+ concurrency: int | None = None
24
+ report_type: str = "comparison"
25
+ use_llm_report: bool = True
26
+
27
+
28
+ @dataclass
29
+ class RunComparisonOutcome:
30
+ run_ids: tuple[str, str]
31
+ comparisons: list[ComparisonResult]
32
+ pipeline_result: PipelineResult
33
+ report_text: str
34
+ status: str
35
+ started_at: datetime
36
+ finished_at: datetime
37
+ duration_ms: int
38
+ degraded_reasons: list[str] = field(default_factory=list)
39
+
40
+ @property
41
+ def is_degraded(self) -> bool:
42
+ return self.status != "ok"
43
+
44
+
45
+ class RunComparisonError(Exception):
46
+ def __init__(self, message: str, *, exit_code: int = 1):
47
+ super().__init__(message)
48
+ self.exit_code = exit_code
49
+
50
+
51
+ class RunComparisonService:
52
+ def __init__(
53
+ self,
54
+ *,
55
+ storage: StoragePort,
56
+ analysis_port: AnalysisPort,
57
+ pipeline_port: ComparisonPipelinePort,
58
+ ) -> None:
59
+ self._storage = storage
60
+ self._analysis = analysis_port
61
+ self._pipeline = pipeline_port
62
+
63
+ def compare_runs(self, request: RunComparisonRequest) -> RunComparisonOutcome:
64
+ started_at = datetime.now(UTC)
65
+ logger.info("Starting run comparison: %s vs %s", request.run_id_a, request.run_id_b)
66
+
67
+ try:
68
+ run_a = self._storage.get_run(request.run_id_a)
69
+ run_b = self._storage.get_run(request.run_id_b)
70
+ except KeyError as exc:
71
+ logger.error("Run not found during comparison: %s", exc)
72
+ raise RunComparisonError("Run을 찾을 수 없습니다.", exit_code=1) from exc
73
+
74
+ comparisons = self._analysis.compare_runs(
75
+ run_a,
76
+ run_b,
77
+ metrics=request.metrics,
78
+ test_type=request.test_type,
79
+ )
80
+ if not comparisons:
81
+ logger.warning("No common metrics to compare for %s vs %s", run_a.run_id, run_b.run_id)
82
+ raise RunComparisonError("공통 메트릭이 없습니다.", exit_code=1)
83
+
84
+ pipeline_error: Exception | None = None
85
+ try:
86
+ pipeline_result = self._pipeline.run_comparison(
87
+ run_ids=[run_a.run_id, run_b.run_id],
88
+ compare_metrics=request.metrics,
89
+ test_type=request.test_type,
90
+ parallel=request.parallel,
91
+ concurrency=request.concurrency,
92
+ report_type=request.report_type,
93
+ use_llm_report=request.use_llm_report,
94
+ )
95
+ except Exception as exc:
96
+ pipeline_error = exc
97
+ logger.exception("Comparison pipeline failed: %s", exc)
98
+ pipeline_result = PipelineResult(
99
+ pipeline_id=f"compare-{run_a.run_id[:8]}-{run_b.run_id[:8]}",
100
+ intent=AnalysisIntent.GENERATE_COMPARISON,
101
+ )
102
+ pipeline_result.mark_complete()
103
+
104
+ report_text, report_found = self._extract_markdown_report(pipeline_result)
105
+ degraded_reasons: list[str] = []
106
+ if pipeline_error is not None:
107
+ degraded_reasons.append("pipeline_error")
108
+ if not report_found:
109
+ degraded_reasons.append("report_missing")
110
+ if not pipeline_result.all_succeeded:
111
+ degraded_reasons.append("pipeline_failed")
112
+
113
+ status = "degraded" if degraded_reasons else "ok"
114
+ if status == "degraded":
115
+ logger.warning("Comparison report degraded: %s", degraded_reasons)
116
+ finished_at = datetime.now(UTC)
117
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
118
+
119
+ logger.info("Completed run comparison: status=%s duration_ms=%s", status, duration_ms)
120
+
121
+ return RunComparisonOutcome(
122
+ run_ids=(run_a.run_id, run_b.run_id),
123
+ comparisons=comparisons,
124
+ pipeline_result=pipeline_result,
125
+ report_text=report_text,
126
+ status=status,
127
+ started_at=started_at,
128
+ finished_at=finished_at,
129
+ duration_ms=duration_ms,
130
+ degraded_reasons=degraded_reasons,
131
+ )
132
+
133
+ @staticmethod
134
+ def _extract_markdown_report(pipeline_result: PipelineResult) -> tuple[str, bool]:
135
+ final_output = pipeline_result.final_output
136
+ if isinstance(final_output, dict):
137
+ report = RunComparisonService._find_report(final_output)
138
+ if report:
139
+ return report, True
140
+ return "# 비교 분석 보고서\n\n보고서 본문을 찾지 못했습니다.\n", False
141
+
142
+ @staticmethod
143
+ def _find_report(output: dict) -> str | None:
144
+ if "report" in output and isinstance(output["report"], str):
145
+ return output["report"]
146
+ for value in output.values():
147
+ if isinstance(value, dict):
148
+ nested = RunComparisonService._find_report(value)
149
+ if nested:
150
+ return nested
151
+ return None
152
+
153
+
154
+ __all__ = [
155
+ "RunComparisonService",
156
+ "RunComparisonRequest",
157
+ "RunComparisonOutcome",
158
+ "RunComparisonError",
159
+ ]
@@ -255,7 +255,12 @@ def _normalize_scores(value: Any) -> list[float]:
255
255
  if value is None:
256
256
  return []
257
257
  if isinstance(value, list | tuple | set):
258
- return [_coerce_float(item) for item in value if _coerce_float(item) is not None]
258
+ scores: list[float] = []
259
+ for item in value:
260
+ score = _coerce_float(item)
261
+ if score is not None:
262
+ scores.append(score)
263
+ return scores
259
264
  coerced = _coerce_float(value)
260
265
  return [coerced] if coerced is not None else []
261
266
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import math
5
6
  from collections.abc import Iterable, Mapping, Sequence
6
7
  from typing import Any
7
8
 
@@ -83,8 +84,16 @@ class StageMetricService:
83
84
  relevance_map: Mapping[str, set[str]],
84
85
  ) -> list[StageMetric]:
85
86
  metrics: list[StageMetric] = []
86
- doc_ids = _to_str_list(event.attributes.get("doc_ids"))
87
- scores = _to_float_list(event.attributes.get("scores"))
87
+ raw_doc_ids = event.attributes.get("doc_ids")
88
+ raw_scores = event.attributes.get("scores")
89
+ unordered_doc_ids = isinstance(raw_doc_ids, set | frozenset)
90
+ unordered_scores = isinstance(raw_scores, set | frozenset)
91
+ doc_ids = _to_str_list(raw_doc_ids)
92
+ scores = _to_float_list(raw_scores)
93
+ order_reconstructed = None
94
+ if unordered_doc_ids:
95
+ doc_ids = sorted(doc_ids)
96
+ order_reconstructed = "doc_id_asc"
88
97
 
89
98
  metrics.append(
90
99
  StageMetric(
@@ -92,19 +101,36 @@ class StageMetricService:
92
101
  stage_id=event.stage_id,
93
102
  metric_name="retrieval.result_count",
94
103
  score=float(len(doc_ids)),
95
- evidence={"count": len(doc_ids)},
104
+ evidence=_with_order_evidence({"count": len(doc_ids)}, unordered_doc_ids, None),
96
105
  )
97
106
  )
107
+ if unordered_doc_ids or unordered_scores:
108
+ metrics.append(
109
+ StageMetric(
110
+ run_id=event.run_id,
111
+ stage_id=event.stage_id,
112
+ metric_name="retrieval.ordering_warning",
113
+ score=1.0,
114
+ evidence=_with_order_evidence(
115
+ {
116
+ "doc_ids_unordered": unordered_doc_ids,
117
+ "scores_unordered": unordered_scores,
118
+ },
119
+ True,
120
+ order_reconstructed,
121
+ ),
122
+ )
123
+ )
98
124
 
99
125
  if scores:
100
- avg_score = sum(scores) / len(scores)
126
+ avg_score = _safe_avg(scores)
101
127
  metrics.append(
102
128
  StageMetric(
103
129
  run_id=event.run_id,
104
130
  stage_id=event.stage_id,
105
131
  metric_name="retrieval.avg_score",
106
132
  score=avg_score,
107
- evidence={"count": len(scores)},
133
+ evidence=_with_order_evidence({"count": len(scores)}, unordered_scores, None),
108
134
  )
109
135
  )
110
136
  if len(scores) > 1:
@@ -115,14 +141,22 @@ class StageMetricService:
115
141
  stage_id=event.stage_id,
116
142
  metric_name="retrieval.score_gap",
117
143
  score=score_gap,
118
- evidence={"max": max(scores), "min": min(scores)},
144
+ evidence=_with_order_evidence(
145
+ {"max": max(scores), "min": min(scores)}, unordered_scores, None
146
+ ),
119
147
  )
120
148
  )
121
149
 
122
150
  relevant_docs = _get_relevant_docs(event, relevance_map)
123
151
  if doc_ids and relevant_docs:
124
152
  top_k = _coerce_int(event.attributes.get("top_k"), default=len(doc_ids))
125
- k = min(top_k, len(doc_ids)) if top_k > 0 else len(doc_ids)
153
+ k = len(doc_ids) if top_k is None or top_k <= 0 else min(top_k, len(doc_ids))
154
+ if unordered_scores and scores:
155
+ score_pairs = list(zip(doc_ids, scores, strict=False))
156
+ score_pairs.sort(key=lambda item: (-item[1], item[0]))
157
+ doc_ids = [doc_id for doc_id, _score in score_pairs]
158
+ scores = [score for _doc_id, score in score_pairs]
159
+ order_reconstructed = "score_desc_then_id"
126
160
  retrieved_top_k = doc_ids[:k]
127
161
  relevant_found = len(set(retrieved_top_k) & relevant_docs)
128
162
 
@@ -135,11 +169,15 @@ class StageMetricService:
135
169
  stage_id=event.stage_id,
136
170
  metric_name="retrieval.precision_at_k",
137
171
  score=precision,
138
- evidence={
139
- "k": k,
140
- "relevant_found": relevant_found,
141
- "retrieved_count": k,
142
- },
172
+ evidence=_with_order_evidence(
173
+ {
174
+ "k": k,
175
+ "relevant_found": relevant_found,
176
+ "retrieved_count": k,
177
+ },
178
+ unordered_doc_ids or unordered_scores,
179
+ order_reconstructed,
180
+ ),
143
181
  )
144
182
  )
145
183
  metrics.append(
@@ -148,11 +186,15 @@ class StageMetricService:
148
186
  stage_id=event.stage_id,
149
187
  metric_name="retrieval.recall_at_k",
150
188
  score=recall,
151
- evidence={
152
- "k": k,
153
- "relevant_found": relevant_found,
154
- "relevant_total": len(relevant_docs),
155
- },
189
+ evidence=_with_order_evidence(
190
+ {
191
+ "k": k,
192
+ "relevant_found": relevant_found,
193
+ "relevant_total": len(relevant_docs),
194
+ },
195
+ unordered_doc_ids or unordered_scores,
196
+ order_reconstructed,
197
+ ),
156
198
  )
157
199
  )
158
200
 
@@ -180,7 +222,7 @@ class StageMetricService:
180
222
 
181
223
  scores = _to_float_list(event.attributes.get("scores"))
182
224
  if scores:
183
- avg_score = sum(scores) / len(scores)
225
+ avg_score = _safe_avg(scores)
184
226
  metrics.append(
185
227
  StageMetric(
186
228
  run_id=event.run_id,
@@ -358,6 +400,8 @@ def _to_str_list(value: Any) -> list[str]:
358
400
  return []
359
401
  if isinstance(value, str):
360
402
  return [value]
403
+ if isinstance(value, set | frozenset):
404
+ return [str(item) for item in value if not isinstance(item, bytes | bytearray)]
361
405
  if isinstance(value, Sequence):
362
406
  return [str(item) for item in value if not isinstance(item, bytes | bytearray)]
363
407
  return [str(value)]
@@ -370,6 +414,8 @@ def _to_str_set(value: Any) -> set[str]:
370
414
  def _to_float_list(value: Any) -> list[float]:
371
415
  if value is None:
372
416
  return []
417
+ if isinstance(value, set | frozenset):
418
+ return [float(item) for item in value]
373
419
  if isinstance(value, Sequence) and not isinstance(value, str | bytes | bytearray):
374
420
  return [float(item) for item in value]
375
421
  return [float(value)]
@@ -390,6 +436,25 @@ def _coerce_float(value: Any) -> float | None:
390
436
  return None
391
437
 
392
438
 
439
+ def _safe_avg(values: Sequence[float]) -> float:
440
+ if not values:
441
+ return 0.0
442
+ total = math.fsum(values)
443
+ return total / len(values)
444
+
445
+
446
+ def _with_order_evidence(
447
+ evidence: dict[str, Any], unordered: bool, order_reconstructed: str | None
448
+ ) -> dict[str, Any]:
449
+ if not unordered:
450
+ return evidence
451
+ enriched = dict(evidence)
452
+ enriched["unordered_input"] = True
453
+ if order_reconstructed:
454
+ enriched["order_reconstructed"] = order_reconstructed
455
+ return enriched
456
+
457
+
393
458
  def _extract_violation_count(attributes: Mapping[str, Any]) -> int | None:
394
459
  violations = attributes.get("violations")
395
460
  if isinstance(violations, list | tuple | set):
@@ -8,6 +8,10 @@ SUMMARY_RECOMMENDED_THRESHOLDS = {
8
8
  "summary_faithfulness": 0.90,
9
9
  "summary_score": 0.85,
10
10
  "entity_preservation": 0.90,
11
+ "summary_accuracy": 0.90,
12
+ "summary_risk_coverage": 0.90,
13
+ "summary_non_definitive": 0.80,
14
+ "summary_needs_followup": 0.80,
11
15
  }
12
16
  QA_RECOMMENDED_THRESHOLDS = {
13
17
  "faithfulness": 0.70,