evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  14. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  15. evalvault/adapters/outbound/artifact_fs.py +16 -0
  16. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  17. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  18. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  19. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  20. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  21. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  22. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  23. evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
  24. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  25. evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
  26. evalvault/config/settings.py +21 -0
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/judge_calibration.py +50 -0
  29. evalvault/domain/entities/prompt.py +1 -1
  30. evalvault/domain/entities/stage.py +11 -3
  31. evalvault/domain/metrics/__init__.py +8 -0
  32. evalvault/domain/metrics/registry.py +39 -3
  33. evalvault/domain/metrics/summary_accuracy.py +189 -0
  34. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  35. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  36. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  37. evalvault/domain/services/artifact_lint_service.py +268 -0
  38. evalvault/domain/services/benchmark_runner.py +1 -6
  39. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  40. evalvault/domain/services/dataset_preprocessor.py +26 -0
  41. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  42. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  43. evalvault/domain/services/evaluator.py +282 -27
  44. evalvault/domain/services/judge_calibration_service.py +495 -0
  45. evalvault/domain/services/ops_snapshot_service.py +159 -0
  46. evalvault/domain/services/prompt_registry.py +39 -10
  47. evalvault/domain/services/regression_gate_service.py +199 -0
  48. evalvault/domain/services/run_comparison_service.py +159 -0
  49. evalvault/domain/services/stage_event_builder.py +6 -1
  50. evalvault/domain/services/stage_metric_service.py +83 -18
  51. evalvault/domain/services/threshold_profiles.py +4 -0
  52. evalvault/domain/services/visual_space_service.py +79 -4
  53. evalvault/ports/outbound/__init__.py +4 -0
  54. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  55. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  56. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  57. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  58. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  59. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  60. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
  61. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  62. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  63. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass
7
+ class JudgeCalibrationCase:
8
+ test_case_id: str
9
+ raw_score: float
10
+ calibrated_score: float
11
+ label: float | None = None
12
+ label_source: str | None = None
13
+
14
+
15
+ @dataclass
16
+ class JudgeCalibrationMetric:
17
+ metric: str
18
+ method: str
19
+ sample_count: int
20
+ label_count: int
21
+ mae: float | None
22
+ pearson: float | None
23
+ spearman: float | None
24
+ temperature: float | None = None
25
+ parameters: dict[str, float | None] = field(default_factory=dict)
26
+ gate_passed: bool | None = None
27
+ warning: str | None = None
28
+
29
+
30
+ @dataclass
31
+ class JudgeCalibrationSummary:
32
+ run_id: str
33
+ labels_source: str
34
+ method: str
35
+ metrics: list[str]
36
+ holdout_ratio: float
37
+ seed: int
38
+ total_labels: int
39
+ total_samples: int
40
+ gate_passed: bool
41
+ gate_threshold: float | None = None
42
+ notes: list[str] = field(default_factory=list)
43
+
44
+
45
+ @dataclass
46
+ class JudgeCalibrationResult:
47
+ summary: JudgeCalibrationSummary
48
+ metrics: list[JudgeCalibrationMetric] = field(default_factory=list)
49
+ case_results: dict[str, list[JudgeCalibrationCase]] = field(default_factory=dict)
50
+ warnings: list[str] = field(default_factory=list)
@@ -7,7 +7,7 @@ from datetime import datetime
7
7
  from typing import Any, Literal
8
8
  from uuid import uuid4
9
9
 
10
- PromptKind = Literal["system", "ragas"]
10
+ PromptKind = Literal["system", "ragas", "custom"]
11
11
 
12
12
 
13
13
  @dataclass
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass, field
6
6
  from datetime import datetime
7
- from typing import Any
7
+ from typing import Any, Literal, overload
8
8
  from uuid import uuid4
9
9
 
10
10
  REQUIRED_STAGE_TYPES: tuple[str, ...] = ("system_prompt", "input", "retrieval", "output")
@@ -82,8 +82,8 @@ class StageEvent:
82
82
  duration_ms=_optional_float(payload.get("duration_ms")),
83
83
  input_ref=input_ref,
84
84
  output_ref=output_ref,
85
- attributes=_ensure_dict(payload.get("attributes")),
86
- metadata=_ensure_dict(payload.get("metadata")),
85
+ attributes=_ensure_dict(payload.get("attributes"), allow_none=False),
86
+ metadata=_ensure_dict(payload.get("metadata"), allow_none=False),
87
87
  trace_id=_optional_str(payload.get("trace_id") or trace_payload.get("trace_id")),
88
88
  span_id=_optional_str(payload.get("span_id") or trace_payload.get("span_id")),
89
89
  )
@@ -187,6 +187,14 @@ def _parse_datetime(value: Any) -> datetime | None:
187
187
  raise ValueError("Invalid datetime value")
188
188
 
189
189
 
190
+ @overload
191
+ def _ensure_dict(value: None, *, allow_none: Literal[True]) -> None: ...
192
+
193
+
194
+ @overload
195
+ def _ensure_dict(value: Any, *, allow_none: Literal[False] = False) -> dict[str, Any]: ...
196
+
197
+
190
198
  def _ensure_dict(value: Any, *, allow_none: bool = False) -> dict[str, Any] | None:
191
199
  if value is None:
192
200
  return None if allow_none else {}
@@ -6,6 +6,10 @@ from evalvault.domain.metrics.entity_preservation import EntityPreservation
6
6
  from evalvault.domain.metrics.insurance import InsuranceTermAccuracy
7
7
  from evalvault.domain.metrics.no_answer import NoAnswerAccuracy, is_no_answer
8
8
  from evalvault.domain.metrics.retrieval_rank import MRR, NDCG, HitRate
9
+ from evalvault.domain.metrics.summary_accuracy import SummaryAccuracy
10
+ from evalvault.domain.metrics.summary_needs_followup import SummaryNeedsFollowup
11
+ from evalvault.domain.metrics.summary_non_definitive import SummaryNonDefinitive
12
+ from evalvault.domain.metrics.summary_risk_coverage import SummaryRiskCoverage
9
13
  from evalvault.domain.metrics.text_match import ExactMatch, F1Score
10
14
 
11
15
  __all__ = [
@@ -19,5 +23,9 @@ __all__ = [
19
23
  "MRR",
20
24
  "NDCG",
21
25
  "NoAnswerAccuracy",
26
+ "SummaryAccuracy",
27
+ "SummaryNeedsFollowup",
28
+ "SummaryNonDefinitive",
29
+ "SummaryRiskCoverage",
22
30
  "is_no_answer",
23
31
  ]
@@ -123,7 +123,7 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
123
123
  ),
124
124
  MetricSpec(
125
125
  name="summary_score",
126
- description="Measures summary coverage and conciseness against contexts",
126
+ description="(LLM) Measures summary coverage and conciseness against contexts",
127
127
  requires_ground_truth=False,
128
128
  requires_embeddings=False,
129
129
  source="ragas",
@@ -132,7 +132,7 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
132
132
  ),
133
133
  MetricSpec(
134
134
  name="summary_faithfulness",
135
- description="Measures whether summary statements are grounded in contexts",
135
+ description="(LLM) Measures whether summary statements are grounded in contexts",
136
136
  requires_ground_truth=False,
137
137
  requires_embeddings=False,
138
138
  source="ragas",
@@ -141,7 +141,43 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
141
141
  ),
142
142
  MetricSpec(
143
143
  name="entity_preservation",
144
- description="Measures preservation of key insurance entities in summaries",
144
+ description="(Rule) Measures preservation of key insurance entities in summaries",
145
+ requires_ground_truth=False,
146
+ requires_embeddings=False,
147
+ source="custom",
148
+ category="summary",
149
+ signal_group="summary_fidelity",
150
+ ),
151
+ MetricSpec(
152
+ name="summary_accuracy",
153
+ description="(Rule) Measures whether summary entities are grounded in contexts",
154
+ requires_ground_truth=False,
155
+ requires_embeddings=False,
156
+ source="custom",
157
+ category="summary",
158
+ signal_group="summary_fidelity",
159
+ ),
160
+ MetricSpec(
161
+ name="summary_risk_coverage",
162
+ description="(Rule) Measures coverage of expected insurance risk tags in summaries",
163
+ requires_ground_truth=False,
164
+ requires_embeddings=False,
165
+ source="custom",
166
+ category="summary",
167
+ signal_group="summary_fidelity",
168
+ ),
169
+ MetricSpec(
170
+ name="summary_non_definitive",
171
+ description="(Rule) Measures avoidance of definitive claims in summaries",
172
+ requires_ground_truth=False,
173
+ requires_embeddings=False,
174
+ source="custom",
175
+ category="summary",
176
+ signal_group="summary_fidelity",
177
+ ),
178
+ MetricSpec(
179
+ name="summary_needs_followup",
180
+ description="(Rule) Measures follow-up guidance when required",
145
181
  requires_ground_truth=False,
146
182
  requires_embeddings=False,
147
183
  source="custom",
@@ -0,0 +1,189 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from decimal import Decimal, InvalidOperation
5
+
6
+
7
+ class SummaryAccuracy:
8
+ """Measure whether summary entities are supported by contexts."""
9
+
10
+ name = "summary_accuracy"
11
+
12
+ _PERCENT_RE = re.compile(r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>%|퍼센트|percent)", re.I)
13
+ _CURRENCY_RE = re.compile(
14
+ r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>원|만원|억원|달러|usd|krw|won)",
15
+ re.I,
16
+ )
17
+ _CURRENCY_PREFIX_RE = re.compile(r"(?P<unit>[$₩])\s*(?P<number>\d+(?:[.,]\d+)?)")
18
+ _DURATION_RE = re.compile(
19
+ r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>년|개월|월|일|years?|months?|days?)",
20
+ re.I,
21
+ )
22
+ _DATE_RE = re.compile(r"\b\d{4}[./-]\d{1,2}[./-]\d{1,2}\b")
23
+
24
+ _CURRENCY_MULTIPLIERS = {"만원": Decimal("10000"), "억원": Decimal("100000000")}
25
+ _KRW_UNITS = {"원", "krw", "won", "₩", "만원", "억원"}
26
+ _USD_UNITS = {"달러", "usd", "$"}
27
+ _DURATION_UNITS = {
28
+ "년": "year",
29
+ "year": "year",
30
+ "years": "year",
31
+ "개월": "month",
32
+ "월": "month",
33
+ "month": "month",
34
+ "months": "month",
35
+ "일": "day",
36
+ "day": "day",
37
+ "days": "day",
38
+ }
39
+
40
+ _KEYWORDS_KO = (
41
+ "면책",
42
+ "제외",
43
+ "단서",
44
+ "다만",
45
+ "조건",
46
+ "자기부담",
47
+ "한도",
48
+ "감액",
49
+ )
50
+ _KEYWORDS_EN = (
51
+ "exclusion",
52
+ "excluded",
53
+ "exception",
54
+ "except",
55
+ "condition",
56
+ "deductible",
57
+ "limit",
58
+ "cap",
59
+ "waiting period",
60
+ "co-pay",
61
+ "copay",
62
+ "co-insurance",
63
+ "coinsurance",
64
+ )
65
+
66
+ def score(self, answer: str, contexts: list[str]) -> float:
67
+ if not contexts:
68
+ return 0.0
69
+
70
+ context_text = " ".join([ctx for ctx in contexts if ctx])
71
+ context_entities = self._extract_entities(context_text)
72
+ summary_entities = self._extract_entities(answer or "")
73
+
74
+ if not summary_entities:
75
+ return 0.5 if context_entities else 0.0
76
+ if not context_entities:
77
+ return 0.0
78
+
79
+ supported = summary_entities.intersection(context_entities)
80
+ return len(supported) / len(summary_entities)
81
+
82
+ def _extract_entities(self, text: str) -> set[str]:
83
+ entities = set()
84
+ entities.update(self._extract_numeric_entities(text))
85
+ entities.update(self._extract_keyword_entities(text))
86
+ return entities
87
+
88
+ def _extract_numeric_entities(self, text: str) -> set[str]:
89
+ entities: set[str] = set()
90
+
91
+ for match in self._PERCENT_RE.finditer(text):
92
+ number = self._normalize_number(match.group("number"))
93
+ if number:
94
+ entities.add(f"percent:{number}")
95
+
96
+ for match in self._CURRENCY_RE.finditer(text):
97
+ number = self._normalize_number(match.group("number"))
98
+ unit = match.group("unit").lower()
99
+ normalized = self._normalize_currency(number, unit)
100
+ if normalized:
101
+ entities.add(f"currency:{normalized}")
102
+
103
+ for match in self._CURRENCY_PREFIX_RE.finditer(text):
104
+ number = self._normalize_number(match.group("number"))
105
+ unit = match.group("unit")
106
+ normalized = self._normalize_currency(number, unit)
107
+ if normalized:
108
+ entities.add(f"currency:{normalized}")
109
+
110
+ for match in self._DURATION_RE.finditer(text):
111
+ number = self._normalize_number(match.group("number"))
112
+ unit = match.group("unit").lower()
113
+ normalized = self._normalize_duration(number, unit)
114
+ if normalized:
115
+ entities.add(f"duration:{normalized}")
116
+
117
+ for match in self._DATE_RE.finditer(text):
118
+ entities.add(f"date:{self._normalize_date(match.group(0))}")
119
+
120
+ return entities
121
+
122
+ def _extract_keyword_entities(self, text: str) -> set[str]:
123
+ entities: set[str] = set()
124
+ lower = text.lower()
125
+
126
+ for keyword in self._KEYWORDS_KO:
127
+ if keyword in text:
128
+ entities.add(f"kw:{keyword}")
129
+
130
+ for keyword in self._KEYWORDS_EN:
131
+ if keyword in lower:
132
+ entities.add(f"kw:{keyword}")
133
+
134
+ return entities
135
+
136
+ def _normalize_currency(self, number: str | None, unit: str) -> str | None:
137
+ if number is None:
138
+ return None
139
+ try:
140
+ value = Decimal(number)
141
+ except InvalidOperation:
142
+ return None
143
+
144
+ unit_key = unit.lower()
145
+ multiplier = self._CURRENCY_MULTIPLIERS.get(unit_key)
146
+ if multiplier:
147
+ value *= multiplier
148
+
149
+ if unit_key in self._KRW_UNITS:
150
+ currency = "krw"
151
+ elif unit_key in self._USD_UNITS:
152
+ currency = "usd"
153
+ else:
154
+ currency = unit_key
155
+
156
+ return f"{currency}:{self._format_decimal(value)}"
157
+
158
+ def _normalize_duration(self, number: str | None, unit: str) -> str | None:
159
+ if number is None:
160
+ return None
161
+ try:
162
+ value = Decimal(number)
163
+ except InvalidOperation:
164
+ return None
165
+ base_unit = self._DURATION_UNITS.get(unit, unit)
166
+ return f"{self._format_decimal(value)}{base_unit}"
167
+
168
+ @staticmethod
169
+ def _normalize_date(raw: str) -> str:
170
+ return re.sub(r"[./-]", "", raw)
171
+
172
+ @staticmethod
173
+ def _normalize_number(raw: str | None) -> str | None:
174
+ if raw is None:
175
+ return None
176
+ cleaned = raw.replace(",", "").strip()
177
+ if not cleaned:
178
+ return None
179
+ try:
180
+ value = Decimal(cleaned)
181
+ except InvalidOperation:
182
+ return None
183
+ return SummaryAccuracy._format_decimal(value)
184
+
185
+ @staticmethod
186
+ def _format_decimal(value: Decimal) -> str:
187
+ if value == value.to_integral_value():
188
+ return str(value.to_integral_value())
189
+ return format(value.normalize(), "f").rstrip("0").rstrip(".")
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class SummaryNeedsFollowup:
5
+ """Check if follow-up guidance appears when required."""
6
+
7
+ name = "summary_needs_followup"
8
+
9
+ _FOLLOWUP_KEYWORDS = [
10
+ "확인 필요",
11
+ "추가 확인",
12
+ "담당자 확인",
13
+ "재문의",
14
+ "추가 문의",
15
+ "서류 확인",
16
+ "follow up",
17
+ "follow-up",
18
+ ]
19
+
20
+ def score(self, answer: str, contexts: list[str], metadata: dict | None = None) -> float:
21
+ text = answer or ""
22
+ has_followup = self._has_followup(text)
23
+ expected = self._expects_followup(metadata)
24
+
25
+ if expected:
26
+ return 1.0 if has_followup else 0.0
27
+ return 1.0 if not has_followup else 0.0
28
+
29
+ def _expects_followup(self, metadata: dict | None) -> bool:
30
+ if not metadata:
31
+ return False
32
+ raw = metadata.get("summary_tags")
33
+ if not raw:
34
+ return False
35
+ if isinstance(raw, list):
36
+ tags = [str(item).strip().lower() for item in raw if str(item).strip()]
37
+ else:
38
+ tags = [str(raw).strip().lower()]
39
+ return "needs_followup" in tags
40
+
41
+ def _has_followup(self, text: str) -> bool:
42
+ lowered = text.lower()
43
+ return any(
44
+ keyword in text or keyword.lower() in lowered for keyword in self._FOLLOWUP_KEYWORDS
45
+ )
@@ -0,0 +1,41 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+
6
+ class SummaryNonDefinitive:
7
+ """Penalize definitive statements in summaries."""
8
+
9
+ name = "summary_non_definitive"
10
+
11
+ _DEFINITIVE_PATTERNS_KO = [
12
+ r"무조건",
13
+ r"반드시",
14
+ r"100%",
15
+ r"전액\s*지급",
16
+ r"확실히",
17
+ r"분명히",
18
+ r"절대",
19
+ r"항상",
20
+ ]
21
+ _DEFINITIVE_PATTERNS_EN = [
22
+ r"always",
23
+ r"guaranteed",
24
+ r"definitely",
25
+ r"certainly",
26
+ r"absolutely",
27
+ r"100%",
28
+ ]
29
+
30
+ def score(self, answer: str, contexts: list[str]) -> float:
31
+ text = answer or ""
32
+ if self._has_definitive_pattern(text):
33
+ return 0.0
34
+ return 1.0
35
+
36
+ def _has_definitive_pattern(self, text: str) -> bool:
37
+ for pattern in self._DEFINITIVE_PATTERNS_KO:
38
+ if re.search(pattern, text):
39
+ return True
40
+ lowered = text.lower()
41
+ return any(re.search(pattern, lowered) for pattern in self._DEFINITIVE_PATTERNS_EN)
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class SummaryRiskCoverage:
5
+ """Measure coverage of expected insurance risk tags in summary."""
6
+
7
+ name = "summary_risk_coverage"
8
+
9
+ _TAG_KEYWORDS = {
10
+ "exclusion": ["면책", "보장 제외", "지급 불가", "exclusion"],
11
+ "deductible": ["자기부담", "본인부담금", "deductible", "copay"],
12
+ "limit": ["한도", "상한", "최대", "limit", "cap"],
13
+ "waiting_period": ["면책기간", "대기기간", "waiting period"],
14
+ "condition": ["조건", "단서", "다만", "condition"],
15
+ "documents_required": ["서류", "진단서", "영수증", "documents"],
16
+ "needs_followup": ["확인 필요", "추가 확인", "담당자 확인", "재문의", "follow up"],
17
+ }
18
+
19
+ def score(self, answer: str, contexts: list[str], metadata: dict | None = None) -> float:
20
+ expected_tags = self._extract_expected_tags(metadata)
21
+ if not expected_tags:
22
+ return 1.0
23
+
24
+ text = answer or ""
25
+ covered = 0
26
+ for tag in expected_tags:
27
+ if self._has_tag_keyword(text, tag):
28
+ covered += 1
29
+
30
+ return covered / len(expected_tags)
31
+
32
+ def _extract_expected_tags(self, metadata: dict | None) -> list[str]:
33
+ if not metadata:
34
+ return []
35
+ raw = metadata.get("summary_tags")
36
+ if not raw:
37
+ return []
38
+ if isinstance(raw, list):
39
+ return [str(item).strip().lower() for item in raw if str(item).strip()]
40
+ return [str(raw).strip().lower()]
41
+
42
+ def _has_tag_keyword(self, text: str, tag: str) -> bool:
43
+ keywords = self._TAG_KEYWORDS.get(tag, [])
44
+ lowered = text.lower()
45
+ return any(keyword in text or keyword.lower() in lowered for keyword in keywords)