evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +14 -0
- evalvault/adapters/inbound/api/main.py +14 -4
- evalvault/adapters/inbound/api/routers/chat.py +543 -0
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/inbound/cli/commands/run.py +14 -0
- evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
- evalvault/adapters/outbound/storage/base_sql.py +41 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
- evalvault/config/settings.py +21 -0
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/prompt.py +1 -1
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/metrics/__init__.py +8 -0
- evalvault/domain/metrics/registry.py +39 -3
- evalvault/domain/metrics/summary_accuracy.py +189 -0
- evalvault/domain/metrics/summary_needs_followup.py +45 -0
- evalvault/domain/metrics/summary_non_definitive.py +41 -0
- evalvault/domain/metrics/summary_risk_coverage.py +45 -0
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/custom_metric_snapshot.py +233 -0
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +282 -27
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/prompt_registry.py +39 -10
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/domain/services/threshold_profiles.py +4 -0
- evalvault/domain/services/visual_space_service.py +79 -4
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class JudgeCalibrationCase:
|
|
8
|
+
test_case_id: str
|
|
9
|
+
raw_score: float
|
|
10
|
+
calibrated_score: float
|
|
11
|
+
label: float | None = None
|
|
12
|
+
label_source: str | None = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class JudgeCalibrationMetric:
|
|
17
|
+
metric: str
|
|
18
|
+
method: str
|
|
19
|
+
sample_count: int
|
|
20
|
+
label_count: int
|
|
21
|
+
mae: float | None
|
|
22
|
+
pearson: float | None
|
|
23
|
+
spearman: float | None
|
|
24
|
+
temperature: float | None = None
|
|
25
|
+
parameters: dict[str, float | None] = field(default_factory=dict)
|
|
26
|
+
gate_passed: bool | None = None
|
|
27
|
+
warning: str | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class JudgeCalibrationSummary:
|
|
32
|
+
run_id: str
|
|
33
|
+
labels_source: str
|
|
34
|
+
method: str
|
|
35
|
+
metrics: list[str]
|
|
36
|
+
holdout_ratio: float
|
|
37
|
+
seed: int
|
|
38
|
+
total_labels: int
|
|
39
|
+
total_samples: int
|
|
40
|
+
gate_passed: bool
|
|
41
|
+
gate_threshold: float | None = None
|
|
42
|
+
notes: list[str] = field(default_factory=list)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class JudgeCalibrationResult:
|
|
47
|
+
summary: JudgeCalibrationSummary
|
|
48
|
+
metrics: list[JudgeCalibrationMetric] = field(default_factory=list)
|
|
49
|
+
case_results: dict[str, list[JudgeCalibrationCase]] = field(default_factory=dict)
|
|
50
|
+
warnings: list[str] = field(default_factory=list)
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
6
|
from datetime import datetime
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, Literal, overload
|
|
8
8
|
from uuid import uuid4
|
|
9
9
|
|
|
10
10
|
REQUIRED_STAGE_TYPES: tuple[str, ...] = ("system_prompt", "input", "retrieval", "output")
|
|
@@ -82,8 +82,8 @@ class StageEvent:
|
|
|
82
82
|
duration_ms=_optional_float(payload.get("duration_ms")),
|
|
83
83
|
input_ref=input_ref,
|
|
84
84
|
output_ref=output_ref,
|
|
85
|
-
attributes=_ensure_dict(payload.get("attributes")),
|
|
86
|
-
metadata=_ensure_dict(payload.get("metadata")),
|
|
85
|
+
attributes=_ensure_dict(payload.get("attributes"), allow_none=False),
|
|
86
|
+
metadata=_ensure_dict(payload.get("metadata"), allow_none=False),
|
|
87
87
|
trace_id=_optional_str(payload.get("trace_id") or trace_payload.get("trace_id")),
|
|
88
88
|
span_id=_optional_str(payload.get("span_id") or trace_payload.get("span_id")),
|
|
89
89
|
)
|
|
@@ -187,6 +187,14 @@ def _parse_datetime(value: Any) -> datetime | None:
|
|
|
187
187
|
raise ValueError("Invalid datetime value")
|
|
188
188
|
|
|
189
189
|
|
|
190
|
+
@overload
|
|
191
|
+
def _ensure_dict(value: None, *, allow_none: Literal[True]) -> None: ...
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@overload
|
|
195
|
+
def _ensure_dict(value: Any, *, allow_none: Literal[False] = False) -> dict[str, Any]: ...
|
|
196
|
+
|
|
197
|
+
|
|
190
198
|
def _ensure_dict(value: Any, *, allow_none: bool = False) -> dict[str, Any] | None:
|
|
191
199
|
if value is None:
|
|
192
200
|
return None if allow_none else {}
|
|
@@ -6,6 +6,10 @@ from evalvault.domain.metrics.entity_preservation import EntityPreservation
|
|
|
6
6
|
from evalvault.domain.metrics.insurance import InsuranceTermAccuracy
|
|
7
7
|
from evalvault.domain.metrics.no_answer import NoAnswerAccuracy, is_no_answer
|
|
8
8
|
from evalvault.domain.metrics.retrieval_rank import MRR, NDCG, HitRate
|
|
9
|
+
from evalvault.domain.metrics.summary_accuracy import SummaryAccuracy
|
|
10
|
+
from evalvault.domain.metrics.summary_needs_followup import SummaryNeedsFollowup
|
|
11
|
+
from evalvault.domain.metrics.summary_non_definitive import SummaryNonDefinitive
|
|
12
|
+
from evalvault.domain.metrics.summary_risk_coverage import SummaryRiskCoverage
|
|
9
13
|
from evalvault.domain.metrics.text_match import ExactMatch, F1Score
|
|
10
14
|
|
|
11
15
|
__all__ = [
|
|
@@ -19,5 +23,9 @@ __all__ = [
|
|
|
19
23
|
"MRR",
|
|
20
24
|
"NDCG",
|
|
21
25
|
"NoAnswerAccuracy",
|
|
26
|
+
"SummaryAccuracy",
|
|
27
|
+
"SummaryNeedsFollowup",
|
|
28
|
+
"SummaryNonDefinitive",
|
|
29
|
+
"SummaryRiskCoverage",
|
|
22
30
|
"is_no_answer",
|
|
23
31
|
]
|
|
@@ -123,7 +123,7 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
|
|
|
123
123
|
),
|
|
124
124
|
MetricSpec(
|
|
125
125
|
name="summary_score",
|
|
126
|
-
description="Measures summary coverage and conciseness against contexts",
|
|
126
|
+
description="(LLM) Measures summary coverage and conciseness against contexts",
|
|
127
127
|
requires_ground_truth=False,
|
|
128
128
|
requires_embeddings=False,
|
|
129
129
|
source="ragas",
|
|
@@ -132,7 +132,7 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
|
|
|
132
132
|
),
|
|
133
133
|
MetricSpec(
|
|
134
134
|
name="summary_faithfulness",
|
|
135
|
-
description="Measures whether summary statements are grounded in contexts",
|
|
135
|
+
description="(LLM) Measures whether summary statements are grounded in contexts",
|
|
136
136
|
requires_ground_truth=False,
|
|
137
137
|
requires_embeddings=False,
|
|
138
138
|
source="ragas",
|
|
@@ -141,7 +141,43 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
|
|
|
141
141
|
),
|
|
142
142
|
MetricSpec(
|
|
143
143
|
name="entity_preservation",
|
|
144
|
-
description="Measures preservation of key insurance entities in summaries",
|
|
144
|
+
description="(Rule) Measures preservation of key insurance entities in summaries",
|
|
145
|
+
requires_ground_truth=False,
|
|
146
|
+
requires_embeddings=False,
|
|
147
|
+
source="custom",
|
|
148
|
+
category="summary",
|
|
149
|
+
signal_group="summary_fidelity",
|
|
150
|
+
),
|
|
151
|
+
MetricSpec(
|
|
152
|
+
name="summary_accuracy",
|
|
153
|
+
description="(Rule) Measures whether summary entities are grounded in contexts",
|
|
154
|
+
requires_ground_truth=False,
|
|
155
|
+
requires_embeddings=False,
|
|
156
|
+
source="custom",
|
|
157
|
+
category="summary",
|
|
158
|
+
signal_group="summary_fidelity",
|
|
159
|
+
),
|
|
160
|
+
MetricSpec(
|
|
161
|
+
name="summary_risk_coverage",
|
|
162
|
+
description="(Rule) Measures coverage of expected insurance risk tags in summaries",
|
|
163
|
+
requires_ground_truth=False,
|
|
164
|
+
requires_embeddings=False,
|
|
165
|
+
source="custom",
|
|
166
|
+
category="summary",
|
|
167
|
+
signal_group="summary_fidelity",
|
|
168
|
+
),
|
|
169
|
+
MetricSpec(
|
|
170
|
+
name="summary_non_definitive",
|
|
171
|
+
description="(Rule) Measures avoidance of definitive claims in summaries",
|
|
172
|
+
requires_ground_truth=False,
|
|
173
|
+
requires_embeddings=False,
|
|
174
|
+
source="custom",
|
|
175
|
+
category="summary",
|
|
176
|
+
signal_group="summary_fidelity",
|
|
177
|
+
),
|
|
178
|
+
MetricSpec(
|
|
179
|
+
name="summary_needs_followup",
|
|
180
|
+
description="(Rule) Measures follow-up guidance when required",
|
|
145
181
|
requires_ground_truth=False,
|
|
146
182
|
requires_embeddings=False,
|
|
147
183
|
source="custom",
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from decimal import Decimal, InvalidOperation
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SummaryAccuracy:
|
|
8
|
+
"""Measure whether summary entities are supported by contexts."""
|
|
9
|
+
|
|
10
|
+
name = "summary_accuracy"
|
|
11
|
+
|
|
12
|
+
_PERCENT_RE = re.compile(r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>%|퍼센트|percent)", re.I)
|
|
13
|
+
_CURRENCY_RE = re.compile(
|
|
14
|
+
r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>원|만원|억원|달러|usd|krw|won)",
|
|
15
|
+
re.I,
|
|
16
|
+
)
|
|
17
|
+
_CURRENCY_PREFIX_RE = re.compile(r"(?P<unit>[$₩])\s*(?P<number>\d+(?:[.,]\d+)?)")
|
|
18
|
+
_DURATION_RE = re.compile(
|
|
19
|
+
r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>년|개월|월|일|years?|months?|days?)",
|
|
20
|
+
re.I,
|
|
21
|
+
)
|
|
22
|
+
_DATE_RE = re.compile(r"\b\d{4}[./-]\d{1,2}[./-]\d{1,2}\b")
|
|
23
|
+
|
|
24
|
+
_CURRENCY_MULTIPLIERS = {"만원": Decimal("10000"), "억원": Decimal("100000000")}
|
|
25
|
+
_KRW_UNITS = {"원", "krw", "won", "₩", "만원", "억원"}
|
|
26
|
+
_USD_UNITS = {"달러", "usd", "$"}
|
|
27
|
+
_DURATION_UNITS = {
|
|
28
|
+
"년": "year",
|
|
29
|
+
"year": "year",
|
|
30
|
+
"years": "year",
|
|
31
|
+
"개월": "month",
|
|
32
|
+
"월": "month",
|
|
33
|
+
"month": "month",
|
|
34
|
+
"months": "month",
|
|
35
|
+
"일": "day",
|
|
36
|
+
"day": "day",
|
|
37
|
+
"days": "day",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
_KEYWORDS_KO = (
|
|
41
|
+
"면책",
|
|
42
|
+
"제외",
|
|
43
|
+
"단서",
|
|
44
|
+
"다만",
|
|
45
|
+
"조건",
|
|
46
|
+
"자기부담",
|
|
47
|
+
"한도",
|
|
48
|
+
"감액",
|
|
49
|
+
)
|
|
50
|
+
_KEYWORDS_EN = (
|
|
51
|
+
"exclusion",
|
|
52
|
+
"excluded",
|
|
53
|
+
"exception",
|
|
54
|
+
"except",
|
|
55
|
+
"condition",
|
|
56
|
+
"deductible",
|
|
57
|
+
"limit",
|
|
58
|
+
"cap",
|
|
59
|
+
"waiting period",
|
|
60
|
+
"co-pay",
|
|
61
|
+
"copay",
|
|
62
|
+
"co-insurance",
|
|
63
|
+
"coinsurance",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def score(self, answer: str, contexts: list[str]) -> float:
|
|
67
|
+
if not contexts:
|
|
68
|
+
return 0.0
|
|
69
|
+
|
|
70
|
+
context_text = " ".join([ctx for ctx in contexts if ctx])
|
|
71
|
+
context_entities = self._extract_entities(context_text)
|
|
72
|
+
summary_entities = self._extract_entities(answer or "")
|
|
73
|
+
|
|
74
|
+
if not summary_entities:
|
|
75
|
+
return 0.5 if context_entities else 0.0
|
|
76
|
+
if not context_entities:
|
|
77
|
+
return 0.0
|
|
78
|
+
|
|
79
|
+
supported = summary_entities.intersection(context_entities)
|
|
80
|
+
return len(supported) / len(summary_entities)
|
|
81
|
+
|
|
82
|
+
def _extract_entities(self, text: str) -> set[str]:
|
|
83
|
+
entities = set()
|
|
84
|
+
entities.update(self._extract_numeric_entities(text))
|
|
85
|
+
entities.update(self._extract_keyword_entities(text))
|
|
86
|
+
return entities
|
|
87
|
+
|
|
88
|
+
def _extract_numeric_entities(self, text: str) -> set[str]:
|
|
89
|
+
entities: set[str] = set()
|
|
90
|
+
|
|
91
|
+
for match in self._PERCENT_RE.finditer(text):
|
|
92
|
+
number = self._normalize_number(match.group("number"))
|
|
93
|
+
if number:
|
|
94
|
+
entities.add(f"percent:{number}")
|
|
95
|
+
|
|
96
|
+
for match in self._CURRENCY_RE.finditer(text):
|
|
97
|
+
number = self._normalize_number(match.group("number"))
|
|
98
|
+
unit = match.group("unit").lower()
|
|
99
|
+
normalized = self._normalize_currency(number, unit)
|
|
100
|
+
if normalized:
|
|
101
|
+
entities.add(f"currency:{normalized}")
|
|
102
|
+
|
|
103
|
+
for match in self._CURRENCY_PREFIX_RE.finditer(text):
|
|
104
|
+
number = self._normalize_number(match.group("number"))
|
|
105
|
+
unit = match.group("unit")
|
|
106
|
+
normalized = self._normalize_currency(number, unit)
|
|
107
|
+
if normalized:
|
|
108
|
+
entities.add(f"currency:{normalized}")
|
|
109
|
+
|
|
110
|
+
for match in self._DURATION_RE.finditer(text):
|
|
111
|
+
number = self._normalize_number(match.group("number"))
|
|
112
|
+
unit = match.group("unit").lower()
|
|
113
|
+
normalized = self._normalize_duration(number, unit)
|
|
114
|
+
if normalized:
|
|
115
|
+
entities.add(f"duration:{normalized}")
|
|
116
|
+
|
|
117
|
+
for match in self._DATE_RE.finditer(text):
|
|
118
|
+
entities.add(f"date:{self._normalize_date(match.group(0))}")
|
|
119
|
+
|
|
120
|
+
return entities
|
|
121
|
+
|
|
122
|
+
def _extract_keyword_entities(self, text: str) -> set[str]:
|
|
123
|
+
entities: set[str] = set()
|
|
124
|
+
lower = text.lower()
|
|
125
|
+
|
|
126
|
+
for keyword in self._KEYWORDS_KO:
|
|
127
|
+
if keyword in text:
|
|
128
|
+
entities.add(f"kw:{keyword}")
|
|
129
|
+
|
|
130
|
+
for keyword in self._KEYWORDS_EN:
|
|
131
|
+
if keyword in lower:
|
|
132
|
+
entities.add(f"kw:{keyword}")
|
|
133
|
+
|
|
134
|
+
return entities
|
|
135
|
+
|
|
136
|
+
def _normalize_currency(self, number: str | None, unit: str) -> str | None:
|
|
137
|
+
if number is None:
|
|
138
|
+
return None
|
|
139
|
+
try:
|
|
140
|
+
value = Decimal(number)
|
|
141
|
+
except InvalidOperation:
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
unit_key = unit.lower()
|
|
145
|
+
multiplier = self._CURRENCY_MULTIPLIERS.get(unit_key)
|
|
146
|
+
if multiplier:
|
|
147
|
+
value *= multiplier
|
|
148
|
+
|
|
149
|
+
if unit_key in self._KRW_UNITS:
|
|
150
|
+
currency = "krw"
|
|
151
|
+
elif unit_key in self._USD_UNITS:
|
|
152
|
+
currency = "usd"
|
|
153
|
+
else:
|
|
154
|
+
currency = unit_key
|
|
155
|
+
|
|
156
|
+
return f"{currency}:{self._format_decimal(value)}"
|
|
157
|
+
|
|
158
|
+
def _normalize_duration(self, number: str | None, unit: str) -> str | None:
|
|
159
|
+
if number is None:
|
|
160
|
+
return None
|
|
161
|
+
try:
|
|
162
|
+
value = Decimal(number)
|
|
163
|
+
except InvalidOperation:
|
|
164
|
+
return None
|
|
165
|
+
base_unit = self._DURATION_UNITS.get(unit, unit)
|
|
166
|
+
return f"{self._format_decimal(value)}{base_unit}"
|
|
167
|
+
|
|
168
|
+
@staticmethod
|
|
169
|
+
def _normalize_date(raw: str) -> str:
|
|
170
|
+
return re.sub(r"[./-]", "", raw)
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def _normalize_number(raw: str | None) -> str | None:
|
|
174
|
+
if raw is None:
|
|
175
|
+
return None
|
|
176
|
+
cleaned = raw.replace(",", "").strip()
|
|
177
|
+
if not cleaned:
|
|
178
|
+
return None
|
|
179
|
+
try:
|
|
180
|
+
value = Decimal(cleaned)
|
|
181
|
+
except InvalidOperation:
|
|
182
|
+
return None
|
|
183
|
+
return SummaryAccuracy._format_decimal(value)
|
|
184
|
+
|
|
185
|
+
@staticmethod
|
|
186
|
+
def _format_decimal(value: Decimal) -> str:
|
|
187
|
+
if value == value.to_integral_value():
|
|
188
|
+
return str(value.to_integral_value())
|
|
189
|
+
return format(value.normalize(), "f").rstrip("0").rstrip(".")
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SummaryNeedsFollowup:
|
|
5
|
+
"""Check if follow-up guidance appears when required."""
|
|
6
|
+
|
|
7
|
+
name = "summary_needs_followup"
|
|
8
|
+
|
|
9
|
+
_FOLLOWUP_KEYWORDS = [
|
|
10
|
+
"확인 필요",
|
|
11
|
+
"추가 확인",
|
|
12
|
+
"담당자 확인",
|
|
13
|
+
"재문의",
|
|
14
|
+
"추가 문의",
|
|
15
|
+
"서류 확인",
|
|
16
|
+
"follow up",
|
|
17
|
+
"follow-up",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
def score(self, answer: str, contexts: list[str], metadata: dict | None = None) -> float:
|
|
21
|
+
text = answer or ""
|
|
22
|
+
has_followup = self._has_followup(text)
|
|
23
|
+
expected = self._expects_followup(metadata)
|
|
24
|
+
|
|
25
|
+
if expected:
|
|
26
|
+
return 1.0 if has_followup else 0.0
|
|
27
|
+
return 1.0 if not has_followup else 0.0
|
|
28
|
+
|
|
29
|
+
def _expects_followup(self, metadata: dict | None) -> bool:
|
|
30
|
+
if not metadata:
|
|
31
|
+
return False
|
|
32
|
+
raw = metadata.get("summary_tags")
|
|
33
|
+
if not raw:
|
|
34
|
+
return False
|
|
35
|
+
if isinstance(raw, list):
|
|
36
|
+
tags = [str(item).strip().lower() for item in raw if str(item).strip()]
|
|
37
|
+
else:
|
|
38
|
+
tags = [str(raw).strip().lower()]
|
|
39
|
+
return "needs_followup" in tags
|
|
40
|
+
|
|
41
|
+
def _has_followup(self, text: str) -> bool:
|
|
42
|
+
lowered = text.lower()
|
|
43
|
+
return any(
|
|
44
|
+
keyword in text or keyword.lower() in lowered for keyword in self._FOLLOWUP_KEYWORDS
|
|
45
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SummaryNonDefinitive:
|
|
7
|
+
"""Penalize definitive statements in summaries."""
|
|
8
|
+
|
|
9
|
+
name = "summary_non_definitive"
|
|
10
|
+
|
|
11
|
+
_DEFINITIVE_PATTERNS_KO = [
|
|
12
|
+
r"무조건",
|
|
13
|
+
r"반드시",
|
|
14
|
+
r"100%",
|
|
15
|
+
r"전액\s*지급",
|
|
16
|
+
r"확실히",
|
|
17
|
+
r"분명히",
|
|
18
|
+
r"절대",
|
|
19
|
+
r"항상",
|
|
20
|
+
]
|
|
21
|
+
_DEFINITIVE_PATTERNS_EN = [
|
|
22
|
+
r"always",
|
|
23
|
+
r"guaranteed",
|
|
24
|
+
r"definitely",
|
|
25
|
+
r"certainly",
|
|
26
|
+
r"absolutely",
|
|
27
|
+
r"100%",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
def score(self, answer: str, contexts: list[str]) -> float:
|
|
31
|
+
text = answer or ""
|
|
32
|
+
if self._has_definitive_pattern(text):
|
|
33
|
+
return 0.0
|
|
34
|
+
return 1.0
|
|
35
|
+
|
|
36
|
+
def _has_definitive_pattern(self, text: str) -> bool:
|
|
37
|
+
for pattern in self._DEFINITIVE_PATTERNS_KO:
|
|
38
|
+
if re.search(pattern, text):
|
|
39
|
+
return True
|
|
40
|
+
lowered = text.lower()
|
|
41
|
+
return any(re.search(pattern, lowered) for pattern in self._DEFINITIVE_PATTERNS_EN)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SummaryRiskCoverage:
|
|
5
|
+
"""Measure coverage of expected insurance risk tags in summary."""
|
|
6
|
+
|
|
7
|
+
name = "summary_risk_coverage"
|
|
8
|
+
|
|
9
|
+
_TAG_KEYWORDS = {
|
|
10
|
+
"exclusion": ["면책", "보장 제외", "지급 불가", "exclusion"],
|
|
11
|
+
"deductible": ["자기부담", "본인부담금", "deductible", "copay"],
|
|
12
|
+
"limit": ["한도", "상한", "최대", "limit", "cap"],
|
|
13
|
+
"waiting_period": ["면책기간", "대기기간", "waiting period"],
|
|
14
|
+
"condition": ["조건", "단서", "다만", "condition"],
|
|
15
|
+
"documents_required": ["서류", "진단서", "영수증", "documents"],
|
|
16
|
+
"needs_followup": ["확인 필요", "추가 확인", "담당자 확인", "재문의", "follow up"],
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def score(self, answer: str, contexts: list[str], metadata: dict | None = None) -> float:
|
|
20
|
+
expected_tags = self._extract_expected_tags(metadata)
|
|
21
|
+
if not expected_tags:
|
|
22
|
+
return 1.0
|
|
23
|
+
|
|
24
|
+
text = answer or ""
|
|
25
|
+
covered = 0
|
|
26
|
+
for tag in expected_tags:
|
|
27
|
+
if self._has_tag_keyword(text, tag):
|
|
28
|
+
covered += 1
|
|
29
|
+
|
|
30
|
+
return covered / len(expected_tags)
|
|
31
|
+
|
|
32
|
+
def _extract_expected_tags(self, metadata: dict | None) -> list[str]:
|
|
33
|
+
if not metadata:
|
|
34
|
+
return []
|
|
35
|
+
raw = metadata.get("summary_tags")
|
|
36
|
+
if not raw:
|
|
37
|
+
return []
|
|
38
|
+
if isinstance(raw, list):
|
|
39
|
+
return [str(item).strip().lower() for item in raw if str(item).strip()]
|
|
40
|
+
return [str(raw).strip().lower()]
|
|
41
|
+
|
|
42
|
+
def _has_tag_keyword(self, text: str, tag: str) -> bool:
|
|
43
|
+
keywords = self._TAG_KEYWORDS.get(tag, [])
|
|
44
|
+
lowered = text.lower()
|
|
45
|
+
return any(keyword in text or keyword.lower() in lowered for keyword in keywords)
|