evalvault 1.65.0__py3-none-any.whl → 1.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +14 -0
- evalvault/adapters/inbound/api/main.py +14 -4
- evalvault/adapters/inbound/api/routers/chat.py +543 -0
- evalvault/adapters/inbound/cli/commands/run.py +14 -0
- evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
- evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
- evalvault/adapters/outbound/storage/base_sql.py +41 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +1 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +29 -2
- evalvault/config/settings.py +21 -0
- evalvault/domain/entities/prompt.py +1 -1
- evalvault/domain/metrics/__init__.py +8 -0
- evalvault/domain/metrics/registry.py +39 -3
- evalvault/domain/metrics/summary_accuracy.py +189 -0
- evalvault/domain/metrics/summary_needs_followup.py +45 -0
- evalvault/domain/metrics/summary_non_definitive.py +41 -0
- evalvault/domain/metrics/summary_risk_coverage.py +45 -0
- evalvault/domain/services/custom_metric_snapshot.py +233 -0
- evalvault/domain/services/evaluator.py +280 -27
- evalvault/domain/services/prompt_registry.py +39 -10
- evalvault/domain/services/threshold_profiles.py +4 -0
- evalvault/domain/services/visual_space_service.py +79 -4
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +28 -22
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -499,8 +499,20 @@ SUMMARY_RECOMMENDED_THRESHOLDS = {
|
|
|
499
499
|
"summary_faithfulness": 0.90,
|
|
500
500
|
"summary_score": 0.85,
|
|
501
501
|
"entity_preservation": 0.90,
|
|
502
|
+
"summary_accuracy": 0.90,
|
|
503
|
+
"summary_risk_coverage": 0.90,
|
|
504
|
+
"summary_non_definitive": 0.80,
|
|
505
|
+
"summary_needs_followup": 0.80,
|
|
502
506
|
}
|
|
503
|
-
SUMMARY_METRIC_ORDER = (
|
|
507
|
+
SUMMARY_METRIC_ORDER = (
|
|
508
|
+
"summary_faithfulness",
|
|
509
|
+
"summary_score",
|
|
510
|
+
"entity_preservation",
|
|
511
|
+
"summary_accuracy",
|
|
512
|
+
"summary_risk_coverage",
|
|
513
|
+
"summary_non_definitive",
|
|
514
|
+
"summary_needs_followup",
|
|
515
|
+
)
|
|
504
516
|
|
|
505
517
|
|
|
506
518
|
@dataclass
|
|
@@ -664,6 +664,8 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
664
664
|
def export_run_to_excel(self, run_id: str, output_path) -> Path:
|
|
665
665
|
from openpyxl import Workbook
|
|
666
666
|
|
|
667
|
+
from evalvault.domain.metrics.registry import get_metric_spec_map
|
|
668
|
+
|
|
667
669
|
output = Path(output_path)
|
|
668
670
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
669
671
|
|
|
@@ -837,6 +839,23 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
837
839
|
|
|
838
840
|
summary_rows: list[dict[str, Any]] = []
|
|
839
841
|
run_payload = run_rows[0] if run_rows else {}
|
|
842
|
+
custom_metric_rows: list[dict[str, Any]] = []
|
|
843
|
+
run_metadata = self._deserialize_json(run_payload.get("metadata")) if run_payload else None
|
|
844
|
+
if isinstance(run_metadata, dict):
|
|
845
|
+
custom_snapshot = run_metadata.get("custom_metric_snapshot")
|
|
846
|
+
if isinstance(custom_snapshot, dict):
|
|
847
|
+
entries = custom_snapshot.get("metrics")
|
|
848
|
+
if isinstance(entries, list):
|
|
849
|
+
for entry in entries:
|
|
850
|
+
if isinstance(entry, dict):
|
|
851
|
+
row = dict(entry)
|
|
852
|
+
row["schema_version"] = custom_snapshot.get("schema_version")
|
|
853
|
+
custom_metric_rows.append(row)
|
|
854
|
+
if custom_metric_rows:
|
|
855
|
+
custom_metric_rows = self._normalize_rows(
|
|
856
|
+
custom_metric_rows,
|
|
857
|
+
json_columns={"inputs", "rules"},
|
|
858
|
+
)
|
|
840
859
|
prompt_set_id = None
|
|
841
860
|
prompt_set_name = None
|
|
842
861
|
if run_prompt_payloads:
|
|
@@ -878,14 +897,17 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
878
897
|
if isinstance(threshold, (int, float)) and score >= threshold:
|
|
879
898
|
entry["pass_count"] += 1
|
|
880
899
|
|
|
900
|
+
metric_spec_map = get_metric_spec_map()
|
|
881
901
|
for entry in metrics_index.values():
|
|
882
902
|
count = entry["count"] or 0
|
|
903
|
+
spec = metric_spec_map.get(entry["metric_name"])
|
|
883
904
|
metric_summary_rows.append(
|
|
884
905
|
{
|
|
885
906
|
"metric_name": entry["metric_name"],
|
|
886
907
|
"avg_score": (entry["score_sum"] / count) if count else None,
|
|
887
908
|
"pass_rate": (entry["pass_count"] / count) if count else None,
|
|
888
909
|
"samples": count,
|
|
910
|
+
"source": spec.source if spec else None,
|
|
889
911
|
}
|
|
890
912
|
)
|
|
891
913
|
|
|
@@ -956,7 +978,25 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
956
978
|
(
|
|
957
979
|
"MetricsSummary",
|
|
958
980
|
metric_summary_rows,
|
|
959
|
-
["metric_name", "avg_score", "pass_rate", "samples"],
|
|
981
|
+
["metric_name", "avg_score", "pass_rate", "samples", "source"],
|
|
982
|
+
),
|
|
983
|
+
(
|
|
984
|
+
"CustomMetrics",
|
|
985
|
+
custom_metric_rows,
|
|
986
|
+
[
|
|
987
|
+
"schema_version",
|
|
988
|
+
"metric_name",
|
|
989
|
+
"source",
|
|
990
|
+
"description",
|
|
991
|
+
"evaluation_method",
|
|
992
|
+
"inputs",
|
|
993
|
+
"output",
|
|
994
|
+
"evaluation_process",
|
|
995
|
+
"rules",
|
|
996
|
+
"notes",
|
|
997
|
+
"implementation_path",
|
|
998
|
+
"implementation_hash",
|
|
999
|
+
],
|
|
960
1000
|
),
|
|
961
1001
|
(
|
|
962
1002
|
"RunPromptSets",
|
|
@@ -360,6 +360,7 @@ class LangfuseAdapter(TrackerPort):
|
|
|
360
360
|
"summary": trace_output["summary"],
|
|
361
361
|
"metrics": metric_summary,
|
|
362
362
|
"phoenix_links": phoenix_links or {},
|
|
363
|
+
"custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
|
|
363
364
|
"test_cases": [
|
|
364
365
|
{
|
|
365
366
|
"test_case_id": result.test_case_id,
|
|
@@ -220,6 +220,11 @@ class MLflowAdapter(TrackerPort):
|
|
|
220
220
|
results_data.append(result_dict)
|
|
221
221
|
|
|
222
222
|
self.save_artifact(trace_id, "test_results", results_data)
|
|
223
|
+
self.save_artifact(
|
|
224
|
+
trace_id,
|
|
225
|
+
"custom_metric_snapshot",
|
|
226
|
+
(run.tracker_metadata or {}).get("custom_metric_snapshot"),
|
|
227
|
+
)
|
|
223
228
|
|
|
224
229
|
# 6. End MLflow run
|
|
225
230
|
self.end_trace(trace_id)
|
|
@@ -352,13 +352,40 @@ class PhoenixAdapter(TrackerPort):
|
|
|
352
352
|
"version": run.dataset_version,
|
|
353
353
|
"total_test_cases": run.total_test_cases,
|
|
354
354
|
},
|
|
355
|
+
"evaluation_config": {
|
|
356
|
+
"model": run.model_name,
|
|
357
|
+
"metrics": run.metrics_evaluated,
|
|
358
|
+
"thresholds": run.thresholds,
|
|
359
|
+
},
|
|
355
360
|
"summary": {
|
|
356
|
-
"
|
|
361
|
+
"total_test_cases": run.total_test_cases,
|
|
362
|
+
"passed": run.passed_test_cases,
|
|
363
|
+
"failed": run.total_test_cases - run.passed_test_cases,
|
|
364
|
+
"pass_rate": round(run.pass_rate, 4),
|
|
365
|
+
"duration_seconds": round(run.duration_seconds, 2)
|
|
366
|
+
if run.duration_seconds
|
|
367
|
+
else None,
|
|
357
368
|
"total_tokens": run.total_tokens,
|
|
358
|
-
"duration_seconds": run.duration_seconds,
|
|
359
369
|
},
|
|
360
370
|
"metrics": metric_summary,
|
|
371
|
+
"custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
|
|
372
|
+
"test_cases": [
|
|
373
|
+
{
|
|
374
|
+
"test_case_id": result.test_case_id,
|
|
375
|
+
"all_passed": result.all_passed,
|
|
376
|
+
"metrics": {
|
|
377
|
+
metric.name: {
|
|
378
|
+
"score": metric.score,
|
|
379
|
+
"threshold": metric.threshold,
|
|
380
|
+
"passed": metric.passed,
|
|
381
|
+
}
|
|
382
|
+
for metric in result.metrics
|
|
383
|
+
},
|
|
384
|
+
}
|
|
385
|
+
for result in run.results
|
|
386
|
+
],
|
|
361
387
|
}
|
|
388
|
+
|
|
362
389
|
self.save_artifact(trace_id, "ragas_evaluation", structured_artifact)
|
|
363
390
|
|
|
364
391
|
# End the trace
|
evalvault/config/settings.py
CHANGED
|
@@ -321,6 +321,27 @@ class Settings(BaseSettings):
|
|
|
321
321
|
default="https://cloud.langfuse.com", description="Langfuse host URL"
|
|
322
322
|
)
|
|
323
323
|
|
|
324
|
+
mcp_enabled: bool = Field(
|
|
325
|
+
default=False,
|
|
326
|
+
description="Enable MCP JSON-RPC endpoint over HTTP.",
|
|
327
|
+
)
|
|
328
|
+
mcp_protocol_version: str = Field(
|
|
329
|
+
default="2025-11-25",
|
|
330
|
+
description="MCP protocol version to advertise.",
|
|
331
|
+
)
|
|
332
|
+
mcp_server_version: str = Field(
|
|
333
|
+
default="0.1.0",
|
|
334
|
+
description="EvalVault MCP server version.",
|
|
335
|
+
)
|
|
336
|
+
mcp_auth_tokens: str | None = Field(
|
|
337
|
+
default=None,
|
|
338
|
+
description="Comma-separated bearer tokens for MCP endpoint (required).",
|
|
339
|
+
)
|
|
340
|
+
mcp_allowed_tools: str | None = Field(
|
|
341
|
+
default=None,
|
|
342
|
+
description="Comma-separated allowlist of MCP tool names.",
|
|
343
|
+
)
|
|
344
|
+
|
|
324
345
|
# MLflow Configuration (optional)
|
|
325
346
|
mlflow_tracking_uri: str | None = Field(default=None, description="MLflow tracking server URI")
|
|
326
347
|
mlflow_experiment_name: str = Field(default="evalvault", description="MLflow experiment name")
|
|
@@ -6,6 +6,10 @@ from evalvault.domain.metrics.entity_preservation import EntityPreservation
|
|
|
6
6
|
from evalvault.domain.metrics.insurance import InsuranceTermAccuracy
|
|
7
7
|
from evalvault.domain.metrics.no_answer import NoAnswerAccuracy, is_no_answer
|
|
8
8
|
from evalvault.domain.metrics.retrieval_rank import MRR, NDCG, HitRate
|
|
9
|
+
from evalvault.domain.metrics.summary_accuracy import SummaryAccuracy
|
|
10
|
+
from evalvault.domain.metrics.summary_needs_followup import SummaryNeedsFollowup
|
|
11
|
+
from evalvault.domain.metrics.summary_non_definitive import SummaryNonDefinitive
|
|
12
|
+
from evalvault.domain.metrics.summary_risk_coverage import SummaryRiskCoverage
|
|
9
13
|
from evalvault.domain.metrics.text_match import ExactMatch, F1Score
|
|
10
14
|
|
|
11
15
|
__all__ = [
|
|
@@ -19,5 +23,9 @@ __all__ = [
|
|
|
19
23
|
"MRR",
|
|
20
24
|
"NDCG",
|
|
21
25
|
"NoAnswerAccuracy",
|
|
26
|
+
"SummaryAccuracy",
|
|
27
|
+
"SummaryNeedsFollowup",
|
|
28
|
+
"SummaryNonDefinitive",
|
|
29
|
+
"SummaryRiskCoverage",
|
|
22
30
|
"is_no_answer",
|
|
23
31
|
]
|
|
@@ -123,7 +123,7 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
|
|
|
123
123
|
),
|
|
124
124
|
MetricSpec(
|
|
125
125
|
name="summary_score",
|
|
126
|
-
description="Measures summary coverage and conciseness against contexts",
|
|
126
|
+
description="(LLM) Measures summary coverage and conciseness against contexts",
|
|
127
127
|
requires_ground_truth=False,
|
|
128
128
|
requires_embeddings=False,
|
|
129
129
|
source="ragas",
|
|
@@ -132,7 +132,7 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
|
|
|
132
132
|
),
|
|
133
133
|
MetricSpec(
|
|
134
134
|
name="summary_faithfulness",
|
|
135
|
-
description="Measures whether summary statements are grounded in contexts",
|
|
135
|
+
description="(LLM) Measures whether summary statements are grounded in contexts",
|
|
136
136
|
requires_ground_truth=False,
|
|
137
137
|
requires_embeddings=False,
|
|
138
138
|
source="ragas",
|
|
@@ -141,7 +141,43 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
|
|
|
141
141
|
),
|
|
142
142
|
MetricSpec(
|
|
143
143
|
name="entity_preservation",
|
|
144
|
-
description="Measures preservation of key insurance entities in summaries",
|
|
144
|
+
description="(Rule) Measures preservation of key insurance entities in summaries",
|
|
145
|
+
requires_ground_truth=False,
|
|
146
|
+
requires_embeddings=False,
|
|
147
|
+
source="custom",
|
|
148
|
+
category="summary",
|
|
149
|
+
signal_group="summary_fidelity",
|
|
150
|
+
),
|
|
151
|
+
MetricSpec(
|
|
152
|
+
name="summary_accuracy",
|
|
153
|
+
description="(Rule) Measures whether summary entities are grounded in contexts",
|
|
154
|
+
requires_ground_truth=False,
|
|
155
|
+
requires_embeddings=False,
|
|
156
|
+
source="custom",
|
|
157
|
+
category="summary",
|
|
158
|
+
signal_group="summary_fidelity",
|
|
159
|
+
),
|
|
160
|
+
MetricSpec(
|
|
161
|
+
name="summary_risk_coverage",
|
|
162
|
+
description="(Rule) Measures coverage of expected insurance risk tags in summaries",
|
|
163
|
+
requires_ground_truth=False,
|
|
164
|
+
requires_embeddings=False,
|
|
165
|
+
source="custom",
|
|
166
|
+
category="summary",
|
|
167
|
+
signal_group="summary_fidelity",
|
|
168
|
+
),
|
|
169
|
+
MetricSpec(
|
|
170
|
+
name="summary_non_definitive",
|
|
171
|
+
description="(Rule) Measures avoidance of definitive claims in summaries",
|
|
172
|
+
requires_ground_truth=False,
|
|
173
|
+
requires_embeddings=False,
|
|
174
|
+
source="custom",
|
|
175
|
+
category="summary",
|
|
176
|
+
signal_group="summary_fidelity",
|
|
177
|
+
),
|
|
178
|
+
MetricSpec(
|
|
179
|
+
name="summary_needs_followup",
|
|
180
|
+
description="(Rule) Measures follow-up guidance when required",
|
|
145
181
|
requires_ground_truth=False,
|
|
146
182
|
requires_embeddings=False,
|
|
147
183
|
source="custom",
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from decimal import Decimal, InvalidOperation
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SummaryAccuracy:
|
|
8
|
+
"""Measure whether summary entities are supported by contexts."""
|
|
9
|
+
|
|
10
|
+
name = "summary_accuracy"
|
|
11
|
+
|
|
12
|
+
_PERCENT_RE = re.compile(r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>%|퍼센트|percent)", re.I)
|
|
13
|
+
_CURRENCY_RE = re.compile(
|
|
14
|
+
r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>원|만원|억원|달러|usd|krw|won)",
|
|
15
|
+
re.I,
|
|
16
|
+
)
|
|
17
|
+
_CURRENCY_PREFIX_RE = re.compile(r"(?P<unit>[$₩])\s*(?P<number>\d+(?:[.,]\d+)?)")
|
|
18
|
+
_DURATION_RE = re.compile(
|
|
19
|
+
r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>년|개월|월|일|years?|months?|days?)",
|
|
20
|
+
re.I,
|
|
21
|
+
)
|
|
22
|
+
_DATE_RE = re.compile(r"\b\d{4}[./-]\d{1,2}[./-]\d{1,2}\b")
|
|
23
|
+
|
|
24
|
+
_CURRENCY_MULTIPLIERS = {"만원": Decimal("10000"), "억원": Decimal("100000000")}
|
|
25
|
+
_KRW_UNITS = {"원", "krw", "won", "₩", "만원", "억원"}
|
|
26
|
+
_USD_UNITS = {"달러", "usd", "$"}
|
|
27
|
+
_DURATION_UNITS = {
|
|
28
|
+
"년": "year",
|
|
29
|
+
"year": "year",
|
|
30
|
+
"years": "year",
|
|
31
|
+
"개월": "month",
|
|
32
|
+
"월": "month",
|
|
33
|
+
"month": "month",
|
|
34
|
+
"months": "month",
|
|
35
|
+
"일": "day",
|
|
36
|
+
"day": "day",
|
|
37
|
+
"days": "day",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
_KEYWORDS_KO = (
|
|
41
|
+
"면책",
|
|
42
|
+
"제외",
|
|
43
|
+
"단서",
|
|
44
|
+
"다만",
|
|
45
|
+
"조건",
|
|
46
|
+
"자기부담",
|
|
47
|
+
"한도",
|
|
48
|
+
"감액",
|
|
49
|
+
)
|
|
50
|
+
_KEYWORDS_EN = (
|
|
51
|
+
"exclusion",
|
|
52
|
+
"excluded",
|
|
53
|
+
"exception",
|
|
54
|
+
"except",
|
|
55
|
+
"condition",
|
|
56
|
+
"deductible",
|
|
57
|
+
"limit",
|
|
58
|
+
"cap",
|
|
59
|
+
"waiting period",
|
|
60
|
+
"co-pay",
|
|
61
|
+
"copay",
|
|
62
|
+
"co-insurance",
|
|
63
|
+
"coinsurance",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def score(self, answer: str, contexts: list[str]) -> float:
|
|
67
|
+
if not contexts:
|
|
68
|
+
return 0.0
|
|
69
|
+
|
|
70
|
+
context_text = " ".join([ctx for ctx in contexts if ctx])
|
|
71
|
+
context_entities = self._extract_entities(context_text)
|
|
72
|
+
summary_entities = self._extract_entities(answer or "")
|
|
73
|
+
|
|
74
|
+
if not summary_entities:
|
|
75
|
+
return 0.5 if context_entities else 0.0
|
|
76
|
+
if not context_entities:
|
|
77
|
+
return 0.0
|
|
78
|
+
|
|
79
|
+
supported = summary_entities.intersection(context_entities)
|
|
80
|
+
return len(supported) / len(summary_entities)
|
|
81
|
+
|
|
82
|
+
def _extract_entities(self, text: str) -> set[str]:
|
|
83
|
+
entities = set()
|
|
84
|
+
entities.update(self._extract_numeric_entities(text))
|
|
85
|
+
entities.update(self._extract_keyword_entities(text))
|
|
86
|
+
return entities
|
|
87
|
+
|
|
88
|
+
def _extract_numeric_entities(self, text: str) -> set[str]:
|
|
89
|
+
entities: set[str] = set()
|
|
90
|
+
|
|
91
|
+
for match in self._PERCENT_RE.finditer(text):
|
|
92
|
+
number = self._normalize_number(match.group("number"))
|
|
93
|
+
if number:
|
|
94
|
+
entities.add(f"percent:{number}")
|
|
95
|
+
|
|
96
|
+
for match in self._CURRENCY_RE.finditer(text):
|
|
97
|
+
number = self._normalize_number(match.group("number"))
|
|
98
|
+
unit = match.group("unit").lower()
|
|
99
|
+
normalized = self._normalize_currency(number, unit)
|
|
100
|
+
if normalized:
|
|
101
|
+
entities.add(f"currency:{normalized}")
|
|
102
|
+
|
|
103
|
+
for match in self._CURRENCY_PREFIX_RE.finditer(text):
|
|
104
|
+
number = self._normalize_number(match.group("number"))
|
|
105
|
+
unit = match.group("unit")
|
|
106
|
+
normalized = self._normalize_currency(number, unit)
|
|
107
|
+
if normalized:
|
|
108
|
+
entities.add(f"currency:{normalized}")
|
|
109
|
+
|
|
110
|
+
for match in self._DURATION_RE.finditer(text):
|
|
111
|
+
number = self._normalize_number(match.group("number"))
|
|
112
|
+
unit = match.group("unit").lower()
|
|
113
|
+
normalized = self._normalize_duration(number, unit)
|
|
114
|
+
if normalized:
|
|
115
|
+
entities.add(f"duration:{normalized}")
|
|
116
|
+
|
|
117
|
+
for match in self._DATE_RE.finditer(text):
|
|
118
|
+
entities.add(f"date:{self._normalize_date(match.group(0))}")
|
|
119
|
+
|
|
120
|
+
return entities
|
|
121
|
+
|
|
122
|
+
def _extract_keyword_entities(self, text: str) -> set[str]:
|
|
123
|
+
entities: set[str] = set()
|
|
124
|
+
lower = text.lower()
|
|
125
|
+
|
|
126
|
+
for keyword in self._KEYWORDS_KO:
|
|
127
|
+
if keyword in text:
|
|
128
|
+
entities.add(f"kw:{keyword}")
|
|
129
|
+
|
|
130
|
+
for keyword in self._KEYWORDS_EN:
|
|
131
|
+
if keyword in lower:
|
|
132
|
+
entities.add(f"kw:{keyword}")
|
|
133
|
+
|
|
134
|
+
return entities
|
|
135
|
+
|
|
136
|
+
def _normalize_currency(self, number: str | None, unit: str) -> str | None:
|
|
137
|
+
if number is None:
|
|
138
|
+
return None
|
|
139
|
+
try:
|
|
140
|
+
value = Decimal(number)
|
|
141
|
+
except InvalidOperation:
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
unit_key = unit.lower()
|
|
145
|
+
multiplier = self._CURRENCY_MULTIPLIERS.get(unit_key)
|
|
146
|
+
if multiplier:
|
|
147
|
+
value *= multiplier
|
|
148
|
+
|
|
149
|
+
if unit_key in self._KRW_UNITS:
|
|
150
|
+
currency = "krw"
|
|
151
|
+
elif unit_key in self._USD_UNITS:
|
|
152
|
+
currency = "usd"
|
|
153
|
+
else:
|
|
154
|
+
currency = unit_key
|
|
155
|
+
|
|
156
|
+
return f"{currency}:{self._format_decimal(value)}"
|
|
157
|
+
|
|
158
|
+
def _normalize_duration(self, number: str | None, unit: str) -> str | None:
|
|
159
|
+
if number is None:
|
|
160
|
+
return None
|
|
161
|
+
try:
|
|
162
|
+
value = Decimal(number)
|
|
163
|
+
except InvalidOperation:
|
|
164
|
+
return None
|
|
165
|
+
base_unit = self._DURATION_UNITS.get(unit, unit)
|
|
166
|
+
return f"{self._format_decimal(value)}{base_unit}"
|
|
167
|
+
|
|
168
|
+
@staticmethod
|
|
169
|
+
def _normalize_date(raw: str) -> str:
|
|
170
|
+
return re.sub(r"[./-]", "", raw)
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def _normalize_number(raw: str | None) -> str | None:
|
|
174
|
+
if raw is None:
|
|
175
|
+
return None
|
|
176
|
+
cleaned = raw.replace(",", "").strip()
|
|
177
|
+
if not cleaned:
|
|
178
|
+
return None
|
|
179
|
+
try:
|
|
180
|
+
value = Decimal(cleaned)
|
|
181
|
+
except InvalidOperation:
|
|
182
|
+
return None
|
|
183
|
+
return SummaryAccuracy._format_decimal(value)
|
|
184
|
+
|
|
185
|
+
@staticmethod
|
|
186
|
+
def _format_decimal(value: Decimal) -> str:
|
|
187
|
+
if value == value.to_integral_value():
|
|
188
|
+
return str(value.to_integral_value())
|
|
189
|
+
return format(value.normalize(), "f").rstrip("0").rstrip(".")
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SummaryNeedsFollowup:
|
|
5
|
+
"""Check if follow-up guidance appears when required."""
|
|
6
|
+
|
|
7
|
+
name = "summary_needs_followup"
|
|
8
|
+
|
|
9
|
+
_FOLLOWUP_KEYWORDS = [
|
|
10
|
+
"확인 필요",
|
|
11
|
+
"추가 확인",
|
|
12
|
+
"담당자 확인",
|
|
13
|
+
"재문의",
|
|
14
|
+
"추가 문의",
|
|
15
|
+
"서류 확인",
|
|
16
|
+
"follow up",
|
|
17
|
+
"follow-up",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
def score(self, answer: str, contexts: list[str], metadata: dict | None = None) -> float:
|
|
21
|
+
text = answer or ""
|
|
22
|
+
has_followup = self._has_followup(text)
|
|
23
|
+
expected = self._expects_followup(metadata)
|
|
24
|
+
|
|
25
|
+
if expected:
|
|
26
|
+
return 1.0 if has_followup else 0.0
|
|
27
|
+
return 1.0 if not has_followup else 0.0
|
|
28
|
+
|
|
29
|
+
def _expects_followup(self, metadata: dict | None) -> bool:
|
|
30
|
+
if not metadata:
|
|
31
|
+
return False
|
|
32
|
+
raw = metadata.get("summary_tags")
|
|
33
|
+
if not raw:
|
|
34
|
+
return False
|
|
35
|
+
if isinstance(raw, list):
|
|
36
|
+
tags = [str(item).strip().lower() for item in raw if str(item).strip()]
|
|
37
|
+
else:
|
|
38
|
+
tags = [str(raw).strip().lower()]
|
|
39
|
+
return "needs_followup" in tags
|
|
40
|
+
|
|
41
|
+
def _has_followup(self, text: str) -> bool:
|
|
42
|
+
lowered = text.lower()
|
|
43
|
+
return any(
|
|
44
|
+
keyword in text or keyword.lower() in lowered for keyword in self._FOLLOWUP_KEYWORDS
|
|
45
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SummaryNonDefinitive:
|
|
7
|
+
"""Penalize definitive statements in summaries."""
|
|
8
|
+
|
|
9
|
+
name = "summary_non_definitive"
|
|
10
|
+
|
|
11
|
+
_DEFINITIVE_PATTERNS_KO = [
|
|
12
|
+
r"무조건",
|
|
13
|
+
r"반드시",
|
|
14
|
+
r"100%",
|
|
15
|
+
r"전액\s*지급",
|
|
16
|
+
r"확실히",
|
|
17
|
+
r"분명히",
|
|
18
|
+
r"절대",
|
|
19
|
+
r"항상",
|
|
20
|
+
]
|
|
21
|
+
_DEFINITIVE_PATTERNS_EN = [
|
|
22
|
+
r"always",
|
|
23
|
+
r"guaranteed",
|
|
24
|
+
r"definitely",
|
|
25
|
+
r"certainly",
|
|
26
|
+
r"absolutely",
|
|
27
|
+
r"100%",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
def score(self, answer: str, contexts: list[str]) -> float:
|
|
31
|
+
text = answer or ""
|
|
32
|
+
if self._has_definitive_pattern(text):
|
|
33
|
+
return 0.0
|
|
34
|
+
return 1.0
|
|
35
|
+
|
|
36
|
+
def _has_definitive_pattern(self, text: str) -> bool:
|
|
37
|
+
for pattern in self._DEFINITIVE_PATTERNS_KO:
|
|
38
|
+
if re.search(pattern, text):
|
|
39
|
+
return True
|
|
40
|
+
lowered = text.lower()
|
|
41
|
+
return any(re.search(pattern, lowered) for pattern in self._DEFINITIVE_PATTERNS_EN)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SummaryRiskCoverage:
|
|
5
|
+
"""Measure coverage of expected insurance risk tags in summary."""
|
|
6
|
+
|
|
7
|
+
name = "summary_risk_coverage"
|
|
8
|
+
|
|
9
|
+
_TAG_KEYWORDS = {
|
|
10
|
+
"exclusion": ["면책", "보장 제외", "지급 불가", "exclusion"],
|
|
11
|
+
"deductible": ["자기부담", "본인부담금", "deductible", "copay"],
|
|
12
|
+
"limit": ["한도", "상한", "최대", "limit", "cap"],
|
|
13
|
+
"waiting_period": ["면책기간", "대기기간", "waiting period"],
|
|
14
|
+
"condition": ["조건", "단서", "다만", "condition"],
|
|
15
|
+
"documents_required": ["서류", "진단서", "영수증", "documents"],
|
|
16
|
+
"needs_followup": ["확인 필요", "추가 확인", "담당자 확인", "재문의", "follow up"],
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def score(self, answer: str, contexts: list[str], metadata: dict | None = None) -> float:
|
|
20
|
+
expected_tags = self._extract_expected_tags(metadata)
|
|
21
|
+
if not expected_tags:
|
|
22
|
+
return 1.0
|
|
23
|
+
|
|
24
|
+
text = answer or ""
|
|
25
|
+
covered = 0
|
|
26
|
+
for tag in expected_tags:
|
|
27
|
+
if self._has_tag_keyword(text, tag):
|
|
28
|
+
covered += 1
|
|
29
|
+
|
|
30
|
+
return covered / len(expected_tags)
|
|
31
|
+
|
|
32
|
+
def _extract_expected_tags(self, metadata: dict | None) -> list[str]:
|
|
33
|
+
if not metadata:
|
|
34
|
+
return []
|
|
35
|
+
raw = metadata.get("summary_tags")
|
|
36
|
+
if not raw:
|
|
37
|
+
return []
|
|
38
|
+
if isinstance(raw, list):
|
|
39
|
+
return [str(item).strip().lower() for item in raw if str(item).strip()]
|
|
40
|
+
return [str(raw).strip().lower()]
|
|
41
|
+
|
|
42
|
+
def _has_tag_keyword(self, text: str, tag: str) -> bool:
|
|
43
|
+
keywords = self._TAG_KEYWORDS.get(tag, [])
|
|
44
|
+
lowered = text.lower()
|
|
45
|
+
return any(keyword in text or keyword.lower() in lowered for keyword in keywords)
|