evalvault 1.65.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  5. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  6. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  7. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  8. evalvault/adapters/outbound/tracker/langfuse_adapter.py +1 -0
  9. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  10. evalvault/adapters/outbound/tracker/phoenix_adapter.py +29 -2
  11. evalvault/config/settings.py +21 -0
  12. evalvault/domain/entities/prompt.py +1 -1
  13. evalvault/domain/metrics/__init__.py +8 -0
  14. evalvault/domain/metrics/registry.py +39 -3
  15. evalvault/domain/metrics/summary_accuracy.py +189 -0
  16. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  17. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  18. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  19. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  20. evalvault/domain/services/evaluator.py +280 -27
  21. evalvault/domain/services/prompt_registry.py +39 -10
  22. evalvault/domain/services/threshold_profiles.py +4 -0
  23. evalvault/domain/services/visual_space_service.py +79 -4
  24. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  25. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +28 -22
  26. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  27. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  28. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -499,8 +499,20 @@ SUMMARY_RECOMMENDED_THRESHOLDS = {
499
499
  "summary_faithfulness": 0.90,
500
500
  "summary_score": 0.85,
501
501
  "entity_preservation": 0.90,
502
+ "summary_accuracy": 0.90,
503
+ "summary_risk_coverage": 0.90,
504
+ "summary_non_definitive": 0.80,
505
+ "summary_needs_followup": 0.80,
502
506
  }
503
- SUMMARY_METRIC_ORDER = ("summary_faithfulness", "summary_score", "entity_preservation")
507
+ SUMMARY_METRIC_ORDER = (
508
+ "summary_faithfulness",
509
+ "summary_score",
510
+ "entity_preservation",
511
+ "summary_accuracy",
512
+ "summary_risk_coverage",
513
+ "summary_non_definitive",
514
+ "summary_needs_followup",
515
+ )
504
516
 
505
517
 
506
518
  @dataclass
@@ -664,6 +664,8 @@ class BaseSQLStorageAdapter(ABC):
664
664
  def export_run_to_excel(self, run_id: str, output_path) -> Path:
665
665
  from openpyxl import Workbook
666
666
 
667
+ from evalvault.domain.metrics.registry import get_metric_spec_map
668
+
667
669
  output = Path(output_path)
668
670
  output.parent.mkdir(parents=True, exist_ok=True)
669
671
 
@@ -837,6 +839,23 @@ class BaseSQLStorageAdapter(ABC):
837
839
 
838
840
  summary_rows: list[dict[str, Any]] = []
839
841
  run_payload = run_rows[0] if run_rows else {}
842
+ custom_metric_rows: list[dict[str, Any]] = []
843
+ run_metadata = self._deserialize_json(run_payload.get("metadata")) if run_payload else None
844
+ if isinstance(run_metadata, dict):
845
+ custom_snapshot = run_metadata.get("custom_metric_snapshot")
846
+ if isinstance(custom_snapshot, dict):
847
+ entries = custom_snapshot.get("metrics")
848
+ if isinstance(entries, list):
849
+ for entry in entries:
850
+ if isinstance(entry, dict):
851
+ row = dict(entry)
852
+ row["schema_version"] = custom_snapshot.get("schema_version")
853
+ custom_metric_rows.append(row)
854
+ if custom_metric_rows:
855
+ custom_metric_rows = self._normalize_rows(
856
+ custom_metric_rows,
857
+ json_columns={"inputs", "rules"},
858
+ )
840
859
  prompt_set_id = None
841
860
  prompt_set_name = None
842
861
  if run_prompt_payloads:
@@ -878,14 +897,17 @@ class BaseSQLStorageAdapter(ABC):
878
897
  if isinstance(threshold, (int, float)) and score >= threshold:
879
898
  entry["pass_count"] += 1
880
899
 
900
+ metric_spec_map = get_metric_spec_map()
881
901
  for entry in metrics_index.values():
882
902
  count = entry["count"] or 0
903
+ spec = metric_spec_map.get(entry["metric_name"])
883
904
  metric_summary_rows.append(
884
905
  {
885
906
  "metric_name": entry["metric_name"],
886
907
  "avg_score": (entry["score_sum"] / count) if count else None,
887
908
  "pass_rate": (entry["pass_count"] / count) if count else None,
888
909
  "samples": count,
910
+ "source": spec.source if spec else None,
889
911
  }
890
912
  )
891
913
 
@@ -956,7 +978,25 @@ class BaseSQLStorageAdapter(ABC):
956
978
  (
957
979
  "MetricsSummary",
958
980
  metric_summary_rows,
959
- ["metric_name", "avg_score", "pass_rate", "samples"],
981
+ ["metric_name", "avg_score", "pass_rate", "samples", "source"],
982
+ ),
983
+ (
984
+ "CustomMetrics",
985
+ custom_metric_rows,
986
+ [
987
+ "schema_version",
988
+ "metric_name",
989
+ "source",
990
+ "description",
991
+ "evaluation_method",
992
+ "inputs",
993
+ "output",
994
+ "evaluation_process",
995
+ "rules",
996
+ "notes",
997
+ "implementation_path",
998
+ "implementation_hash",
999
+ ],
960
1000
  ),
961
1001
  (
962
1002
  "RunPromptSets",
@@ -360,6 +360,7 @@ class LangfuseAdapter(TrackerPort):
360
360
  "summary": trace_output["summary"],
361
361
  "metrics": metric_summary,
362
362
  "phoenix_links": phoenix_links or {},
363
+ "custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
363
364
  "test_cases": [
364
365
  {
365
366
  "test_case_id": result.test_case_id,
@@ -220,6 +220,11 @@ class MLflowAdapter(TrackerPort):
220
220
  results_data.append(result_dict)
221
221
 
222
222
  self.save_artifact(trace_id, "test_results", results_data)
223
+ self.save_artifact(
224
+ trace_id,
225
+ "custom_metric_snapshot",
226
+ (run.tracker_metadata or {}).get("custom_metric_snapshot"),
227
+ )
223
228
 
224
229
  # 6. End MLflow run
225
230
  self.end_trace(trace_id)
@@ -352,13 +352,40 @@ class PhoenixAdapter(TrackerPort):
352
352
  "version": run.dataset_version,
353
353
  "total_test_cases": run.total_test_cases,
354
354
  },
355
+ "evaluation_config": {
356
+ "model": run.model_name,
357
+ "metrics": run.metrics_evaluated,
358
+ "thresholds": run.thresholds,
359
+ },
355
360
  "summary": {
356
- "pass_rate": run.pass_rate,
361
+ "total_test_cases": run.total_test_cases,
362
+ "passed": run.passed_test_cases,
363
+ "failed": run.total_test_cases - run.passed_test_cases,
364
+ "pass_rate": round(run.pass_rate, 4),
365
+ "duration_seconds": round(run.duration_seconds, 2)
366
+ if run.duration_seconds
367
+ else None,
357
368
  "total_tokens": run.total_tokens,
358
- "duration_seconds": run.duration_seconds,
359
369
  },
360
370
  "metrics": metric_summary,
371
+ "custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
372
+ "test_cases": [
373
+ {
374
+ "test_case_id": result.test_case_id,
375
+ "all_passed": result.all_passed,
376
+ "metrics": {
377
+ metric.name: {
378
+ "score": metric.score,
379
+ "threshold": metric.threshold,
380
+ "passed": metric.passed,
381
+ }
382
+ for metric in result.metrics
383
+ },
384
+ }
385
+ for result in run.results
386
+ ],
361
387
  }
388
+
362
389
  self.save_artifact(trace_id, "ragas_evaluation", structured_artifact)
363
390
 
364
391
  # End the trace
@@ -321,6 +321,27 @@ class Settings(BaseSettings):
321
321
  default="https://cloud.langfuse.com", description="Langfuse host URL"
322
322
  )
323
323
 
324
+ mcp_enabled: bool = Field(
325
+ default=False,
326
+ description="Enable MCP JSON-RPC endpoint over HTTP.",
327
+ )
328
+ mcp_protocol_version: str = Field(
329
+ default="2025-11-25",
330
+ description="MCP protocol version to advertise.",
331
+ )
332
+ mcp_server_version: str = Field(
333
+ default="0.1.0",
334
+ description="EvalVault MCP server version.",
335
+ )
336
+ mcp_auth_tokens: str | None = Field(
337
+ default=None,
338
+ description="Comma-separated bearer tokens for MCP endpoint (required).",
339
+ )
340
+ mcp_allowed_tools: str | None = Field(
341
+ default=None,
342
+ description="Comma-separated allowlist of MCP tool names.",
343
+ )
344
+
324
345
  # MLflow Configuration (optional)
325
346
  mlflow_tracking_uri: str | None = Field(default=None, description="MLflow tracking server URI")
326
347
  mlflow_experiment_name: str = Field(default="evalvault", description="MLflow experiment name")
@@ -7,7 +7,7 @@ from datetime import datetime
7
7
  from typing import Any, Literal
8
8
  from uuid import uuid4
9
9
 
10
- PromptKind = Literal["system", "ragas"]
10
+ PromptKind = Literal["system", "ragas", "custom"]
11
11
 
12
12
 
13
13
  @dataclass
@@ -6,6 +6,10 @@ from evalvault.domain.metrics.entity_preservation import EntityPreservation
6
6
  from evalvault.domain.metrics.insurance import InsuranceTermAccuracy
7
7
  from evalvault.domain.metrics.no_answer import NoAnswerAccuracy, is_no_answer
8
8
  from evalvault.domain.metrics.retrieval_rank import MRR, NDCG, HitRate
9
+ from evalvault.domain.metrics.summary_accuracy import SummaryAccuracy
10
+ from evalvault.domain.metrics.summary_needs_followup import SummaryNeedsFollowup
11
+ from evalvault.domain.metrics.summary_non_definitive import SummaryNonDefinitive
12
+ from evalvault.domain.metrics.summary_risk_coverage import SummaryRiskCoverage
9
13
  from evalvault.domain.metrics.text_match import ExactMatch, F1Score
10
14
 
11
15
  __all__ = [
@@ -19,5 +23,9 @@ __all__ = [
19
23
  "MRR",
20
24
  "NDCG",
21
25
  "NoAnswerAccuracy",
26
+ "SummaryAccuracy",
27
+ "SummaryNeedsFollowup",
28
+ "SummaryNonDefinitive",
29
+ "SummaryRiskCoverage",
22
30
  "is_no_answer",
23
31
  ]
@@ -123,7 +123,7 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
123
123
  ),
124
124
  MetricSpec(
125
125
  name="summary_score",
126
- description="Measures summary coverage and conciseness against contexts",
126
+ description="(LLM) Measures summary coverage and conciseness against contexts",
127
127
  requires_ground_truth=False,
128
128
  requires_embeddings=False,
129
129
  source="ragas",
@@ -132,7 +132,7 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
132
132
  ),
133
133
  MetricSpec(
134
134
  name="summary_faithfulness",
135
- description="Measures whether summary statements are grounded in contexts",
135
+ description="(LLM) Measures whether summary statements are grounded in contexts",
136
136
  requires_ground_truth=False,
137
137
  requires_embeddings=False,
138
138
  source="ragas",
@@ -141,7 +141,43 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
141
141
  ),
142
142
  MetricSpec(
143
143
  name="entity_preservation",
144
- description="Measures preservation of key insurance entities in summaries",
144
+ description="(Rule) Measures preservation of key insurance entities in summaries",
145
+ requires_ground_truth=False,
146
+ requires_embeddings=False,
147
+ source="custom",
148
+ category="summary",
149
+ signal_group="summary_fidelity",
150
+ ),
151
+ MetricSpec(
152
+ name="summary_accuracy",
153
+ description="(Rule) Measures whether summary entities are grounded in contexts",
154
+ requires_ground_truth=False,
155
+ requires_embeddings=False,
156
+ source="custom",
157
+ category="summary",
158
+ signal_group="summary_fidelity",
159
+ ),
160
+ MetricSpec(
161
+ name="summary_risk_coverage",
162
+ description="(Rule) Measures coverage of expected insurance risk tags in summaries",
163
+ requires_ground_truth=False,
164
+ requires_embeddings=False,
165
+ source="custom",
166
+ category="summary",
167
+ signal_group="summary_fidelity",
168
+ ),
169
+ MetricSpec(
170
+ name="summary_non_definitive",
171
+ description="(Rule) Measures avoidance of definitive claims in summaries",
172
+ requires_ground_truth=False,
173
+ requires_embeddings=False,
174
+ source="custom",
175
+ category="summary",
176
+ signal_group="summary_fidelity",
177
+ ),
178
+ MetricSpec(
179
+ name="summary_needs_followup",
180
+ description="(Rule) Measures follow-up guidance when required",
145
181
  requires_ground_truth=False,
146
182
  requires_embeddings=False,
147
183
  source="custom",
@@ -0,0 +1,189 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from decimal import Decimal, InvalidOperation
5
+
6
+
7
+ class SummaryAccuracy:
8
+ """Measure whether summary entities are supported by contexts."""
9
+
10
+ name = "summary_accuracy"
11
+
12
+ _PERCENT_RE = re.compile(r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>%|퍼센트|percent)", re.I)
13
+ _CURRENCY_RE = re.compile(
14
+ r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>원|만원|억원|달러|usd|krw|won)",
15
+ re.I,
16
+ )
17
+ _CURRENCY_PREFIX_RE = re.compile(r"(?P<unit>[$₩])\s*(?P<number>\d+(?:[.,]\d+)?)")
18
+ _DURATION_RE = re.compile(
19
+ r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>년|개월|월|일|years?|months?|days?)",
20
+ re.I,
21
+ )
22
+ _DATE_RE = re.compile(r"\b\d{4}[./-]\d{1,2}[./-]\d{1,2}\b")
23
+
24
+ _CURRENCY_MULTIPLIERS = {"만원": Decimal("10000"), "억원": Decimal("100000000")}
25
+ _KRW_UNITS = {"원", "krw", "won", "₩", "만원", "억원"}
26
+ _USD_UNITS = {"달러", "usd", "$"}
27
+ _DURATION_UNITS = {
28
+ "년": "year",
29
+ "year": "year",
30
+ "years": "year",
31
+ "개월": "month",
32
+ "월": "month",
33
+ "month": "month",
34
+ "months": "month",
35
+ "일": "day",
36
+ "day": "day",
37
+ "days": "day",
38
+ }
39
+
40
+ _KEYWORDS_KO = (
41
+ "면책",
42
+ "제외",
43
+ "단서",
44
+ "다만",
45
+ "조건",
46
+ "자기부담",
47
+ "한도",
48
+ "감액",
49
+ )
50
+ _KEYWORDS_EN = (
51
+ "exclusion",
52
+ "excluded",
53
+ "exception",
54
+ "except",
55
+ "condition",
56
+ "deductible",
57
+ "limit",
58
+ "cap",
59
+ "waiting period",
60
+ "co-pay",
61
+ "copay",
62
+ "co-insurance",
63
+ "coinsurance",
64
+ )
65
+
66
+ def score(self, answer: str, contexts: list[str]) -> float:
67
+ if not contexts:
68
+ return 0.0
69
+
70
+ context_text = " ".join([ctx for ctx in contexts if ctx])
71
+ context_entities = self._extract_entities(context_text)
72
+ summary_entities = self._extract_entities(answer or "")
73
+
74
+ if not summary_entities:
75
+ return 0.5 if context_entities else 0.0
76
+ if not context_entities:
77
+ return 0.0
78
+
79
+ supported = summary_entities.intersection(context_entities)
80
+ return len(supported) / len(summary_entities)
81
+
82
+ def _extract_entities(self, text: str) -> set[str]:
83
+ entities = set()
84
+ entities.update(self._extract_numeric_entities(text))
85
+ entities.update(self._extract_keyword_entities(text))
86
+ return entities
87
+
88
+ def _extract_numeric_entities(self, text: str) -> set[str]:
89
+ entities: set[str] = set()
90
+
91
+ for match in self._PERCENT_RE.finditer(text):
92
+ number = self._normalize_number(match.group("number"))
93
+ if number:
94
+ entities.add(f"percent:{number}")
95
+
96
+ for match in self._CURRENCY_RE.finditer(text):
97
+ number = self._normalize_number(match.group("number"))
98
+ unit = match.group("unit").lower()
99
+ normalized = self._normalize_currency(number, unit)
100
+ if normalized:
101
+ entities.add(f"currency:{normalized}")
102
+
103
+ for match in self._CURRENCY_PREFIX_RE.finditer(text):
104
+ number = self._normalize_number(match.group("number"))
105
+ unit = match.group("unit")
106
+ normalized = self._normalize_currency(number, unit)
107
+ if normalized:
108
+ entities.add(f"currency:{normalized}")
109
+
110
+ for match in self._DURATION_RE.finditer(text):
111
+ number = self._normalize_number(match.group("number"))
112
+ unit = match.group("unit").lower()
113
+ normalized = self._normalize_duration(number, unit)
114
+ if normalized:
115
+ entities.add(f"duration:{normalized}")
116
+
117
+ for match in self._DATE_RE.finditer(text):
118
+ entities.add(f"date:{self._normalize_date(match.group(0))}")
119
+
120
+ return entities
121
+
122
+ def _extract_keyword_entities(self, text: str) -> set[str]:
123
+ entities: set[str] = set()
124
+ lower = text.lower()
125
+
126
+ for keyword in self._KEYWORDS_KO:
127
+ if keyword in text:
128
+ entities.add(f"kw:{keyword}")
129
+
130
+ for keyword in self._KEYWORDS_EN:
131
+ if keyword in lower:
132
+ entities.add(f"kw:{keyword}")
133
+
134
+ return entities
135
+
136
+ def _normalize_currency(self, number: str | None, unit: str) -> str | None:
137
+ if number is None:
138
+ return None
139
+ try:
140
+ value = Decimal(number)
141
+ except InvalidOperation:
142
+ return None
143
+
144
+ unit_key = unit.lower()
145
+ multiplier = self._CURRENCY_MULTIPLIERS.get(unit_key)
146
+ if multiplier:
147
+ value *= multiplier
148
+
149
+ if unit_key in self._KRW_UNITS:
150
+ currency = "krw"
151
+ elif unit_key in self._USD_UNITS:
152
+ currency = "usd"
153
+ else:
154
+ currency = unit_key
155
+
156
+ return f"{currency}:{self._format_decimal(value)}"
157
+
158
+ def _normalize_duration(self, number: str | None, unit: str) -> str | None:
159
+ if number is None:
160
+ return None
161
+ try:
162
+ value = Decimal(number)
163
+ except InvalidOperation:
164
+ return None
165
+ base_unit = self._DURATION_UNITS.get(unit, unit)
166
+ return f"{self._format_decimal(value)}{base_unit}"
167
+
168
+ @staticmethod
169
+ def _normalize_date(raw: str) -> str:
170
+ return re.sub(r"[./-]", "", raw)
171
+
172
+ @staticmethod
173
+ def _normalize_number(raw: str | None) -> str | None:
174
+ if raw is None:
175
+ return None
176
+ cleaned = raw.replace(",", "").strip()
177
+ if not cleaned:
178
+ return None
179
+ try:
180
+ value = Decimal(cleaned)
181
+ except InvalidOperation:
182
+ return None
183
+ return SummaryAccuracy._format_decimal(value)
184
+
185
+ @staticmethod
186
+ def _format_decimal(value: Decimal) -> str:
187
+ if value == value.to_integral_value():
188
+ return str(value.to_integral_value())
189
+ return format(value.normalize(), "f").rstrip("0").rstrip(".")
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class SummaryNeedsFollowup:
5
+ """Check if follow-up guidance appears when required."""
6
+
7
+ name = "summary_needs_followup"
8
+
9
+ _FOLLOWUP_KEYWORDS = [
10
+ "확인 필요",
11
+ "추가 확인",
12
+ "담당자 확인",
13
+ "재문의",
14
+ "추가 문의",
15
+ "서류 확인",
16
+ "follow up",
17
+ "follow-up",
18
+ ]
19
+
20
+ def score(self, answer: str, contexts: list[str], metadata: dict | None = None) -> float:
21
+ text = answer or ""
22
+ has_followup = self._has_followup(text)
23
+ expected = self._expects_followup(metadata)
24
+
25
+ if expected:
26
+ return 1.0 if has_followup else 0.0
27
+ return 1.0 if not has_followup else 0.0
28
+
29
+ def _expects_followup(self, metadata: dict | None) -> bool:
30
+ if not metadata:
31
+ return False
32
+ raw = metadata.get("summary_tags")
33
+ if not raw:
34
+ return False
35
+ if isinstance(raw, list):
36
+ tags = [str(item).strip().lower() for item in raw if str(item).strip()]
37
+ else:
38
+ tags = [str(raw).strip().lower()]
39
+ return "needs_followup" in tags
40
+
41
+ def _has_followup(self, text: str) -> bool:
42
+ lowered = text.lower()
43
+ return any(
44
+ keyword in text or keyword.lower() in lowered for keyword in self._FOLLOWUP_KEYWORDS
45
+ )
@@ -0,0 +1,41 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+
6
+ class SummaryNonDefinitive:
7
+ """Penalize definitive statements in summaries."""
8
+
9
+ name = "summary_non_definitive"
10
+
11
+ _DEFINITIVE_PATTERNS_KO = [
12
+ r"무조건",
13
+ r"반드시",
14
+ r"100%",
15
+ r"전액\s*지급",
16
+ r"확실히",
17
+ r"분명히",
18
+ r"절대",
19
+ r"항상",
20
+ ]
21
+ _DEFINITIVE_PATTERNS_EN = [
22
+ r"always",
23
+ r"guaranteed",
24
+ r"definitely",
25
+ r"certainly",
26
+ r"absolutely",
27
+ r"100%",
28
+ ]
29
+
30
+ def score(self, answer: str, contexts: list[str]) -> float:
31
+ text = answer or ""
32
+ if self._has_definitive_pattern(text):
33
+ return 0.0
34
+ return 1.0
35
+
36
+ def _has_definitive_pattern(self, text: str) -> bool:
37
+ for pattern in self._DEFINITIVE_PATTERNS_KO:
38
+ if re.search(pattern, text):
39
+ return True
40
+ lowered = text.lower()
41
+ return any(re.search(pattern, lowered) for pattern in self._DEFINITIVE_PATTERNS_EN)
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class SummaryRiskCoverage:
5
+ """Measure coverage of expected insurance risk tags in summary."""
6
+
7
+ name = "summary_risk_coverage"
8
+
9
+ _TAG_KEYWORDS = {
10
+ "exclusion": ["면책", "보장 제외", "지급 불가", "exclusion"],
11
+ "deductible": ["자기부담", "본인부담금", "deductible", "copay"],
12
+ "limit": ["한도", "상한", "최대", "limit", "cap"],
13
+ "waiting_period": ["면책기간", "대기기간", "waiting period"],
14
+ "condition": ["조건", "단서", "다만", "condition"],
15
+ "documents_required": ["서류", "진단서", "영수증", "documents"],
16
+ "needs_followup": ["확인 필요", "추가 확인", "담당자 확인", "재문의", "follow up"],
17
+ }
18
+
19
+ def score(self, answer: str, contexts: list[str], metadata: dict | None = None) -> float:
20
+ expected_tags = self._extract_expected_tags(metadata)
21
+ if not expected_tags:
22
+ return 1.0
23
+
24
+ text = answer or ""
25
+ covered = 0
26
+ for tag in expected_tags:
27
+ if self._has_tag_keyword(text, tag):
28
+ covered += 1
29
+
30
+ return covered / len(expected_tags)
31
+
32
+ def _extract_expected_tags(self, metadata: dict | None) -> list[str]:
33
+ if not metadata:
34
+ return []
35
+ raw = metadata.get("summary_tags")
36
+ if not raw:
37
+ return []
38
+ if isinstance(raw, list):
39
+ return [str(item).strip().lower() for item in raw if str(item).strip()]
40
+ return [str(raw).strip().lower()]
41
+
42
+ def _has_tag_keyword(self, text: str, tag: str) -> bool:
43
+ keywords = self._TAG_KEYWORDS.get(tag, [])
44
+ lowered = text.lower()
45
+ return any(keyword in text or keyword.lower() in lowered for keyword in keywords)