evalvault 1.65.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  5. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  6. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  7. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  8. evalvault/adapters/outbound/tracker/langfuse_adapter.py +1 -0
  9. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  10. evalvault/adapters/outbound/tracker/phoenix_adapter.py +29 -2
  11. evalvault/config/settings.py +21 -0
  12. evalvault/domain/entities/prompt.py +1 -1
  13. evalvault/domain/metrics/__init__.py +8 -0
  14. evalvault/domain/metrics/registry.py +39 -3
  15. evalvault/domain/metrics/summary_accuracy.py +189 -0
  16. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  17. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  18. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  19. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  20. evalvault/domain/services/evaluator.py +280 -27
  21. evalvault/domain/services/prompt_registry.py +39 -10
  22. evalvault/domain/services/threshold_profiles.py +4 -0
  23. evalvault/domain/services/visual_space_service.py +79 -4
  24. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  25. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +28 -22
  26. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  27. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  28. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,233 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import inspect
5
+ from collections.abc import Iterable
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from evalvault.domain.metrics.registry import get_metric_spec_map
10
+
11
+ SCHEMA_VERSION = 1
12
+
13
+ _CUSTOM_METRIC_DETAILS: dict[str, dict[str, Any]] = {
14
+ "entity_preservation": {
15
+ "evaluation_method": "rule-based",
16
+ "inputs": ["answer", "contexts"],
17
+ "output": "0.0-1.0 (preserved_entities / context_entities)",
18
+ "evaluation_process": "Extract numeric/keyword entities from contexts and measure how many appear in the summary.",
19
+ "rules": {
20
+ "numeric_entities": ["percent", "currency", "duration", "date"],
21
+ "keywords_ko": [
22
+ "면책",
23
+ "제외",
24
+ "단서",
25
+ "다만",
26
+ "조건",
27
+ "자기부담",
28
+ "한도",
29
+ "감액",
30
+ ],
31
+ "keywords_en": [
32
+ "exclusion",
33
+ "deductible",
34
+ "limit",
35
+ "cap",
36
+ "copay",
37
+ "coinsurance",
38
+ ],
39
+ },
40
+ "notes": "Insurance-risk oriented entity coverage check.",
41
+ },
42
+ "insurance_term_accuracy": {
43
+ "evaluation_method": "rule-based",
44
+ "inputs": ["answer", "contexts"],
45
+ "output": "0.0-1.0 (verified_terms / answer_terms)",
46
+ "evaluation_process": "Detect insurance terms in the answer and verify their presence in contexts.",
47
+ "rules": {"terms_dictionary": "terms_dictionary.json"},
48
+ "notes": "Insurance glossary matching with canonical/variant terms.",
49
+ },
50
+ "summary_accuracy": {
51
+ "evaluation_method": "rule-based",
52
+ "inputs": ["answer", "contexts"],
53
+ "output": "0.0-1.0 (supported_summary_entities / summary_entities)",
54
+ "evaluation_process": "Extract numeric/keyword entities from summary and verify their presence in contexts.",
55
+ "rules": {
56
+ "numeric_entities": ["percent", "currency", "duration", "date"],
57
+ "keywords_ko": ["면책", "제외", "단서", "다만", "조건", "자기부담", "한도", "감액"],
58
+ "keywords_en": ["exclusion", "deductible", "limit", "cap", "waiting period"],
59
+ },
60
+ "notes": "Penalizes summary entities not grounded in contexts.",
61
+ },
62
+ "summary_risk_coverage": {
63
+ "evaluation_method": "rule-based",
64
+ "inputs": ["answer", "metadata.summary_tags"],
65
+ "output": "0.0-1.0 (covered_tags / expected_tags)",
66
+ "evaluation_process": "Check if summary mentions expected insurance risk tags.",
67
+ "rules": {
68
+ "exclusion": ["면책", "보장 제외", "지급 불가", "exclusion"],
69
+ "deductible": ["자기부담", "본인부담금", "deductible", "copay"],
70
+ "limit": ["한도", "상한", "최대", "limit", "cap"],
71
+ "waiting_period": ["면책기간", "대기기간", "waiting period"],
72
+ "condition": ["조건", "단서", "다만", "condition"],
73
+ "documents_required": ["서류", "진단서", "영수증", "documents"],
74
+ "needs_followup": ["확인 필요", "추가 확인", "담당자 확인", "재문의", "follow up"],
75
+ },
76
+ "notes": "Uses metadata summary_tags to define expected coverage.",
77
+ },
78
+ "summary_non_definitive": {
79
+ "evaluation_method": "rule-based",
80
+ "inputs": ["answer"],
81
+ "output": "1.0 if definitive claims absent else 0.0",
82
+ "evaluation_process": "Detect definitive expressions that increase liability risk.",
83
+ "rules": {
84
+ "patterns_ko": ["무조건", "반드시", "100%", "전액 지급", "확실히", "분명히", "절대"],
85
+ "patterns_en": [
86
+ "always",
87
+ "guaranteed",
88
+ "definitely",
89
+ "certainly",
90
+ "absolutely",
91
+ "100%",
92
+ ],
93
+ },
94
+ "notes": "Higher is safer; penalizes absolute guarantees.",
95
+ },
96
+ "summary_needs_followup": {
97
+ "evaluation_method": "rule-based",
98
+ "inputs": ["answer", "metadata.summary_tags"],
99
+ "output": "1.0 if follow-up guidance matches expected need",
100
+ "evaluation_process": "Check follow-up guidance when needs_followup tag exists.",
101
+ "rules": {
102
+ "followup_keywords": [
103
+ "확인 필요",
104
+ "추가 확인",
105
+ "담당자 확인",
106
+ "재문의",
107
+ "추가 문의",
108
+ "follow up",
109
+ ]
110
+ },
111
+ "notes": "Requires tags to avoid false penalties.",
112
+ },
113
+ "no_answer_accuracy": {
114
+ "evaluation_method": "rule-based",
115
+ "inputs": ["answer", "ground_truth"],
116
+ "output": "1.0 if abstention behavior matches, else 0.0",
117
+ "evaluation_process": "Detect abstention patterns in answer and ground_truth and compare behavior.",
118
+ "rules": {"patterns": "Korean/English regex patterns"},
119
+ "notes": "Hallucination/abstention behavior check.",
120
+ },
121
+ "exact_match": {
122
+ "evaluation_method": "string-match",
123
+ "inputs": ["answer", "ground_truth"],
124
+ "output": "1.0 exact match else 0.0",
125
+ "evaluation_process": "Normalize text and compare exact match with optional strict number matching.",
126
+ "rules": {"normalize": True, "number_strict": True},
127
+ "notes": "Token/number strict matching for factual answers.",
128
+ },
129
+ "f1_score": {
130
+ "evaluation_method": "token-overlap",
131
+ "inputs": ["answer", "ground_truth"],
132
+ "output": "0.0-1.0 (weighted F1)",
133
+ "evaluation_process": "Tokenize, compute weighted precision/recall/F1 with number emphasis.",
134
+ "rules": {"number_weight": 2.0},
135
+ "notes": "Token-level overlap with numeric weighting.",
136
+ },
137
+ "mrr": {
138
+ "evaluation_method": "retrieval-rank",
139
+ "inputs": ["ground_truth", "contexts"],
140
+ "output": "0.0-1.0 (1/rank of first relevant context)",
141
+ "evaluation_process": "Compute relevance by token overlap and take reciprocal rank of first hit.",
142
+ "rules": {"relevance_threshold": 0.3},
143
+ "notes": "Ranking quality of retrieved contexts.",
144
+ },
145
+ "ndcg": {
146
+ "evaluation_method": "retrieval-rank",
147
+ "inputs": ["ground_truth", "contexts"],
148
+ "output": "0.0-1.0 (NDCG@K)",
149
+ "evaluation_process": "Compute graded relevance per context and calculate NDCG.",
150
+ "rules": {"k": 10, "use_graded": True},
151
+ "notes": "Ranking quality across all relevant contexts.",
152
+ },
153
+ "hit_rate": {
154
+ "evaluation_method": "retrieval-rank",
155
+ "inputs": ["ground_truth", "contexts"],
156
+ "output": "1.0 if any relevant context in top K else 0.0",
157
+ "evaluation_process": "Check whether top-K contexts contain a relevant hit.",
158
+ "rules": {"k": 10, "relevance_threshold": 0.3},
159
+ "notes": "Recall@K style coverage check.",
160
+ },
161
+ "confidence_score": {
162
+ "evaluation_method": "rule-based",
163
+ "inputs": ["answer", "ground_truth", "contexts"],
164
+ "output": "0.0-1.0 (weighted confidence)",
165
+ "evaluation_process": "Combine context coverage, answer specificity, and consistency scores.",
166
+ "rules": {"coverage": 0.4, "specificity": 0.3, "consistency": 0.3},
167
+ "notes": "Heuristic confidence signal for human escalation.",
168
+ },
169
+ "contextual_relevancy": {
170
+ "evaluation_method": "token-overlap",
171
+ "inputs": ["question", "contexts"],
172
+ "output": "0.0-1.0 (avg relevancy)",
173
+ "evaluation_process": "Measure question-context token overlap and average across contexts.",
174
+ "rules": {"relevance_threshold": 0.35},
175
+ "notes": "Reference-free context relevance check.",
176
+ },
177
+ }
178
+
179
+
180
+ def _hash_file(path: str | Path | None) -> str | None:
181
+ if not path:
182
+ return None
183
+ file_path = Path(path)
184
+ if not file_path.exists():
185
+ return None
186
+ payload = file_path.read_bytes()
187
+ return hashlib.sha256(payload).hexdigest()
188
+
189
+
190
+ def _resolve_source_path(metric_class: type[Any]) -> str | None:
191
+ try:
192
+ source = inspect.getsourcefile(metric_class)
193
+ except TypeError:
194
+ return None
195
+ if not source:
196
+ return None
197
+ return str(Path(source).resolve())
198
+
199
+
200
+ def build_custom_metric_snapshot(
201
+ metric_classes: dict[str, type[Any]],
202
+ metrics: Iterable[str],
203
+ ) -> dict[str, Any] | None:
204
+ custom_names = [name for name in metrics if name in metric_classes]
205
+ if not custom_names:
206
+ return None
207
+
208
+ spec_map = get_metric_spec_map()
209
+ rows: list[dict[str, Any]] = []
210
+ for metric_name in custom_names:
211
+ metric_class = metric_classes.get(metric_name)
212
+ if metric_class is None:
213
+ continue
214
+ source_path = _resolve_source_path(metric_class)
215
+ details = _CUSTOM_METRIC_DETAILS.get(metric_name, {})
216
+ spec = spec_map.get(metric_name)
217
+ rows.append(
218
+ {
219
+ "metric_name": metric_name,
220
+ "source": "custom",
221
+ "description": spec.description if spec else None,
222
+ "evaluation_method": details.get("evaluation_method"),
223
+ "inputs": details.get("inputs"),
224
+ "output": details.get("output"),
225
+ "evaluation_process": details.get("evaluation_process"),
226
+ "rules": details.get("rules"),
227
+ "notes": details.get("notes"),
228
+ "implementation_path": source_path,
229
+ "implementation_hash": _hash_file(source_path),
230
+ }
231
+ )
232
+
233
+ return {"schema_version": SCHEMA_VERSION, "metrics": rows}
@@ -11,8 +11,9 @@ from collections.abc import Callable, Sequence
11
11
  from contextlib import suppress
12
12
  from dataclasses import dataclass
13
13
  from datetime import datetime
14
- from typing import Any, Literal, cast, overload
14
+ from typing import Any, Literal, overload
15
15
 
16
+ from pydantic import BaseModel, Field, field_validator
16
17
  from ragas import SingleTurnSample
17
18
 
18
19
  from evalvault.domain.entities import (
@@ -30,8 +31,13 @@ from evalvault.domain.metrics.entity_preservation import EntityPreservation
30
31
  from evalvault.domain.metrics.insurance import InsuranceTermAccuracy
31
32
  from evalvault.domain.metrics.no_answer import NoAnswerAccuracy
32
33
  from evalvault.domain.metrics.retrieval_rank import MRR, NDCG, HitRate
34
+ from evalvault.domain.metrics.summary_accuracy import SummaryAccuracy
35
+ from evalvault.domain.metrics.summary_needs_followup import SummaryNeedsFollowup
36
+ from evalvault.domain.metrics.summary_non_definitive import SummaryNonDefinitive
37
+ from evalvault.domain.metrics.summary_risk_coverage import SummaryRiskCoverage
33
38
  from evalvault.domain.metrics.text_match import ExactMatch, F1Score
34
39
  from evalvault.domain.services.batch_executor import run_in_batches
40
+ from evalvault.domain.services.custom_metric_snapshot import build_custom_metric_snapshot
35
41
  from evalvault.domain.services.dataset_preprocessor import DatasetPreprocessor
36
42
  from evalvault.domain.services.retriever_context import apply_retriever_to_dataset
37
43
  from evalvault.ports.outbound.korean_nlp_port import KoreanNLPToolkitPort, RetrieverPort
@@ -55,14 +61,53 @@ _SUMMARY_FAITHFULNESS_PROMPT_EN = (
55
61
  )
56
62
 
57
63
 
64
+ def _patch_ragas_faithfulness_output() -> None:
65
+ try:
66
+ from ragas.metrics import Faithfulness
67
+ except Exception:
68
+ return
69
+
70
+ prompt = getattr(Faithfulness, "nli_statements_prompt", None)
71
+ if prompt is None:
72
+ return
73
+
74
+ output_model = getattr(prompt, "output_model", None)
75
+ if output_model is None:
76
+ return
77
+
78
+ class _StatementFaithfulnessAnswer(BaseModel):
79
+ statement: str = Field(..., description="the original statement, word-by-word")
80
+ reason: str = Field(..., description="the reason of the verdict")
81
+ verdict: int = Field(..., description="the verdict(0/1) of the faithfulness.")
82
+
83
+ @field_validator("verdict", mode="before")
84
+ @classmethod
85
+ def _coerce_verdict(cls, value):
86
+ if isinstance(value, str):
87
+ normalized = value.strip()
88
+ if normalized.isdigit():
89
+ return int(normalized)
90
+ return value
91
+
92
+ class _NLIStatementOutput(BaseModel):
93
+ statements: list[_StatementFaithfulnessAnswer]
94
+
95
+ try:
96
+ prompt.output_model = _NLIStatementOutput
97
+ except Exception:
98
+ return
99
+
100
+
58
101
  def _import_metric(name: str) -> type[Any]:
59
102
  for module_name in ("ragas.metrics.collections", "ragas.metrics"):
60
103
  try:
61
104
  module = importlib.import_module(module_name)
62
- except Exception:
105
+ if hasattr(module, name):
106
+ if name == "Faithfulness":
107
+ _patch_ragas_faithfulness_output()
108
+ return getattr(module, name)
109
+ except ImportError:
63
110
  continue
64
- if hasattr(module, name):
65
- return cast(type[Any], getattr(module, name))
66
111
  raise ImportError(f"Missing ragas metric: {name}")
67
112
 
68
113
 
@@ -147,6 +192,10 @@ class RagasEvaluator:
147
192
  CUSTOM_METRIC_MAP = {
148
193
  "insurance_term_accuracy": InsuranceTermAccuracy,
149
194
  "entity_preservation": EntityPreservation,
195
+ "summary_accuracy": SummaryAccuracy,
196
+ "summary_risk_coverage": SummaryRiskCoverage,
197
+ "summary_non_definitive": SummaryNonDefinitive,
198
+ "summary_needs_followup": SummaryNeedsFollowup,
150
199
  "exact_match": ExactMatch,
151
200
  "f1_score": F1Score,
152
201
  "no_answer_accuracy": NoAnswerAccuracy,
@@ -198,6 +247,10 @@ class RagasEvaluator:
198
247
  "summary_faithfulness": 0.9,
199
248
  "summary_score": 0.85,
200
249
  "entity_preservation": 0.9,
250
+ "summary_accuracy": 0.9,
251
+ "summary_risk_coverage": 0.9,
252
+ "summary_non_definitive": 0.8,
253
+ "summary_needs_followup": 0.8,
201
254
  "contextual_relevancy": 0.35,
202
255
  }
203
256
  LANGUAGE_SAMPLE_LIMIT = 5
@@ -225,10 +278,28 @@ class RagasEvaluator:
225
278
  "예시의 원자성 수준을 따르세요."
226
279
  )
227
280
  FACTUAL_CORRECTNESS_NLI_INSTRUCTION = (
228
- "다음 CONTEXT를 바탕으로STATEMENT가 직접적으로 "
229
- "추론 가능한지 판단하세요. "
230
- "가능하면 verdict=1, 불가능하면 verdict=0으로 표시하고, "
231
- "간단한 이유를 한국어로 적으세요."
281
+ "주어진 컨텍스트를 보고진술이 직접적으로 도출 가능한지 판단하세요. "
282
+ "가능하면 verdict=1, 불가능하면 verdict=0을 JSON으로 반환하세요."
283
+ )
284
+ SUMMARY_SCORE_QUESTION_INSTRUCTION = (
285
+ "다음 텍스트와 핵심 키워드를 기반으로, "
286
+ "텍스트에 근거해 반드시 1로 답할 수 있는 폐쇄형 질문을 생성하세요. "
287
+ "질문은 한국어로 작성하세요."
288
+ )
289
+ SUMMARY_SCORE_ANSWER_INSTRUCTION = (
290
+ "다음 질문 목록에 대해, 제공된 요약이 각 질문에 답할 수 있으면 '1', "
291
+ "그렇지 않으면 '0'을 JSON 배열로 반환하세요."
292
+ )
293
+ SUMMARY_SCORE_KEYPHRASE_INSTRUCTION = (
294
+ "다음 텍스트에서 인물, 기관, 위치, 날짜/시간, 금액, 비율과 같은 핵심 키워드를 추출하세요."
295
+ )
296
+ SUMMARY_FAITHFULNESS_STATEMENT_INSTRUCTION = (
297
+ "질문과 답변을 보고 각 문장을 이해 가능한 주장으로 분해하세요. "
298
+ "각 주장은 대명사 없이 독립적으로 이해 가능해야 합니다."
299
+ )
300
+ SUMMARY_FAITHFULNESS_NLI_INSTRUCTION = (
301
+ "주어진 컨텍스트를 보고 각 진술이 직접적으로 도출 가능한지 판단하세요. "
302
+ "가능하면 verdict=1, 불가능하면 verdict=0을 JSON으로 반환하세요."
232
303
  )
233
304
  FACTUAL_CORRECTNESS_CLAIM_EXAMPLES = [
234
305
  {
@@ -390,6 +461,7 @@ class RagasEvaluator:
390
461
 
391
462
  # Evaluate with Ragas (if any Ragas metrics)
392
463
  eval_results_by_test_case = {}
464
+ prompt_snapshots = {}
393
465
  if ragas_metrics:
394
466
  run.tracker_metadata["ragas_config"] = self._build_ragas_config(llm)
395
467
  (
@@ -412,6 +484,13 @@ class RagasEvaluator:
412
484
  elif prompt_overrides:
413
485
  logger.warning("Ragas prompt overrides provided but no Ragas metrics requested.")
414
486
 
487
+ custom_snapshot = build_custom_metric_snapshot(self.CUSTOM_METRIC_MAP, metrics)
488
+ if custom_snapshot:
489
+ run.tracker_metadata["custom_metric_snapshot"] = custom_snapshot
490
+ custom_prompt_snapshots = self._build_custom_prompt_snapshots(custom_snapshot)
491
+ if custom_prompt_snapshots:
492
+ run.tracker_metadata["custom_prompt_snapshots"] = custom_prompt_snapshots
493
+
415
494
  # Evaluate with custom metrics (if any custom metrics)
416
495
  if custom_metrics:
417
496
  custom_results = await self._evaluate_with_custom_metrics(
@@ -583,6 +662,11 @@ class RagasEvaluator:
583
662
  ragas_metrics=ragas_metrics,
584
663
  prompt_overrides=prompt_overrides,
585
664
  )
665
+ self._apply_summary_prompt_defaults(
666
+ dataset=dataset,
667
+ ragas_metrics=ragas_metrics,
668
+ prompt_overrides=prompt_overrides,
669
+ )
586
670
  self._apply_factual_correctness_prompt_defaults(
587
671
  dataset=dataset,
588
672
  ragas_metrics=ragas_metrics,
@@ -645,6 +729,30 @@ class RagasEvaluator:
645
729
  continue
646
730
  self._apply_korean_answer_relevancy_prompt(metric)
647
731
 
732
+ def _apply_summary_prompt_defaults(
733
+ self,
734
+ *,
735
+ dataset: Dataset,
736
+ ragas_metrics: list[Any],
737
+ prompt_overrides: dict[str, str] | None,
738
+ ) -> None:
739
+ if not ragas_metrics:
740
+ return
741
+ if prompt_overrides and any(
742
+ metric in prompt_overrides for metric in ("summary_score", "summary_faithfulness")
743
+ ):
744
+ return
745
+ resolved_language = self._resolve_dataset_language(dataset)
746
+ if resolved_language == "en":
747
+ return
748
+
749
+ for metric in ragas_metrics:
750
+ metric_name = getattr(metric, "name", None)
751
+ if metric_name == "summary_score":
752
+ self._apply_korean_summary_score_prompts(metric)
753
+ elif metric_name == "summary_faithfulness":
754
+ self._apply_korean_summary_faithfulness_prompts(metric)
755
+
648
756
  def _apply_factual_correctness_prompt_defaults(
649
757
  self,
650
758
  *,
@@ -745,6 +853,56 @@ class RagasEvaluator:
745
853
  prompt.language = "ko"
746
854
  return True
747
855
 
856
+ def _apply_korean_summary_score_prompts(self, metric: Any) -> bool:
857
+ question_prompt = getattr(metric, "question_generation_prompt", None)
858
+ answer_prompt = getattr(metric, "answer_generation_prompt", None)
859
+ keyphrase_prompt = getattr(metric, "extract_keyphrases_prompt", None)
860
+ applied = False
861
+
862
+ if question_prompt and hasattr(question_prompt, "instruction"):
863
+ question_prompt.instruction = self.SUMMARY_SCORE_QUESTION_INSTRUCTION
864
+ if hasattr(question_prompt, "language"):
865
+ with suppress(Exception):
866
+ question_prompt.language = "ko"
867
+ applied = True
868
+
869
+ if answer_prompt and hasattr(answer_prompt, "instruction"):
870
+ answer_prompt.instruction = self.SUMMARY_SCORE_ANSWER_INSTRUCTION
871
+ if hasattr(answer_prompt, "language"):
872
+ with suppress(Exception):
873
+ answer_prompt.language = "ko"
874
+ applied = True
875
+
876
+ if keyphrase_prompt and hasattr(keyphrase_prompt, "instruction"):
877
+ keyphrase_prompt.instruction = self.SUMMARY_SCORE_KEYPHRASE_INSTRUCTION
878
+ if hasattr(keyphrase_prompt, "language"):
879
+ with suppress(Exception):
880
+ keyphrase_prompt.language = "ko"
881
+ applied = True
882
+
883
+ return applied
884
+
885
+ def _apply_korean_summary_faithfulness_prompts(self, metric: Any) -> bool:
886
+ statement_prompt = getattr(metric, "statement_generator_prompt", None)
887
+ nli_prompt = getattr(metric, "nli_statements_prompt", None)
888
+ applied = False
889
+
890
+ if statement_prompt and hasattr(statement_prompt, "instruction"):
891
+ statement_prompt.instruction = self.SUMMARY_FAITHFULNESS_STATEMENT_INSTRUCTION
892
+ if hasattr(statement_prompt, "language"):
893
+ with suppress(Exception):
894
+ statement_prompt.language = "ko"
895
+ applied = True
896
+
897
+ if nli_prompt and hasattr(nli_prompt, "instruction"):
898
+ nli_prompt.instruction = self.SUMMARY_FAITHFULNESS_NLI_INSTRUCTION
899
+ if hasattr(nli_prompt, "language"):
900
+ with suppress(Exception):
901
+ nli_prompt.language = "ko"
902
+ applied = True
903
+
904
+ return applied
905
+
748
906
  def _apply_korean_factual_correctness_prompts(self, metric: Any) -> bool:
749
907
  claim_prompt = getattr(metric, "claim_decomposition_prompt", None)
750
908
  nli_prompt = getattr(metric, "nli_prompt", None)
@@ -819,6 +977,8 @@ class RagasEvaluator:
819
977
  continue
820
978
  prompt_text = prompt_overrides[metric_name]
821
979
  applied = self._override_metric_prompt(metric, prompt_text)
980
+ if not applied and metric_name == "faithfulness":
981
+ applied = self._override_faithfulness_prompt(metric, prompt_text)
822
982
  statuses[metric_name] = "applied" if applied else "unsupported"
823
983
  if not applied:
824
984
  logger.warning("Prompt override for metric '%s' could not be applied.", metric_name)
@@ -878,6 +1038,16 @@ class RagasEvaluator:
878
1038
 
879
1039
  return False
880
1040
 
1041
+ @staticmethod
1042
+ def _override_faithfulness_prompt(metric: Any, prompt_text: str) -> bool:
1043
+ target = getattr(metric, "nli_statements_prompt", None)
1044
+ if target is None:
1045
+ return False
1046
+ if hasattr(target, "instruction"):
1047
+ target.instruction = prompt_text
1048
+ return True
1049
+ return False
1050
+
881
1051
  @staticmethod
882
1052
  def _extract_prompt_text(value: Any) -> str | None:
883
1053
  if value is None:
@@ -926,18 +1096,50 @@ class RagasEvaluator:
926
1096
  metric_name = getattr(metric, "name", None)
927
1097
  if not metric_name:
928
1098
  continue
929
- prompt_text = self._collect_metric_prompt_text(metric)
930
- if not prompt_text:
931
- continue
932
1099
  requested = bool(prompt_overrides and metric_name in prompt_overrides)
933
1100
  status = override_status.get(metric_name)
934
1101
  source = "override" if status == "applied" else "default"
935
- snapshots[str(metric_name)] = {
936
- "prompt": prompt_text,
937
- "source": source,
938
- "override_requested": requested,
939
- "override_status": status,
940
- }
1102
+
1103
+ prompts: dict[str, str] = {}
1104
+ if metric_name == "summary_score":
1105
+ prompts["question_generation"] = (
1106
+ self._extract_prompt_text(getattr(metric, "question_generation_prompt", None))
1107
+ or ""
1108
+ )
1109
+ prompts["answer_generation"] = (
1110
+ self._extract_prompt_text(getattr(metric, "answer_generation_prompt", None))
1111
+ or ""
1112
+ )
1113
+ prompts["extract_keyphrases"] = (
1114
+ self._extract_prompt_text(getattr(metric, "extract_keyphrases_prompt", None))
1115
+ or ""
1116
+ )
1117
+ prompts = {k: v for k, v in prompts.items() if v}
1118
+ elif metric_name == "summary_faithfulness":
1119
+ prompts["statement_generation"] = (
1120
+ self._extract_prompt_text(getattr(metric, "statement_generator_prompt", None))
1121
+ or ""
1122
+ )
1123
+ prompts["nli_statements"] = (
1124
+ self._extract_prompt_text(getattr(metric, "nli_statements_prompt", None)) or ""
1125
+ )
1126
+ prompts = {k: v for k, v in prompts.items() if v}
1127
+
1128
+ prompt_text = self._collect_metric_prompt_text(metric)
1129
+ if prompts:
1130
+ snapshots[str(metric_name)] = {
1131
+ "prompts": prompts,
1132
+ "source": source,
1133
+ "override_requested": requested,
1134
+ "override_status": status,
1135
+ }
1136
+ elif prompt_text:
1137
+ snapshots[str(metric_name)] = {
1138
+ "prompt": prompt_text,
1139
+ "source": source,
1140
+ "override_requested": requested,
1141
+ "override_status": status,
1142
+ }
941
1143
  return snapshots
942
1144
 
943
1145
  async def _evaluate_sequential(
@@ -1135,16 +1337,26 @@ class RagasEvaluator:
1135
1337
  claim_details: dict[str, ClaimLevelResult] = {}
1136
1338
 
1137
1339
  for metric in ragas_metrics:
1138
- if metric.name in self.FAITHFULNESS_METRICS and self._faithfulness_ragas_failed:
1139
- if metric.name == "summary_faithfulness":
1140
- judge_score = await self._score_summary_faithfulness_judge(sample)
1141
- if judge_score is not None:
1142
- scores[metric.name] = judge_score
1340
+ if metric.name in self.FAITHFULNESS_METRICS:
1341
+ if self._active_llm_provider == "ollama":
1342
+ fallback_score = self._fallback_korean_faithfulness(
1343
+ sample, return_details=False
1344
+ )
1345
+ if fallback_score is None:
1346
+ fallback_score = await self._score_faithfulness_with_fallback(sample)
1347
+ if fallback_score is not None:
1348
+ scores[metric.name] = fallback_score
1349
+ continue
1350
+ if self._faithfulness_ragas_failed:
1351
+ if metric.name == "summary_faithfulness":
1352
+ judge_score = await self._score_summary_faithfulness_judge(sample)
1353
+ if judge_score is not None:
1354
+ scores[metric.name] = judge_score
1355
+ continue
1356
+ fallback_score = await self._score_faithfulness_with_fallback(sample)
1357
+ if fallback_score is not None:
1358
+ scores[metric.name] = fallback_score
1143
1359
  continue
1144
- fallback_score = await self._score_faithfulness_with_fallback(sample)
1145
- if fallback_score is not None:
1146
- scores[metric.name] = fallback_score
1147
- continue
1148
1360
  try:
1149
1361
  # Ragas >=0.4 uses ascore() with kwargs
1150
1362
  if hasattr(metric, "ascore"):
@@ -1272,6 +1484,32 @@ class RagasEvaluator:
1272
1484
  normalized = str(domain).strip().lower()
1273
1485
  return cls.SUMMARY_SCORE_COEFF_BY_DOMAIN.get(normalized, cls.SUMMARY_SCORE_COEFF)
1274
1486
 
1487
+ def _build_custom_prompt_snapshots(self, snapshot: dict[str, Any]) -> dict[str, dict[str, Any]]:
1488
+ entries = snapshot.get("metrics") if isinstance(snapshot, dict) else None
1489
+ if not isinstance(entries, list):
1490
+ return {}
1491
+ prompt_snapshot: dict[str, dict[str, Any]] = {}
1492
+ for entry in entries:
1493
+ if not isinstance(entry, dict):
1494
+ continue
1495
+ name = entry.get("metric_name")
1496
+ if not isinstance(name, str) or not name:
1497
+ continue
1498
+ evaluation_process = entry.get("evaluation_process")
1499
+ if not isinstance(evaluation_process, str) or not evaluation_process:
1500
+ continue
1501
+ rules = entry.get("rules") if isinstance(entry.get("rules"), dict) else None
1502
+ prompts: dict[str, str] = {"rule": evaluation_process}
1503
+ if rules:
1504
+ prompts["rules"] = json.dumps(rules, ensure_ascii=False, indent=2)
1505
+ prompt_snapshot[name] = {
1506
+ "prompts": prompts,
1507
+ "source": "custom_rules",
1508
+ "rules": rules,
1509
+ "inputs": entry.get("inputs"),
1510
+ }
1511
+ return prompt_snapshot
1512
+
1275
1513
  def _build_summary_score_metric(self, metric_class, ragas_llm, coeff: float | None = None):
1276
1514
  if coeff is None:
1277
1515
  coeff = self.SUMMARY_SCORE_COEFF
@@ -1653,9 +1891,11 @@ class RagasEvaluator:
1653
1891
  contexts=test_case.contexts,
1654
1892
  )
1655
1893
  else:
1656
- score = metric_instance.score(
1894
+ score = self._score_custom_metric_with_metadata(
1895
+ metric_instance,
1657
1896
  answer=test_case.answer,
1658
1897
  contexts=test_case.contexts,
1898
+ metadata=test_case.metadata,
1659
1899
  )
1660
1900
  scores[metric_name] = score
1661
1901
 
@@ -1676,6 +1916,19 @@ class RagasEvaluator:
1676
1916
 
1677
1917
  return results
1678
1918
 
1919
+ def _score_custom_metric_with_metadata(
1920
+ self,
1921
+ metric_instance: Any,
1922
+ *,
1923
+ answer: str,
1924
+ contexts: list[str],
1925
+ metadata: dict[str, Any],
1926
+ ) -> float:
1927
+ try:
1928
+ return float(metric_instance.score(answer=answer, contexts=contexts, metadata=metadata))
1929
+ except TypeError:
1930
+ return float(metric_instance.score(answer=answer, contexts=contexts))
1931
+
1679
1932
  def _calculate_cost(self, model_name: str, prompt_tokens: int, completion_tokens: int) -> float:
1680
1933
  """Calculate estimated cost in USD based on model pricing."""
1681
1934
  # Find matching model key (exact or substring match)