evalvault 1.65.0__py3-none-any.whl → 1.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +14 -0
- evalvault/adapters/inbound/api/main.py +14 -4
- evalvault/adapters/inbound/api/routers/chat.py +543 -0
- evalvault/adapters/inbound/cli/commands/run.py +14 -0
- evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
- evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
- evalvault/adapters/outbound/storage/base_sql.py +41 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +1 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +29 -2
- evalvault/config/settings.py +21 -0
- evalvault/domain/entities/prompt.py +1 -1
- evalvault/domain/metrics/__init__.py +8 -0
- evalvault/domain/metrics/registry.py +39 -3
- evalvault/domain/metrics/summary_accuracy.py +189 -0
- evalvault/domain/metrics/summary_needs_followup.py +45 -0
- evalvault/domain/metrics/summary_non_definitive.py +41 -0
- evalvault/domain/metrics/summary_risk_coverage.py +45 -0
- evalvault/domain/services/custom_metric_snapshot.py +233 -0
- evalvault/domain/services/evaluator.py +280 -27
- evalvault/domain/services/prompt_registry.py +39 -10
- evalvault/domain/services/threshold_profiles.py +4 -0
- evalvault/domain/services/visual_space_service.py +79 -4
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +28 -22
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import inspect
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from evalvault.domain.metrics.registry import get_metric_spec_map
|
|
10
|
+
|
|
11
|
+
SCHEMA_VERSION = 1
|
|
12
|
+
|
|
13
|
+
_CUSTOM_METRIC_DETAILS: dict[str, dict[str, Any]] = {
|
|
14
|
+
"entity_preservation": {
|
|
15
|
+
"evaluation_method": "rule-based",
|
|
16
|
+
"inputs": ["answer", "contexts"],
|
|
17
|
+
"output": "0.0-1.0 (preserved_entities / context_entities)",
|
|
18
|
+
"evaluation_process": "Extract numeric/keyword entities from contexts and measure how many appear in the summary.",
|
|
19
|
+
"rules": {
|
|
20
|
+
"numeric_entities": ["percent", "currency", "duration", "date"],
|
|
21
|
+
"keywords_ko": [
|
|
22
|
+
"면책",
|
|
23
|
+
"제외",
|
|
24
|
+
"단서",
|
|
25
|
+
"다만",
|
|
26
|
+
"조건",
|
|
27
|
+
"자기부담",
|
|
28
|
+
"한도",
|
|
29
|
+
"감액",
|
|
30
|
+
],
|
|
31
|
+
"keywords_en": [
|
|
32
|
+
"exclusion",
|
|
33
|
+
"deductible",
|
|
34
|
+
"limit",
|
|
35
|
+
"cap",
|
|
36
|
+
"copay",
|
|
37
|
+
"coinsurance",
|
|
38
|
+
],
|
|
39
|
+
},
|
|
40
|
+
"notes": "Insurance-risk oriented entity coverage check.",
|
|
41
|
+
},
|
|
42
|
+
"insurance_term_accuracy": {
|
|
43
|
+
"evaluation_method": "rule-based",
|
|
44
|
+
"inputs": ["answer", "contexts"],
|
|
45
|
+
"output": "0.0-1.0 (verified_terms / answer_terms)",
|
|
46
|
+
"evaluation_process": "Detect insurance terms in the answer and verify their presence in contexts.",
|
|
47
|
+
"rules": {"terms_dictionary": "terms_dictionary.json"},
|
|
48
|
+
"notes": "Insurance glossary matching with canonical/variant terms.",
|
|
49
|
+
},
|
|
50
|
+
"summary_accuracy": {
|
|
51
|
+
"evaluation_method": "rule-based",
|
|
52
|
+
"inputs": ["answer", "contexts"],
|
|
53
|
+
"output": "0.0-1.0 (supported_summary_entities / summary_entities)",
|
|
54
|
+
"evaluation_process": "Extract numeric/keyword entities from summary and verify their presence in contexts.",
|
|
55
|
+
"rules": {
|
|
56
|
+
"numeric_entities": ["percent", "currency", "duration", "date"],
|
|
57
|
+
"keywords_ko": ["면책", "제외", "단서", "다만", "조건", "자기부담", "한도", "감액"],
|
|
58
|
+
"keywords_en": ["exclusion", "deductible", "limit", "cap", "waiting period"],
|
|
59
|
+
},
|
|
60
|
+
"notes": "Penalizes summary entities not grounded in contexts.",
|
|
61
|
+
},
|
|
62
|
+
"summary_risk_coverage": {
|
|
63
|
+
"evaluation_method": "rule-based",
|
|
64
|
+
"inputs": ["answer", "metadata.summary_tags"],
|
|
65
|
+
"output": "0.0-1.0 (covered_tags / expected_tags)",
|
|
66
|
+
"evaluation_process": "Check if summary mentions expected insurance risk tags.",
|
|
67
|
+
"rules": {
|
|
68
|
+
"exclusion": ["면책", "보장 제외", "지급 불가", "exclusion"],
|
|
69
|
+
"deductible": ["자기부담", "본인부담금", "deductible", "copay"],
|
|
70
|
+
"limit": ["한도", "상한", "최대", "limit", "cap"],
|
|
71
|
+
"waiting_period": ["면책기간", "대기기간", "waiting period"],
|
|
72
|
+
"condition": ["조건", "단서", "다만", "condition"],
|
|
73
|
+
"documents_required": ["서류", "진단서", "영수증", "documents"],
|
|
74
|
+
"needs_followup": ["확인 필요", "추가 확인", "담당자 확인", "재문의", "follow up"],
|
|
75
|
+
},
|
|
76
|
+
"notes": "Uses metadata summary_tags to define expected coverage.",
|
|
77
|
+
},
|
|
78
|
+
"summary_non_definitive": {
|
|
79
|
+
"evaluation_method": "rule-based",
|
|
80
|
+
"inputs": ["answer"],
|
|
81
|
+
"output": "1.0 if definitive claims absent else 0.0",
|
|
82
|
+
"evaluation_process": "Detect definitive expressions that increase liability risk.",
|
|
83
|
+
"rules": {
|
|
84
|
+
"patterns_ko": ["무조건", "반드시", "100%", "전액 지급", "확실히", "분명히", "절대"],
|
|
85
|
+
"patterns_en": [
|
|
86
|
+
"always",
|
|
87
|
+
"guaranteed",
|
|
88
|
+
"definitely",
|
|
89
|
+
"certainly",
|
|
90
|
+
"absolutely",
|
|
91
|
+
"100%",
|
|
92
|
+
],
|
|
93
|
+
},
|
|
94
|
+
"notes": "Higher is safer; penalizes absolute guarantees.",
|
|
95
|
+
},
|
|
96
|
+
"summary_needs_followup": {
|
|
97
|
+
"evaluation_method": "rule-based",
|
|
98
|
+
"inputs": ["answer", "metadata.summary_tags"],
|
|
99
|
+
"output": "1.0 if follow-up guidance matches expected need",
|
|
100
|
+
"evaluation_process": "Check follow-up guidance when needs_followup tag exists.",
|
|
101
|
+
"rules": {
|
|
102
|
+
"followup_keywords": [
|
|
103
|
+
"확인 필요",
|
|
104
|
+
"추가 확인",
|
|
105
|
+
"담당자 확인",
|
|
106
|
+
"재문의",
|
|
107
|
+
"추가 문의",
|
|
108
|
+
"follow up",
|
|
109
|
+
]
|
|
110
|
+
},
|
|
111
|
+
"notes": "Requires tags to avoid false penalties.",
|
|
112
|
+
},
|
|
113
|
+
"no_answer_accuracy": {
|
|
114
|
+
"evaluation_method": "rule-based",
|
|
115
|
+
"inputs": ["answer", "ground_truth"],
|
|
116
|
+
"output": "1.0 if abstention behavior matches, else 0.0",
|
|
117
|
+
"evaluation_process": "Detect abstention patterns in answer and ground_truth and compare behavior.",
|
|
118
|
+
"rules": {"patterns": "Korean/English regex patterns"},
|
|
119
|
+
"notes": "Hallucination/abstention behavior check.",
|
|
120
|
+
},
|
|
121
|
+
"exact_match": {
|
|
122
|
+
"evaluation_method": "string-match",
|
|
123
|
+
"inputs": ["answer", "ground_truth"],
|
|
124
|
+
"output": "1.0 exact match else 0.0",
|
|
125
|
+
"evaluation_process": "Normalize text and compare exact match with optional strict number matching.",
|
|
126
|
+
"rules": {"normalize": True, "number_strict": True},
|
|
127
|
+
"notes": "Token/number strict matching for factual answers.",
|
|
128
|
+
},
|
|
129
|
+
"f1_score": {
|
|
130
|
+
"evaluation_method": "token-overlap",
|
|
131
|
+
"inputs": ["answer", "ground_truth"],
|
|
132
|
+
"output": "0.0-1.0 (weighted F1)",
|
|
133
|
+
"evaluation_process": "Tokenize, compute weighted precision/recall/F1 with number emphasis.",
|
|
134
|
+
"rules": {"number_weight": 2.0},
|
|
135
|
+
"notes": "Token-level overlap with numeric weighting.",
|
|
136
|
+
},
|
|
137
|
+
"mrr": {
|
|
138
|
+
"evaluation_method": "retrieval-rank",
|
|
139
|
+
"inputs": ["ground_truth", "contexts"],
|
|
140
|
+
"output": "0.0-1.0 (1/rank of first relevant context)",
|
|
141
|
+
"evaluation_process": "Compute relevance by token overlap and take reciprocal rank of first hit.",
|
|
142
|
+
"rules": {"relevance_threshold": 0.3},
|
|
143
|
+
"notes": "Ranking quality of retrieved contexts.",
|
|
144
|
+
},
|
|
145
|
+
"ndcg": {
|
|
146
|
+
"evaluation_method": "retrieval-rank",
|
|
147
|
+
"inputs": ["ground_truth", "contexts"],
|
|
148
|
+
"output": "0.0-1.0 (NDCG@K)",
|
|
149
|
+
"evaluation_process": "Compute graded relevance per context and calculate NDCG.",
|
|
150
|
+
"rules": {"k": 10, "use_graded": True},
|
|
151
|
+
"notes": "Ranking quality across all relevant contexts.",
|
|
152
|
+
},
|
|
153
|
+
"hit_rate": {
|
|
154
|
+
"evaluation_method": "retrieval-rank",
|
|
155
|
+
"inputs": ["ground_truth", "contexts"],
|
|
156
|
+
"output": "1.0 if any relevant context in top K else 0.0",
|
|
157
|
+
"evaluation_process": "Check whether top-K contexts contain a relevant hit.",
|
|
158
|
+
"rules": {"k": 10, "relevance_threshold": 0.3},
|
|
159
|
+
"notes": "Recall@K style coverage check.",
|
|
160
|
+
},
|
|
161
|
+
"confidence_score": {
|
|
162
|
+
"evaluation_method": "rule-based",
|
|
163
|
+
"inputs": ["answer", "ground_truth", "contexts"],
|
|
164
|
+
"output": "0.0-1.0 (weighted confidence)",
|
|
165
|
+
"evaluation_process": "Combine context coverage, answer specificity, and consistency scores.",
|
|
166
|
+
"rules": {"coverage": 0.4, "specificity": 0.3, "consistency": 0.3},
|
|
167
|
+
"notes": "Heuristic confidence signal for human escalation.",
|
|
168
|
+
},
|
|
169
|
+
"contextual_relevancy": {
|
|
170
|
+
"evaluation_method": "token-overlap",
|
|
171
|
+
"inputs": ["question", "contexts"],
|
|
172
|
+
"output": "0.0-1.0 (avg relevancy)",
|
|
173
|
+
"evaluation_process": "Measure question-context token overlap and average across contexts.",
|
|
174
|
+
"rules": {"relevance_threshold": 0.35},
|
|
175
|
+
"notes": "Reference-free context relevance check.",
|
|
176
|
+
},
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _hash_file(path: str | Path | None) -> str | None:
|
|
181
|
+
if not path:
|
|
182
|
+
return None
|
|
183
|
+
file_path = Path(path)
|
|
184
|
+
if not file_path.exists():
|
|
185
|
+
return None
|
|
186
|
+
payload = file_path.read_bytes()
|
|
187
|
+
return hashlib.sha256(payload).hexdigest()
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _resolve_source_path(metric_class: type[Any]) -> str | None:
|
|
191
|
+
try:
|
|
192
|
+
source = inspect.getsourcefile(metric_class)
|
|
193
|
+
except TypeError:
|
|
194
|
+
return None
|
|
195
|
+
if not source:
|
|
196
|
+
return None
|
|
197
|
+
return str(Path(source).resolve())
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def build_custom_metric_snapshot(
|
|
201
|
+
metric_classes: dict[str, type[Any]],
|
|
202
|
+
metrics: Iterable[str],
|
|
203
|
+
) -> dict[str, Any] | None:
|
|
204
|
+
custom_names = [name for name in metrics if name in metric_classes]
|
|
205
|
+
if not custom_names:
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
spec_map = get_metric_spec_map()
|
|
209
|
+
rows: list[dict[str, Any]] = []
|
|
210
|
+
for metric_name in custom_names:
|
|
211
|
+
metric_class = metric_classes.get(metric_name)
|
|
212
|
+
if metric_class is None:
|
|
213
|
+
continue
|
|
214
|
+
source_path = _resolve_source_path(metric_class)
|
|
215
|
+
details = _CUSTOM_METRIC_DETAILS.get(metric_name, {})
|
|
216
|
+
spec = spec_map.get(metric_name)
|
|
217
|
+
rows.append(
|
|
218
|
+
{
|
|
219
|
+
"metric_name": metric_name,
|
|
220
|
+
"source": "custom",
|
|
221
|
+
"description": spec.description if spec else None,
|
|
222
|
+
"evaluation_method": details.get("evaluation_method"),
|
|
223
|
+
"inputs": details.get("inputs"),
|
|
224
|
+
"output": details.get("output"),
|
|
225
|
+
"evaluation_process": details.get("evaluation_process"),
|
|
226
|
+
"rules": details.get("rules"),
|
|
227
|
+
"notes": details.get("notes"),
|
|
228
|
+
"implementation_path": source_path,
|
|
229
|
+
"implementation_hash": _hash_file(source_path),
|
|
230
|
+
}
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
return {"schema_version": SCHEMA_VERSION, "metrics": rows}
|
|
@@ -11,8 +11,9 @@ from collections.abc import Callable, Sequence
|
|
|
11
11
|
from contextlib import suppress
|
|
12
12
|
from dataclasses import dataclass
|
|
13
13
|
from datetime import datetime
|
|
14
|
-
from typing import Any, Literal,
|
|
14
|
+
from typing import Any, Literal, overload
|
|
15
15
|
|
|
16
|
+
from pydantic import BaseModel, Field, field_validator
|
|
16
17
|
from ragas import SingleTurnSample
|
|
17
18
|
|
|
18
19
|
from evalvault.domain.entities import (
|
|
@@ -30,8 +31,13 @@ from evalvault.domain.metrics.entity_preservation import EntityPreservation
|
|
|
30
31
|
from evalvault.domain.metrics.insurance import InsuranceTermAccuracy
|
|
31
32
|
from evalvault.domain.metrics.no_answer import NoAnswerAccuracy
|
|
32
33
|
from evalvault.domain.metrics.retrieval_rank import MRR, NDCG, HitRate
|
|
34
|
+
from evalvault.domain.metrics.summary_accuracy import SummaryAccuracy
|
|
35
|
+
from evalvault.domain.metrics.summary_needs_followup import SummaryNeedsFollowup
|
|
36
|
+
from evalvault.domain.metrics.summary_non_definitive import SummaryNonDefinitive
|
|
37
|
+
from evalvault.domain.metrics.summary_risk_coverage import SummaryRiskCoverage
|
|
33
38
|
from evalvault.domain.metrics.text_match import ExactMatch, F1Score
|
|
34
39
|
from evalvault.domain.services.batch_executor import run_in_batches
|
|
40
|
+
from evalvault.domain.services.custom_metric_snapshot import build_custom_metric_snapshot
|
|
35
41
|
from evalvault.domain.services.dataset_preprocessor import DatasetPreprocessor
|
|
36
42
|
from evalvault.domain.services.retriever_context import apply_retriever_to_dataset
|
|
37
43
|
from evalvault.ports.outbound.korean_nlp_port import KoreanNLPToolkitPort, RetrieverPort
|
|
@@ -55,14 +61,53 @@ _SUMMARY_FAITHFULNESS_PROMPT_EN = (
|
|
|
55
61
|
)
|
|
56
62
|
|
|
57
63
|
|
|
64
|
+
def _patch_ragas_faithfulness_output() -> None:
|
|
65
|
+
try:
|
|
66
|
+
from ragas.metrics import Faithfulness
|
|
67
|
+
except Exception:
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
prompt = getattr(Faithfulness, "nli_statements_prompt", None)
|
|
71
|
+
if prompt is None:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
output_model = getattr(prompt, "output_model", None)
|
|
75
|
+
if output_model is None:
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
class _StatementFaithfulnessAnswer(BaseModel):
|
|
79
|
+
statement: str = Field(..., description="the original statement, word-by-word")
|
|
80
|
+
reason: str = Field(..., description="the reason of the verdict")
|
|
81
|
+
verdict: int = Field(..., description="the verdict(0/1) of the faithfulness.")
|
|
82
|
+
|
|
83
|
+
@field_validator("verdict", mode="before")
|
|
84
|
+
@classmethod
|
|
85
|
+
def _coerce_verdict(cls, value):
|
|
86
|
+
if isinstance(value, str):
|
|
87
|
+
normalized = value.strip()
|
|
88
|
+
if normalized.isdigit():
|
|
89
|
+
return int(normalized)
|
|
90
|
+
return value
|
|
91
|
+
|
|
92
|
+
class _NLIStatementOutput(BaseModel):
|
|
93
|
+
statements: list[_StatementFaithfulnessAnswer]
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
prompt.output_model = _NLIStatementOutput
|
|
97
|
+
except Exception:
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
|
|
58
101
|
def _import_metric(name: str) -> type[Any]:
|
|
59
102
|
for module_name in ("ragas.metrics.collections", "ragas.metrics"):
|
|
60
103
|
try:
|
|
61
104
|
module = importlib.import_module(module_name)
|
|
62
|
-
|
|
105
|
+
if hasattr(module, name):
|
|
106
|
+
if name == "Faithfulness":
|
|
107
|
+
_patch_ragas_faithfulness_output()
|
|
108
|
+
return getattr(module, name)
|
|
109
|
+
except ImportError:
|
|
63
110
|
continue
|
|
64
|
-
if hasattr(module, name):
|
|
65
|
-
return cast(type[Any], getattr(module, name))
|
|
66
111
|
raise ImportError(f"Missing ragas metric: {name}")
|
|
67
112
|
|
|
68
113
|
|
|
@@ -147,6 +192,10 @@ class RagasEvaluator:
|
|
|
147
192
|
CUSTOM_METRIC_MAP = {
|
|
148
193
|
"insurance_term_accuracy": InsuranceTermAccuracy,
|
|
149
194
|
"entity_preservation": EntityPreservation,
|
|
195
|
+
"summary_accuracy": SummaryAccuracy,
|
|
196
|
+
"summary_risk_coverage": SummaryRiskCoverage,
|
|
197
|
+
"summary_non_definitive": SummaryNonDefinitive,
|
|
198
|
+
"summary_needs_followup": SummaryNeedsFollowup,
|
|
150
199
|
"exact_match": ExactMatch,
|
|
151
200
|
"f1_score": F1Score,
|
|
152
201
|
"no_answer_accuracy": NoAnswerAccuracy,
|
|
@@ -198,6 +247,10 @@ class RagasEvaluator:
|
|
|
198
247
|
"summary_faithfulness": 0.9,
|
|
199
248
|
"summary_score": 0.85,
|
|
200
249
|
"entity_preservation": 0.9,
|
|
250
|
+
"summary_accuracy": 0.9,
|
|
251
|
+
"summary_risk_coverage": 0.9,
|
|
252
|
+
"summary_non_definitive": 0.8,
|
|
253
|
+
"summary_needs_followup": 0.8,
|
|
201
254
|
"contextual_relevancy": 0.35,
|
|
202
255
|
}
|
|
203
256
|
LANGUAGE_SAMPLE_LIMIT = 5
|
|
@@ -225,10 +278,28 @@ class RagasEvaluator:
|
|
|
225
278
|
"예시의 원자성 수준을 따르세요."
|
|
226
279
|
)
|
|
227
280
|
FACTUAL_CORRECTNESS_NLI_INSTRUCTION = (
|
|
228
|
-
"
|
|
229
|
-
"
|
|
230
|
-
|
|
231
|
-
|
|
281
|
+
"주어진 컨텍스트를 보고 각 진술이 직접적으로 도출 가능한지 판단하세요. "
|
|
282
|
+
"가능하면 verdict=1, 불가능하면 verdict=0을 JSON으로 반환하세요."
|
|
283
|
+
)
|
|
284
|
+
SUMMARY_SCORE_QUESTION_INSTRUCTION = (
|
|
285
|
+
"다음 텍스트와 핵심 키워드를 기반으로, "
|
|
286
|
+
"텍스트에 근거해 반드시 1로 답할 수 있는 폐쇄형 질문을 생성하세요. "
|
|
287
|
+
"질문은 한국어로 작성하세요."
|
|
288
|
+
)
|
|
289
|
+
SUMMARY_SCORE_ANSWER_INSTRUCTION = (
|
|
290
|
+
"다음 질문 목록에 대해, 제공된 요약이 각 질문에 답할 수 있으면 '1', "
|
|
291
|
+
"그렇지 않으면 '0'을 JSON 배열로 반환하세요."
|
|
292
|
+
)
|
|
293
|
+
SUMMARY_SCORE_KEYPHRASE_INSTRUCTION = (
|
|
294
|
+
"다음 텍스트에서 인물, 기관, 위치, 날짜/시간, 금액, 비율과 같은 핵심 키워드를 추출하세요."
|
|
295
|
+
)
|
|
296
|
+
SUMMARY_FAITHFULNESS_STATEMENT_INSTRUCTION = (
|
|
297
|
+
"질문과 답변을 보고 각 문장을 이해 가능한 주장으로 분해하세요. "
|
|
298
|
+
"각 주장은 대명사 없이 독립적으로 이해 가능해야 합니다."
|
|
299
|
+
)
|
|
300
|
+
SUMMARY_FAITHFULNESS_NLI_INSTRUCTION = (
|
|
301
|
+
"주어진 컨텍스트를 보고 각 진술이 직접적으로 도출 가능한지 판단하세요. "
|
|
302
|
+
"가능하면 verdict=1, 불가능하면 verdict=0을 JSON으로 반환하세요."
|
|
232
303
|
)
|
|
233
304
|
FACTUAL_CORRECTNESS_CLAIM_EXAMPLES = [
|
|
234
305
|
{
|
|
@@ -390,6 +461,7 @@ class RagasEvaluator:
|
|
|
390
461
|
|
|
391
462
|
# Evaluate with Ragas (if any Ragas metrics)
|
|
392
463
|
eval_results_by_test_case = {}
|
|
464
|
+
prompt_snapshots = {}
|
|
393
465
|
if ragas_metrics:
|
|
394
466
|
run.tracker_metadata["ragas_config"] = self._build_ragas_config(llm)
|
|
395
467
|
(
|
|
@@ -412,6 +484,13 @@ class RagasEvaluator:
|
|
|
412
484
|
elif prompt_overrides:
|
|
413
485
|
logger.warning("Ragas prompt overrides provided but no Ragas metrics requested.")
|
|
414
486
|
|
|
487
|
+
custom_snapshot = build_custom_metric_snapshot(self.CUSTOM_METRIC_MAP, metrics)
|
|
488
|
+
if custom_snapshot:
|
|
489
|
+
run.tracker_metadata["custom_metric_snapshot"] = custom_snapshot
|
|
490
|
+
custom_prompt_snapshots = self._build_custom_prompt_snapshots(custom_snapshot)
|
|
491
|
+
if custom_prompt_snapshots:
|
|
492
|
+
run.tracker_metadata["custom_prompt_snapshots"] = custom_prompt_snapshots
|
|
493
|
+
|
|
415
494
|
# Evaluate with custom metrics (if any custom metrics)
|
|
416
495
|
if custom_metrics:
|
|
417
496
|
custom_results = await self._evaluate_with_custom_metrics(
|
|
@@ -583,6 +662,11 @@ class RagasEvaluator:
|
|
|
583
662
|
ragas_metrics=ragas_metrics,
|
|
584
663
|
prompt_overrides=prompt_overrides,
|
|
585
664
|
)
|
|
665
|
+
self._apply_summary_prompt_defaults(
|
|
666
|
+
dataset=dataset,
|
|
667
|
+
ragas_metrics=ragas_metrics,
|
|
668
|
+
prompt_overrides=prompt_overrides,
|
|
669
|
+
)
|
|
586
670
|
self._apply_factual_correctness_prompt_defaults(
|
|
587
671
|
dataset=dataset,
|
|
588
672
|
ragas_metrics=ragas_metrics,
|
|
@@ -645,6 +729,30 @@ class RagasEvaluator:
|
|
|
645
729
|
continue
|
|
646
730
|
self._apply_korean_answer_relevancy_prompt(metric)
|
|
647
731
|
|
|
732
|
+
def _apply_summary_prompt_defaults(
|
|
733
|
+
self,
|
|
734
|
+
*,
|
|
735
|
+
dataset: Dataset,
|
|
736
|
+
ragas_metrics: list[Any],
|
|
737
|
+
prompt_overrides: dict[str, str] | None,
|
|
738
|
+
) -> None:
|
|
739
|
+
if not ragas_metrics:
|
|
740
|
+
return
|
|
741
|
+
if prompt_overrides and any(
|
|
742
|
+
metric in prompt_overrides for metric in ("summary_score", "summary_faithfulness")
|
|
743
|
+
):
|
|
744
|
+
return
|
|
745
|
+
resolved_language = self._resolve_dataset_language(dataset)
|
|
746
|
+
if resolved_language == "en":
|
|
747
|
+
return
|
|
748
|
+
|
|
749
|
+
for metric in ragas_metrics:
|
|
750
|
+
metric_name = getattr(metric, "name", None)
|
|
751
|
+
if metric_name == "summary_score":
|
|
752
|
+
self._apply_korean_summary_score_prompts(metric)
|
|
753
|
+
elif metric_name == "summary_faithfulness":
|
|
754
|
+
self._apply_korean_summary_faithfulness_prompts(metric)
|
|
755
|
+
|
|
648
756
|
def _apply_factual_correctness_prompt_defaults(
|
|
649
757
|
self,
|
|
650
758
|
*,
|
|
@@ -745,6 +853,56 @@ class RagasEvaluator:
|
|
|
745
853
|
prompt.language = "ko"
|
|
746
854
|
return True
|
|
747
855
|
|
|
856
|
+
def _apply_korean_summary_score_prompts(self, metric: Any) -> bool:
|
|
857
|
+
question_prompt = getattr(metric, "question_generation_prompt", None)
|
|
858
|
+
answer_prompt = getattr(metric, "answer_generation_prompt", None)
|
|
859
|
+
keyphrase_prompt = getattr(metric, "extract_keyphrases_prompt", None)
|
|
860
|
+
applied = False
|
|
861
|
+
|
|
862
|
+
if question_prompt and hasattr(question_prompt, "instruction"):
|
|
863
|
+
question_prompt.instruction = self.SUMMARY_SCORE_QUESTION_INSTRUCTION
|
|
864
|
+
if hasattr(question_prompt, "language"):
|
|
865
|
+
with suppress(Exception):
|
|
866
|
+
question_prompt.language = "ko"
|
|
867
|
+
applied = True
|
|
868
|
+
|
|
869
|
+
if answer_prompt and hasattr(answer_prompt, "instruction"):
|
|
870
|
+
answer_prompt.instruction = self.SUMMARY_SCORE_ANSWER_INSTRUCTION
|
|
871
|
+
if hasattr(answer_prompt, "language"):
|
|
872
|
+
with suppress(Exception):
|
|
873
|
+
answer_prompt.language = "ko"
|
|
874
|
+
applied = True
|
|
875
|
+
|
|
876
|
+
if keyphrase_prompt and hasattr(keyphrase_prompt, "instruction"):
|
|
877
|
+
keyphrase_prompt.instruction = self.SUMMARY_SCORE_KEYPHRASE_INSTRUCTION
|
|
878
|
+
if hasattr(keyphrase_prompt, "language"):
|
|
879
|
+
with suppress(Exception):
|
|
880
|
+
keyphrase_prompt.language = "ko"
|
|
881
|
+
applied = True
|
|
882
|
+
|
|
883
|
+
return applied
|
|
884
|
+
|
|
885
|
+
def _apply_korean_summary_faithfulness_prompts(self, metric: Any) -> bool:
|
|
886
|
+
statement_prompt = getattr(metric, "statement_generator_prompt", None)
|
|
887
|
+
nli_prompt = getattr(metric, "nli_statements_prompt", None)
|
|
888
|
+
applied = False
|
|
889
|
+
|
|
890
|
+
if statement_prompt and hasattr(statement_prompt, "instruction"):
|
|
891
|
+
statement_prompt.instruction = self.SUMMARY_FAITHFULNESS_STATEMENT_INSTRUCTION
|
|
892
|
+
if hasattr(statement_prompt, "language"):
|
|
893
|
+
with suppress(Exception):
|
|
894
|
+
statement_prompt.language = "ko"
|
|
895
|
+
applied = True
|
|
896
|
+
|
|
897
|
+
if nli_prompt and hasattr(nli_prompt, "instruction"):
|
|
898
|
+
nli_prompt.instruction = self.SUMMARY_FAITHFULNESS_NLI_INSTRUCTION
|
|
899
|
+
if hasattr(nli_prompt, "language"):
|
|
900
|
+
with suppress(Exception):
|
|
901
|
+
nli_prompt.language = "ko"
|
|
902
|
+
applied = True
|
|
903
|
+
|
|
904
|
+
return applied
|
|
905
|
+
|
|
748
906
|
def _apply_korean_factual_correctness_prompts(self, metric: Any) -> bool:
|
|
749
907
|
claim_prompt = getattr(metric, "claim_decomposition_prompt", None)
|
|
750
908
|
nli_prompt = getattr(metric, "nli_prompt", None)
|
|
@@ -819,6 +977,8 @@ class RagasEvaluator:
|
|
|
819
977
|
continue
|
|
820
978
|
prompt_text = prompt_overrides[metric_name]
|
|
821
979
|
applied = self._override_metric_prompt(metric, prompt_text)
|
|
980
|
+
if not applied and metric_name == "faithfulness":
|
|
981
|
+
applied = self._override_faithfulness_prompt(metric, prompt_text)
|
|
822
982
|
statuses[metric_name] = "applied" if applied else "unsupported"
|
|
823
983
|
if not applied:
|
|
824
984
|
logger.warning("Prompt override for metric '%s' could not be applied.", metric_name)
|
|
@@ -878,6 +1038,16 @@ class RagasEvaluator:
|
|
|
878
1038
|
|
|
879
1039
|
return False
|
|
880
1040
|
|
|
1041
|
+
@staticmethod
|
|
1042
|
+
def _override_faithfulness_prompt(metric: Any, prompt_text: str) -> bool:
|
|
1043
|
+
target = getattr(metric, "nli_statements_prompt", None)
|
|
1044
|
+
if target is None:
|
|
1045
|
+
return False
|
|
1046
|
+
if hasattr(target, "instruction"):
|
|
1047
|
+
target.instruction = prompt_text
|
|
1048
|
+
return True
|
|
1049
|
+
return False
|
|
1050
|
+
|
|
881
1051
|
@staticmethod
|
|
882
1052
|
def _extract_prompt_text(value: Any) -> str | None:
|
|
883
1053
|
if value is None:
|
|
@@ -926,18 +1096,50 @@ class RagasEvaluator:
|
|
|
926
1096
|
metric_name = getattr(metric, "name", None)
|
|
927
1097
|
if not metric_name:
|
|
928
1098
|
continue
|
|
929
|
-
prompt_text = self._collect_metric_prompt_text(metric)
|
|
930
|
-
if not prompt_text:
|
|
931
|
-
continue
|
|
932
1099
|
requested = bool(prompt_overrides and metric_name in prompt_overrides)
|
|
933
1100
|
status = override_status.get(metric_name)
|
|
934
1101
|
source = "override" if status == "applied" else "default"
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
"
|
|
939
|
-
|
|
940
|
-
|
|
1102
|
+
|
|
1103
|
+
prompts: dict[str, str] = {}
|
|
1104
|
+
if metric_name == "summary_score":
|
|
1105
|
+
prompts["question_generation"] = (
|
|
1106
|
+
self._extract_prompt_text(getattr(metric, "question_generation_prompt", None))
|
|
1107
|
+
or ""
|
|
1108
|
+
)
|
|
1109
|
+
prompts["answer_generation"] = (
|
|
1110
|
+
self._extract_prompt_text(getattr(metric, "answer_generation_prompt", None))
|
|
1111
|
+
or ""
|
|
1112
|
+
)
|
|
1113
|
+
prompts["extract_keyphrases"] = (
|
|
1114
|
+
self._extract_prompt_text(getattr(metric, "extract_keyphrases_prompt", None))
|
|
1115
|
+
or ""
|
|
1116
|
+
)
|
|
1117
|
+
prompts = {k: v for k, v in prompts.items() if v}
|
|
1118
|
+
elif metric_name == "summary_faithfulness":
|
|
1119
|
+
prompts["statement_generation"] = (
|
|
1120
|
+
self._extract_prompt_text(getattr(metric, "statement_generator_prompt", None))
|
|
1121
|
+
or ""
|
|
1122
|
+
)
|
|
1123
|
+
prompts["nli_statements"] = (
|
|
1124
|
+
self._extract_prompt_text(getattr(metric, "nli_statements_prompt", None)) or ""
|
|
1125
|
+
)
|
|
1126
|
+
prompts = {k: v for k, v in prompts.items() if v}
|
|
1127
|
+
|
|
1128
|
+
prompt_text = self._collect_metric_prompt_text(metric)
|
|
1129
|
+
if prompts:
|
|
1130
|
+
snapshots[str(metric_name)] = {
|
|
1131
|
+
"prompts": prompts,
|
|
1132
|
+
"source": source,
|
|
1133
|
+
"override_requested": requested,
|
|
1134
|
+
"override_status": status,
|
|
1135
|
+
}
|
|
1136
|
+
elif prompt_text:
|
|
1137
|
+
snapshots[str(metric_name)] = {
|
|
1138
|
+
"prompt": prompt_text,
|
|
1139
|
+
"source": source,
|
|
1140
|
+
"override_requested": requested,
|
|
1141
|
+
"override_status": status,
|
|
1142
|
+
}
|
|
941
1143
|
return snapshots
|
|
942
1144
|
|
|
943
1145
|
async def _evaluate_sequential(
|
|
@@ -1135,16 +1337,26 @@ class RagasEvaluator:
|
|
|
1135
1337
|
claim_details: dict[str, ClaimLevelResult] = {}
|
|
1136
1338
|
|
|
1137
1339
|
for metric in ragas_metrics:
|
|
1138
|
-
if metric.name in self.FAITHFULNESS_METRICS
|
|
1139
|
-
if
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1340
|
+
if metric.name in self.FAITHFULNESS_METRICS:
|
|
1341
|
+
if self._active_llm_provider == "ollama":
|
|
1342
|
+
fallback_score = self._fallback_korean_faithfulness(
|
|
1343
|
+
sample, return_details=False
|
|
1344
|
+
)
|
|
1345
|
+
if fallback_score is None:
|
|
1346
|
+
fallback_score = await self._score_faithfulness_with_fallback(sample)
|
|
1347
|
+
if fallback_score is not None:
|
|
1348
|
+
scores[metric.name] = fallback_score
|
|
1349
|
+
continue
|
|
1350
|
+
if self._faithfulness_ragas_failed:
|
|
1351
|
+
if metric.name == "summary_faithfulness":
|
|
1352
|
+
judge_score = await self._score_summary_faithfulness_judge(sample)
|
|
1353
|
+
if judge_score is not None:
|
|
1354
|
+
scores[metric.name] = judge_score
|
|
1355
|
+
continue
|
|
1356
|
+
fallback_score = await self._score_faithfulness_with_fallback(sample)
|
|
1357
|
+
if fallback_score is not None:
|
|
1358
|
+
scores[metric.name] = fallback_score
|
|
1143
1359
|
continue
|
|
1144
|
-
fallback_score = await self._score_faithfulness_with_fallback(sample)
|
|
1145
|
-
if fallback_score is not None:
|
|
1146
|
-
scores[metric.name] = fallback_score
|
|
1147
|
-
continue
|
|
1148
1360
|
try:
|
|
1149
1361
|
# Ragas >=0.4 uses ascore() with kwargs
|
|
1150
1362
|
if hasattr(metric, "ascore"):
|
|
@@ -1272,6 +1484,32 @@ class RagasEvaluator:
|
|
|
1272
1484
|
normalized = str(domain).strip().lower()
|
|
1273
1485
|
return cls.SUMMARY_SCORE_COEFF_BY_DOMAIN.get(normalized, cls.SUMMARY_SCORE_COEFF)
|
|
1274
1486
|
|
|
1487
|
+
def _build_custom_prompt_snapshots(self, snapshot: dict[str, Any]) -> dict[str, dict[str, Any]]:
|
|
1488
|
+
entries = snapshot.get("metrics") if isinstance(snapshot, dict) else None
|
|
1489
|
+
if not isinstance(entries, list):
|
|
1490
|
+
return {}
|
|
1491
|
+
prompt_snapshot: dict[str, dict[str, Any]] = {}
|
|
1492
|
+
for entry in entries:
|
|
1493
|
+
if not isinstance(entry, dict):
|
|
1494
|
+
continue
|
|
1495
|
+
name = entry.get("metric_name")
|
|
1496
|
+
if not isinstance(name, str) or not name:
|
|
1497
|
+
continue
|
|
1498
|
+
evaluation_process = entry.get("evaluation_process")
|
|
1499
|
+
if not isinstance(evaluation_process, str) or not evaluation_process:
|
|
1500
|
+
continue
|
|
1501
|
+
rules = entry.get("rules") if isinstance(entry.get("rules"), dict) else None
|
|
1502
|
+
prompts: dict[str, str] = {"rule": evaluation_process}
|
|
1503
|
+
if rules:
|
|
1504
|
+
prompts["rules"] = json.dumps(rules, ensure_ascii=False, indent=2)
|
|
1505
|
+
prompt_snapshot[name] = {
|
|
1506
|
+
"prompts": prompts,
|
|
1507
|
+
"source": "custom_rules",
|
|
1508
|
+
"rules": rules,
|
|
1509
|
+
"inputs": entry.get("inputs"),
|
|
1510
|
+
}
|
|
1511
|
+
return prompt_snapshot
|
|
1512
|
+
|
|
1275
1513
|
def _build_summary_score_metric(self, metric_class, ragas_llm, coeff: float | None = None):
|
|
1276
1514
|
if coeff is None:
|
|
1277
1515
|
coeff = self.SUMMARY_SCORE_COEFF
|
|
@@ -1653,9 +1891,11 @@ class RagasEvaluator:
|
|
|
1653
1891
|
contexts=test_case.contexts,
|
|
1654
1892
|
)
|
|
1655
1893
|
else:
|
|
1656
|
-
score =
|
|
1894
|
+
score = self._score_custom_metric_with_metadata(
|
|
1895
|
+
metric_instance,
|
|
1657
1896
|
answer=test_case.answer,
|
|
1658
1897
|
contexts=test_case.contexts,
|
|
1898
|
+
metadata=test_case.metadata,
|
|
1659
1899
|
)
|
|
1660
1900
|
scores[metric_name] = score
|
|
1661
1901
|
|
|
@@ -1676,6 +1916,19 @@ class RagasEvaluator:
|
|
|
1676
1916
|
|
|
1677
1917
|
return results
|
|
1678
1918
|
|
|
1919
|
+
def _score_custom_metric_with_metadata(
|
|
1920
|
+
self,
|
|
1921
|
+
metric_instance: Any,
|
|
1922
|
+
*,
|
|
1923
|
+
answer: str,
|
|
1924
|
+
contexts: list[str],
|
|
1925
|
+
metadata: dict[str, Any],
|
|
1926
|
+
) -> float:
|
|
1927
|
+
try:
|
|
1928
|
+
return float(metric_instance.score(answer=answer, contexts=contexts, metadata=metadata))
|
|
1929
|
+
except TypeError:
|
|
1930
|
+
return float(metric_instance.score(answer=answer, contexts=contexts))
|
|
1931
|
+
|
|
1679
1932
|
def _calculate_cost(self, model_name: str, prompt_tokens: int, completion_tokens: int) -> float:
|
|
1680
1933
|
"""Calculate estimated cost in USD based on model pricing."""
|
|
1681
1934
|
# Find matching model key (exact or substring match)
|