evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +14 -0
- evalvault/adapters/inbound/api/main.py +14 -4
- evalvault/adapters/inbound/api/routers/chat.py +543 -0
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/inbound/cli/commands/run.py +14 -0
- evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
- evalvault/adapters/outbound/storage/base_sql.py +41 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
- evalvault/config/settings.py +21 -0
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/prompt.py +1 -1
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/metrics/__init__.py +8 -0
- evalvault/domain/metrics/registry.py +39 -3
- evalvault/domain/metrics/summary_accuracy.py +189 -0
- evalvault/domain/metrics/summary_needs_followup.py +45 -0
- evalvault/domain/metrics/summary_non_definitive.py +41 -0
- evalvault/domain/metrics/summary_risk_coverage.py +45 -0
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/custom_metric_snapshot.py +233 -0
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +282 -27
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/prompt_registry.py +39 -10
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/domain/services/threshold_profiles.py +4 -0
- evalvault/domain/services/visual_space_service.py +79 -4
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -11,8 +11,9 @@ from collections.abc import Callable, Sequence
|
|
|
11
11
|
from contextlib import suppress
|
|
12
12
|
from dataclasses import dataclass
|
|
13
13
|
from datetime import datetime
|
|
14
|
-
from typing import Any, Literal,
|
|
14
|
+
from typing import Any, Literal, overload
|
|
15
15
|
|
|
16
|
+
from pydantic import BaseModel, Field, field_validator
|
|
16
17
|
from ragas import SingleTurnSample
|
|
17
18
|
|
|
18
19
|
from evalvault.domain.entities import (
|
|
@@ -30,8 +31,13 @@ from evalvault.domain.metrics.entity_preservation import EntityPreservation
|
|
|
30
31
|
from evalvault.domain.metrics.insurance import InsuranceTermAccuracy
|
|
31
32
|
from evalvault.domain.metrics.no_answer import NoAnswerAccuracy
|
|
32
33
|
from evalvault.domain.metrics.retrieval_rank import MRR, NDCG, HitRate
|
|
34
|
+
from evalvault.domain.metrics.summary_accuracy import SummaryAccuracy
|
|
35
|
+
from evalvault.domain.metrics.summary_needs_followup import SummaryNeedsFollowup
|
|
36
|
+
from evalvault.domain.metrics.summary_non_definitive import SummaryNonDefinitive
|
|
37
|
+
from evalvault.domain.metrics.summary_risk_coverage import SummaryRiskCoverage
|
|
33
38
|
from evalvault.domain.metrics.text_match import ExactMatch, F1Score
|
|
34
39
|
from evalvault.domain.services.batch_executor import run_in_batches
|
|
40
|
+
from evalvault.domain.services.custom_metric_snapshot import build_custom_metric_snapshot
|
|
35
41
|
from evalvault.domain.services.dataset_preprocessor import DatasetPreprocessor
|
|
36
42
|
from evalvault.domain.services.retriever_context import apply_retriever_to_dataset
|
|
37
43
|
from evalvault.ports.outbound.korean_nlp_port import KoreanNLPToolkitPort, RetrieverPort
|
|
@@ -55,14 +61,53 @@ _SUMMARY_FAITHFULNESS_PROMPT_EN = (
|
|
|
55
61
|
)
|
|
56
62
|
|
|
57
63
|
|
|
64
|
+
def _patch_ragas_faithfulness_output() -> None:
|
|
65
|
+
try:
|
|
66
|
+
from ragas.metrics import Faithfulness
|
|
67
|
+
except Exception:
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
prompt = getattr(Faithfulness, "nli_statements_prompt", None)
|
|
71
|
+
if prompt is None:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
output_model = getattr(prompt, "output_model", None)
|
|
75
|
+
if output_model is None:
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
class _StatementFaithfulnessAnswer(BaseModel):
|
|
79
|
+
statement: str = Field(..., description="the original statement, word-by-word")
|
|
80
|
+
reason: str = Field(..., description="the reason of the verdict")
|
|
81
|
+
verdict: int = Field(..., description="the verdict(0/1) of the faithfulness.")
|
|
82
|
+
|
|
83
|
+
@field_validator("verdict", mode="before")
|
|
84
|
+
@classmethod
|
|
85
|
+
def _coerce_verdict(cls, value):
|
|
86
|
+
if isinstance(value, str):
|
|
87
|
+
normalized = value.strip()
|
|
88
|
+
if normalized.isdigit():
|
|
89
|
+
return int(normalized)
|
|
90
|
+
return value
|
|
91
|
+
|
|
92
|
+
class _NLIStatementOutput(BaseModel):
|
|
93
|
+
statements: list[_StatementFaithfulnessAnswer]
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
prompt.output_model = _NLIStatementOutput
|
|
97
|
+
except Exception:
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
|
|
58
101
|
def _import_metric(name: str) -> type[Any]:
|
|
59
102
|
for module_name in ("ragas.metrics.collections", "ragas.metrics"):
|
|
60
103
|
try:
|
|
61
104
|
module = importlib.import_module(module_name)
|
|
62
|
-
|
|
105
|
+
if hasattr(module, name):
|
|
106
|
+
if name == "Faithfulness":
|
|
107
|
+
_patch_ragas_faithfulness_output()
|
|
108
|
+
return getattr(module, name)
|
|
109
|
+
except ImportError:
|
|
63
110
|
continue
|
|
64
|
-
if hasattr(module, name):
|
|
65
|
-
return cast(type[Any], getattr(module, name))
|
|
66
111
|
raise ImportError(f"Missing ragas metric: {name}")
|
|
67
112
|
|
|
68
113
|
|
|
@@ -147,6 +192,10 @@ class RagasEvaluator:
|
|
|
147
192
|
CUSTOM_METRIC_MAP = {
|
|
148
193
|
"insurance_term_accuracy": InsuranceTermAccuracy,
|
|
149
194
|
"entity_preservation": EntityPreservation,
|
|
195
|
+
"summary_accuracy": SummaryAccuracy,
|
|
196
|
+
"summary_risk_coverage": SummaryRiskCoverage,
|
|
197
|
+
"summary_non_definitive": SummaryNonDefinitive,
|
|
198
|
+
"summary_needs_followup": SummaryNeedsFollowup,
|
|
150
199
|
"exact_match": ExactMatch,
|
|
151
200
|
"f1_score": F1Score,
|
|
152
201
|
"no_answer_accuracy": NoAnswerAccuracy,
|
|
@@ -198,6 +247,10 @@ class RagasEvaluator:
|
|
|
198
247
|
"summary_faithfulness": 0.9,
|
|
199
248
|
"summary_score": 0.85,
|
|
200
249
|
"entity_preservation": 0.9,
|
|
250
|
+
"summary_accuracy": 0.9,
|
|
251
|
+
"summary_risk_coverage": 0.9,
|
|
252
|
+
"summary_non_definitive": 0.8,
|
|
253
|
+
"summary_needs_followup": 0.8,
|
|
201
254
|
"contextual_relevancy": 0.35,
|
|
202
255
|
}
|
|
203
256
|
LANGUAGE_SAMPLE_LIMIT = 5
|
|
@@ -225,10 +278,28 @@ class RagasEvaluator:
|
|
|
225
278
|
"예시의 원자성 수준을 따르세요."
|
|
226
279
|
)
|
|
227
280
|
FACTUAL_CORRECTNESS_NLI_INSTRUCTION = (
|
|
228
|
-
"
|
|
229
|
-
"
|
|
230
|
-
|
|
231
|
-
|
|
281
|
+
"주어진 컨텍스트를 보고 각 진술이 직접적으로 도출 가능한지 판단하세요. "
|
|
282
|
+
"가능하면 verdict=1, 불가능하면 verdict=0을 JSON으로 반환하세요."
|
|
283
|
+
)
|
|
284
|
+
SUMMARY_SCORE_QUESTION_INSTRUCTION = (
|
|
285
|
+
"다음 텍스트와 핵심 키워드를 기반으로, "
|
|
286
|
+
"텍스트에 근거해 반드시 1로 답할 수 있는 폐쇄형 질문을 생성하세요. "
|
|
287
|
+
"질문은 한국어로 작성하세요."
|
|
288
|
+
)
|
|
289
|
+
SUMMARY_SCORE_ANSWER_INSTRUCTION = (
|
|
290
|
+
"다음 질문 목록에 대해, 제공된 요약이 각 질문에 답할 수 있으면 '1', "
|
|
291
|
+
"그렇지 않으면 '0'을 JSON 배열로 반환하세요."
|
|
292
|
+
)
|
|
293
|
+
SUMMARY_SCORE_KEYPHRASE_INSTRUCTION = (
|
|
294
|
+
"다음 텍스트에서 인물, 기관, 위치, 날짜/시간, 금액, 비율과 같은 핵심 키워드를 추출하세요."
|
|
295
|
+
)
|
|
296
|
+
SUMMARY_FAITHFULNESS_STATEMENT_INSTRUCTION = (
|
|
297
|
+
"질문과 답변을 보고 각 문장을 이해 가능한 주장으로 분해하세요. "
|
|
298
|
+
"각 주장은 대명사 없이 독립적으로 이해 가능해야 합니다."
|
|
299
|
+
)
|
|
300
|
+
SUMMARY_FAITHFULNESS_NLI_INSTRUCTION = (
|
|
301
|
+
"주어진 컨텍스트를 보고 각 진술이 직접적으로 도출 가능한지 판단하세요. "
|
|
302
|
+
"가능하면 verdict=1, 불가능하면 verdict=0을 JSON으로 반환하세요."
|
|
232
303
|
)
|
|
233
304
|
FACTUAL_CORRECTNESS_CLAIM_EXAMPLES = [
|
|
234
305
|
{
|
|
@@ -330,6 +401,8 @@ class RagasEvaluator:
|
|
|
330
401
|
self._active_llm_model = llm.get_model_name()
|
|
331
402
|
self._active_llm = llm
|
|
332
403
|
self._prompt_language = self._normalize_language_hint(language) if language else None
|
|
404
|
+
if self._prompt_language is None:
|
|
405
|
+
self._prompt_language = self._resolve_dataset_language(dataset)
|
|
333
406
|
# Resolve thresholds: CLI > dataset > default(0.7)
|
|
334
407
|
resolved_thresholds = {}
|
|
335
408
|
for metric in metrics:
|
|
@@ -388,6 +461,7 @@ class RagasEvaluator:
|
|
|
388
461
|
|
|
389
462
|
# Evaluate with Ragas (if any Ragas metrics)
|
|
390
463
|
eval_results_by_test_case = {}
|
|
464
|
+
prompt_snapshots = {}
|
|
391
465
|
if ragas_metrics:
|
|
392
466
|
run.tracker_metadata["ragas_config"] = self._build_ragas_config(llm)
|
|
393
467
|
(
|
|
@@ -410,6 +484,13 @@ class RagasEvaluator:
|
|
|
410
484
|
elif prompt_overrides:
|
|
411
485
|
logger.warning("Ragas prompt overrides provided but no Ragas metrics requested.")
|
|
412
486
|
|
|
487
|
+
custom_snapshot = build_custom_metric_snapshot(self.CUSTOM_METRIC_MAP, metrics)
|
|
488
|
+
if custom_snapshot:
|
|
489
|
+
run.tracker_metadata["custom_metric_snapshot"] = custom_snapshot
|
|
490
|
+
custom_prompt_snapshots = self._build_custom_prompt_snapshots(custom_snapshot)
|
|
491
|
+
if custom_prompt_snapshots:
|
|
492
|
+
run.tracker_metadata["custom_prompt_snapshots"] = custom_prompt_snapshots
|
|
493
|
+
|
|
413
494
|
# Evaluate with custom metrics (if any custom metrics)
|
|
414
495
|
if custom_metrics:
|
|
415
496
|
custom_results = await self._evaluate_with_custom_metrics(
|
|
@@ -581,6 +662,11 @@ class RagasEvaluator:
|
|
|
581
662
|
ragas_metrics=ragas_metrics,
|
|
582
663
|
prompt_overrides=prompt_overrides,
|
|
583
664
|
)
|
|
665
|
+
self._apply_summary_prompt_defaults(
|
|
666
|
+
dataset=dataset,
|
|
667
|
+
ragas_metrics=ragas_metrics,
|
|
668
|
+
prompt_overrides=prompt_overrides,
|
|
669
|
+
)
|
|
584
670
|
self._apply_factual_correctness_prompt_defaults(
|
|
585
671
|
dataset=dataset,
|
|
586
672
|
ragas_metrics=ragas_metrics,
|
|
@@ -643,6 +729,30 @@ class RagasEvaluator:
|
|
|
643
729
|
continue
|
|
644
730
|
self._apply_korean_answer_relevancy_prompt(metric)
|
|
645
731
|
|
|
732
|
+
def _apply_summary_prompt_defaults(
|
|
733
|
+
self,
|
|
734
|
+
*,
|
|
735
|
+
dataset: Dataset,
|
|
736
|
+
ragas_metrics: list[Any],
|
|
737
|
+
prompt_overrides: dict[str, str] | None,
|
|
738
|
+
) -> None:
|
|
739
|
+
if not ragas_metrics:
|
|
740
|
+
return
|
|
741
|
+
if prompt_overrides and any(
|
|
742
|
+
metric in prompt_overrides for metric in ("summary_score", "summary_faithfulness")
|
|
743
|
+
):
|
|
744
|
+
return
|
|
745
|
+
resolved_language = self._resolve_dataset_language(dataset)
|
|
746
|
+
if resolved_language == "en":
|
|
747
|
+
return
|
|
748
|
+
|
|
749
|
+
for metric in ragas_metrics:
|
|
750
|
+
metric_name = getattr(metric, "name", None)
|
|
751
|
+
if metric_name == "summary_score":
|
|
752
|
+
self._apply_korean_summary_score_prompts(metric)
|
|
753
|
+
elif metric_name == "summary_faithfulness":
|
|
754
|
+
self._apply_korean_summary_faithfulness_prompts(metric)
|
|
755
|
+
|
|
646
756
|
def _apply_factual_correctness_prompt_defaults(
|
|
647
757
|
self,
|
|
648
758
|
*,
|
|
@@ -743,6 +853,56 @@ class RagasEvaluator:
|
|
|
743
853
|
prompt.language = "ko"
|
|
744
854
|
return True
|
|
745
855
|
|
|
856
|
+
def _apply_korean_summary_score_prompts(self, metric: Any) -> bool:
|
|
857
|
+
question_prompt = getattr(metric, "question_generation_prompt", None)
|
|
858
|
+
answer_prompt = getattr(metric, "answer_generation_prompt", None)
|
|
859
|
+
keyphrase_prompt = getattr(metric, "extract_keyphrases_prompt", None)
|
|
860
|
+
applied = False
|
|
861
|
+
|
|
862
|
+
if question_prompt and hasattr(question_prompt, "instruction"):
|
|
863
|
+
question_prompt.instruction = self.SUMMARY_SCORE_QUESTION_INSTRUCTION
|
|
864
|
+
if hasattr(question_prompt, "language"):
|
|
865
|
+
with suppress(Exception):
|
|
866
|
+
question_prompt.language = "ko"
|
|
867
|
+
applied = True
|
|
868
|
+
|
|
869
|
+
if answer_prompt and hasattr(answer_prompt, "instruction"):
|
|
870
|
+
answer_prompt.instruction = self.SUMMARY_SCORE_ANSWER_INSTRUCTION
|
|
871
|
+
if hasattr(answer_prompt, "language"):
|
|
872
|
+
with suppress(Exception):
|
|
873
|
+
answer_prompt.language = "ko"
|
|
874
|
+
applied = True
|
|
875
|
+
|
|
876
|
+
if keyphrase_prompt and hasattr(keyphrase_prompt, "instruction"):
|
|
877
|
+
keyphrase_prompt.instruction = self.SUMMARY_SCORE_KEYPHRASE_INSTRUCTION
|
|
878
|
+
if hasattr(keyphrase_prompt, "language"):
|
|
879
|
+
with suppress(Exception):
|
|
880
|
+
keyphrase_prompt.language = "ko"
|
|
881
|
+
applied = True
|
|
882
|
+
|
|
883
|
+
return applied
|
|
884
|
+
|
|
885
|
+
def _apply_korean_summary_faithfulness_prompts(self, metric: Any) -> bool:
|
|
886
|
+
statement_prompt = getattr(metric, "statement_generator_prompt", None)
|
|
887
|
+
nli_prompt = getattr(metric, "nli_statements_prompt", None)
|
|
888
|
+
applied = False
|
|
889
|
+
|
|
890
|
+
if statement_prompt and hasattr(statement_prompt, "instruction"):
|
|
891
|
+
statement_prompt.instruction = self.SUMMARY_FAITHFULNESS_STATEMENT_INSTRUCTION
|
|
892
|
+
if hasattr(statement_prompt, "language"):
|
|
893
|
+
with suppress(Exception):
|
|
894
|
+
statement_prompt.language = "ko"
|
|
895
|
+
applied = True
|
|
896
|
+
|
|
897
|
+
if nli_prompt and hasattr(nli_prompt, "instruction"):
|
|
898
|
+
nli_prompt.instruction = self.SUMMARY_FAITHFULNESS_NLI_INSTRUCTION
|
|
899
|
+
if hasattr(nli_prompt, "language"):
|
|
900
|
+
with suppress(Exception):
|
|
901
|
+
nli_prompt.language = "ko"
|
|
902
|
+
applied = True
|
|
903
|
+
|
|
904
|
+
return applied
|
|
905
|
+
|
|
746
906
|
def _apply_korean_factual_correctness_prompts(self, metric: Any) -> bool:
|
|
747
907
|
claim_prompt = getattr(metric, "claim_decomposition_prompt", None)
|
|
748
908
|
nli_prompt = getattr(metric, "nli_prompt", None)
|
|
@@ -817,6 +977,8 @@ class RagasEvaluator:
|
|
|
817
977
|
continue
|
|
818
978
|
prompt_text = prompt_overrides[metric_name]
|
|
819
979
|
applied = self._override_metric_prompt(metric, prompt_text)
|
|
980
|
+
if not applied and metric_name == "faithfulness":
|
|
981
|
+
applied = self._override_faithfulness_prompt(metric, prompt_text)
|
|
820
982
|
statuses[metric_name] = "applied" if applied else "unsupported"
|
|
821
983
|
if not applied:
|
|
822
984
|
logger.warning("Prompt override for metric '%s' could not be applied.", metric_name)
|
|
@@ -876,6 +1038,16 @@ class RagasEvaluator:
|
|
|
876
1038
|
|
|
877
1039
|
return False
|
|
878
1040
|
|
|
1041
|
+
@staticmethod
|
|
1042
|
+
def _override_faithfulness_prompt(metric: Any, prompt_text: str) -> bool:
|
|
1043
|
+
target = getattr(metric, "nli_statements_prompt", None)
|
|
1044
|
+
if target is None:
|
|
1045
|
+
return False
|
|
1046
|
+
if hasattr(target, "instruction"):
|
|
1047
|
+
target.instruction = prompt_text
|
|
1048
|
+
return True
|
|
1049
|
+
return False
|
|
1050
|
+
|
|
879
1051
|
@staticmethod
|
|
880
1052
|
def _extract_prompt_text(value: Any) -> str | None:
|
|
881
1053
|
if value is None:
|
|
@@ -924,18 +1096,50 @@ class RagasEvaluator:
|
|
|
924
1096
|
metric_name = getattr(metric, "name", None)
|
|
925
1097
|
if not metric_name:
|
|
926
1098
|
continue
|
|
927
|
-
prompt_text = self._collect_metric_prompt_text(metric)
|
|
928
|
-
if not prompt_text:
|
|
929
|
-
continue
|
|
930
1099
|
requested = bool(prompt_overrides and metric_name in prompt_overrides)
|
|
931
1100
|
status = override_status.get(metric_name)
|
|
932
1101
|
source = "override" if status == "applied" else "default"
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
"
|
|
937
|
-
|
|
938
|
-
|
|
1102
|
+
|
|
1103
|
+
prompts: dict[str, str] = {}
|
|
1104
|
+
if metric_name == "summary_score":
|
|
1105
|
+
prompts["question_generation"] = (
|
|
1106
|
+
self._extract_prompt_text(getattr(metric, "question_generation_prompt", None))
|
|
1107
|
+
or ""
|
|
1108
|
+
)
|
|
1109
|
+
prompts["answer_generation"] = (
|
|
1110
|
+
self._extract_prompt_text(getattr(metric, "answer_generation_prompt", None))
|
|
1111
|
+
or ""
|
|
1112
|
+
)
|
|
1113
|
+
prompts["extract_keyphrases"] = (
|
|
1114
|
+
self._extract_prompt_text(getattr(metric, "extract_keyphrases_prompt", None))
|
|
1115
|
+
or ""
|
|
1116
|
+
)
|
|
1117
|
+
prompts = {k: v for k, v in prompts.items() if v}
|
|
1118
|
+
elif metric_name == "summary_faithfulness":
|
|
1119
|
+
prompts["statement_generation"] = (
|
|
1120
|
+
self._extract_prompt_text(getattr(metric, "statement_generator_prompt", None))
|
|
1121
|
+
or ""
|
|
1122
|
+
)
|
|
1123
|
+
prompts["nli_statements"] = (
|
|
1124
|
+
self._extract_prompt_text(getattr(metric, "nli_statements_prompt", None)) or ""
|
|
1125
|
+
)
|
|
1126
|
+
prompts = {k: v for k, v in prompts.items() if v}
|
|
1127
|
+
|
|
1128
|
+
prompt_text = self._collect_metric_prompt_text(metric)
|
|
1129
|
+
if prompts:
|
|
1130
|
+
snapshots[str(metric_name)] = {
|
|
1131
|
+
"prompts": prompts,
|
|
1132
|
+
"source": source,
|
|
1133
|
+
"override_requested": requested,
|
|
1134
|
+
"override_status": status,
|
|
1135
|
+
}
|
|
1136
|
+
elif prompt_text:
|
|
1137
|
+
snapshots[str(metric_name)] = {
|
|
1138
|
+
"prompt": prompt_text,
|
|
1139
|
+
"source": source,
|
|
1140
|
+
"override_requested": requested,
|
|
1141
|
+
"override_status": status,
|
|
1142
|
+
}
|
|
939
1143
|
return snapshots
|
|
940
1144
|
|
|
941
1145
|
async def _evaluate_sequential(
|
|
@@ -1133,16 +1337,26 @@ class RagasEvaluator:
|
|
|
1133
1337
|
claim_details: dict[str, ClaimLevelResult] = {}
|
|
1134
1338
|
|
|
1135
1339
|
for metric in ragas_metrics:
|
|
1136
|
-
if metric.name in self.FAITHFULNESS_METRICS
|
|
1137
|
-
if
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1340
|
+
if metric.name in self.FAITHFULNESS_METRICS:
|
|
1341
|
+
if self._active_llm_provider == "ollama":
|
|
1342
|
+
fallback_score = self._fallback_korean_faithfulness(
|
|
1343
|
+
sample, return_details=False
|
|
1344
|
+
)
|
|
1345
|
+
if fallback_score is None:
|
|
1346
|
+
fallback_score = await self._score_faithfulness_with_fallback(sample)
|
|
1347
|
+
if fallback_score is not None:
|
|
1348
|
+
scores[metric.name] = fallback_score
|
|
1349
|
+
continue
|
|
1350
|
+
if self._faithfulness_ragas_failed:
|
|
1351
|
+
if metric.name == "summary_faithfulness":
|
|
1352
|
+
judge_score = await self._score_summary_faithfulness_judge(sample)
|
|
1353
|
+
if judge_score is not None:
|
|
1354
|
+
scores[metric.name] = judge_score
|
|
1355
|
+
continue
|
|
1356
|
+
fallback_score = await self._score_faithfulness_with_fallback(sample)
|
|
1357
|
+
if fallback_score is not None:
|
|
1358
|
+
scores[metric.name] = fallback_score
|
|
1141
1359
|
continue
|
|
1142
|
-
fallback_score = await self._score_faithfulness_with_fallback(sample)
|
|
1143
|
-
if fallback_score is not None:
|
|
1144
|
-
scores[metric.name] = fallback_score
|
|
1145
|
-
continue
|
|
1146
1360
|
try:
|
|
1147
1361
|
# Ragas >=0.4 uses ascore() with kwargs
|
|
1148
1362
|
if hasattr(metric, "ascore"):
|
|
@@ -1270,6 +1484,32 @@ class RagasEvaluator:
|
|
|
1270
1484
|
normalized = str(domain).strip().lower()
|
|
1271
1485
|
return cls.SUMMARY_SCORE_COEFF_BY_DOMAIN.get(normalized, cls.SUMMARY_SCORE_COEFF)
|
|
1272
1486
|
|
|
1487
|
+
def _build_custom_prompt_snapshots(self, snapshot: dict[str, Any]) -> dict[str, dict[str, Any]]:
|
|
1488
|
+
entries = snapshot.get("metrics") if isinstance(snapshot, dict) else None
|
|
1489
|
+
if not isinstance(entries, list):
|
|
1490
|
+
return {}
|
|
1491
|
+
prompt_snapshot: dict[str, dict[str, Any]] = {}
|
|
1492
|
+
for entry in entries:
|
|
1493
|
+
if not isinstance(entry, dict):
|
|
1494
|
+
continue
|
|
1495
|
+
name = entry.get("metric_name")
|
|
1496
|
+
if not isinstance(name, str) or not name:
|
|
1497
|
+
continue
|
|
1498
|
+
evaluation_process = entry.get("evaluation_process")
|
|
1499
|
+
if not isinstance(evaluation_process, str) or not evaluation_process:
|
|
1500
|
+
continue
|
|
1501
|
+
rules = entry.get("rules") if isinstance(entry.get("rules"), dict) else None
|
|
1502
|
+
prompts: dict[str, str] = {"rule": evaluation_process}
|
|
1503
|
+
if rules:
|
|
1504
|
+
prompts["rules"] = json.dumps(rules, ensure_ascii=False, indent=2)
|
|
1505
|
+
prompt_snapshot[name] = {
|
|
1506
|
+
"prompts": prompts,
|
|
1507
|
+
"source": "custom_rules",
|
|
1508
|
+
"rules": rules,
|
|
1509
|
+
"inputs": entry.get("inputs"),
|
|
1510
|
+
}
|
|
1511
|
+
return prompt_snapshot
|
|
1512
|
+
|
|
1273
1513
|
def _build_summary_score_metric(self, metric_class, ragas_llm, coeff: float | None = None):
|
|
1274
1514
|
if coeff is None:
|
|
1275
1515
|
coeff = self.SUMMARY_SCORE_COEFF
|
|
@@ -1651,9 +1891,11 @@ class RagasEvaluator:
|
|
|
1651
1891
|
contexts=test_case.contexts,
|
|
1652
1892
|
)
|
|
1653
1893
|
else:
|
|
1654
|
-
score =
|
|
1894
|
+
score = self._score_custom_metric_with_metadata(
|
|
1895
|
+
metric_instance,
|
|
1655
1896
|
answer=test_case.answer,
|
|
1656
1897
|
contexts=test_case.contexts,
|
|
1898
|
+
metadata=test_case.metadata,
|
|
1657
1899
|
)
|
|
1658
1900
|
scores[metric_name] = score
|
|
1659
1901
|
|
|
@@ -1674,6 +1916,19 @@ class RagasEvaluator:
|
|
|
1674
1916
|
|
|
1675
1917
|
return results
|
|
1676
1918
|
|
|
1919
|
+
def _score_custom_metric_with_metadata(
|
|
1920
|
+
self,
|
|
1921
|
+
metric_instance: Any,
|
|
1922
|
+
*,
|
|
1923
|
+
answer: str,
|
|
1924
|
+
contexts: list[str],
|
|
1925
|
+
metadata: dict[str, Any],
|
|
1926
|
+
) -> float:
|
|
1927
|
+
try:
|
|
1928
|
+
return float(metric_instance.score(answer=answer, contexts=contexts, metadata=metadata))
|
|
1929
|
+
except TypeError:
|
|
1930
|
+
return float(metric_instance.score(answer=answer, contexts=contexts))
|
|
1931
|
+
|
|
1677
1932
|
def _calculate_cost(self, model_name: str, prompt_tokens: int, completion_tokens: int) -> float:
|
|
1678
1933
|
"""Calculate estimated cost in USD based on model pricing."""
|
|
1679
1934
|
# Find matching model key (exact or substring match)
|