evalvault 1.62.1__py3-none-any.whl → 1.63.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +190 -19
- evalvault/adapters/inbound/api/routers/runs.py +66 -2
- evalvault/adapters/inbound/cli/commands/method.py +5 -2
- evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
- evalvault/adapters/inbound/cli/commands/run.py +43 -2
- evalvault/adapters/inbound/cli/commands/run_helpers.py +10 -0
- evalvault/adapters/inbound/mcp/tools.py +5 -2
- evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
- evalvault/adapters/outbound/llm/__init__.py +5 -43
- evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
- evalvault/adapters/outbound/llm/factory.py +103 -0
- evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
- evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
- evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
- evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
- evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
- evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
- evalvault/adapters/outbound/storage/base_sql.py +527 -21
- evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
- evalvault/debug_ragas.py +7 -1
- evalvault/debug_ragas_real.py +5 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/prompt_suggestion.py +50 -0
- evalvault/domain/services/__init__.py +6 -0
- evalvault/domain/services/evaluator.py +191 -103
- evalvault/domain/services/holdout_splitter.py +67 -0
- evalvault/domain/services/intent_classifier.py +73 -0
- evalvault/domain/services/pipeline_template_registry.py +3 -0
- evalvault/domain/services/prompt_candidate_service.py +117 -0
- evalvault/domain/services/prompt_registry.py +40 -2
- evalvault/domain/services/prompt_scoring_service.py +286 -0
- evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
- evalvault/domain/services/synthetic_qa_generator.py +4 -3
- evalvault/ports/inbound/learning_hook_port.py +4 -1
- evalvault/ports/outbound/__init__.py +2 -0
- evalvault/ports/outbound/llm_factory_port.py +13 -0
- evalvault/ports/outbound/llm_port.py +34 -2
- evalvault/ports/outbound/storage_port.py +38 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/METADATA +228 -4
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/RECORD +46 -38
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/WHEEL +0 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Prompt suggestion entities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class PromptCandidate:
|
|
11
|
+
"""Single prompt candidate for suggestion workflow."""
|
|
12
|
+
|
|
13
|
+
candidate_id: str
|
|
14
|
+
source: str
|
|
15
|
+
content: str
|
|
16
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class PromptCandidateSampleScore:
|
|
21
|
+
sample_index: int
|
|
22
|
+
scores: dict[str, float]
|
|
23
|
+
weighted_score: float
|
|
24
|
+
responses: list[dict[str, Any]] = field(default_factory=list)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class PromptCandidateScore:
|
|
29
|
+
"""Evaluation score for a prompt candidate."""
|
|
30
|
+
|
|
31
|
+
candidate_id: str
|
|
32
|
+
scores: dict[str, float]
|
|
33
|
+
weighted_score: float
|
|
34
|
+
sample_scores: list[PromptCandidateSampleScore] = field(default_factory=list)
|
|
35
|
+
selected_sample_index: int | None = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class PromptSuggestionResult:
|
|
40
|
+
"""Aggregated prompt suggestion results."""
|
|
41
|
+
|
|
42
|
+
run_id: str
|
|
43
|
+
role: str
|
|
44
|
+
metrics: list[str]
|
|
45
|
+
weights: dict[str, float]
|
|
46
|
+
candidates: list[PromptCandidate]
|
|
47
|
+
scores: list[PromptCandidateScore]
|
|
48
|
+
ranking: list[str]
|
|
49
|
+
holdout_ratio: float
|
|
50
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
@@ -4,8 +4,11 @@ from evalvault.domain.services.analysis_service import AnalysisService
|
|
|
4
4
|
from evalvault.domain.services.dataset_preprocessor import DatasetPreprocessor
|
|
5
5
|
from evalvault.domain.services.domain_learning_hook import DomainLearningHook
|
|
6
6
|
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
7
|
+
from evalvault.domain.services.holdout_splitter import split_dataset_holdout
|
|
7
8
|
from evalvault.domain.services.improvement_guide_service import ImprovementGuideService
|
|
8
9
|
from evalvault.domain.services.method_runner import MethodRunnerService, MethodRunResult
|
|
10
|
+
from evalvault.domain.services.prompt_scoring_service import PromptScoringService
|
|
11
|
+
from evalvault.domain.services.prompt_suggestion_reporter import PromptSuggestionReporter
|
|
9
12
|
|
|
10
13
|
__all__ = [
|
|
11
14
|
"AnalysisService",
|
|
@@ -14,5 +17,8 @@ __all__ = [
|
|
|
14
17
|
"ImprovementGuideService",
|
|
15
18
|
"MethodRunnerService",
|
|
16
19
|
"MethodRunResult",
|
|
20
|
+
"PromptScoringService",
|
|
21
|
+
"PromptSuggestionReporter",
|
|
17
22
|
"RagasEvaluator",
|
|
23
|
+
"split_dataset_holdout",
|
|
18
24
|
]
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
import importlib
|
|
6
7
|
import json
|
|
7
8
|
import logging
|
|
8
9
|
import math
|
|
@@ -10,36 +11,10 @@ from collections.abc import Callable, Sequence
|
|
|
10
11
|
from contextlib import suppress
|
|
11
12
|
from dataclasses import dataclass
|
|
12
13
|
from datetime import datetime
|
|
13
|
-
from typing import Any
|
|
14
|
+
from typing import Any, Literal, cast, overload
|
|
14
15
|
|
|
15
16
|
from ragas import SingleTurnSample
|
|
16
17
|
|
|
17
|
-
try: # Ragas >=0.2.0
|
|
18
|
-
from ragas.metrics.collections import (
|
|
19
|
-
AnswerRelevancy,
|
|
20
|
-
ContextPrecision,
|
|
21
|
-
ContextRecall,
|
|
22
|
-
FactualCorrectness,
|
|
23
|
-
Faithfulness,
|
|
24
|
-
SemanticSimilarity,
|
|
25
|
-
)
|
|
26
|
-
except ImportError: # pragma: no cover - fallback for older Ragas versions
|
|
27
|
-
from ragas.metrics import (
|
|
28
|
-
AnswerRelevancy,
|
|
29
|
-
ContextPrecision,
|
|
30
|
-
ContextRecall,
|
|
31
|
-
FactualCorrectness,
|
|
32
|
-
Faithfulness,
|
|
33
|
-
SemanticSimilarity,
|
|
34
|
-
)
|
|
35
|
-
try: # SummaryScore lives in different modules depending on Ragas version
|
|
36
|
-
from ragas.metrics.collections import SummaryScore as RagasSummaryScore
|
|
37
|
-
except ImportError: # pragma: no cover - fallback for older Ragas versions
|
|
38
|
-
try:
|
|
39
|
-
from ragas.metrics import SummarizationScore as RagasSummaryScore
|
|
40
|
-
except ImportError: # pragma: no cover - no summary support available
|
|
41
|
-
RagasSummaryScore = None
|
|
42
|
-
|
|
43
18
|
from evalvault.domain.entities import (
|
|
44
19
|
ClaimLevelResult,
|
|
45
20
|
ClaimVerdict,
|
|
@@ -59,9 +34,55 @@ from evalvault.domain.metrics.text_match import ExactMatch, F1Score
|
|
|
59
34
|
from evalvault.domain.services.batch_executor import run_in_batches
|
|
60
35
|
from evalvault.domain.services.dataset_preprocessor import DatasetPreprocessor
|
|
61
36
|
from evalvault.domain.services.retriever_context import apply_retriever_to_dataset
|
|
62
|
-
from evalvault.ports.outbound.korean_nlp_port import RetrieverPort
|
|
37
|
+
from evalvault.ports.outbound.korean_nlp_port import KoreanNLPToolkitPort, RetrieverPort
|
|
38
|
+
from evalvault.ports.outbound.llm_factory_port import LLMFactoryPort
|
|
63
39
|
from evalvault.ports.outbound.llm_port import LLMPort
|
|
64
40
|
|
|
41
|
+
_SUMMARY_FAITHFULNESS_PROMPT_KO = (
|
|
42
|
+
"당신은 요약 충실도 판정자입니다.\n"
|
|
43
|
+
"컨텍스트와 요약을 보고 요약의 모든 주장이 컨텍스트에 의해 뒷받침되는지 판단하세요.\n"
|
|
44
|
+
"숫자, 조건, 면책, 기간, 자격 등이 누락되거나 추가되거나 모순되면 verdict는 unsupported입니다.\n"
|
|
45
|
+
'JSON만 반환: {"verdict": "supported|unsupported", "reason": "..."}\n\n'
|
|
46
|
+
"컨텍스트:\n{context}\n\n요약:\n{summary}\n"
|
|
47
|
+
)
|
|
48
|
+
_SUMMARY_FAITHFULNESS_PROMPT_EN = (
|
|
49
|
+
"You are a strict summarization faithfulness judge.\n"
|
|
50
|
+
"Given the CONTEXT and SUMMARY, determine whether every claim in SUMMARY is supported by CONTEXT.\n"
|
|
51
|
+
"If any numbers, conditions, exclusions, durations, or eligibility are missing, added, or "
|
|
52
|
+
"contradicted, verdict is unsupported.\n"
|
|
53
|
+
'Return JSON only: {"verdict": "supported|unsupported", "reason": "..."}\n\n'
|
|
54
|
+
"CONTEXT:\n{context}\n\nSUMMARY:\n{summary}\n"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _import_metric(name: str) -> type[Any]:
|
|
59
|
+
for module_name in ("ragas.metrics.collections", "ragas.metrics"):
|
|
60
|
+
try:
|
|
61
|
+
module = importlib.import_module(module_name)
|
|
62
|
+
except Exception:
|
|
63
|
+
continue
|
|
64
|
+
if hasattr(module, name):
|
|
65
|
+
return cast(type[Any], getattr(module, name))
|
|
66
|
+
raise ImportError(f"Missing ragas metric: {name}")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _import_optional_metric(names: list[str]) -> type[Any] | None:
|
|
70
|
+
for name in names:
|
|
71
|
+
try:
|
|
72
|
+
return _import_metric(name)
|
|
73
|
+
except Exception:
|
|
74
|
+
continue
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
AnswerRelevancy = _import_metric("AnswerRelevancy")
|
|
79
|
+
ContextPrecision = _import_metric("ContextPrecision")
|
|
80
|
+
ContextRecall = _import_metric("ContextRecall")
|
|
81
|
+
FactualCorrectness = _import_metric("FactualCorrectness")
|
|
82
|
+
Faithfulness = _import_metric("Faithfulness")
|
|
83
|
+
SemanticSimilarity = _import_metric("SemanticSimilarity")
|
|
84
|
+
RagasSummaryScore = _import_optional_metric(["SummaryScore", "SummarizationScore"])
|
|
85
|
+
|
|
65
86
|
logger = logging.getLogger(__name__)
|
|
66
87
|
|
|
67
88
|
|
|
@@ -247,9 +268,16 @@ class RagasEvaluator:
|
|
|
247
268
|
"openai/gpt-5-nano": (5.00, 15.00),
|
|
248
269
|
}
|
|
249
270
|
|
|
250
|
-
def __init__(
|
|
271
|
+
def __init__(
|
|
272
|
+
self,
|
|
273
|
+
*,
|
|
274
|
+
preprocessor: DatasetPreprocessor | None = None,
|
|
275
|
+
korean_toolkit: KoreanNLPToolkitPort | None = None,
|
|
276
|
+
llm_factory: LLMFactoryPort | None = None,
|
|
277
|
+
) -> None:
|
|
251
278
|
self._preprocessor = preprocessor or DatasetPreprocessor()
|
|
252
|
-
self._korean_toolkit =
|
|
279
|
+
self._korean_toolkit = korean_toolkit
|
|
280
|
+
self._llm_factory = llm_factory
|
|
253
281
|
self._faithfulness_ragas_failed = False
|
|
254
282
|
self._faithfulness_fallback_llm = None
|
|
255
283
|
self._faithfulness_fallback_metric = None
|
|
@@ -258,6 +286,7 @@ class RagasEvaluator:
|
|
|
258
286
|
self._active_llm_provider = None
|
|
259
287
|
self._active_llm_model = None
|
|
260
288
|
self._active_llm = None
|
|
289
|
+
self._prompt_language = None
|
|
261
290
|
|
|
262
291
|
async def evaluate(
|
|
263
292
|
self,
|
|
@@ -273,6 +302,7 @@ class RagasEvaluator:
|
|
|
273
302
|
on_progress: Callable[[int, int, str], None] | None = None,
|
|
274
303
|
prompt_overrides: dict[str, str] | None = None,
|
|
275
304
|
claim_level: bool = False,
|
|
305
|
+
language: str | None = None,
|
|
276
306
|
) -> EvaluationRun:
|
|
277
307
|
"""데이터셋을 Ragas로 평가.
|
|
278
308
|
|
|
@@ -299,6 +329,7 @@ class RagasEvaluator:
|
|
|
299
329
|
self._active_llm_provider = getattr(llm, "provider_name", None)
|
|
300
330
|
self._active_llm_model = llm.get_model_name()
|
|
301
331
|
self._active_llm = llm
|
|
332
|
+
self._prompt_language = self._normalize_language_hint(language) if language else None
|
|
302
333
|
# Resolve thresholds: CLI > dataset > default(0.7)
|
|
303
334
|
resolved_thresholds = {}
|
|
304
335
|
for metric in metrics:
|
|
@@ -359,7 +390,11 @@ class RagasEvaluator:
|
|
|
359
390
|
eval_results_by_test_case = {}
|
|
360
391
|
if ragas_metrics:
|
|
361
392
|
run.tracker_metadata["ragas_config"] = self._build_ragas_config(llm)
|
|
362
|
-
|
|
393
|
+
(
|
|
394
|
+
eval_results_by_test_case,
|
|
395
|
+
override_status,
|
|
396
|
+
prompt_snapshots,
|
|
397
|
+
) = await self._evaluate_with_ragas(
|
|
363
398
|
dataset=dataset,
|
|
364
399
|
metrics=ragas_metrics,
|
|
365
400
|
llm=llm,
|
|
@@ -370,6 +405,8 @@ class RagasEvaluator:
|
|
|
370
405
|
)
|
|
371
406
|
if override_status:
|
|
372
407
|
run.tracker_metadata["ragas_prompt_overrides"] = override_status
|
|
408
|
+
if prompt_snapshots:
|
|
409
|
+
run.tracker_metadata["ragas_prompt_snapshots"] = prompt_snapshots
|
|
373
410
|
elif prompt_overrides:
|
|
374
411
|
logger.warning("Ragas prompt overrides provided but no Ragas metrics requested.")
|
|
375
412
|
|
|
@@ -485,7 +522,7 @@ class RagasEvaluator:
|
|
|
485
522
|
batch_size: int = 5,
|
|
486
523
|
on_progress: Callable[[int, int, str], None] | None = None,
|
|
487
524
|
prompt_overrides: dict[str, str] | None = None,
|
|
488
|
-
) -> tuple[dict[str, TestCaseEvalResult], dict[str, str]]:
|
|
525
|
+
) -> tuple[dict[str, TestCaseEvalResult], dict[str, str], dict[str, dict[str, Any]]]:
|
|
489
526
|
"""Ragas로 실제 평가 수행.
|
|
490
527
|
|
|
491
528
|
Args:
|
|
@@ -496,7 +533,7 @@ class RagasEvaluator:
|
|
|
496
533
|
batch_size: 병렬 처리 시 배치 크기
|
|
497
534
|
|
|
498
535
|
Returns:
|
|
499
|
-
(테스트 케이스 ID별 평가 결과, 프롬프트 오버라이드 적용
|
|
536
|
+
(테스트 케이스 ID별 평가 결과, 프롬프트 오버라이드 적용 상태, 프롬프트 스냅샷)
|
|
500
537
|
예: {"tc-001": TestCaseEvalResult(...)}
|
|
501
538
|
"""
|
|
502
539
|
|
|
@@ -554,6 +591,12 @@ class RagasEvaluator:
|
|
|
554
591
|
if prompt_overrides:
|
|
555
592
|
override_status = self._apply_prompt_overrides(ragas_metrics, prompt_overrides)
|
|
556
593
|
|
|
594
|
+
prompt_snapshots = self._collect_ragas_prompt_snapshots(
|
|
595
|
+
ragas_metrics,
|
|
596
|
+
prompt_overrides,
|
|
597
|
+
override_status,
|
|
598
|
+
)
|
|
599
|
+
|
|
557
600
|
# 병렬 처리 vs 순차 처리
|
|
558
601
|
if parallel and len(ragas_samples) > 1:
|
|
559
602
|
return (
|
|
@@ -566,6 +609,7 @@ class RagasEvaluator:
|
|
|
566
609
|
on_progress=on_progress,
|
|
567
610
|
),
|
|
568
611
|
override_status,
|
|
612
|
+
prompt_snapshots,
|
|
569
613
|
)
|
|
570
614
|
return (
|
|
571
615
|
await self._evaluate_sequential(
|
|
@@ -576,6 +620,7 @@ class RagasEvaluator:
|
|
|
576
620
|
on_progress=on_progress,
|
|
577
621
|
),
|
|
578
622
|
override_status,
|
|
623
|
+
prompt_snapshots,
|
|
579
624
|
)
|
|
580
625
|
|
|
581
626
|
def _apply_answer_relevancy_prompt_defaults(
|
|
@@ -619,6 +664,8 @@ class RagasEvaluator:
|
|
|
619
664
|
self._apply_korean_factual_correctness_prompts(metric)
|
|
620
665
|
|
|
621
666
|
def _resolve_dataset_language(self, dataset: Dataset) -> str | None:
|
|
667
|
+
if self._prompt_language:
|
|
668
|
+
return self._prompt_language
|
|
622
669
|
metadata = dataset.metadata if isinstance(dataset.metadata, dict) else {}
|
|
623
670
|
for key in ("language", "lang", "locale"):
|
|
624
671
|
normalized = self._normalize_language_hint(metadata.get(key))
|
|
@@ -784,10 +831,10 @@ class RagasEvaluator:
|
|
|
784
831
|
if isinstance(target, str):
|
|
785
832
|
metric.prompt = prompt_text
|
|
786
833
|
return True
|
|
787
|
-
if hasattr(target, "template"):
|
|
834
|
+
if target is not None and hasattr(target, "template"):
|
|
788
835
|
target.template = prompt_text
|
|
789
836
|
return True
|
|
790
|
-
if hasattr(target, "instruction"):
|
|
837
|
+
if target is not None and hasattr(target, "instruction"):
|
|
791
838
|
target.instruction = prompt_text
|
|
792
839
|
return True
|
|
793
840
|
|
|
@@ -796,10 +843,10 @@ class RagasEvaluator:
|
|
|
796
843
|
if isinstance(target, str):
|
|
797
844
|
metric.question_generation = prompt_text
|
|
798
845
|
return True
|
|
799
|
-
if hasattr(target, "template"):
|
|
846
|
+
if target is not None and hasattr(target, "template"):
|
|
800
847
|
target.template = prompt_text
|
|
801
848
|
return True
|
|
802
|
-
if hasattr(target, "instruction"):
|
|
849
|
+
if target is not None and hasattr(target, "instruction"):
|
|
803
850
|
target.instruction = prompt_text
|
|
804
851
|
return True
|
|
805
852
|
|
|
@@ -829,6 +876,68 @@ class RagasEvaluator:
|
|
|
829
876
|
|
|
830
877
|
return False
|
|
831
878
|
|
|
879
|
+
@staticmethod
|
|
880
|
+
def _extract_prompt_text(value: Any) -> str | None:
|
|
881
|
+
if value is None:
|
|
882
|
+
return None
|
|
883
|
+
if isinstance(value, str):
|
|
884
|
+
return value
|
|
885
|
+
for attr in ("template", "instruction", "prompt", "text"):
|
|
886
|
+
try:
|
|
887
|
+
candidate = getattr(value, attr)
|
|
888
|
+
except Exception:
|
|
889
|
+
continue
|
|
890
|
+
if isinstance(candidate, str) and candidate.strip():
|
|
891
|
+
return candidate
|
|
892
|
+
return None
|
|
893
|
+
|
|
894
|
+
def _collect_metric_prompt_text(self, metric: Any) -> str | None:
|
|
895
|
+
for attr in ("prompt", "question_generation"):
|
|
896
|
+
if hasattr(metric, attr):
|
|
897
|
+
try:
|
|
898
|
+
value = getattr(metric, attr)
|
|
899
|
+
except Exception:
|
|
900
|
+
continue
|
|
901
|
+
text = self._extract_prompt_text(value)
|
|
902
|
+
if text:
|
|
903
|
+
return text
|
|
904
|
+
for attr in dir(metric):
|
|
905
|
+
if not attr.endswith("_prompt") or attr == "prompt":
|
|
906
|
+
continue
|
|
907
|
+
try:
|
|
908
|
+
value = getattr(metric, attr)
|
|
909
|
+
except Exception:
|
|
910
|
+
continue
|
|
911
|
+
text = self._extract_prompt_text(value)
|
|
912
|
+
if text:
|
|
913
|
+
return text
|
|
914
|
+
return None
|
|
915
|
+
|
|
916
|
+
def _collect_ragas_prompt_snapshots(
|
|
917
|
+
self,
|
|
918
|
+
ragas_metrics: list[Any],
|
|
919
|
+
prompt_overrides: dict[str, str] | None,
|
|
920
|
+
override_status: dict[str, str],
|
|
921
|
+
) -> dict[str, dict[str, Any]]:
|
|
922
|
+
snapshots: dict[str, dict[str, Any]] = {}
|
|
923
|
+
for metric in ragas_metrics:
|
|
924
|
+
metric_name = getattr(metric, "name", None)
|
|
925
|
+
if not metric_name:
|
|
926
|
+
continue
|
|
927
|
+
prompt_text = self._collect_metric_prompt_text(metric)
|
|
928
|
+
if not prompt_text:
|
|
929
|
+
continue
|
|
930
|
+
requested = bool(prompt_overrides and metric_name in prompt_overrides)
|
|
931
|
+
status = override_status.get(metric_name)
|
|
932
|
+
source = "override" if status == "applied" else "default"
|
|
933
|
+
snapshots[str(metric_name)] = {
|
|
934
|
+
"prompt": prompt_text,
|
|
935
|
+
"source": source,
|
|
936
|
+
"override_requested": requested,
|
|
937
|
+
"override_status": status,
|
|
938
|
+
}
|
|
939
|
+
return snapshots
|
|
940
|
+
|
|
832
941
|
async def _evaluate_sequential(
|
|
833
942
|
self,
|
|
834
943
|
dataset: Dataset,
|
|
@@ -1173,6 +1282,22 @@ class RagasEvaluator:
|
|
|
1173
1282
|
def default_threshold_for(cls, metric_name: str) -> float:
|
|
1174
1283
|
return cls.DEFAULT_METRIC_THRESHOLDS.get(metric_name, cls.DEFAULT_THRESHOLD_FALLBACK)
|
|
1175
1284
|
|
|
1285
|
+
@overload
|
|
1286
|
+
def _fallback_korean_faithfulness(
|
|
1287
|
+
self,
|
|
1288
|
+
sample: SingleTurnSample,
|
|
1289
|
+
*,
|
|
1290
|
+
return_details: Literal[True],
|
|
1291
|
+
) -> ClaimLevelResult | None: ...
|
|
1292
|
+
|
|
1293
|
+
@overload
|
|
1294
|
+
def _fallback_korean_faithfulness(
|
|
1295
|
+
self,
|
|
1296
|
+
sample: SingleTurnSample,
|
|
1297
|
+
*,
|
|
1298
|
+
return_details: Literal[False] = False,
|
|
1299
|
+
) -> float | None: ...
|
|
1300
|
+
|
|
1176
1301
|
def _fallback_korean_faithfulness(
|
|
1177
1302
|
self, sample: SingleTurnSample, *, return_details: bool = False
|
|
1178
1303
|
) -> float | ClaimLevelResult | None:
|
|
@@ -1194,11 +1319,7 @@ class RagasEvaluator:
|
|
|
1194
1319
|
return None
|
|
1195
1320
|
|
|
1196
1321
|
if self._korean_toolkit is None:
|
|
1197
|
-
|
|
1198
|
-
from evalvault.adapters.outbound.nlp.korean.toolkit import KoreanNLPToolkit
|
|
1199
|
-
except Exception: # pragma: no cover - optional dependency
|
|
1200
|
-
return None
|
|
1201
|
-
self._korean_toolkit = KoreanNLPToolkit()
|
|
1322
|
+
return None
|
|
1202
1323
|
|
|
1203
1324
|
try:
|
|
1204
1325
|
result = self._korean_toolkit.check_faithfulness(
|
|
@@ -1212,6 +1333,8 @@ class RagasEvaluator:
|
|
|
1212
1333
|
return self._convert_to_claim_level_result(result, test_case_id="")
|
|
1213
1334
|
|
|
1214
1335
|
score = getattr(result, "score", None)
|
|
1336
|
+
if score is None:
|
|
1337
|
+
return None
|
|
1215
1338
|
try:
|
|
1216
1339
|
return float(score)
|
|
1217
1340
|
except (TypeError, ValueError):
|
|
@@ -1291,14 +1414,11 @@ class RagasEvaluator:
|
|
|
1291
1414
|
return None
|
|
1292
1415
|
|
|
1293
1416
|
context = "\n\n".join(sample.retrieved_contexts)
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
"If any numbers, conditions, exclusions, durations, or eligibility are missing, added, or "
|
|
1298
|
-
"contradicted, verdict is unsupported.\n"
|
|
1299
|
-
'Return JSON only: {"verdict": "supported|unsupported", "reason": "..."}\n\n'
|
|
1300
|
-
f"CONTEXT:\n{context}\n\nSUMMARY:\n{sample.response}\n"
|
|
1417
|
+
language = self._prompt_language or "ko"
|
|
1418
|
+
template = (
|
|
1419
|
+
_SUMMARY_FAITHFULNESS_PROMPT_EN if language == "en" else _SUMMARY_FAITHFULNESS_PROMPT_KO
|
|
1301
1420
|
)
|
|
1421
|
+
prompt = template.format(context=context, summary=sample.response)
|
|
1302
1422
|
|
|
1303
1423
|
try:
|
|
1304
1424
|
response_text = await asyncio.to_thread(llm.generate_text, prompt, json_mode=True)
|
|
@@ -1340,7 +1460,7 @@ class RagasEvaluator:
|
|
|
1340
1460
|
) -> float | None:
|
|
1341
1461
|
metric = self._get_faithfulness_fallback_metric()
|
|
1342
1462
|
if metric is None:
|
|
1343
|
-
return self._fallback_korean_faithfulness(sample)
|
|
1463
|
+
return self._fallback_korean_faithfulness(sample, return_details=False)
|
|
1344
1464
|
|
|
1345
1465
|
try:
|
|
1346
1466
|
if hasattr(metric, "ascore"):
|
|
@@ -1368,6 +1488,8 @@ class RagasEvaluator:
|
|
|
1368
1488
|
else:
|
|
1369
1489
|
score_value = result
|
|
1370
1490
|
|
|
1491
|
+
if score_value is None:
|
|
1492
|
+
raise ValueError("Metric returned None")
|
|
1371
1493
|
score_value = float(score_value)
|
|
1372
1494
|
if math.isnan(score_value):
|
|
1373
1495
|
raise ValueError("Metric returned NaN")
|
|
@@ -1379,7 +1501,7 @@ class RagasEvaluator:
|
|
|
1379
1501
|
self._summarize_ragas_error(exc),
|
|
1380
1502
|
)
|
|
1381
1503
|
self._faithfulness_fallback_failed = True
|
|
1382
|
-
return self._fallback_korean_faithfulness(sample)
|
|
1504
|
+
return self._fallback_korean_faithfulness(sample, return_details=False)
|
|
1383
1505
|
|
|
1384
1506
|
def _get_faithfulness_fallback_metric(self):
|
|
1385
1507
|
if self._faithfulness_fallback_failed:
|
|
@@ -1411,29 +1533,14 @@ class RagasEvaluator:
|
|
|
1411
1533
|
return None
|
|
1412
1534
|
if self._faithfulness_fallback_llm is not None:
|
|
1413
1535
|
return self._faithfulness_fallback_llm
|
|
1414
|
-
|
|
1415
|
-
try:
|
|
1416
|
-
from evalvault.adapters.outbound.llm import create_llm_adapter_for_model
|
|
1417
|
-
from evalvault.config.settings import Settings
|
|
1418
|
-
except Exception:
|
|
1419
|
-
return None
|
|
1420
|
-
|
|
1421
|
-
settings = Settings()
|
|
1422
|
-
provider, model = self._resolve_faithfulness_fallback_config(settings)
|
|
1423
|
-
if not provider or not model:
|
|
1536
|
+
if self._llm_factory is None:
|
|
1424
1537
|
return None
|
|
1425
1538
|
|
|
1426
1539
|
try:
|
|
1427
|
-
llm =
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
"Faithfulness fallback LLM enabled: %s/%s",
|
|
1432
|
-
provider,
|
|
1433
|
-
model,
|
|
1434
|
-
)
|
|
1435
|
-
self._faithfulness_fallback_logged = True
|
|
1436
|
-
return llm
|
|
1540
|
+
llm = self._llm_factory.create_faithfulness_fallback(
|
|
1541
|
+
self._active_llm_provider,
|
|
1542
|
+
self._active_llm_model,
|
|
1543
|
+
)
|
|
1437
1544
|
except Exception as exc:
|
|
1438
1545
|
if not self._faithfulness_fallback_failed:
|
|
1439
1546
|
logger.warning(
|
|
@@ -1443,39 +1550,20 @@ class RagasEvaluator:
|
|
|
1443
1550
|
self._faithfulness_fallback_failed = True
|
|
1444
1551
|
return None
|
|
1445
1552
|
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
settings.faithfulness_fallback_provider.strip().lower()
|
|
1449
|
-
if settings.faithfulness_fallback_provider
|
|
1450
|
-
else None
|
|
1451
|
-
)
|
|
1452
|
-
model = settings.faithfulness_fallback_model
|
|
1453
|
-
active_provider = (
|
|
1454
|
-
self._active_llm_provider.strip().lower()
|
|
1455
|
-
if isinstance(self._active_llm_provider, str) and self._active_llm_provider.strip()
|
|
1456
|
-
else None
|
|
1457
|
-
)
|
|
1458
|
-
default_provider = active_provider or settings.llm_provider.lower()
|
|
1459
|
-
|
|
1460
|
-
if not provider and model:
|
|
1461
|
-
provider = default_provider
|
|
1462
|
-
if provider and not model:
|
|
1463
|
-
model = self._default_faithfulness_fallback_model(provider)
|
|
1464
|
-
if not provider and not model:
|
|
1465
|
-
provider = default_provider
|
|
1466
|
-
model = self._default_faithfulness_fallback_model(default_provider)
|
|
1467
|
-
|
|
1468
|
-
if not provider or not model:
|
|
1469
|
-
return None, None
|
|
1470
|
-
return provider, model
|
|
1553
|
+
if llm is None:
|
|
1554
|
+
return None
|
|
1471
1555
|
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1556
|
+
self._faithfulness_fallback_llm = llm
|
|
1557
|
+
if not self._faithfulness_fallback_logged:
|
|
1558
|
+
provider = getattr(llm, "provider_name", None)
|
|
1559
|
+
model = llm.get_model_name()
|
|
1560
|
+
logger.warning(
|
|
1561
|
+
"Faithfulness fallback LLM enabled: %s/%s",
|
|
1562
|
+
provider,
|
|
1563
|
+
model,
|
|
1564
|
+
)
|
|
1565
|
+
self._faithfulness_fallback_logged = True
|
|
1566
|
+
return llm
|
|
1479
1567
|
|
|
1480
1568
|
@staticmethod
|
|
1481
1569
|
def _contains_korean(text: str) -> bool:
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
|
|
5
|
+
from evalvault.domain.entities import Dataset, TestCase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def split_dataset_holdout(
|
|
9
|
+
*,
|
|
10
|
+
dataset: Dataset,
|
|
11
|
+
holdout_ratio: float,
|
|
12
|
+
seed: int | None,
|
|
13
|
+
) -> tuple[Dataset, Dataset]:
|
|
14
|
+
if holdout_ratio < 0 or holdout_ratio >= 1:
|
|
15
|
+
raise ValueError("holdout_ratio must be in [0, 1).")
|
|
16
|
+
|
|
17
|
+
total = len(dataset.test_cases)
|
|
18
|
+
if total == 0:
|
|
19
|
+
return _clone_dataset(dataset, "dev", []), _clone_dataset(dataset, "holdout", [])
|
|
20
|
+
|
|
21
|
+
holdout_size = int(total * holdout_ratio)
|
|
22
|
+
if holdout_ratio > 0 and holdout_size == 0:
|
|
23
|
+
holdout_size = 1
|
|
24
|
+
if holdout_size >= total:
|
|
25
|
+
holdout_size = total - 1
|
|
26
|
+
|
|
27
|
+
rng = random.Random(seed)
|
|
28
|
+
indices = list(range(total))
|
|
29
|
+
rng.shuffle(indices)
|
|
30
|
+
|
|
31
|
+
holdout_indices = set(indices[:holdout_size])
|
|
32
|
+
dev_cases: list[TestCase] = []
|
|
33
|
+
holdout_cases: list[TestCase] = []
|
|
34
|
+
|
|
35
|
+
for idx, test_case in enumerate(dataset.test_cases):
|
|
36
|
+
if idx in holdout_indices:
|
|
37
|
+
holdout_cases.append(test_case)
|
|
38
|
+
else:
|
|
39
|
+
dev_cases.append(test_case)
|
|
40
|
+
|
|
41
|
+
return (
|
|
42
|
+
_clone_dataset(dataset, "dev", dev_cases, holdout_ratio, seed),
|
|
43
|
+
_clone_dataset(dataset, "holdout", holdout_cases, holdout_ratio, seed),
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _clone_dataset(
|
|
48
|
+
dataset: Dataset,
|
|
49
|
+
split: str,
|
|
50
|
+
test_cases: list[TestCase],
|
|
51
|
+
holdout_ratio: float | None = None,
|
|
52
|
+
seed: int | None = None,
|
|
53
|
+
) -> Dataset:
|
|
54
|
+
metadata = dict(dataset.metadata or {})
|
|
55
|
+
metadata["split"] = split
|
|
56
|
+
if holdout_ratio is not None:
|
|
57
|
+
metadata.setdefault("holdout_ratio", holdout_ratio)
|
|
58
|
+
if seed is not None:
|
|
59
|
+
metadata.setdefault("split_seed", seed)
|
|
60
|
+
return Dataset(
|
|
61
|
+
name=dataset.name,
|
|
62
|
+
version=dataset.version,
|
|
63
|
+
test_cases=list(test_cases),
|
|
64
|
+
metadata=metadata,
|
|
65
|
+
source_file=dataset.source_file,
|
|
66
|
+
thresholds=dict(dataset.thresholds),
|
|
67
|
+
)
|