evalvault 1.62.1__py3-none-any.whl → 1.63.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. evalvault/adapters/inbound/api/adapter.py +190 -19
  2. evalvault/adapters/inbound/api/routers/runs.py +66 -2
  3. evalvault/adapters/inbound/cli/commands/method.py +5 -2
  4. evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
  5. evalvault/adapters/inbound/cli/commands/run.py +88 -5
  6. evalvault/adapters/inbound/cli/commands/run_helpers.py +12 -0
  7. evalvault/adapters/inbound/mcp/tools.py +5 -2
  8. evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
  9. evalvault/adapters/outbound/improvement/pattern_detector.py +1 -1
  10. evalvault/adapters/outbound/improvement/playbook_loader.py +1 -1
  11. evalvault/adapters/outbound/llm/__init__.py +5 -43
  12. evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
  13. evalvault/adapters/outbound/llm/factory.py +103 -0
  14. evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
  15. evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
  16. evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
  17. evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
  18. evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
  19. evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
  20. evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
  21. evalvault/adapters/outbound/storage/base_sql.py +528 -21
  22. evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
  23. evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
  24. evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
  25. evalvault/debug_ragas.py +7 -1
  26. evalvault/debug_ragas_real.py +5 -1
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/prompt_suggestion.py +50 -0
  29. evalvault/domain/services/__init__.py +6 -0
  30. evalvault/domain/services/evaluator.py +191 -103
  31. evalvault/domain/services/holdout_splitter.py +67 -0
  32. evalvault/domain/services/intent_classifier.py +73 -0
  33. evalvault/domain/services/pipeline_template_registry.py +3 -0
  34. evalvault/domain/services/prompt_candidate_service.py +117 -0
  35. evalvault/domain/services/prompt_registry.py +40 -2
  36. evalvault/domain/services/prompt_scoring_service.py +286 -0
  37. evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
  38. evalvault/domain/services/synthetic_qa_generator.py +4 -3
  39. evalvault/ports/inbound/learning_hook_port.py +4 -1
  40. evalvault/ports/outbound/__init__.py +2 -0
  41. evalvault/ports/outbound/llm_factory_port.py +13 -0
  42. evalvault/ports/outbound/llm_port.py +34 -2
  43. evalvault/ports/outbound/storage_port.py +38 -0
  44. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/METADATA +228 -4
  45. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/RECORD +48 -40
  46. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/WHEEL +0 -0
  47. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/entry_points.txt +0 -0
  48. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,50 @@
1
+ """Prompt suggestion entities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class PromptCandidate:
11
+ """Single prompt candidate for suggestion workflow."""
12
+
13
+ candidate_id: str
14
+ source: str
15
+ content: str
16
+ metadata: dict[str, Any] = field(default_factory=dict)
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class PromptCandidateSampleScore:
21
+ sample_index: int
22
+ scores: dict[str, float]
23
+ weighted_score: float
24
+ responses: list[dict[str, Any]] = field(default_factory=list)
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class PromptCandidateScore:
29
+ """Evaluation score for a prompt candidate."""
30
+
31
+ candidate_id: str
32
+ scores: dict[str, float]
33
+ weighted_score: float
34
+ sample_scores: list[PromptCandidateSampleScore] = field(default_factory=list)
35
+ selected_sample_index: int | None = None
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class PromptSuggestionResult:
40
+ """Aggregated prompt suggestion results."""
41
+
42
+ run_id: str
43
+ role: str
44
+ metrics: list[str]
45
+ weights: dict[str, float]
46
+ candidates: list[PromptCandidate]
47
+ scores: list[PromptCandidateScore]
48
+ ranking: list[str]
49
+ holdout_ratio: float
50
+ metadata: dict[str, Any] = field(default_factory=dict)
@@ -4,8 +4,11 @@ from evalvault.domain.services.analysis_service import AnalysisService
4
4
  from evalvault.domain.services.dataset_preprocessor import DatasetPreprocessor
5
5
  from evalvault.domain.services.domain_learning_hook import DomainLearningHook
6
6
  from evalvault.domain.services.evaluator import RagasEvaluator
7
+ from evalvault.domain.services.holdout_splitter import split_dataset_holdout
7
8
  from evalvault.domain.services.improvement_guide_service import ImprovementGuideService
8
9
  from evalvault.domain.services.method_runner import MethodRunnerService, MethodRunResult
10
+ from evalvault.domain.services.prompt_scoring_service import PromptScoringService
11
+ from evalvault.domain.services.prompt_suggestion_reporter import PromptSuggestionReporter
9
12
 
10
13
  __all__ = [
11
14
  "AnalysisService",
@@ -14,5 +17,8 @@ __all__ = [
14
17
  "ImprovementGuideService",
15
18
  "MethodRunnerService",
16
19
  "MethodRunResult",
20
+ "PromptScoringService",
21
+ "PromptSuggestionReporter",
17
22
  "RagasEvaluator",
23
+ "split_dataset_holdout",
18
24
  ]
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import asyncio
6
+ import importlib
6
7
  import json
7
8
  import logging
8
9
  import math
@@ -10,36 +11,10 @@ from collections.abc import Callable, Sequence
10
11
  from contextlib import suppress
11
12
  from dataclasses import dataclass
12
13
  from datetime import datetime
13
- from typing import Any
14
+ from typing import Any, Literal, cast, overload
14
15
 
15
16
  from ragas import SingleTurnSample
16
17
 
17
- try: # Ragas >=0.2.0
18
- from ragas.metrics.collections import (
19
- AnswerRelevancy,
20
- ContextPrecision,
21
- ContextRecall,
22
- FactualCorrectness,
23
- Faithfulness,
24
- SemanticSimilarity,
25
- )
26
- except ImportError: # pragma: no cover - fallback for older Ragas versions
27
- from ragas.metrics import (
28
- AnswerRelevancy,
29
- ContextPrecision,
30
- ContextRecall,
31
- FactualCorrectness,
32
- Faithfulness,
33
- SemanticSimilarity,
34
- )
35
- try: # SummaryScore lives in different modules depending on Ragas version
36
- from ragas.metrics.collections import SummaryScore as RagasSummaryScore
37
- except ImportError: # pragma: no cover - fallback for older Ragas versions
38
- try:
39
- from ragas.metrics import SummarizationScore as RagasSummaryScore
40
- except ImportError: # pragma: no cover - no summary support available
41
- RagasSummaryScore = None
42
-
43
18
  from evalvault.domain.entities import (
44
19
  ClaimLevelResult,
45
20
  ClaimVerdict,
@@ -59,9 +34,55 @@ from evalvault.domain.metrics.text_match import ExactMatch, F1Score
59
34
  from evalvault.domain.services.batch_executor import run_in_batches
60
35
  from evalvault.domain.services.dataset_preprocessor import DatasetPreprocessor
61
36
  from evalvault.domain.services.retriever_context import apply_retriever_to_dataset
62
- from evalvault.ports.outbound.korean_nlp_port import RetrieverPort
37
+ from evalvault.ports.outbound.korean_nlp_port import KoreanNLPToolkitPort, RetrieverPort
38
+ from evalvault.ports.outbound.llm_factory_port import LLMFactoryPort
63
39
  from evalvault.ports.outbound.llm_port import LLMPort
64
40
 
41
+ _SUMMARY_FAITHFULNESS_PROMPT_KO = (
42
+ "당신은 요약 충실도 판정자입니다.\n"
43
+ "컨텍스트와 요약을 보고 요약의 모든 주장이 컨텍스트에 의해 뒷받침되는지 판단하세요.\n"
44
+ "숫자, 조건, 면책, 기간, 자격 등이 누락되거나 추가되거나 모순되면 verdict는 unsupported입니다.\n"
45
+ 'JSON만 반환: {"verdict": "supported|unsupported", "reason": "..."}\n\n'
46
+ "컨텍스트:\n{context}\n\n요약:\n{summary}\n"
47
+ )
48
+ _SUMMARY_FAITHFULNESS_PROMPT_EN = (
49
+ "You are a strict summarization faithfulness judge.\n"
50
+ "Given the CONTEXT and SUMMARY, determine whether every claim in SUMMARY is supported by CONTEXT.\n"
51
+ "If any numbers, conditions, exclusions, durations, or eligibility are missing, added, or "
52
+ "contradicted, verdict is unsupported.\n"
53
+ 'Return JSON only: {"verdict": "supported|unsupported", "reason": "..."}\n\n'
54
+ "CONTEXT:\n{context}\n\nSUMMARY:\n{summary}\n"
55
+ )
56
+
57
+
58
+ def _import_metric(name: str) -> type[Any]:
59
+ for module_name in ("ragas.metrics.collections", "ragas.metrics"):
60
+ try:
61
+ module = importlib.import_module(module_name)
62
+ except Exception:
63
+ continue
64
+ if hasattr(module, name):
65
+ return cast(type[Any], getattr(module, name))
66
+ raise ImportError(f"Missing ragas metric: {name}")
67
+
68
+
69
+ def _import_optional_metric(names: list[str]) -> type[Any] | None:
70
+ for name in names:
71
+ try:
72
+ return _import_metric(name)
73
+ except Exception:
74
+ continue
75
+ return None
76
+
77
+
78
+ AnswerRelevancy = _import_metric("AnswerRelevancy")
79
+ ContextPrecision = _import_metric("ContextPrecision")
80
+ ContextRecall = _import_metric("ContextRecall")
81
+ FactualCorrectness = _import_metric("FactualCorrectness")
82
+ Faithfulness = _import_metric("Faithfulness")
83
+ SemanticSimilarity = _import_metric("SemanticSimilarity")
84
+ RagasSummaryScore = _import_optional_metric(["SummaryScore", "SummarizationScore"])
85
+
65
86
  logger = logging.getLogger(__name__)
66
87
 
67
88
 
@@ -247,9 +268,16 @@ class RagasEvaluator:
247
268
  "openai/gpt-5-nano": (5.00, 15.00),
248
269
  }
249
270
 
250
- def __init__(self, *, preprocessor: DatasetPreprocessor | None = None) -> None:
271
+ def __init__(
272
+ self,
273
+ *,
274
+ preprocessor: DatasetPreprocessor | None = None,
275
+ korean_toolkit: KoreanNLPToolkitPort | None = None,
276
+ llm_factory: LLMFactoryPort | None = None,
277
+ ) -> None:
251
278
  self._preprocessor = preprocessor or DatasetPreprocessor()
252
- self._korean_toolkit = None
279
+ self._korean_toolkit = korean_toolkit
280
+ self._llm_factory = llm_factory
253
281
  self._faithfulness_ragas_failed = False
254
282
  self._faithfulness_fallback_llm = None
255
283
  self._faithfulness_fallback_metric = None
@@ -258,6 +286,7 @@ class RagasEvaluator:
258
286
  self._active_llm_provider = None
259
287
  self._active_llm_model = None
260
288
  self._active_llm = None
289
+ self._prompt_language = None
261
290
 
262
291
  async def evaluate(
263
292
  self,
@@ -273,6 +302,7 @@ class RagasEvaluator:
273
302
  on_progress: Callable[[int, int, str], None] | None = None,
274
303
  prompt_overrides: dict[str, str] | None = None,
275
304
  claim_level: bool = False,
305
+ language: str | None = None,
276
306
  ) -> EvaluationRun:
277
307
  """데이터셋을 Ragas로 평가.
278
308
 
@@ -299,6 +329,7 @@ class RagasEvaluator:
299
329
  self._active_llm_provider = getattr(llm, "provider_name", None)
300
330
  self._active_llm_model = llm.get_model_name()
301
331
  self._active_llm = llm
332
+ self._prompt_language = self._normalize_language_hint(language) if language else None
302
333
  # Resolve thresholds: CLI > dataset > default(0.7)
303
334
  resolved_thresholds = {}
304
335
  for metric in metrics:
@@ -359,7 +390,11 @@ class RagasEvaluator:
359
390
  eval_results_by_test_case = {}
360
391
  if ragas_metrics:
361
392
  run.tracker_metadata["ragas_config"] = self._build_ragas_config(llm)
362
- eval_results_by_test_case, override_status = await self._evaluate_with_ragas(
393
+ (
394
+ eval_results_by_test_case,
395
+ override_status,
396
+ prompt_snapshots,
397
+ ) = await self._evaluate_with_ragas(
363
398
  dataset=dataset,
364
399
  metrics=ragas_metrics,
365
400
  llm=llm,
@@ -370,6 +405,8 @@ class RagasEvaluator:
370
405
  )
371
406
  if override_status:
372
407
  run.tracker_metadata["ragas_prompt_overrides"] = override_status
408
+ if prompt_snapshots:
409
+ run.tracker_metadata["ragas_prompt_snapshots"] = prompt_snapshots
373
410
  elif prompt_overrides:
374
411
  logger.warning("Ragas prompt overrides provided but no Ragas metrics requested.")
375
412
 
@@ -485,7 +522,7 @@ class RagasEvaluator:
485
522
  batch_size: int = 5,
486
523
  on_progress: Callable[[int, int, str], None] | None = None,
487
524
  prompt_overrides: dict[str, str] | None = None,
488
- ) -> tuple[dict[str, TestCaseEvalResult], dict[str, str]]:
525
+ ) -> tuple[dict[str, TestCaseEvalResult], dict[str, str], dict[str, dict[str, Any]]]:
489
526
  """Ragas로 실제 평가 수행.
490
527
 
491
528
  Args:
@@ -496,7 +533,7 @@ class RagasEvaluator:
496
533
  batch_size: 병렬 처리 시 배치 크기
497
534
 
498
535
  Returns:
499
- (테스트 케이스 ID별 평가 결과, 프롬프트 오버라이드 적용 상태)
536
+ (테스트 케이스 ID별 평가 결과, 프롬프트 오버라이드 적용 상태, 프롬프트 스냅샷)
500
537
  예: {"tc-001": TestCaseEvalResult(...)}
501
538
  """
502
539
 
@@ -554,6 +591,12 @@ class RagasEvaluator:
554
591
  if prompt_overrides:
555
592
  override_status = self._apply_prompt_overrides(ragas_metrics, prompt_overrides)
556
593
 
594
+ prompt_snapshots = self._collect_ragas_prompt_snapshots(
595
+ ragas_metrics,
596
+ prompt_overrides,
597
+ override_status,
598
+ )
599
+
557
600
  # 병렬 처리 vs 순차 처리
558
601
  if parallel and len(ragas_samples) > 1:
559
602
  return (
@@ -566,6 +609,7 @@ class RagasEvaluator:
566
609
  on_progress=on_progress,
567
610
  ),
568
611
  override_status,
612
+ prompt_snapshots,
569
613
  )
570
614
  return (
571
615
  await self._evaluate_sequential(
@@ -576,6 +620,7 @@ class RagasEvaluator:
576
620
  on_progress=on_progress,
577
621
  ),
578
622
  override_status,
623
+ prompt_snapshots,
579
624
  )
580
625
 
581
626
  def _apply_answer_relevancy_prompt_defaults(
@@ -619,6 +664,8 @@ class RagasEvaluator:
619
664
  self._apply_korean_factual_correctness_prompts(metric)
620
665
 
621
666
  def _resolve_dataset_language(self, dataset: Dataset) -> str | None:
667
+ if self._prompt_language:
668
+ return self._prompt_language
622
669
  metadata = dataset.metadata if isinstance(dataset.metadata, dict) else {}
623
670
  for key in ("language", "lang", "locale"):
624
671
  normalized = self._normalize_language_hint(metadata.get(key))
@@ -784,10 +831,10 @@ class RagasEvaluator:
784
831
  if isinstance(target, str):
785
832
  metric.prompt = prompt_text
786
833
  return True
787
- if hasattr(target, "template"):
834
+ if target is not None and hasattr(target, "template"):
788
835
  target.template = prompt_text
789
836
  return True
790
- if hasattr(target, "instruction"):
837
+ if target is not None and hasattr(target, "instruction"):
791
838
  target.instruction = prompt_text
792
839
  return True
793
840
 
@@ -796,10 +843,10 @@ class RagasEvaluator:
796
843
  if isinstance(target, str):
797
844
  metric.question_generation = prompt_text
798
845
  return True
799
- if hasattr(target, "template"):
846
+ if target is not None and hasattr(target, "template"):
800
847
  target.template = prompt_text
801
848
  return True
802
- if hasattr(target, "instruction"):
849
+ if target is not None and hasattr(target, "instruction"):
803
850
  target.instruction = prompt_text
804
851
  return True
805
852
 
@@ -829,6 +876,68 @@ class RagasEvaluator:
829
876
 
830
877
  return False
831
878
 
879
+ @staticmethod
880
+ def _extract_prompt_text(value: Any) -> str | None:
881
+ if value is None:
882
+ return None
883
+ if isinstance(value, str):
884
+ return value
885
+ for attr in ("template", "instruction", "prompt", "text"):
886
+ try:
887
+ candidate = getattr(value, attr)
888
+ except Exception:
889
+ continue
890
+ if isinstance(candidate, str) and candidate.strip():
891
+ return candidate
892
+ return None
893
+
894
+ def _collect_metric_prompt_text(self, metric: Any) -> str | None:
895
+ for attr in ("prompt", "question_generation"):
896
+ if hasattr(metric, attr):
897
+ try:
898
+ value = getattr(metric, attr)
899
+ except Exception:
900
+ continue
901
+ text = self._extract_prompt_text(value)
902
+ if text:
903
+ return text
904
+ for attr in dir(metric):
905
+ if not attr.endswith("_prompt") or attr == "prompt":
906
+ continue
907
+ try:
908
+ value = getattr(metric, attr)
909
+ except Exception:
910
+ continue
911
+ text = self._extract_prompt_text(value)
912
+ if text:
913
+ return text
914
+ return None
915
+
916
+ def _collect_ragas_prompt_snapshots(
917
+ self,
918
+ ragas_metrics: list[Any],
919
+ prompt_overrides: dict[str, str] | None,
920
+ override_status: dict[str, str],
921
+ ) -> dict[str, dict[str, Any]]:
922
+ snapshots: dict[str, dict[str, Any]] = {}
923
+ for metric in ragas_metrics:
924
+ metric_name = getattr(metric, "name", None)
925
+ if not metric_name:
926
+ continue
927
+ prompt_text = self._collect_metric_prompt_text(metric)
928
+ if not prompt_text:
929
+ continue
930
+ requested = bool(prompt_overrides and metric_name in prompt_overrides)
931
+ status = override_status.get(metric_name)
932
+ source = "override" if status == "applied" else "default"
933
+ snapshots[str(metric_name)] = {
934
+ "prompt": prompt_text,
935
+ "source": source,
936
+ "override_requested": requested,
937
+ "override_status": status,
938
+ }
939
+ return snapshots
940
+
832
941
  async def _evaluate_sequential(
833
942
  self,
834
943
  dataset: Dataset,
@@ -1173,6 +1282,22 @@ class RagasEvaluator:
1173
1282
  def default_threshold_for(cls, metric_name: str) -> float:
1174
1283
  return cls.DEFAULT_METRIC_THRESHOLDS.get(metric_name, cls.DEFAULT_THRESHOLD_FALLBACK)
1175
1284
 
1285
+ @overload
1286
+ def _fallback_korean_faithfulness(
1287
+ self,
1288
+ sample: SingleTurnSample,
1289
+ *,
1290
+ return_details: Literal[True],
1291
+ ) -> ClaimLevelResult | None: ...
1292
+
1293
+ @overload
1294
+ def _fallback_korean_faithfulness(
1295
+ self,
1296
+ sample: SingleTurnSample,
1297
+ *,
1298
+ return_details: Literal[False] = False,
1299
+ ) -> float | None: ...
1300
+
1176
1301
  def _fallback_korean_faithfulness(
1177
1302
  self, sample: SingleTurnSample, *, return_details: bool = False
1178
1303
  ) -> float | ClaimLevelResult | None:
@@ -1194,11 +1319,7 @@ class RagasEvaluator:
1194
1319
  return None
1195
1320
 
1196
1321
  if self._korean_toolkit is None:
1197
- try:
1198
- from evalvault.adapters.outbound.nlp.korean.toolkit import KoreanNLPToolkit
1199
- except Exception: # pragma: no cover - optional dependency
1200
- return None
1201
- self._korean_toolkit = KoreanNLPToolkit()
1322
+ return None
1202
1323
 
1203
1324
  try:
1204
1325
  result = self._korean_toolkit.check_faithfulness(
@@ -1212,6 +1333,8 @@ class RagasEvaluator:
1212
1333
  return self._convert_to_claim_level_result(result, test_case_id="")
1213
1334
 
1214
1335
  score = getattr(result, "score", None)
1336
+ if score is None:
1337
+ return None
1215
1338
  try:
1216
1339
  return float(score)
1217
1340
  except (TypeError, ValueError):
@@ -1291,14 +1414,11 @@ class RagasEvaluator:
1291
1414
  return None
1292
1415
 
1293
1416
  context = "\n\n".join(sample.retrieved_contexts)
1294
- prompt = (
1295
- "You are a strict summarization faithfulness judge.\n"
1296
- "Given the CONTEXT and SUMMARY, determine whether every claim in SUMMARY is supported by CONTEXT.\n"
1297
- "If any numbers, conditions, exclusions, durations, or eligibility are missing, added, or "
1298
- "contradicted, verdict is unsupported.\n"
1299
- 'Return JSON only: {"verdict": "supported|unsupported", "reason": "..."}\n\n'
1300
- f"CONTEXT:\n{context}\n\nSUMMARY:\n{sample.response}\n"
1417
+ language = self._prompt_language or "ko"
1418
+ template = (
1419
+ _SUMMARY_FAITHFULNESS_PROMPT_EN if language == "en" else _SUMMARY_FAITHFULNESS_PROMPT_KO
1301
1420
  )
1421
+ prompt = template.format(context=context, summary=sample.response)
1302
1422
 
1303
1423
  try:
1304
1424
  response_text = await asyncio.to_thread(llm.generate_text, prompt, json_mode=True)
@@ -1340,7 +1460,7 @@ class RagasEvaluator:
1340
1460
  ) -> float | None:
1341
1461
  metric = self._get_faithfulness_fallback_metric()
1342
1462
  if metric is None:
1343
- return self._fallback_korean_faithfulness(sample)
1463
+ return self._fallback_korean_faithfulness(sample, return_details=False)
1344
1464
 
1345
1465
  try:
1346
1466
  if hasattr(metric, "ascore"):
@@ -1368,6 +1488,8 @@ class RagasEvaluator:
1368
1488
  else:
1369
1489
  score_value = result
1370
1490
 
1491
+ if score_value is None:
1492
+ raise ValueError("Metric returned None")
1371
1493
  score_value = float(score_value)
1372
1494
  if math.isnan(score_value):
1373
1495
  raise ValueError("Metric returned NaN")
@@ -1379,7 +1501,7 @@ class RagasEvaluator:
1379
1501
  self._summarize_ragas_error(exc),
1380
1502
  )
1381
1503
  self._faithfulness_fallback_failed = True
1382
- return self._fallback_korean_faithfulness(sample)
1504
+ return self._fallback_korean_faithfulness(sample, return_details=False)
1383
1505
 
1384
1506
  def _get_faithfulness_fallback_metric(self):
1385
1507
  if self._faithfulness_fallback_failed:
@@ -1411,29 +1533,14 @@ class RagasEvaluator:
1411
1533
  return None
1412
1534
  if self._faithfulness_fallback_llm is not None:
1413
1535
  return self._faithfulness_fallback_llm
1414
-
1415
- try:
1416
- from evalvault.adapters.outbound.llm import create_llm_adapter_for_model
1417
- from evalvault.config.settings import Settings
1418
- except Exception:
1419
- return None
1420
-
1421
- settings = Settings()
1422
- provider, model = self._resolve_faithfulness_fallback_config(settings)
1423
- if not provider or not model:
1536
+ if self._llm_factory is None:
1424
1537
  return None
1425
1538
 
1426
1539
  try:
1427
- llm = create_llm_adapter_for_model(provider, model, settings)
1428
- self._faithfulness_fallback_llm = llm
1429
- if not self._faithfulness_fallback_logged:
1430
- logger.warning(
1431
- "Faithfulness fallback LLM enabled: %s/%s",
1432
- provider,
1433
- model,
1434
- )
1435
- self._faithfulness_fallback_logged = True
1436
- return llm
1540
+ llm = self._llm_factory.create_faithfulness_fallback(
1541
+ self._active_llm_provider,
1542
+ self._active_llm_model,
1543
+ )
1437
1544
  except Exception as exc:
1438
1545
  if not self._faithfulness_fallback_failed:
1439
1546
  logger.warning(
@@ -1443,39 +1550,20 @@ class RagasEvaluator:
1443
1550
  self._faithfulness_fallback_failed = True
1444
1551
  return None
1445
1552
 
1446
- def _resolve_faithfulness_fallback_config(self, settings) -> tuple[str | None, str | None]:
1447
- provider = (
1448
- settings.faithfulness_fallback_provider.strip().lower()
1449
- if settings.faithfulness_fallback_provider
1450
- else None
1451
- )
1452
- model = settings.faithfulness_fallback_model
1453
- active_provider = (
1454
- self._active_llm_provider.strip().lower()
1455
- if isinstance(self._active_llm_provider, str) and self._active_llm_provider.strip()
1456
- else None
1457
- )
1458
- default_provider = active_provider or settings.llm_provider.lower()
1459
-
1460
- if not provider and model:
1461
- provider = default_provider
1462
- if provider and not model:
1463
- model = self._default_faithfulness_fallback_model(provider)
1464
- if not provider and not model:
1465
- provider = default_provider
1466
- model = self._default_faithfulness_fallback_model(default_provider)
1467
-
1468
- if not provider or not model:
1469
- return None, None
1470
- return provider, model
1553
+ if llm is None:
1554
+ return None
1471
1555
 
1472
- @staticmethod
1473
- def _default_faithfulness_fallback_model(provider: str) -> str | None:
1474
- if provider == "ollama":
1475
- return "gpt-oss-safeguard:20b"
1476
- if provider == "vllm":
1477
- return "gpt-oss-120b"
1478
- return None
1556
+ self._faithfulness_fallback_llm = llm
1557
+ if not self._faithfulness_fallback_logged:
1558
+ provider = getattr(llm, "provider_name", None)
1559
+ model = llm.get_model_name()
1560
+ logger.warning(
1561
+ "Faithfulness fallback LLM enabled: %s/%s",
1562
+ provider,
1563
+ model,
1564
+ )
1565
+ self._faithfulness_fallback_logged = True
1566
+ return llm
1479
1567
 
1480
1568
  @staticmethod
1481
1569
  def _contains_korean(text: str) -> bool:
@@ -0,0 +1,67 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+
5
+ from evalvault.domain.entities import Dataset, TestCase
6
+
7
+
8
+ def split_dataset_holdout(
9
+ *,
10
+ dataset: Dataset,
11
+ holdout_ratio: float,
12
+ seed: int | None,
13
+ ) -> tuple[Dataset, Dataset]:
14
+ if holdout_ratio < 0 or holdout_ratio >= 1:
15
+ raise ValueError("holdout_ratio must be in [0, 1).")
16
+
17
+ total = len(dataset.test_cases)
18
+ if total == 0:
19
+ return _clone_dataset(dataset, "dev", []), _clone_dataset(dataset, "holdout", [])
20
+
21
+ holdout_size = int(total * holdout_ratio)
22
+ if holdout_ratio > 0 and holdout_size == 0:
23
+ holdout_size = 1
24
+ if holdout_size >= total:
25
+ holdout_size = total - 1
26
+
27
+ rng = random.Random(seed)
28
+ indices = list(range(total))
29
+ rng.shuffle(indices)
30
+
31
+ holdout_indices = set(indices[:holdout_size])
32
+ dev_cases: list[TestCase] = []
33
+ holdout_cases: list[TestCase] = []
34
+
35
+ for idx, test_case in enumerate(dataset.test_cases):
36
+ if idx in holdout_indices:
37
+ holdout_cases.append(test_case)
38
+ else:
39
+ dev_cases.append(test_case)
40
+
41
+ return (
42
+ _clone_dataset(dataset, "dev", dev_cases, holdout_ratio, seed),
43
+ _clone_dataset(dataset, "holdout", holdout_cases, holdout_ratio, seed),
44
+ )
45
+
46
+
47
+ def _clone_dataset(
48
+ dataset: Dataset,
49
+ split: str,
50
+ test_cases: list[TestCase],
51
+ holdout_ratio: float | None = None,
52
+ seed: int | None = None,
53
+ ) -> Dataset:
54
+ metadata = dict(dataset.metadata or {})
55
+ metadata["split"] = split
56
+ if holdout_ratio is not None:
57
+ metadata.setdefault("holdout_ratio", holdout_ratio)
58
+ if seed is not None:
59
+ metadata.setdefault("split_seed", seed)
60
+ return Dataset(
61
+ name=dataset.name,
62
+ version=dataset.version,
63
+ test_cases=list(test_cases),
64
+ metadata=metadata,
65
+ source_file=dataset.source_file,
66
+ thresholds=dict(dataset.thresholds),
67
+ )