evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  14. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  15. evalvault/adapters/outbound/artifact_fs.py +16 -0
  16. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  17. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  18. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  19. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  20. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  21. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  22. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  23. evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
  24. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  25. evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
  26. evalvault/config/settings.py +21 -0
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/judge_calibration.py +50 -0
  29. evalvault/domain/entities/prompt.py +1 -1
  30. evalvault/domain/entities/stage.py +11 -3
  31. evalvault/domain/metrics/__init__.py +8 -0
  32. evalvault/domain/metrics/registry.py +39 -3
  33. evalvault/domain/metrics/summary_accuracy.py +189 -0
  34. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  35. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  36. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  37. evalvault/domain/services/artifact_lint_service.py +268 -0
  38. evalvault/domain/services/benchmark_runner.py +1 -6
  39. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  40. evalvault/domain/services/dataset_preprocessor.py +26 -0
  41. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  42. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  43. evalvault/domain/services/evaluator.py +282 -27
  44. evalvault/domain/services/judge_calibration_service.py +495 -0
  45. evalvault/domain/services/ops_snapshot_service.py +159 -0
  46. evalvault/domain/services/prompt_registry.py +39 -10
  47. evalvault/domain/services/regression_gate_service.py +199 -0
  48. evalvault/domain/services/run_comparison_service.py +159 -0
  49. evalvault/domain/services/stage_event_builder.py +6 -1
  50. evalvault/domain/services/stage_metric_service.py +83 -18
  51. evalvault/domain/services/threshold_profiles.py +4 -0
  52. evalvault/domain/services/visual_space_service.py +79 -4
  53. evalvault/ports/outbound/__init__.py +4 -0
  54. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  55. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  56. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  57. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  58. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  59. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  60. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
  61. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  62. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  63. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -11,8 +11,9 @@ from collections.abc import Callable, Sequence
11
11
  from contextlib import suppress
12
12
  from dataclasses import dataclass
13
13
  from datetime import datetime
14
- from typing import Any, Literal, cast, overload
14
+ from typing import Any, Literal, overload
15
15
 
16
+ from pydantic import BaseModel, Field, field_validator
16
17
  from ragas import SingleTurnSample
17
18
 
18
19
  from evalvault.domain.entities import (
@@ -30,8 +31,13 @@ from evalvault.domain.metrics.entity_preservation import EntityPreservation
30
31
  from evalvault.domain.metrics.insurance import InsuranceTermAccuracy
31
32
  from evalvault.domain.metrics.no_answer import NoAnswerAccuracy
32
33
  from evalvault.domain.metrics.retrieval_rank import MRR, NDCG, HitRate
34
+ from evalvault.domain.metrics.summary_accuracy import SummaryAccuracy
35
+ from evalvault.domain.metrics.summary_needs_followup import SummaryNeedsFollowup
36
+ from evalvault.domain.metrics.summary_non_definitive import SummaryNonDefinitive
37
+ from evalvault.domain.metrics.summary_risk_coverage import SummaryRiskCoverage
33
38
  from evalvault.domain.metrics.text_match import ExactMatch, F1Score
34
39
  from evalvault.domain.services.batch_executor import run_in_batches
40
+ from evalvault.domain.services.custom_metric_snapshot import build_custom_metric_snapshot
35
41
  from evalvault.domain.services.dataset_preprocessor import DatasetPreprocessor
36
42
  from evalvault.domain.services.retriever_context import apply_retriever_to_dataset
37
43
  from evalvault.ports.outbound.korean_nlp_port import KoreanNLPToolkitPort, RetrieverPort
@@ -55,14 +61,53 @@ _SUMMARY_FAITHFULNESS_PROMPT_EN = (
55
61
  )
56
62
 
57
63
 
64
+ def _patch_ragas_faithfulness_output() -> None:
65
+ try:
66
+ from ragas.metrics import Faithfulness
67
+ except Exception:
68
+ return
69
+
70
+ prompt = getattr(Faithfulness, "nli_statements_prompt", None)
71
+ if prompt is None:
72
+ return
73
+
74
+ output_model = getattr(prompt, "output_model", None)
75
+ if output_model is None:
76
+ return
77
+
78
+ class _StatementFaithfulnessAnswer(BaseModel):
79
+ statement: str = Field(..., description="the original statement, word-by-word")
80
+ reason: str = Field(..., description="the reason of the verdict")
81
+ verdict: int = Field(..., description="the verdict(0/1) of the faithfulness.")
82
+
83
+ @field_validator("verdict", mode="before")
84
+ @classmethod
85
+ def _coerce_verdict(cls, value):
86
+ if isinstance(value, str):
87
+ normalized = value.strip()
88
+ if normalized.isdigit():
89
+ return int(normalized)
90
+ return value
91
+
92
+ class _NLIStatementOutput(BaseModel):
93
+ statements: list[_StatementFaithfulnessAnswer]
94
+
95
+ try:
96
+ prompt.output_model = _NLIStatementOutput
97
+ except Exception:
98
+ return
99
+
100
+
58
101
  def _import_metric(name: str) -> type[Any]:
59
102
  for module_name in ("ragas.metrics.collections", "ragas.metrics"):
60
103
  try:
61
104
  module = importlib.import_module(module_name)
62
- except Exception:
105
+ if hasattr(module, name):
106
+ if name == "Faithfulness":
107
+ _patch_ragas_faithfulness_output()
108
+ return getattr(module, name)
109
+ except ImportError:
63
110
  continue
64
- if hasattr(module, name):
65
- return cast(type[Any], getattr(module, name))
66
111
  raise ImportError(f"Missing ragas metric: {name}")
67
112
 
68
113
 
@@ -147,6 +192,10 @@ class RagasEvaluator:
147
192
  CUSTOM_METRIC_MAP = {
148
193
  "insurance_term_accuracy": InsuranceTermAccuracy,
149
194
  "entity_preservation": EntityPreservation,
195
+ "summary_accuracy": SummaryAccuracy,
196
+ "summary_risk_coverage": SummaryRiskCoverage,
197
+ "summary_non_definitive": SummaryNonDefinitive,
198
+ "summary_needs_followup": SummaryNeedsFollowup,
150
199
  "exact_match": ExactMatch,
151
200
  "f1_score": F1Score,
152
201
  "no_answer_accuracy": NoAnswerAccuracy,
@@ -198,6 +247,10 @@ class RagasEvaluator:
198
247
  "summary_faithfulness": 0.9,
199
248
  "summary_score": 0.85,
200
249
  "entity_preservation": 0.9,
250
+ "summary_accuracy": 0.9,
251
+ "summary_risk_coverage": 0.9,
252
+ "summary_non_definitive": 0.8,
253
+ "summary_needs_followup": 0.8,
201
254
  "contextual_relevancy": 0.35,
202
255
  }
203
256
  LANGUAGE_SAMPLE_LIMIT = 5
@@ -225,10 +278,28 @@ class RagasEvaluator:
225
278
  "예시의 원자성 수준을 따르세요."
226
279
  )
227
280
  FACTUAL_CORRECTNESS_NLI_INSTRUCTION = (
228
- "다음 CONTEXT를 바탕으로STATEMENT가 직접적으로 "
229
- "추론 가능한지 판단하세요. "
230
- "가능하면 verdict=1, 불가능하면 verdict=0으로 표시하고, "
231
- "간단한 이유를 한국어로 적으세요."
281
+ "주어진 컨텍스트를 보고진술이 직접적으로 도출 가능한지 판단하세요. "
282
+ "가능하면 verdict=1, 불가능하면 verdict=0을 JSON으로 반환하세요."
283
+ )
284
+ SUMMARY_SCORE_QUESTION_INSTRUCTION = (
285
+ "다음 텍스트와 핵심 키워드를 기반으로, "
286
+ "텍스트에 근거해 반드시 1로 답할 수 있는 폐쇄형 질문을 생성하세요. "
287
+ "질문은 한국어로 작성하세요."
288
+ )
289
+ SUMMARY_SCORE_ANSWER_INSTRUCTION = (
290
+ "다음 질문 목록에 대해, 제공된 요약이 각 질문에 답할 수 있으면 '1', "
291
+ "그렇지 않으면 '0'을 JSON 배열로 반환하세요."
292
+ )
293
+ SUMMARY_SCORE_KEYPHRASE_INSTRUCTION = (
294
+ "다음 텍스트에서 인물, 기관, 위치, 날짜/시간, 금액, 비율과 같은 핵심 키워드를 추출하세요."
295
+ )
296
+ SUMMARY_FAITHFULNESS_STATEMENT_INSTRUCTION = (
297
+ "질문과 답변을 보고 각 문장을 이해 가능한 주장으로 분해하세요. "
298
+ "각 주장은 대명사 없이 독립적으로 이해 가능해야 합니다."
299
+ )
300
+ SUMMARY_FAITHFULNESS_NLI_INSTRUCTION = (
301
+ "주어진 컨텍스트를 보고 각 진술이 직접적으로 도출 가능한지 판단하세요. "
302
+ "가능하면 verdict=1, 불가능하면 verdict=0을 JSON으로 반환하세요."
232
303
  )
233
304
  FACTUAL_CORRECTNESS_CLAIM_EXAMPLES = [
234
305
  {
@@ -330,6 +401,8 @@ class RagasEvaluator:
330
401
  self._active_llm_model = llm.get_model_name()
331
402
  self._active_llm = llm
332
403
  self._prompt_language = self._normalize_language_hint(language) if language else None
404
+ if self._prompt_language is None:
405
+ self._prompt_language = self._resolve_dataset_language(dataset)
333
406
  # Resolve thresholds: CLI > dataset > default(0.7)
334
407
  resolved_thresholds = {}
335
408
  for metric in metrics:
@@ -388,6 +461,7 @@ class RagasEvaluator:
388
461
 
389
462
  # Evaluate with Ragas (if any Ragas metrics)
390
463
  eval_results_by_test_case = {}
464
+ prompt_snapshots = {}
391
465
  if ragas_metrics:
392
466
  run.tracker_metadata["ragas_config"] = self._build_ragas_config(llm)
393
467
  (
@@ -410,6 +484,13 @@ class RagasEvaluator:
410
484
  elif prompt_overrides:
411
485
  logger.warning("Ragas prompt overrides provided but no Ragas metrics requested.")
412
486
 
487
+ custom_snapshot = build_custom_metric_snapshot(self.CUSTOM_METRIC_MAP, metrics)
488
+ if custom_snapshot:
489
+ run.tracker_metadata["custom_metric_snapshot"] = custom_snapshot
490
+ custom_prompt_snapshots = self._build_custom_prompt_snapshots(custom_snapshot)
491
+ if custom_prompt_snapshots:
492
+ run.tracker_metadata["custom_prompt_snapshots"] = custom_prompt_snapshots
493
+
413
494
  # Evaluate with custom metrics (if any custom metrics)
414
495
  if custom_metrics:
415
496
  custom_results = await self._evaluate_with_custom_metrics(
@@ -581,6 +662,11 @@ class RagasEvaluator:
581
662
  ragas_metrics=ragas_metrics,
582
663
  prompt_overrides=prompt_overrides,
583
664
  )
665
+ self._apply_summary_prompt_defaults(
666
+ dataset=dataset,
667
+ ragas_metrics=ragas_metrics,
668
+ prompt_overrides=prompt_overrides,
669
+ )
584
670
  self._apply_factual_correctness_prompt_defaults(
585
671
  dataset=dataset,
586
672
  ragas_metrics=ragas_metrics,
@@ -643,6 +729,30 @@ class RagasEvaluator:
643
729
  continue
644
730
  self._apply_korean_answer_relevancy_prompt(metric)
645
731
 
732
+ def _apply_summary_prompt_defaults(
733
+ self,
734
+ *,
735
+ dataset: Dataset,
736
+ ragas_metrics: list[Any],
737
+ prompt_overrides: dict[str, str] | None,
738
+ ) -> None:
739
+ if not ragas_metrics:
740
+ return
741
+ if prompt_overrides and any(
742
+ metric in prompt_overrides for metric in ("summary_score", "summary_faithfulness")
743
+ ):
744
+ return
745
+ resolved_language = self._resolve_dataset_language(dataset)
746
+ if resolved_language == "en":
747
+ return
748
+
749
+ for metric in ragas_metrics:
750
+ metric_name = getattr(metric, "name", None)
751
+ if metric_name == "summary_score":
752
+ self._apply_korean_summary_score_prompts(metric)
753
+ elif metric_name == "summary_faithfulness":
754
+ self._apply_korean_summary_faithfulness_prompts(metric)
755
+
646
756
  def _apply_factual_correctness_prompt_defaults(
647
757
  self,
648
758
  *,
@@ -743,6 +853,56 @@ class RagasEvaluator:
743
853
  prompt.language = "ko"
744
854
  return True
745
855
 
856
+ def _apply_korean_summary_score_prompts(self, metric: Any) -> bool:
857
+ question_prompt = getattr(metric, "question_generation_prompt", None)
858
+ answer_prompt = getattr(metric, "answer_generation_prompt", None)
859
+ keyphrase_prompt = getattr(metric, "extract_keyphrases_prompt", None)
860
+ applied = False
861
+
862
+ if question_prompt and hasattr(question_prompt, "instruction"):
863
+ question_prompt.instruction = self.SUMMARY_SCORE_QUESTION_INSTRUCTION
864
+ if hasattr(question_prompt, "language"):
865
+ with suppress(Exception):
866
+ question_prompt.language = "ko"
867
+ applied = True
868
+
869
+ if answer_prompt and hasattr(answer_prompt, "instruction"):
870
+ answer_prompt.instruction = self.SUMMARY_SCORE_ANSWER_INSTRUCTION
871
+ if hasattr(answer_prompt, "language"):
872
+ with suppress(Exception):
873
+ answer_prompt.language = "ko"
874
+ applied = True
875
+
876
+ if keyphrase_prompt and hasattr(keyphrase_prompt, "instruction"):
877
+ keyphrase_prompt.instruction = self.SUMMARY_SCORE_KEYPHRASE_INSTRUCTION
878
+ if hasattr(keyphrase_prompt, "language"):
879
+ with suppress(Exception):
880
+ keyphrase_prompt.language = "ko"
881
+ applied = True
882
+
883
+ return applied
884
+
885
+ def _apply_korean_summary_faithfulness_prompts(self, metric: Any) -> bool:
886
+ statement_prompt = getattr(metric, "statement_generator_prompt", None)
887
+ nli_prompt = getattr(metric, "nli_statements_prompt", None)
888
+ applied = False
889
+
890
+ if statement_prompt and hasattr(statement_prompt, "instruction"):
891
+ statement_prompt.instruction = self.SUMMARY_FAITHFULNESS_STATEMENT_INSTRUCTION
892
+ if hasattr(statement_prompt, "language"):
893
+ with suppress(Exception):
894
+ statement_prompt.language = "ko"
895
+ applied = True
896
+
897
+ if nli_prompt and hasattr(nli_prompt, "instruction"):
898
+ nli_prompt.instruction = self.SUMMARY_FAITHFULNESS_NLI_INSTRUCTION
899
+ if hasattr(nli_prompt, "language"):
900
+ with suppress(Exception):
901
+ nli_prompt.language = "ko"
902
+ applied = True
903
+
904
+ return applied
905
+
746
906
  def _apply_korean_factual_correctness_prompts(self, metric: Any) -> bool:
747
907
  claim_prompt = getattr(metric, "claim_decomposition_prompt", None)
748
908
  nli_prompt = getattr(metric, "nli_prompt", None)
@@ -817,6 +977,8 @@ class RagasEvaluator:
817
977
  continue
818
978
  prompt_text = prompt_overrides[metric_name]
819
979
  applied = self._override_metric_prompt(metric, prompt_text)
980
+ if not applied and metric_name == "faithfulness":
981
+ applied = self._override_faithfulness_prompt(metric, prompt_text)
820
982
  statuses[metric_name] = "applied" if applied else "unsupported"
821
983
  if not applied:
822
984
  logger.warning("Prompt override for metric '%s' could not be applied.", metric_name)
@@ -876,6 +1038,16 @@ class RagasEvaluator:
876
1038
 
877
1039
  return False
878
1040
 
1041
+ @staticmethod
1042
+ def _override_faithfulness_prompt(metric: Any, prompt_text: str) -> bool:
1043
+ target = getattr(metric, "nli_statements_prompt", None)
1044
+ if target is None:
1045
+ return False
1046
+ if hasattr(target, "instruction"):
1047
+ target.instruction = prompt_text
1048
+ return True
1049
+ return False
1050
+
879
1051
  @staticmethod
880
1052
  def _extract_prompt_text(value: Any) -> str | None:
881
1053
  if value is None:
@@ -924,18 +1096,50 @@ class RagasEvaluator:
924
1096
  metric_name = getattr(metric, "name", None)
925
1097
  if not metric_name:
926
1098
  continue
927
- prompt_text = self._collect_metric_prompt_text(metric)
928
- if not prompt_text:
929
- continue
930
1099
  requested = bool(prompt_overrides and metric_name in prompt_overrides)
931
1100
  status = override_status.get(metric_name)
932
1101
  source = "override" if status == "applied" else "default"
933
- snapshots[str(metric_name)] = {
934
- "prompt": prompt_text,
935
- "source": source,
936
- "override_requested": requested,
937
- "override_status": status,
938
- }
1102
+
1103
+ prompts: dict[str, str] = {}
1104
+ if metric_name == "summary_score":
1105
+ prompts["question_generation"] = (
1106
+ self._extract_prompt_text(getattr(metric, "question_generation_prompt", None))
1107
+ or ""
1108
+ )
1109
+ prompts["answer_generation"] = (
1110
+ self._extract_prompt_text(getattr(metric, "answer_generation_prompt", None))
1111
+ or ""
1112
+ )
1113
+ prompts["extract_keyphrases"] = (
1114
+ self._extract_prompt_text(getattr(metric, "extract_keyphrases_prompt", None))
1115
+ or ""
1116
+ )
1117
+ prompts = {k: v for k, v in prompts.items() if v}
1118
+ elif metric_name == "summary_faithfulness":
1119
+ prompts["statement_generation"] = (
1120
+ self._extract_prompt_text(getattr(metric, "statement_generator_prompt", None))
1121
+ or ""
1122
+ )
1123
+ prompts["nli_statements"] = (
1124
+ self._extract_prompt_text(getattr(metric, "nli_statements_prompt", None)) or ""
1125
+ )
1126
+ prompts = {k: v for k, v in prompts.items() if v}
1127
+
1128
+ prompt_text = self._collect_metric_prompt_text(metric)
1129
+ if prompts:
1130
+ snapshots[str(metric_name)] = {
1131
+ "prompts": prompts,
1132
+ "source": source,
1133
+ "override_requested": requested,
1134
+ "override_status": status,
1135
+ }
1136
+ elif prompt_text:
1137
+ snapshots[str(metric_name)] = {
1138
+ "prompt": prompt_text,
1139
+ "source": source,
1140
+ "override_requested": requested,
1141
+ "override_status": status,
1142
+ }
939
1143
  return snapshots
940
1144
 
941
1145
  async def _evaluate_sequential(
@@ -1133,16 +1337,26 @@ class RagasEvaluator:
1133
1337
  claim_details: dict[str, ClaimLevelResult] = {}
1134
1338
 
1135
1339
  for metric in ragas_metrics:
1136
- if metric.name in self.FAITHFULNESS_METRICS and self._faithfulness_ragas_failed:
1137
- if metric.name == "summary_faithfulness":
1138
- judge_score = await self._score_summary_faithfulness_judge(sample)
1139
- if judge_score is not None:
1140
- scores[metric.name] = judge_score
1340
+ if metric.name in self.FAITHFULNESS_METRICS:
1341
+ if self._active_llm_provider == "ollama":
1342
+ fallback_score = self._fallback_korean_faithfulness(
1343
+ sample, return_details=False
1344
+ )
1345
+ if fallback_score is None:
1346
+ fallback_score = await self._score_faithfulness_with_fallback(sample)
1347
+ if fallback_score is not None:
1348
+ scores[metric.name] = fallback_score
1349
+ continue
1350
+ if self._faithfulness_ragas_failed:
1351
+ if metric.name == "summary_faithfulness":
1352
+ judge_score = await self._score_summary_faithfulness_judge(sample)
1353
+ if judge_score is not None:
1354
+ scores[metric.name] = judge_score
1355
+ continue
1356
+ fallback_score = await self._score_faithfulness_with_fallback(sample)
1357
+ if fallback_score is not None:
1358
+ scores[metric.name] = fallback_score
1141
1359
  continue
1142
- fallback_score = await self._score_faithfulness_with_fallback(sample)
1143
- if fallback_score is not None:
1144
- scores[metric.name] = fallback_score
1145
- continue
1146
1360
  try:
1147
1361
  # Ragas >=0.4 uses ascore() with kwargs
1148
1362
  if hasattr(metric, "ascore"):
@@ -1270,6 +1484,32 @@ class RagasEvaluator:
1270
1484
  normalized = str(domain).strip().lower()
1271
1485
  return cls.SUMMARY_SCORE_COEFF_BY_DOMAIN.get(normalized, cls.SUMMARY_SCORE_COEFF)
1272
1486
 
1487
+ def _build_custom_prompt_snapshots(self, snapshot: dict[str, Any]) -> dict[str, dict[str, Any]]:
1488
+ entries = snapshot.get("metrics") if isinstance(snapshot, dict) else None
1489
+ if not isinstance(entries, list):
1490
+ return {}
1491
+ prompt_snapshot: dict[str, dict[str, Any]] = {}
1492
+ for entry in entries:
1493
+ if not isinstance(entry, dict):
1494
+ continue
1495
+ name = entry.get("metric_name")
1496
+ if not isinstance(name, str) or not name:
1497
+ continue
1498
+ evaluation_process = entry.get("evaluation_process")
1499
+ if not isinstance(evaluation_process, str) or not evaluation_process:
1500
+ continue
1501
+ rules = entry.get("rules") if isinstance(entry.get("rules"), dict) else None
1502
+ prompts: dict[str, str] = {"rule": evaluation_process}
1503
+ if rules:
1504
+ prompts["rules"] = json.dumps(rules, ensure_ascii=False, indent=2)
1505
+ prompt_snapshot[name] = {
1506
+ "prompts": prompts,
1507
+ "source": "custom_rules",
1508
+ "rules": rules,
1509
+ "inputs": entry.get("inputs"),
1510
+ }
1511
+ return prompt_snapshot
1512
+
1273
1513
  def _build_summary_score_metric(self, metric_class, ragas_llm, coeff: float | None = None):
1274
1514
  if coeff is None:
1275
1515
  coeff = self.SUMMARY_SCORE_COEFF
@@ -1651,9 +1891,11 @@ class RagasEvaluator:
1651
1891
  contexts=test_case.contexts,
1652
1892
  )
1653
1893
  else:
1654
- score = metric_instance.score(
1894
+ score = self._score_custom_metric_with_metadata(
1895
+ metric_instance,
1655
1896
  answer=test_case.answer,
1656
1897
  contexts=test_case.contexts,
1898
+ metadata=test_case.metadata,
1657
1899
  )
1658
1900
  scores[metric_name] = score
1659
1901
 
@@ -1674,6 +1916,19 @@ class RagasEvaluator:
1674
1916
 
1675
1917
  return results
1676
1918
 
1919
+ def _score_custom_metric_with_metadata(
1920
+ self,
1921
+ metric_instance: Any,
1922
+ *,
1923
+ answer: str,
1924
+ contexts: list[str],
1925
+ metadata: dict[str, Any],
1926
+ ) -> float:
1927
+ try:
1928
+ return float(metric_instance.score(answer=answer, contexts=contexts, metadata=metadata))
1929
+ except TypeError:
1930
+ return float(metric_instance.score(answer=answer, contexts=contexts))
1931
+
1677
1932
  def _calculate_cost(self, model_name: str, prompt_tokens: int, completion_tokens: int) -> float:
1678
1933
  """Calculate estimated cost in USD based on model pricing."""
1679
1934
  # Find matching model key (exact or substring match)