eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +170 -0
  5. eval_framework/context/eval.py +114 -0
  6. eval_framework/context/local.py +52 -0
  7. eval_framework/evaluation_generator.py +231 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +323 -0
  16. eval_framework/llm/base.py +58 -0
  17. eval_framework/llm/huggingface.py +332 -0
  18. eval_framework/llm/mistral.py +73 -0
  19. eval_framework/llm/models.py +16 -0
  20. eval_framework/llm/openai.py +205 -0
  21. eval_framework/llm/vllm.py +438 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +187 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/bleu.py +76 -0
  29. eval_framework/metrics/completion/chrf.py +62 -0
  30. eval_framework/metrics/completion/code_assertion.py +44 -0
  31. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  32. eval_framework/metrics/completion/comet.py +56 -0
  33. eval_framework/metrics/completion/concordance_index.py +38 -0
  34. eval_framework/metrics/completion/csv_format.py +102 -0
  35. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  36. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  37. eval_framework/metrics/completion/f1.py +42 -0
  38. eval_framework/metrics/completion/format_checker.py +56 -0
  39. eval_framework/metrics/completion/grid_difference.py +77 -0
  40. eval_framework/metrics/completion/ifeval.py +73 -0
  41. eval_framework/metrics/completion/json_format.py +171 -0
  42. eval_framework/metrics/completion/language_checker.py +74 -0
  43. eval_framework/metrics/completion/length_control.py +83 -0
  44. eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
  45. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  46. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  47. eval_framework/metrics/completion/repetition.py +88 -0
  48. eval_framework/metrics/completion/rouge_1.py +35 -0
  49. eval_framework/metrics/completion/rouge_2.py +45 -0
  50. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  51. eval_framework/metrics/completion/rouge_l.py +52 -0
  52. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  53. eval_framework/metrics/completion/ter.py +67 -0
  54. eval_framework/metrics/completion/text_counter.py +182 -0
  55. eval_framework/metrics/efficiency/__init__.py +0 -0
  56. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  57. eval_framework/metrics/llm/__init__.py +0 -0
  58. eval_framework/metrics/llm/base.py +8 -0
  59. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  60. eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
  61. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  62. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  63. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  64. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  65. eval_framework/metrics/llm/graders/language.py +56 -0
  66. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  67. eval_framework/metrics/llm/graders/models.py +74 -0
  68. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  69. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  70. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  71. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  72. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  73. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  74. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  75. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  76. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  77. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
  78. eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
  79. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  80. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  81. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  82. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  83. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  84. eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
  85. eval_framework/py.typed +0 -0
  86. eval_framework/response_generator.py +416 -0
  87. eval_framework/result_processors/__init__.py +0 -0
  88. eval_framework/result_processors/base.py +74 -0
  89. eval_framework/result_processors/hf_processor.py +87 -0
  90. eval_framework/result_processors/result_processor.py +129 -0
  91. eval_framework/run.py +314 -0
  92. eval_framework/run_direct.py +42 -0
  93. eval_framework/shared/types.py +227 -0
  94. eval_framework/tasks/__init__.py +6 -0
  95. eval_framework/tasks/base.py +314 -0
  96. eval_framework/tasks/benchmarks/__init__.py +0 -0
  97. eval_framework/tasks/benchmarks/arc.py +46 -0
  98. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  99. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  100. eval_framework/tasks/benchmarks/belebele.py +60 -0
  101. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  102. eval_framework/tasks/benchmarks/casehold.py +47 -0
  103. eval_framework/tasks/benchmarks/chembench.py +85 -0
  104. eval_framework/tasks/benchmarks/copa.py +39 -0
  105. eval_framework/tasks/benchmarks/duc.py +91 -0
  106. eval_framework/tasks/benchmarks/flores200.py +62 -0
  107. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  108. eval_framework/tasks/benchmarks/gpqa.py +177 -0
  109. eval_framework/tasks/benchmarks/gsm8k.py +148 -0
  110. eval_framework/tasks/benchmarks/hellaswag.py +44 -0
  111. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  112. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  113. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  114. eval_framework/tasks/benchmarks/include.py +119 -0
  115. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  116. eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
  117. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  118. eval_framework/tasks/benchmarks/mmlu.py +190 -0
  119. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  120. eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
  121. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  122. eval_framework/tasks/benchmarks/openbookqa.py +37 -0
  123. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  124. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  125. eval_framework/tasks/benchmarks/piqa.py +39 -0
  126. eval_framework/tasks/benchmarks/quality.py +56 -0
  127. eval_framework/tasks/benchmarks/sciq.py +44 -0
  128. eval_framework/tasks/benchmarks/sphyr.py +75 -0
  129. eval_framework/tasks/benchmarks/squad.py +89 -0
  130. eval_framework/tasks/benchmarks/struct_eval.py +110 -0
  131. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  132. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  133. eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
  134. eval_framework/tasks/benchmarks/winogender.py +39 -0
  135. eval_framework/tasks/benchmarks/winogrande.py +44 -0
  136. eval_framework/tasks/benchmarks/winox.py +57 -0
  137. eval_framework/tasks/benchmarks/wmt.py +160 -0
  138. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  139. eval_framework/tasks/eval_config.py +112 -0
  140. eval_framework/tasks/perturbation.py +83 -0
  141. eval_framework/tasks/registry.py +186 -0
  142. eval_framework/tasks/task_loader.py +80 -0
  143. eval_framework/tasks/task_names.py +138 -0
  144. eval_framework/tasks/utils.py +578 -0
  145. eval_framework/utils/constants.py +9 -0
  146. eval_framework/utils/generate_task_docs.py +229 -0
  147. eval_framework/utils/helpers.py +3 -0
  148. eval_framework/utils/logging.py +50 -0
  149. eval_framework/utils/packaging.py +52 -0
  150. eval_framework-0.2.0.dist-info/METADATA +514 -0
  151. eval_framework-0.2.0.dist-info/RECORD +161 -0
  152. eval_framework-0.2.0.dist-info/WHEEL +4 -0
  153. eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
  154. template_formatting/README.md +83 -0
  155. template_formatting/__init__.py +0 -0
  156. template_formatting/formatter.py +536 -0
  157. template_formatting/mistral_formatter.py +159 -0
  158. template_formatting/py.typed +0 -0
  159. template_formatting/tests/test_formatter_eval.py +408 -0
  160. template_formatting/tests/test_formatter_scaling.py +253 -0
  161. template_formatting/tests/test_mistral_formatter.py +136 -0
@@ -0,0 +1,103 @@
1
+ from collections.abc import Mapping
2
+
3
+ from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
4
+ from eval_framework.metrics.llm.graders.language import Language
5
+ from eval_framework.metrics.llm.graders.models import GradingOutput, PromptTemplate, parse_json_output
6
+
7
+
8
+ class SummarizationWorldKnowledgeGradingOutput(GradingOutput):
9
+ contains_world_knowledge_thought_process: str | None
10
+ contains_world_knowledge: bool | None
11
+
12
+
13
+ class SummarizationWorldKnowledgeGrader:
14
+ REFERENCE_INPUT_KEY = "reference_input"
15
+ COMPLETION_KEY = "completion"
16
+ PROMPT_TEMPLATES = {
17
+ Language("de"): PromptTemplate(
18
+ system_prompt="""Deine Aufgabe ist es, zu bewerten ob eine Zusammenfassung Informationen, die über den Referenztext hinausgehen (auch genannt "Weltwissen") enthält.
19
+
20
+ Gebe die Antwort im folgenden JSON-Format:
21
+ {
22
+ "contains_world_knowledge_thought_process": str (Achte sehr genau auf die Antwort und argumentiere in ein paar Sätzen, ob die Zusammenfassung Informationen enthält, die über den Referenztext hinausgehen),
23
+ "contains_world_knowledge": bool (Enthält die Zusammenfassung Informationen die über den Referenztext hinausgehen?)
24
+ }""", # noqa: E501
25
+ user_prompt=f"""**Referenztext**
26
+ {{{REFERENCE_INPUT_KEY}}}
27
+
28
+ ---
29
+ **Zusammenfassung**
30
+ {{{COMPLETION_KEY}}}""",
31
+ ),
32
+ Language("en"): PromptTemplate(
33
+ system_prompt="""Your task is to evaluate a summary regarding whether it contains information that goes beyond the reference text (also known as "world knowledge").
34
+
35
+ You must provide your evaluation in the following JSON format:
36
+ {
37
+ "contains_world_knowledge_thought_process": str (Pay very close attention to the summary and argue whether the response contains world knowledge or not in a few sentences),
38
+ "contains_world_knowledge": bool (Does the summary contain information that goes beyond the reference text?),
39
+ }""", # noqa: E501
40
+ user_prompt=f"""**Reference Text**
41
+ {{{REFERENCE_INPUT_KEY}}}
42
+
43
+ ---
44
+ **Summary**
45
+ {{{COMPLETION_KEY}}}""",
46
+ ),
47
+ Language("fr"): PromptTemplate(
48
+ system_prompt="""Votre tâche consiste à évaluer une résumé pour déterminer s'il contient des informations qui vont au-delà du texte de référence (également appelé "connaissance du monde").
49
+
50
+ Vous devez fournir votre évaluation dans le format JSON suivant :
51
+ {
52
+ "contains_world_knowledge_thought_process": str (Prêtez une attention particulière au résumé et argumentez si le résumé contient des informations qui vont au-delà du texte de référence ou non en quelques phrases),
53
+ "contains_world_knowledge": bool (Le résumé contient-il des informations qui vont au-delà du texte de référence ?),
54
+ }""", # noqa: E501
55
+ user_prompt=f"""**Texte de référence**
56
+ {{{REFERENCE_INPUT_KEY}}}
57
+
58
+ ---
59
+ **Résumé**
60
+ {{{COMPLETION_KEY}}}""",
61
+ ),
62
+ }
63
+
64
+ def __init__(
65
+ self,
66
+ grading_model: StructuredOutputChatModel,
67
+ prompt_templates: Mapping[Language, PromptTemplate] = PROMPT_TEMPLATES,
68
+ ) -> None:
69
+ self._grading_model = grading_model
70
+
71
+ if not all(
72
+ self.REFERENCE_INPUT_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
73
+ ) or not all(
74
+ self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
75
+ ):
76
+ raise ValueError(
77
+ f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}' "
78
+ "and '{self.REFERENCE_INPUT_KEY}'."
79
+ )
80
+ self._prompt_templates = prompt_templates
81
+
82
+ def grade(
83
+ self, reference_input: str, completion: str, language: Language
84
+ ) -> SummarizationWorldKnowledgeGradingOutput:
85
+ prompt_template = language.language_config(self._prompt_templates)
86
+
87
+ messages = prompt_template.to_messages(
88
+ [],
89
+ [
90
+ (self.REFERENCE_INPUT_KEY, reference_input),
91
+ (self.COMPLETION_KEY, completion),
92
+ ],
93
+ )
94
+
95
+ raw_completion = self._grading_model.generate_from_messages([messages])[0]
96
+ loaded_json = parse_json_output(raw_completion.completion)
97
+
98
+ return SummarizationWorldKnowledgeGradingOutput(
99
+ contains_world_knowledge_thought_process=loaded_json.get("contains_world_knowledge_thought_process", None),
100
+ contains_world_knowledge=loaded_json.get("contains_world_knowledge", None),
101
+ judge_prompt=raw_completion.prompt,
102
+ judge_response=raw_completion.completion,
103
+ )
@@ -0,0 +1,36 @@
1
+ from eval_framework.llm.base import BaseLLM
2
+ from eval_framework.metrics.base import MetricResult
3
+ from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
4
+ from eval_framework.metrics.llm.graders.chatbot_style_grader import ChatbotStyleGrader
5
+ from eval_framework.metrics.llm.graders.language import Language
6
+ from eval_framework.shared.types import Completion
7
+
8
+
9
+ class LLMJudgeChatbotStyle(BaseLLMJudgeMetric):
10
+ NAME = "Chatbot Style"
11
+
12
+ def __init__(self, llm_judge: BaseLLM):
13
+ super().__init__(llm_judge)
14
+ self._grader = ChatbotStyleGrader(llm_judge)
15
+
16
+ def calculate(self, response: Completion) -> list[MetricResult]:
17
+ if response.error is not None:
18
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
19
+
20
+ language = Language(response.get_instruction_language())
21
+
22
+ grading = self._grader.grade(
23
+ completion=response.sanitized_completion,
24
+ language=language,
25
+ )
26
+
27
+ return [
28
+ MetricResult(
29
+ metric_name=self.NAME,
30
+ value=float(grading.is_chatbot_style) if grading.is_chatbot_style is not None else None,
31
+ higher_is_better=True,
32
+ llm_judge_prompt=grading.judge_prompt,
33
+ llm_judge_response=grading.judge_response,
34
+ error=response.error,
35
+ )
36
+ ]
@@ -0,0 +1,39 @@
1
+ from eval_framework.llm.base import BaseLLM
2
+ from eval_framework.metrics.base import MetricResult
3
+ from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
4
+ from eval_framework.metrics.llm.graders.language import Language
5
+ from eval_framework.metrics.llm.graders.long_context_grader import LongContextGrader
6
+ from eval_framework.shared.types import Completion
7
+
8
+
9
+ class LLMJudgeCompletionAccuracy(BaseLLMJudgeMetric):
10
+ NAME = "Judge Completion Accuracy"
11
+
12
+ def __init__(self, llm_judge: BaseLLM):
13
+ super().__init__(llm_judge)
14
+ self._grader = LongContextGrader(llm_judge)
15
+
16
+ def calculate(self, response: Completion) -> list[MetricResult]:
17
+ if response.error is not None:
18
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
19
+
20
+ assert isinstance(response.ground_truth, str)
21
+
22
+ language = Language(response.get_instruction_language())
23
+
24
+ grading = self._grader.grade(
25
+ expected_output=response.ground_truth,
26
+ completion=response.sanitized_completion,
27
+ language=language,
28
+ )
29
+
30
+ return [
31
+ MetricResult(
32
+ metric_name=self.NAME,
33
+ value=float(grading.answer_is_correct) if grading.answer_is_correct is not None else None,
34
+ higher_is_better=True,
35
+ llm_judge_prompt=grading.judge_prompt,
36
+ llm_judge_response=grading.judge_response,
37
+ error=response.error,
38
+ )
39
+ ]
@@ -0,0 +1,37 @@
1
+ from eval_framework.llm.base import BaseLLM
2
+ from eval_framework.metrics.base import MetricResult
3
+ from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
4
+ from eval_framework.metrics.llm.graders.conciseness_grader import ConcisenessGrader
5
+ from eval_framework.metrics.llm.graders.language import Language
6
+ from eval_framework.shared.types import Completion
7
+
8
+
9
+ class LLMJudgeConciseness(BaseLLMJudgeMetric):
10
+ NAME = "Conciseness"
11
+
12
+ def __init__(self, llm_judge: BaseLLM):
13
+ super().__init__(llm_judge)
14
+ self._grader = ConcisenessGrader(llm_judge)
15
+
16
+ def calculate(self, response: Completion) -> list[MetricResult]:
17
+ if response.error is not None:
18
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
19
+
20
+ language = Language(response.get_instruction_language())
21
+
22
+ grading = self._grader.grade(
23
+ instruction=response.system_user_instruction,
24
+ completion=response.sanitized_completion,
25
+ language=language,
26
+ )
27
+
28
+ return [
29
+ MetricResult(
30
+ metric_name=self.NAME,
31
+ value=float(grading.is_concise) if grading.is_concise is not None else None,
32
+ higher_is_better=True,
33
+ llm_judge_prompt=grading.judge_prompt,
34
+ llm_judge_response=grading.judge_response,
35
+ error=response.error,
36
+ )
37
+ ]
@@ -0,0 +1,36 @@
1
+ from eval_framework.llm.base import BaseLLM
2
+ from eval_framework.metrics.base import MetricResult
3
+ from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
4
+ from eval_framework.metrics.llm.graders.contains_names_grader import ContainsNamesGrader
5
+ from eval_framework.metrics.llm.graders.language import Language
6
+ from eval_framework.shared.types import Completion
7
+
8
+
9
+ class LLMJudgeAvoidsNames(BaseLLMJudgeMetric):
10
+ NAME = "Avoids Names"
11
+
12
+ def __init__(self, llm_judge: BaseLLM):
13
+ super().__init__(llm_judge)
14
+ self._grader = ContainsNamesGrader(llm_judge)
15
+
16
+ def calculate(self, response: Completion) -> list[MetricResult]:
17
+ if response.error is not None:
18
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
19
+
20
+ language = Language(response.get_instruction_language())
21
+
22
+ grading = self._grader.grade(
23
+ completion=response.sanitized_completion,
24
+ language=language,
25
+ )
26
+
27
+ return [
28
+ MetricResult(
29
+ metric_name=self.NAME,
30
+ value=float(not grading.contains_names) if grading.contains_names is not None else None,
31
+ higher_is_better=True,
32
+ llm_judge_prompt=grading.judge_prompt,
33
+ llm_judge_response=grading.judge_response,
34
+ error=response.error,
35
+ )
36
+ ]
@@ -0,0 +1,43 @@
1
+ from eval_framework.llm.base import BaseLLM
2
+ from eval_framework.metrics.base import (
3
+ MetricResult,
4
+ )
5
+ from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
6
+ from eval_framework.metrics.llm.graders.format_correctness_grader import FormatCorrectnessGrader
7
+ from eval_framework.metrics.llm.graders.language import Language
8
+ from eval_framework.shared.types import BaseMetricContext, Completion, LanguageMetricContext, extract_context_metric
9
+
10
+
11
+ class LLMJudgeFormatCorrectnessContext(BaseMetricContext):
12
+ language: str
13
+
14
+
15
+ class LLMJudgeFormatCorrectness(BaseLLMJudgeMetric):
16
+ NAME = "Format Correctness"
17
+
18
+ def __init__(self, llm_judge: BaseLLM):
19
+ super().__init__(llm_judge)
20
+ self._grader = FormatCorrectnessGrader(llm_judge)
21
+
22
+ def calculate(self, response: Completion) -> list[MetricResult]:
23
+ if response.error is not None:
24
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
25
+
26
+ context = extract_context_metric(response, LanguageMetricContext)
27
+
28
+ grading = self._grader.grade(
29
+ instruction=response.system_user_instruction,
30
+ completion=response.sanitized_completion,
31
+ language=Language(context.language),
32
+ )
33
+
34
+ return [
35
+ MetricResult(
36
+ metric_name=self.NAME,
37
+ value=float(grading.format_correctness) if grading.format_correctness is not None else None,
38
+ higher_is_better=True,
39
+ llm_judge_prompt=grading.judge_prompt,
40
+ llm_judge_response=grading.judge_response,
41
+ error=response.error,
42
+ )
43
+ ]
@@ -0,0 +1,58 @@
1
+ from eval_framework.llm.base import BaseLLM
2
+ from eval_framework.metrics.base import MetricResult
3
+ from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
4
+ from eval_framework.metrics.llm.graders.instruction_grader import InstructionGrader
5
+ from eval_framework.metrics.llm.graders.language import Language
6
+ from eval_framework.shared.types import Completion
7
+
8
+
9
+ class LLMJudgeInstruction(BaseLLMJudgeMetric):
10
+ NAME = "Instruction Following"
11
+ KEYS = [
12
+ "quality",
13
+ "is_following_instruction",
14
+ "has_correct_grammar_and_spelling",
15
+ "is_context_consistent",
16
+ "is_not_repeating",
17
+ "is_trustworthy",
18
+ "is_safe",
19
+ ]
20
+
21
+ def __init__(self, llm_judge: BaseLLM):
22
+ super().__init__(llm_judge)
23
+ self._grader = InstructionGrader(llm_judge)
24
+
25
+ def calculate(self, response: Completion) -> list[MetricResult]:
26
+ if response.error is not None:
27
+ for key in self.KEYS:
28
+ return [
29
+ MetricResult(
30
+ metric_name=f"{self.NAME} - {key}", value=None, higher_is_better=True, error=response.error
31
+ )
32
+ ]
33
+
34
+ language = Language(response.get_instruction_language())
35
+
36
+ grading = self._grader.grade(
37
+ instruction=response.system_user_instruction,
38
+ completion=response.sanitized_completion,
39
+ language=language,
40
+ )
41
+
42
+ results = []
43
+ for key in self.KEYS:
44
+ if key == "quality":
45
+ # [0, 1] normalization required for visualizer
46
+ value = (float(getattr(grading, key)) - 1) / 4 if getattr(grading, key) is not None else None
47
+ else:
48
+ value = float(getattr(grading, key)) if getattr(grading, key) is not None else None
49
+ result = MetricResult(
50
+ metric_name=f"{self.NAME}/{key}",
51
+ value=value,
52
+ higher_is_better=True,
53
+ llm_judge_prompt=grading.judge_prompt,
54
+ llm_judge_response=grading.judge_response,
55
+ error=response.error,
56
+ )
57
+ results.append(result)
58
+ return results
@@ -0,0 +1,205 @@
1
+ import re
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from eval_framework.logger import logger
6
+ from eval_framework.metrics.base import MetricResult
7
+ from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
8
+ from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric
9
+ from template_formatting.formatter import Message, Role
10
+
11
+ PAIR_JUDGE_PROMPTS = {
12
+ "pair_assistant_single_turn": {
13
+ "prompt_template": "[System]\nPlease act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.\n\n[User Question]\n{question}\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]" # noqa: E501
14
+ },
15
+ "pair_assistant_multi_turn": {
16
+ "prompt_template": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. You should choose the assistant that follows the user's instructions and answers the user's questions better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. You should focus on who provides a better answer to the second user question. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>" # noqa: E501
17
+ },
18
+ "pair_assistant_single_turn_w_reference": {
19
+ "prompt_template": "[System]\nPlease act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer, assistant A's answer, and assistant B's answer. Your job is to evaluate which assistant's answer is better. Begin your evaluation by comparing both assistants' answers with the reference answer. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.\n\n[User Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]" # noqa: E501
20
+ },
21
+ "pair_assistant_multi_turn_w_reference": {
22
+ "prompt_template": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.\n\n<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>" # noqa: E501
23
+ },
24
+ }
25
+
26
+ PAIR_JUDGE_PROMPTS_DE = {
27
+ "pair_assistant_single_turn": {
28
+ "prompt_template": '[System]\nBitte agieren Sie als unparteiischer Beurteiler und bewerten Sie die Qualität der Antworten, die von zwei KI-Assistenten auf die unten angezeigte Nutzerfrage gegeben wurden. Wählen Sie den Assistenten aus, der die Anweisungen des Nutzers besser befolgt und die Nutzerfrage besser beantwortet. Ihre Bewertung sollte Faktoren wie Nützlichkeit, Relevanz, Genauigkeit, Tiefe, Kreativität und Detaillierungsgrad der Antworten berücksichtigen. Beginnen Sie Ihre Bewertung mit einem Vergleich der beiden Antworten und geben Sie eine kurze Erklärung ab. Vermeiden Sie jeglichen Bias bezüglich der Position der Antworten und stellen Sie sicher, dass die Reihenfolge, in der die Antworten präsentiert wurden, Ihre Entscheidung nicht beeinflusst. Lassen Sie nicht zu, dass die Länge der Antworten Ihre Bewertung beeinflusst. Bevorzugen Sie keine bestimmten Namen der Assistenten. Seien Sie so objektiv wie möglich. Geben Sie nach Ihrer Erklärung Ihr endgültiges Urteil streng nach folgendem Format aus: "[[A]]" wenn Assistent A besser ist, "[[B]]" wenn Assistent B besser ist und "[[C]]" bei einem Unentschieden\n[Nutzerfrage]\n{question}\n\n[Der Anfang von Assistent A\'s Antwort]\n{answer_a}\n[Das Ende Assistent A\'s Antwort]\n\n[Der Anfang von Assistent B\'s Antwort]\n{answer_b}\n[Der Anfang von Assistent B\'s Antwort]' # noqa: E501
29
+ },
30
+ "pair_assistant_multi_turn": {
31
+ "prompt_template": 'Bitte agieren Sie als unparteiischer Beurteiler und bewerten Sie die Qualität der Antworten, die von zwei KI-Assistenten auf die Nutzerfragen gegeben wurden. Wählen Sie den Assistenten aus, der die Anweisungen des Nutzers besser befolgt und die Nutzerfragen besser beantwortet. Ihre Bewertung sollte Faktoren wie Nützlichkeit, Relevanz, Genauigkeit, Tiefe, Kreativität und Detaillgrad der Antworten berücksichtigen. Konzentrieren Sie sich darauf, wer die bessere Antwort auf die zweite Nutzerfrage liefert. Beginnen Sie Ihre Bewertung mit einem Vergleich der Antworten der beiden Assistenten und geben Sie eine kurze Erklärung ab. Vermeiden Sie jegliche Positionsvoreingenommenheit und stellen Sie sicher, dass die Reihenfolge, in der die Antworten präsentiert wurden, Ihre Entscheidung nicht beeinflusst. Lassen Sie nicht zu, dass die Länge der Antworten Ihre Bewertung beeinflusst. Bevorzugen Sie keine bestimmten Namen der Assistenten. Seien Sie so objektiv wie möglich. Geben Sie nach Ihrer Erklärung Ihr endgültiges Urteil streng nach folgendem Format aus: "[[A]]" wenn Assistent A besser ist, "[[B]]" wenn Assistent B besser ist und "[[C]]" bei einem Unentschieden.\n\n<|Der Anfang von Assistent A\'s Konversation mit dem User|>\n\n### User:\n{question_1}\n\n### Assistent A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistent A:\n{answer_a_2}\n\n<|Das Ende von Assistent A\'s Konversation mit dem User|>\n\n\n<|Der Anfang von Assistent B\'s Konversation mit der User|>\n\n### User:\n{question_1}\n\n### Assistent B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistent B:\n{answer_b_2}\n\n<|Das Ende von Assistent B\'s Konversation mit dem User|>' # noqa: E501
32
+ },
33
+ "pair_assistant_single_turn_w_reference": {
34
+ "prompt_template": '[System]\nBitte agieren Sie als unparteiischer Beurteiler und bewerten Sie die Qualität der Antworten, die von zwei KI-Assistenten auf die unten angezeigte Nutzerfrage gegeben wurden. Ihre Bewertung sollte Richtigkeit und Hilfreichkeit berücksichtigen. Sie erhalten eine Referenzantwort, die Antwort von Assistent A und die Antwort von Assistent B. Ihre Aufgabe ist es zu beurteilen, welche Antwort der Assistenten besser ist. Beginnen Sie Ihre Bewertung damit, die Antworten beider Assistenten mit der Referenzantwort zu vergleichen. Identifizieren und korrigieren Sie etwaige Fehler. Vermeiden Sie jegliche Positionsvoreingenommenheit und stellen Sie sicher, dass die Reihenfolge, in der die Antworten präsentiert wurden, Ihre Entscheidung nicht beeinflusst. Lassen Sie nicht zu, dass die Länge der Antworten Ihre Bewertung beeinflusst. Bevorzugen Sie keine bestimmten Namen der Assistenten. Seien Sie so objektiv wie möglich. Geben Sie nach Ihrer Erklärung Ihr endgültiges Urteil streng nach folgendem Format aus: "[[A]]" wenn Assistent A besser ist, "[[B]]" wenn Assistent B besser ist und "[[C]]" bei einem Unentschieden\n\n[Nutzerfrage]\n{question}\n\n[Der Anfang der Referenzantwort]\n{ref_answer_1}\n[Das Ender der Referenzantwort]\n\n[Der Anfang von Assistent A\'s Antwort]\n{answer_a}\n[Das Ende von Assistent A\'s Antwort]\n\n[Der Anfag von Assistent B\'s Answer]\n{answer_b}\n[Das Ende vin Assistent B\'s Antwort]' # noqa: E501
35
+ },
36
+ "pair_assistant_multi_turn_w_reference": {
37
+ "prompt_template": 'Bitte agieren Sie als unparteiischer Beurteiler und bewerten Sie die Qualität der Antworten, die von zwei KI-Assistenten auf die Nutzerfragen gegeben wurden. Ihre Bewertung sollte Richtigkeit und Hilfreichkeit berücksichtigen. Sie erhalten Referenzantworten, die Antworten von Assistent A und die Antworten von Assistent B. Ihre Aufgabe ist es zu ermitteln, welcher Assistent richtige und hilfreiche Antworten auf die zweite Nutzerfrage liefert. Beginnen Sie Ihre Bewertung damit, die Antworten beider Assistenten mit den Referenzantworten zu vergleichen. Identifizieren und korrigieren Sie etwaige Fehler. Vermeiden Sie jegliche Positionsvoreingenommenheit und stellen Sie sicher, dass die Reihenfolge, in der die Antworten präsentiert wurden, Ihre Entscheidung nicht beeinflusst. Lassen Sie nicht zu, dass die Länge der Antworten Ihre Bewertung beeinflusst. Bevorzugen Sie keine bestimmten Namen der Assistenten. Seien Sie so objektiv wie möglich. Geben Sie nach Ihrer Erklärung Ihr endgültiges Urteil streng nach folgendem Format aus: "[[A]]" wenn Assistent A besser ist, "[[B]]" wenn Assistent B besser ist und "[[C]]" bei einem Unentschieden.\n\n<|Der Anfang der Referenzantwort|>\n\n### User:\n{question_1}\n\n### Referenzantwort:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Referenzantwort:\n{ref_answer_2}\n\n<|Das Ende der Referenzantwort|>\n\n\n<|Der Anfang von Assistant A\'s Konversation mit dem User|>\n\n### User:\n{question_1}\n\n### Assistent A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistent A:\n{answer_a_2}\n\n<|Das Ende von Assistent A\'s Konversation mit dem User|>\n\n\n<|Der Anfang von Assistent B\'s Konversation mit dem User|>\n\n### User:\n{question_1}\n\n### Assistent B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistent B:\n{answer_b_2}\n\n<|Das Ende von Assistent B\'s Konversation mit dem User|>' # noqa: E501
38
+ },
39
+ }
40
+
41
+ PAIR_JUDGE_PROMPTS_FI = {
42
+ "pair_assistant_single_turn": {
43
+ "prompt_template": '[Järjestelmä]\nToimi puolueettomana tuomarina ja arvioi kahden AI-avustajan antamien vastausten laatua alla näkyvään käyttäjän kysymykseen. Sinun tulee valita se avustaja, joka noudattaa käyttäjän ohjeita ja vastaa käyttäjän kysymykseen paremmin. Arviosi tulisi ottaa huomioon tekijät kuten hyödyllisyys, asiaankuuluvuus, tarkkuus, syvällisyys, luovuus ja yksityiskohtien taso. Aloita arviointisi vertaamalla kahta vastausta ja anna lyhyt selitys. Vältä mahdollisia asemointiharhoja ja varmista, että vastausten esitysjärjestys ei vaikuta päätökseesi. Älä anna vastausten pituuden vaikuttaa arvioosi. Älä suosi tiettyjä avustajien nimiä. Ole mahdollisimman objektiivinen. Selityksen jälkeen anna lopullinen päätöksesi noudattamalla tarkasti tätä muotoa: "[[A]]", jos avustaja A on parempi, "[[B]]", jos avustaja B on parempi, ja "[[C]]" tasapelin tapauksessa.\n\n[Käyttäjän kysymys]\n{question}\n\n[Avustaja A:n vastauksen alku]\n{answer_a}\n[Avustaja A:n vastauksen loppu]\n\n[Avustaja B:n vastauksen alku]\n{answer_b}\n[Avustaja B:n vastauksen loppu]' # noqa: E501
44
+ },
45
+ "pair_assistant_multi_turn": {
46
+ "prompt_template": 'Toimi puolueettomana tuomarina ja arvioi kahden AI-avustajan antamien vastausten laatua käyttäjän kysymyksiin. Sinun tulee valita se avustaja, joka noudattaa käyttäjän ohjeita ja vastaa käyttäjän kysymyksiin paremmin. Arviosi tulisi ottaa huomioon tekijät kuten hyödyllisyys, asiaankuuluvuus, tarkkuus, syvällisyys, luovuus ja yksityiskohtien taso. Arviosi tulisi keskittyä siihen, kuka antaa paremman vastauksen toiseen käyttäjän kysymykseen. Aloita arviointisi vertaamalla kahden avustajan vastauksia ja anna lyhyt selitys. Vältä mahdollisia asemointiharhoja ja varmista, että vastausten esitysjärjestys ei vaikuta päätökseesi. Älä anna vastausten pituuden vaikuttaa arvioosi. Älä suosi tiettyjä avustajien nimiä. Ole mahdollisimman objektiivinen. Selityksen jälkeen anna lopullinen päätöksesi noudattamalla tarkasti tätä muotoa: "[[A]]", jos avustaja A on parempi, "[[B]]", jos avustaja B on parempi, ja "[[C]]" tasapelin tapauksessa.\n\n<|Avustaja A:n keskustelun alku käyttäjän kanssa|>\n\n### Käyttäjä:\n{question_1}\n\n### Avustaja A:\n{answer_a_1}\n\n### Käyttäjä:\n{question_2}\n\n### Avustaja A:\n{answer_a_2}\n\n<|Avustaja A:n keskustelun loppu käyttäjän kanssa|>\n\n\n<|Avustaja B:n keskustelun alku käyttäjän kanssa|>\n\n### Käyttäjä:\n{question_1}\n\n### Avustaja B:\n{answer_b_1}\n\n### Käyttäjä:\n{question_2}\n\n### Avustaja B:\n{answer_b_2}\n\n<|Avustaja B:n keskustelun loppu käyttäjän kanssa|>' # noqa: E501
47
+ },
48
+ "pair_assistant_single_turn_w_reference": {
49
+ "prompt_template": '[Järjestelmä]\nToimi puolueettomana tuomarina ja arvioi kahden AI-avustajan antamien vastausten laatua alla näkyvään käyttäjän kysymykseen. Arviosi tulisi ottaa huomioon oikeellisuus ja hyödyllisyys. Sinulle annetaan viitevastaus, avustajan A vastaus ja avustajan B vastaus. Tehtäväsi on arvioida, kumpi avustaja antoi paremman vastauksen. Aloita arviointisi vertaamalla molempien avustajien vastauksia viitevastaukseen. Tunnista ja korjaa mahdolliset virheet. Vältä mahdollisia asemointiharhoja ja varmista, että vastausten esitysjärjestys ei vaikuta päätökseesi. Älä anna vastausten pituuden vaikuttaa arvioosi. Älä suosi tiettyjä avustajien nimiä. Ole mahdollisimman objektiivinen. Selityksen jälkeen anna lopullinen päätöksesi noudattamalla tarkasti tätä muotoa: "[[A]]", jos avustaja A on parempi, "[[B]]", jos avustaja B on parempi, ja "[[C]]" tasapelin tapauksessa.\n\n[Käyttäjän kysymys]\n{question}\n\n[Viitevastauksen alku]\n{ref_answer_1}\n[Viitevastauksen loppu]\n\n[Avustaja A:n vastauksen alku]\n{answer_a}\n[Avustaja A:n vastauksen loppu]\n\n[Avustaja B:n vastauksen alku]\n{answer_b}\n[Avustaja B:n vastauksen loppu]' # noqa: E501
50
+ },
51
+ "pair_assistant_multi_turn_w_reference": {
52
+ "prompt_template": 'Toimi puolueettomana tuomarina ja arvioi kahden AI-avustajan antamien vastausten laatua käyttäjän kysymyksiin. Arviosi tulisi ottaa huomioon oikeellisuus ja hyödyllisyys. Sinulle annetaan viitevastaukset, avustajan A vastaukset ja avustajan B vastaukset. Tehtäväsi on määrittää, kumpi avustaja antoi oikeat ja hyödylliset vastaukset toiseen käyttäjän kysymykseen. Aloita arviointisi vertaamalla molempien avustajien vastauksia viitevastauksiin. Tunnista ja korjaa mahdolliset virheet. Vältä mahdollisia asemointiharhoja ja varmista, että vastausten esitysjärjestys ei vaikuta päätökseesi. Älä anna vastausten pituuden vaikuttaa arvioosi. Älä suosi tiettyjä avustajien nimiä. Ole mahdollisimman objektiivinen. Selityksen jälkeen anna lopullinen päätöksesi noudattamalla tarkasti tätä muotoa: "[[A]]", jos avustaja A on parempi, "[[B]]", jos avustaja B on parempi, ja "[[C]]" tasapelin tapauksessa.\n\n<|Viitevastauksen alku|>\n\n### Käyttäjä:\n{question_1}\n\n### Viitevastaus:\n{ref_answer_1}\n\n### Käyttäjä:\n{question_2}\n\n### Viitevastaus:\n{ref_answer_2}\n\n<|Viitevastauksen loppu|>\n\n\n<|Avustaja A:n keskustelun alku käyttäjän kanssa|>\n\n### Käyttäjä:\n{question_1}\n\n### Avustaja A:\n{answer_a_1}\n\n### Käyttäjä:\n{question_2}\n\n### Avustaja A:\n{answer_a_2}\n\n<|Avustaja A:n keskustelun loppu käyttäjän kanssa|>\n\n\n<|Avustaja B:n keskustelun alku käyttäjän kanssa|>\n\n### Käyttäjä:\n{question_1}\n\n### Avustaja B:\n{answer_b_1}\n\n### Käyttäjä:\n{question_2}\n\n### Avustaja B:\n{answer_b_2}\n\n<|Avustaja B:n keskustelun loppu käyttäjän kanssa|>' # noqa: E501
53
+ },
54
+ }
55
+
56
+
57
+ NEED_REF_CATEGORIES = ["math", "reasoning", "coding", "arena-hard-200"]
58
+
59
+
60
+ class PromptToJudge(BaseModel):
61
+ comparison_type: str
62
+ prompt_text: str
63
+
64
+
65
+ class MTBenchJudgePairMetricContext(BaseMetricContext):
66
+ category: str
67
+ answer: list[str] | str
68
+ reference: list[str] | str | None
69
+
70
+
71
+ def generate_pair_judge_prompts(response: Completion) -> list[PromptToJudge]:
72
+ context = extract_context_metric(response, MTBenchJudgePairMetricContext)
73
+ assert response.messages is not None
74
+
75
+ if response.subject.startswith("de"):
76
+ prompt_templates = PAIR_JUDGE_PROMPTS_DE
77
+ elif response.subject.startswith("fi"):
78
+ prompt_templates = PAIR_JUDGE_PROMPTS_FI
79
+ else:
80
+ prompt_templates = PAIR_JUDGE_PROMPTS
81
+ prompts_to_judge = []
82
+
83
+ context = extract_context_metric(response, MTBenchJudgePairMetricContext)
84
+
85
+ assert context.category is not None, "Category must be provided in the context for MTBenchJudgePairMetricContext"
86
+ assert context.answer is not None, "Answer must be provided in the context for MTBenchJudgePairMetricContext"
87
+
88
+ # No reference answer needed
89
+ if context.category not in NEED_REF_CATEGORIES:
90
+ # SINLGE TURN
91
+ if len(response.messages) <= 2:
92
+ # turn 1
93
+ question = response.last_user_instruction
94
+ answer_a = response.completion
95
+ answer_b = context.answer[0]
96
+ # format prompt
97
+ single_turn_prompt = prompt_templates["pair_assistant_single_turn"]["prompt_template"].format(
98
+ question=question, answer_a=answer_a, answer_b=answer_b
99
+ )
100
+ prompts_to_judge.append(PromptToJudge(comparison_type="pairwise_judgement", prompt_text=single_turn_prompt))
101
+
102
+ # MULTI TURN
103
+ else:
104
+ # turn 1
105
+ question_1 = response.first_user_instruction
106
+ answer_a_1 = response.messages[1].content
107
+ answer_b_1 = context.answer[0]
108
+ # turn 2
109
+ question_2 = response.last_user_instruction
110
+ answer_a_2 = response.completion
111
+ answer_b_2 = context.answer[1]
112
+ # format prompt
113
+ multi_turn_prompt = prompt_templates["pair_assistant_multi_turn"]["prompt_template"].format(
114
+ question_1=question_1,
115
+ answer_a_1=answer_a_1,
116
+ answer_b_1=answer_b_1,
117
+ question_2=question_2,
118
+ answer_a_2=answer_a_2,
119
+ answer_b_2=answer_b_2,
120
+ )
121
+ prompts_to_judge.append(PromptToJudge(comparison_type="pairwise_judgement", prompt_text=multi_turn_prompt))
122
+ # Reference answer needed
123
+ elif context.reference:
124
+ # SINGLE TURN
125
+ if len(response.messages) <= 2 and len(context.reference) >= 1:
126
+ # turn 1
127
+ question = response.last_user_instruction
128
+ answer_a = response.completion
129
+ answer_b = context.answer[0]
130
+ ref_answer_1 = context.reference[0]
131
+ # format prompt
132
+ single_turn_prompt = prompt_templates["pair_assistant_single_turn_w_reference"]["prompt_template"].format(
133
+ question=question, answer_a=answer_a, answer_b=answer_b, ref_answer_1=ref_answer_1
134
+ )
135
+ prompts_to_judge.append(PromptToJudge(comparison_type="pairwise_judgement", prompt_text=single_turn_prompt))
136
+ # MULTI TURN
137
+ elif len(context.reference) >= 2:
138
+ # turn 1
139
+ question_1 = response.first_user_instruction
140
+ answer_a_1 = response.messages[1].content
141
+ answer_b_1 = context.answer[0]
142
+ ref_answer_1 = context.reference[0]
143
+ # turn 2
144
+ question_2 = response.last_user_instruction
145
+ answer_a_2 = response.completion
146
+ answer_b_2 = context.answer[1]
147
+ ref_answer_2 = context.reference[1]
148
+ # format prompt
149
+ multi_turn_prompt = prompt_templates["pair_assistant_multi_turn_w_reference"]["prompt_template"].format(
150
+ question_1=question_1,
151
+ answer_a_1=answer_a_1,
152
+ answer_b_1=answer_b_1,
153
+ ref_answer_1=ref_answer_1,
154
+ question_2=question_2,
155
+ answer_a_2=answer_a_2,
156
+ answer_b_2=answer_b_2,
157
+ ref_answer_2=ref_answer_2,
158
+ )
159
+ prompts_to_judge.append(PromptToJudge(comparison_type="pairwise_judgement", prompt_text=multi_turn_prompt))
160
+ else:
161
+ logger.info(
162
+ f"Warning: No reference answer found for this sample (category: "
163
+ f"{context.category}), even though it is needed."
164
+ )
165
+
166
+ return prompts_to_judge
167
+
168
+
169
+ class MTBenchJudgePair(BaseLLMJudgeMetric):
170
+ NAME = "pairwise_judgement"
171
+
172
+ def calculate(self, response: Completion) -> list[MetricResult]:
173
+ try:
174
+ prompts_to_judge: list[PromptToJudge] = generate_pair_judge_prompts(response)
175
+
176
+ all_metrics = []
177
+ for prompt_to_judge in prompts_to_judge:
178
+ messages = [Message(role=Role.USER, content=prompt_to_judge.prompt_text)]
179
+ output = self._llm_judge.generate_from_messages([messages])
180
+ parsed_output = self._output_to_rating(output[0].completion)
181
+
182
+ all_metrics.append(
183
+ MetricResult(
184
+ metric_name=prompt_to_judge.comparison_type,
185
+ value=parsed_output,
186
+ higher_is_better=True,
187
+ )
188
+ )
189
+
190
+ return all_metrics
191
+
192
+ except KeyError as e:
193
+ logger.info(f"LLM judge did not produce an expected output, sample will be ignored in aggregations. {e}")
194
+ return []
195
+
196
+ @staticmethod
197
+ def _output_to_rating(output: str) -> float:
198
+ match = re.search(r"\[\[(.*?)\]\]", output)
199
+ # A = Win, B = Lose, C = Tie
200
+ letters = {"A": 1, "B": 0, "C": 0.5}
201
+ if match:
202
+ value = match.group(1)
203
+ if value in letters:
204
+ return letters[value]
205
+ return 0.5 # Tie