eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +170 -0
  5. eval_framework/context/eval.py +114 -0
  6. eval_framework/context/local.py +52 -0
  7. eval_framework/evaluation_generator.py +231 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +323 -0
  16. eval_framework/llm/base.py +58 -0
  17. eval_framework/llm/huggingface.py +332 -0
  18. eval_framework/llm/mistral.py +73 -0
  19. eval_framework/llm/models.py +16 -0
  20. eval_framework/llm/openai.py +205 -0
  21. eval_framework/llm/vllm.py +438 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +187 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/bleu.py +76 -0
  29. eval_framework/metrics/completion/chrf.py +62 -0
  30. eval_framework/metrics/completion/code_assertion.py +44 -0
  31. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  32. eval_framework/metrics/completion/comet.py +56 -0
  33. eval_framework/metrics/completion/concordance_index.py +38 -0
  34. eval_framework/metrics/completion/csv_format.py +102 -0
  35. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  36. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  37. eval_framework/metrics/completion/f1.py +42 -0
  38. eval_framework/metrics/completion/format_checker.py +56 -0
  39. eval_framework/metrics/completion/grid_difference.py +77 -0
  40. eval_framework/metrics/completion/ifeval.py +73 -0
  41. eval_framework/metrics/completion/json_format.py +171 -0
  42. eval_framework/metrics/completion/language_checker.py +74 -0
  43. eval_framework/metrics/completion/length_control.py +83 -0
  44. eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
  45. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  46. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  47. eval_framework/metrics/completion/repetition.py +88 -0
  48. eval_framework/metrics/completion/rouge_1.py +35 -0
  49. eval_framework/metrics/completion/rouge_2.py +45 -0
  50. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  51. eval_framework/metrics/completion/rouge_l.py +52 -0
  52. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  53. eval_framework/metrics/completion/ter.py +67 -0
  54. eval_framework/metrics/completion/text_counter.py +182 -0
  55. eval_framework/metrics/efficiency/__init__.py +0 -0
  56. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  57. eval_framework/metrics/llm/__init__.py +0 -0
  58. eval_framework/metrics/llm/base.py +8 -0
  59. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  60. eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
  61. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  62. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  63. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  64. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  65. eval_framework/metrics/llm/graders/language.py +56 -0
  66. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  67. eval_framework/metrics/llm/graders/models.py +74 -0
  68. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  69. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  70. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  71. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  72. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  73. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  74. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  75. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  76. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  77. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
  78. eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
  79. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  80. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  81. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  82. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  83. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  84. eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
  85. eval_framework/py.typed +0 -0
  86. eval_framework/response_generator.py +416 -0
  87. eval_framework/result_processors/__init__.py +0 -0
  88. eval_framework/result_processors/base.py +74 -0
  89. eval_framework/result_processors/hf_processor.py +87 -0
  90. eval_framework/result_processors/result_processor.py +129 -0
  91. eval_framework/run.py +314 -0
  92. eval_framework/run_direct.py +42 -0
  93. eval_framework/shared/types.py +227 -0
  94. eval_framework/tasks/__init__.py +6 -0
  95. eval_framework/tasks/base.py +314 -0
  96. eval_framework/tasks/benchmarks/__init__.py +0 -0
  97. eval_framework/tasks/benchmarks/arc.py +46 -0
  98. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  99. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  100. eval_framework/tasks/benchmarks/belebele.py +60 -0
  101. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  102. eval_framework/tasks/benchmarks/casehold.py +47 -0
  103. eval_framework/tasks/benchmarks/chembench.py +85 -0
  104. eval_framework/tasks/benchmarks/copa.py +39 -0
  105. eval_framework/tasks/benchmarks/duc.py +91 -0
  106. eval_framework/tasks/benchmarks/flores200.py +62 -0
  107. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  108. eval_framework/tasks/benchmarks/gpqa.py +177 -0
  109. eval_framework/tasks/benchmarks/gsm8k.py +148 -0
  110. eval_framework/tasks/benchmarks/hellaswag.py +44 -0
  111. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  112. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  113. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  114. eval_framework/tasks/benchmarks/include.py +119 -0
  115. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  116. eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
  117. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  118. eval_framework/tasks/benchmarks/mmlu.py +190 -0
  119. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  120. eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
  121. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  122. eval_framework/tasks/benchmarks/openbookqa.py +37 -0
  123. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  124. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  125. eval_framework/tasks/benchmarks/piqa.py +39 -0
  126. eval_framework/tasks/benchmarks/quality.py +56 -0
  127. eval_framework/tasks/benchmarks/sciq.py +44 -0
  128. eval_framework/tasks/benchmarks/sphyr.py +75 -0
  129. eval_framework/tasks/benchmarks/squad.py +89 -0
  130. eval_framework/tasks/benchmarks/struct_eval.py +110 -0
  131. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  132. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  133. eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
  134. eval_framework/tasks/benchmarks/winogender.py +39 -0
  135. eval_framework/tasks/benchmarks/winogrande.py +44 -0
  136. eval_framework/tasks/benchmarks/winox.py +57 -0
  137. eval_framework/tasks/benchmarks/wmt.py +160 -0
  138. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  139. eval_framework/tasks/eval_config.py +112 -0
  140. eval_framework/tasks/perturbation.py +83 -0
  141. eval_framework/tasks/registry.py +186 -0
  142. eval_framework/tasks/task_loader.py +80 -0
  143. eval_framework/tasks/task_names.py +138 -0
  144. eval_framework/tasks/utils.py +578 -0
  145. eval_framework/utils/constants.py +9 -0
  146. eval_framework/utils/generate_task_docs.py +229 -0
  147. eval_framework/utils/helpers.py +3 -0
  148. eval_framework/utils/logging.py +50 -0
  149. eval_framework/utils/packaging.py +52 -0
  150. eval_framework-0.2.0.dist-info/METADATA +514 -0
  151. eval_framework-0.2.0.dist-info/RECORD +161 -0
  152. eval_framework-0.2.0.dist-info/WHEEL +4 -0
  153. eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
  154. template_formatting/README.md +83 -0
  155. template_formatting/__init__.py +0 -0
  156. template_formatting/formatter.py +536 -0
  157. template_formatting/mistral_formatter.py +159 -0
  158. template_formatting/py.typed +0 -0
  159. template_formatting/tests/test_formatter_eval.py +408 -0
  160. template_formatting/tests/test_formatter_scaling.py +253 -0
  161. template_formatting/tests/test_mistral_formatter.py +136 -0
@@ -0,0 +1,92 @@
1
+ from collections.abc import Mapping
2
+
3
+ from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
4
+ from eval_framework.metrics.llm.graders.language import Language
5
+ from eval_framework.metrics.llm.graders.models import GradingOutput, PromptTemplate, parse_json_output
6
+
7
+
8
+ class ChatbotStyleGradingOutput(GradingOutput):
9
+ thought_process: str | None
10
+ is_chatbot_style: bool | None
11
+
12
+
13
+ class ChatbotStyleGrader:
14
+ COMPLETION_KEY = "completion"
15
+ PROMPT_TEMPLATES = {
16
+ Language("de"): PromptTemplate(
17
+ system_prompt="""Deine Aufgabe ist es zu klassifizieren, ob eine von einem Textgenerator gelieferte Antwort dem Stile eines Chatbots entspricht.
18
+
19
+ Hier sind einige Schlüsselmerkmale einer Antwort im Stile eines Chatbots:
20
+ * Sie leitet den Hauptinhalt mit Phrasen wie "Natürlich, ich helfe Dir gerne", "Na klar!" oder "Selbstverständlich kann ich" ein.
21
+ * Sie endet mit Phrasen wie "Ich hoffe, ich konnte Dir weiterhelfen!"
22
+ * Sie stellt Nachfragen an den Benutzer.
23
+ * Sie neigt dazu, überaus wortreich zu sein.
24
+ * Sie enthält Gesprächs- und Unterhaltungsfloskeln.
25
+ * Sie enthält Text, der zum Verständnis des Inhalts nicht zwingend notwendig ist.
26
+ * Sie bewahrt einen überaus freundlichen Ton.
27
+
28
+ Beachte, dass die Erfüllung von nur einem dieser Merkmale ausreicht, um die Antwort als Chatbot-Stil zu klassifizieren.
29
+
30
+ Gebe deine Bewertung in folgendem JSON-Format:
31
+ {
32
+ "thought_process": str (Achte sehr genau auf die Antwort und argumentiere in ein paar Sätzen, ob die Antwort dem Chatbot-Stil folgt oder nicht),
33
+ "is_chatbot_style": bool
34
+ }""", # noqa: E501
35
+ user_prompt=f"""**Antwort des Textgenerators**
36
+ {{{COMPLETION_KEY}}}""",
37
+ ),
38
+ Language("en"): PromptTemplate(
39
+ system_prompt="""Your task is to classify if a text generation model's response follows a chatbot-style format.
40
+
41
+ Here are some key characteristics of a chatbot-style response:
42
+ * It introduces the main content with phrases like "Certainly, here is", "Sure!" or "Of course."
43
+ * It ends with phrases such as "I hope this helps!"
44
+ * It asks follow-up questions.
45
+ * It tends to be verbose.
46
+ * It tends to contain fluff that is not necessary to understand the content.
47
+ * It maintains a friendly tone.
48
+
49
+ Note that even one of these characteristics is enough to classify the response as following a chatbot-style format.
50
+
51
+ You must provide your evaluation in the following JSON format:
52
+ {
53
+ "thought_process": str (Pay very close attention to the response and argue whether the response follows a chatbot-style or not in a few sentences),
54
+ "is_chatbot_style": bool
55
+ }""", # noqa: E501
56
+ user_prompt=f"""**Model Response**:
57
+ {{{COMPLETION_KEY}}}""",
58
+ ),
59
+ }
60
+
61
+ def __init__(
62
+ self,
63
+ grading_model: StructuredOutputChatModel,
64
+ prompt_templates: Mapping[Language, PromptTemplate] = PROMPT_TEMPLATES,
65
+ ) -> None:
66
+ self._grading_model = grading_model
67
+
68
+ if not all(self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()):
69
+ raise ValueError(f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}'.")
70
+ self._prompt_templates = prompt_templates
71
+
72
+ def grade(self, completion: str, language: Language) -> ChatbotStyleGradingOutput:
73
+ try:
74
+ prompt_template = language.language_config(self._prompt_templates)
75
+ except Exception as _:
76
+ prompt_template = Language("en").language_config(self._prompt_templates)
77
+
78
+ messages = prompt_template.to_messages(
79
+ [],
80
+ [
81
+ (self.COMPLETION_KEY, completion),
82
+ ],
83
+ )
84
+ raw_completion = self._grading_model.generate_from_messages([messages])[0]
85
+ loaded_json = parse_json_output(raw_completion.completion)
86
+
87
+ return ChatbotStyleGradingOutput(
88
+ thought_process=loaded_json.get("thought_process", None),
89
+ is_chatbot_style=loaded_json.get("is_chatbot_style", None),
90
+ judge_prompt=raw_completion.prompt,
91
+ judge_response=raw_completion.completion,
92
+ )
@@ -0,0 +1,146 @@
1
+ from collections.abc import Mapping
2
+ from enum import Enum
3
+
4
+ from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
5
+ from eval_framework.metrics.llm.graders.language import Language
6
+ from eval_framework.metrics.llm.graders.models import (
7
+ GradingOutput,
8
+ PromptTemplateWithParseMap,
9
+ parse_json_output,
10
+ )
11
+
12
+
13
+ class MatchOutcome(str, Enum):
14
+ A_WINS = "a_wins"
15
+ DRAW = "draw"
16
+ B_WINS = "b_wins"
17
+
18
+ @property
19
+ def payoff(self) -> tuple[float, float]:
20
+ if self == self.A_WINS:
21
+ return (1, 0)
22
+ if self == self.DRAW:
23
+ return (0.5, 0.5)
24
+ return (0, 1)
25
+
26
+ @staticmethod
27
+ def from_rank_literal(rank: int) -> "MatchOutcome":
28
+ match rank:
29
+ case 1:
30
+ return MatchOutcome.A_WINS
31
+ case 2:
32
+ return MatchOutcome.B_WINS
33
+ case 3:
34
+ return MatchOutcome.DRAW
35
+ case _:
36
+ raise ValueError(f"Got unexpected rank {rank}")
37
+
38
+
39
+ class ComparisonGradingOutput(GradingOutput):
40
+ reasoning: str
41
+ outcome: MatchOutcome
42
+
43
+
44
+ class ComparisonGrader:
45
+ INSTRUCTION_KEY = "instruction"
46
+ ANSWER_1_KEY = "answer_1"
47
+ ANSWER_2_KEY = "answer_2"
48
+ REASONING_KEY = "explanation"
49
+ BETTER_ANSWER_KEY = "better_answer"
50
+ PROMPT_TEMPLATES = {
51
+ Language("de"): PromptTemplateWithParseMap(
52
+ system_prompt=f"""Beachte die gegebene Aufgabe und dazugehörigen Antworten. Entscheide, welche Antwort besser ist, Antwort 1 oder Antwort 2. Gebe anschließend "Antwort 1 ist besser", "Antwort 2 ist besser" oder "Beide gleich" aus.
53
+
54
+ Eine gute Antwort ist:
55
+ 1. Inhaltlich korrekt.
56
+ 2. Beachtet die Anforderungen der Aufgabe präzise.
57
+ 3. Ist im Rahmen der Aufgabenstellung kreativ und nicht repetetiv.
58
+ 4. In der Sprache der Aufgabe verfasst.
59
+
60
+ Gebe die Antwort im folgenden json-Format:
61
+ {{
62
+ "{REASONING_KEY}": str (Beschreibe in wenigen Sätzen (max. 5) die Unterschiede der beiden Antworten und begründe, warum eine der beiden Antworten besser ist oder warum die Antworten ähnlich gut sind.),
63
+ "{BETTER_ANSWER_KEY}": Literal["Antwort 1 ist besser", "Antwort 2 ist besser", "Beide gleich"]
64
+ }}""", # noqa: E501
65
+ user_prompt=f"""Aufgabe:
66
+ {{{INSTRUCTION_KEY}}}
67
+ ---
68
+ Antwort 1:
69
+ {{{ANSWER_1_KEY}}}
70
+ ---
71
+ Antwort 2:
72
+ {{{ANSWER_2_KEY}}}""",
73
+ parse_map={
74
+ "Antwort 1 ist besser": MatchOutcome.A_WINS,
75
+ "Antwort 2 ist besser": MatchOutcome.B_WINS,
76
+ "Beide gleich": MatchOutcome.DRAW,
77
+ },
78
+ ),
79
+ Language("en"): PromptTemplateWithParseMap(
80
+ system_prompt=f"""Note the given task and the corresponding answers. Decide which answer is better, answer 1 or answer 2. Then output "Answer 1 is better", "Answer 2 is better" or "Both equal".
81
+
82
+ A good answer is:
83
+ 1. correct in content.
84
+ 2. follows the requirements of the task precisely.
85
+ 3. is creative and not repetitive in the context of the task.
86
+ 4. written in the same language as the task.
87
+
88
+ Enter the answer in the following json format:
89
+ {{
90
+ "{REASONING_KEY}": str (Describe in a few sentences (max. 5) the differences between the two answers and give reasons why one of the two answers is better or why the answers are similarly good),
91
+ "{BETTER_ANSWER_KEY}": Literal["Answer 1 is better", "Answer 2 is better", "Both equal"]
92
+ }}""", # noqa: E501
93
+ user_prompt=f"""Task:
94
+ {{{INSTRUCTION_KEY}}}
95
+ ---
96
+ Answer 1:
97
+ {{{ANSWER_1_KEY}}}
98
+ ---
99
+ Answer 2:
100
+ {{{ANSWER_2_KEY}}}""",
101
+ parse_map={
102
+ "Answer 1 is better": MatchOutcome.A_WINS,
103
+ "Answer 2 is better": MatchOutcome.B_WINS,
104
+ "Both equal": MatchOutcome.DRAW,
105
+ },
106
+ ),
107
+ }
108
+
109
+ def __init__(
110
+ self,
111
+ grading_model: StructuredOutputChatModel,
112
+ prompt_templates: Mapping[Language, PromptTemplateWithParseMap] = PROMPT_TEMPLATES,
113
+ ) -> None:
114
+ self._grading_model = grading_model
115
+
116
+ if not all(
117
+ self.INSTRUCTION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
118
+ ) or not all(self.ANSWER_1_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()):
119
+ raise ValueError(
120
+ f"At least one PromptTemplate invalid, must contain '{self.ANSWER_1_KEY}' and '{self.INSTRUCTION_KEY}'."
121
+ )
122
+ self._prompt_templates = prompt_templates
123
+
124
+ def grade(
125
+ self, instruction: str, completion_1: str, completion_2: str, language: Language
126
+ ) -> ComparisonGradingOutput:
127
+ prompt_template = language.language_config(self._prompt_templates)
128
+
129
+ messages = prompt_template.to_messages(
130
+ [],
131
+ [
132
+ (self.INSTRUCTION_KEY, instruction),
133
+ (self.ANSWER_1_KEY, completion_1),
134
+ (self.ANSWER_2_KEY, completion_2),
135
+ ],
136
+ )
137
+
138
+ raw_completion = self._grading_model.generate_from_messages([messages])[0]
139
+ loaded_json = parse_json_output(raw_completion.completion)
140
+
141
+ return ComparisonGradingOutput(
142
+ reasoning=loaded_json.get(self.REASONING_KEY, None),
143
+ outcome=prompt_template.parse_map.get(str(loaded_json.get(self.BETTER_ANSWER_KEY, None)), None),
144
+ judge_prompt=raw_completion.prompt,
145
+ judge_response=raw_completion.completion,
146
+ )
@@ -0,0 +1,93 @@
1
+ from collections.abc import Mapping
2
+
3
+ from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
4
+ from eval_framework.metrics.llm.graders.language import Language
5
+ from eval_framework.metrics.llm.graders.models import GradingOutput, PromptTemplate, parse_json_output
6
+
7
+
8
+ class ConcisenessGradingOutput(GradingOutput):
9
+ thought_process: str | None
10
+ is_concise: bool | None
11
+
12
+
13
+ class ConcisenessGrader:
14
+ INSTRUCTION_KEY = "instruction"
15
+ COMPLETION_KEY = "completion"
16
+ PROMPT_TEMPLATES = {
17
+ Language("de"): PromptTemplate(
18
+ system_prompt="""Deine Aufgabe ist es zu klassifizieren, ob eine von einem Textgenerator gelieferte Antwort kurz und prägnant ist.
19
+
20
+ Eine kurz und prägnante ("concise") Antwort ist eine Antwort, die knapp und auf den Punkt ist, ohne unnötige Details oder Ausführungen.
21
+
22
+ Gebe deine Bewertung in folgendem JSON-Format:
23
+ {
24
+ "thought_process": str (Achte sehr genau auf die Antwort und argumentiere in ein paar Sätzen, ob die Antwort kurz und prägnant ("concise") ist oder nicht),
25
+ "is_concise": bool
26
+ }""", # noqa: E501
27
+ user_prompt=f"""**Benutzeranweisung**
28
+ {{{INSTRUCTION_KEY}}}
29
+
30
+ ---
31
+ **Antwort des Textgenerators**
32
+ {{{COMPLETION_KEY}}}""",
33
+ ),
34
+ Language("en"): PromptTemplate(
35
+ system_prompt="""Your task is to classify if a text generation model's response is concise.
36
+
37
+ A concise response is one that is brief and to the point, without unnecessary details or elaboration.
38
+
39
+ You must provide your evaluation in the following JSON format:
40
+ {
41
+ "thought_process": str (Pay very close attention to the response and argue whether the response is concise or not in a few sentences),
42
+ "is_concise": bool
43
+ }""", # noqa: E501
44
+ user_prompt=f"""**User Instruction**:
45
+ {{{INSTRUCTION_KEY}}}
46
+
47
+ ---
48
+ **Model Response**:
49
+ {{{COMPLETION_KEY}}}""",
50
+ ),
51
+ }
52
+
53
+ def __init__(
54
+ self,
55
+ grading_model: StructuredOutputChatModel,
56
+ prompt_templates: Mapping[Language, PromptTemplate] = PROMPT_TEMPLATES,
57
+ ) -> None:
58
+ self._grading_model = grading_model
59
+
60
+ if not all(
61
+ self.INSTRUCTION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
62
+ ) or not all(
63
+ self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
64
+ ):
65
+ raise ValueError(
66
+ f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}' "
67
+ "and '{self.INSTRUCTION_KEY}'."
68
+ )
69
+ self._prompt_templates = prompt_templates
70
+
71
+ def grade(self, instruction: str, completion: str, language: Language) -> ConcisenessGradingOutput:
72
+ try:
73
+ prompt_template = language.language_config(self._prompt_templates)
74
+ except Exception as _:
75
+ prompt_template = Language("en").language_config(self._prompt_templates)
76
+
77
+ messages = prompt_template.to_messages(
78
+ [],
79
+ [
80
+ (self.INSTRUCTION_KEY, instruction),
81
+ (self.COMPLETION_KEY, completion),
82
+ ],
83
+ )
84
+
85
+ raw_completion = self._grading_model.generate_from_messages([messages])[0]
86
+ loaded_json = parse_json_output(raw_completion.completion)
87
+
88
+ return ConcisenessGradingOutput(
89
+ thought_process=loaded_json.get("thought_process", None),
90
+ is_concise=loaded_json.get("is_concise", None),
91
+ judge_prompt=raw_completion.prompt,
92
+ judge_response=raw_completion.completion,
93
+ )
@@ -0,0 +1,71 @@
1
+ from collections.abc import Mapping
2
+
3
+ from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
4
+ from eval_framework.metrics.llm.graders.language import Language
5
+ from eval_framework.metrics.llm.graders.models import GradingOutput, PromptTemplate, parse_json_output
6
+
7
+
8
+ class ContainsNamesGradingOutput(GradingOutput):
9
+ contains_names: bool | None
10
+
11
+
12
+ class ContainsNamesGrader:
13
+ COMPLETION_KEY = "completion"
14
+ PROMPT_TEMPLATES = {
15
+ Language("en"): PromptTemplate(
16
+ system_prompt="""Your assignment is to assess whether text contains names of individuals.
17
+
18
+ Your assessment should be formatted in JSON as follows:
19
+ {
20
+ "contains_names": bool (State with a true or false whether the text contains any names.)
21
+ }""",
22
+ user_prompt=f"""{{{COMPLETION_KEY}}}""",
23
+ ),
24
+ Language("de"): PromptTemplate(
25
+ system_prompt="""Deine Aufgabe ist es zu bewerten, ob ein Text Namen von Personen enthält.
26
+
27
+ Deine Bewertung sollte im JSON-Format wie folgt aussehen:
28
+ {
29
+ "contains_names": bool (Gib an, ob der Text Namen enthält oder nicht.)
30
+ }""",
31
+ user_prompt=f"""{{{COMPLETION_KEY}}}""",
32
+ ),
33
+ Language("fr"): PromptTemplate(
34
+ system_prompt="""Votre tâche consiste à évaluer si un texte contient des noms de personnes.
35
+
36
+ Votre évaluation doit être formatée en JSON comme suit :
37
+ {
38
+ "contains_names": bool (Indiquez si le texte contient des noms ou non.)
39
+ }""",
40
+ user_prompt=f"""{{{COMPLETION_KEY}}}""",
41
+ ),
42
+ }
43
+
44
+ def __init__(
45
+ self,
46
+ grading_model: StructuredOutputChatModel,
47
+ prompt_templates: Mapping[Language, PromptTemplate] = PROMPT_TEMPLATES,
48
+ ) -> None:
49
+ self._grading_model = grading_model
50
+
51
+ if not all(self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()):
52
+ raise ValueError(f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}'.")
53
+ self._prompt_templates = prompt_templates
54
+
55
+ def grade(self, completion: str, language: Language) -> ContainsNamesGradingOutput:
56
+ prompt_template = language.language_config(self._prompt_templates)
57
+ messages = prompt_template.to_messages(
58
+ [],
59
+ [
60
+ (self.COMPLETION_KEY, completion),
61
+ ],
62
+ )
63
+
64
+ raw_completion = self._grading_model.generate_from_messages([messages])[0]
65
+ loaded_json = parse_json_output(raw_completion.completion)
66
+
67
+ return ContainsNamesGradingOutput(
68
+ contains_names=loaded_json.get("contains_names", None),
69
+ judge_prompt=raw_completion.prompt,
70
+ judge_response=raw_completion.completion,
71
+ )
@@ -0,0 +1,109 @@
1
+ from collections.abc import Mapping
2
+
3
+ from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
4
+ from eval_framework.metrics.llm.graders.language import Language
5
+ from eval_framework.metrics.llm.graders.models import (
6
+ FOFOPromptTemplate,
7
+ GradingOutput,
8
+ parse_json_output,
9
+ )
10
+
11
+
12
+ class FormatCorrectnessOutput(GradingOutput):
13
+ reasons: str | None
14
+ format_correctness: int | None
15
+
16
+
17
+ class FormatCorrectnessGrader:
18
+ INSTRUCTION_KEY = "<instruction>"
19
+ COMPLETION_KEY = "<completion>"
20
+
21
+ PROMPT_TEMPLATES = {
22
+ Language("en"): FOFOPromptTemplate(
23
+ system_prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
24
+ user_prompt=f"""
25
+ I would like you to create a leaderboard that evaluates the correctness of the format of answers from
26
+ various large language models. To accomplish this, you will need to analyze the text prompts given to
27
+ the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are
28
+ properly formatted as a json string. I will provide both the prompts and the responses for this purpose.\n
29
+
30
+ Here is the prompt: {{
31
+ "instruction": {INSTRUCTION_KEY}
32
+ }}
33
+
34
+ Here are the outputs of the models:
35
+ [
36
+ {{
37
+ "model": "model",
38
+ "answer": {COMPLETION_KEY}
39
+ }},
40
+ ]
41
+
42
+ Please evaluate the formatting of the model’s responses by checking if they comply with the format
43
+ specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation
44
+ for why the format is correct or incorrect. Your feedback should include the name of the model, followed
45
+ by the format correctness status represented as ’1’ for correct and ’0’ for incorrect. Present your
46
+ reasoning as bullet points within a single string for each model assessed. In other words, you should
47
+ produce the following output:
48
+ ```json
49
+ [
50
+ {{
51
+ "model": <model-name>,
52
+ "format_correctness": <correctness>,
53
+ "reasons": <reasons-of-format-correctness>
54
+ }}
55
+ ]```
56
+
57
+ Please note that your response should be a properly formatted JSON string and should not contain any
58
+ additional content. We will load it directly as a JSON string in Python.
59
+ """,
60
+ ),
61
+ }
62
+
63
+ def __init__(
64
+ self,
65
+ grading_model: StructuredOutputChatModel,
66
+ prompt_templates: Mapping[Language, FOFOPromptTemplate] = PROMPT_TEMPLATES,
67
+ ) -> None:
68
+ self._grading_model = grading_model
69
+
70
+ if not all(
71
+ self.INSTRUCTION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
72
+ ) or not all(
73
+ self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
74
+ ):
75
+ raise ValueError(
76
+ f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}' "
77
+ "and '{self.INSTRUCTION_KEY}'."
78
+ )
79
+ self._prompt_templates = prompt_templates
80
+
81
+ def grade(self, instruction: str, completion: str, language: Language) -> FormatCorrectnessOutput:
82
+ try:
83
+ prompt_template = language.language_config(self._prompt_templates)
84
+ except Exception as _:
85
+ prompt_template = Language("en").language_config(self._prompt_templates)
86
+
87
+ messages = prompt_template.to_messages(
88
+ [],
89
+ [
90
+ (self.INSTRUCTION_KEY, instruction),
91
+ (self.COMPLETION_KEY, completion),
92
+ ],
93
+ )
94
+
95
+ raw_completion = self._grading_model.generate_from_messages([messages])[0]
96
+ loaded_json = parse_json_output(raw_completion.completion)
97
+ reasons = loaded_json.get("reasons", None)
98
+ if isinstance(reasons, list):
99
+ reasons = ["• " + reason + "\n " for reason in reasons]
100
+ reasons = "".join(reasons)
101
+ elif isinstance(reasons, str):
102
+ reasons = "• " + reasons + "\n "
103
+
104
+ return FormatCorrectnessOutput(
105
+ reasons=reasons,
106
+ format_correctness=loaded_json.get("format_correctness", None),
107
+ judge_prompt=raw_completion.prompt,
108
+ judge_response=raw_completion.completion,
109
+ )