eval-framework 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_framework/__init__.py +7 -0
- eval_framework/base_config.py +36 -0
- eval_framework/context/__init__.py +0 -0
- eval_framework/context/determined.py +170 -0
- eval_framework/context/eval.py +114 -0
- eval_framework/context/local.py +52 -0
- eval_framework/evaluation_generator.py +231 -0
- eval_framework/exceptions.py +2 -0
- eval_framework/external/ifeval_impl/README.md +5 -0
- eval_framework/external/ifeval_impl/instructions.py +1523 -0
- eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
- eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
- eval_framework/external/ifeval_impl/utils.py +135 -0
- eval_framework/llm/__init__.py +0 -0
- eval_framework/llm/aleph_alpha.py +323 -0
- eval_framework/llm/base.py +58 -0
- eval_framework/llm/huggingface.py +332 -0
- eval_framework/llm/mistral.py +73 -0
- eval_framework/llm/models.py +16 -0
- eval_framework/llm/openai.py +205 -0
- eval_framework/llm/vllm.py +438 -0
- eval_framework/logger.py +3 -0
- eval_framework/main.py +187 -0
- eval_framework/metrics/__init__.py +0 -0
- eval_framework/metrics/base.py +40 -0
- eval_framework/metrics/completion/__init__.py +1 -0
- eval_framework/metrics/completion/accuracy_completion.py +16 -0
- eval_framework/metrics/completion/bleu.py +76 -0
- eval_framework/metrics/completion/chrf.py +62 -0
- eval_framework/metrics/completion/code_assertion.py +44 -0
- eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
- eval_framework/metrics/completion/comet.py +56 -0
- eval_framework/metrics/completion/concordance_index.py +38 -0
- eval_framework/metrics/completion/csv_format.py +102 -0
- eval_framework/metrics/completion/cwe_accuracy.py +49 -0
- eval_framework/metrics/completion/exponential_similarity.py +65 -0
- eval_framework/metrics/completion/f1.py +42 -0
- eval_framework/metrics/completion/format_checker.py +56 -0
- eval_framework/metrics/completion/grid_difference.py +77 -0
- eval_framework/metrics/completion/ifeval.py +73 -0
- eval_framework/metrics/completion/json_format.py +171 -0
- eval_framework/metrics/completion/language_checker.py +74 -0
- eval_framework/metrics/completion/length_control.py +83 -0
- eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
- eval_framework/metrics/completion/niah_accuracy.py +163 -0
- eval_framework/metrics/completion/placeholder_checker.py +27 -0
- eval_framework/metrics/completion/repetition.py +88 -0
- eval_framework/metrics/completion/rouge_1.py +35 -0
- eval_framework/metrics/completion/rouge_2.py +45 -0
- eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
- eval_framework/metrics/completion/rouge_l.py +52 -0
- eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
- eval_framework/metrics/completion/ter.py +67 -0
- eval_framework/metrics/completion/text_counter.py +182 -0
- eval_framework/metrics/efficiency/__init__.py +0 -0
- eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
- eval_framework/metrics/llm/__init__.py +0 -0
- eval_framework/metrics/llm/base.py +8 -0
- eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
- eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
- eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
- eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
- eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
- eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
- eval_framework/metrics/llm/graders/language.py +56 -0
- eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
- eval_framework/metrics/llm/graders/models.py +74 -0
- eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
- eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
- eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
- eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
- eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
- eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
- eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
- eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
- eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
- eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
- eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
- eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
- eval_framework/metrics/llm/llm_judge_sql.py +394 -0
- eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
- eval_framework/metrics/loglikelihood/__init__.py +0 -0
- eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
- eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
- eval_framework/py.typed +0 -0
- eval_framework/response_generator.py +416 -0
- eval_framework/result_processors/__init__.py +0 -0
- eval_framework/result_processors/base.py +74 -0
- eval_framework/result_processors/hf_processor.py +87 -0
- eval_framework/result_processors/result_processor.py +129 -0
- eval_framework/run.py +314 -0
- eval_framework/run_direct.py +42 -0
- eval_framework/shared/types.py +227 -0
- eval_framework/tasks/__init__.py +6 -0
- eval_framework/tasks/base.py +314 -0
- eval_framework/tasks/benchmarks/__init__.py +0 -0
- eval_framework/tasks/benchmarks/arc.py +46 -0
- eval_framework/tasks/benchmarks/arc_de.py +46 -0
- eval_framework/tasks/benchmarks/arc_fi.py +46 -0
- eval_framework/tasks/benchmarks/belebele.py +60 -0
- eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
- eval_framework/tasks/benchmarks/casehold.py +47 -0
- eval_framework/tasks/benchmarks/chembench.py +85 -0
- eval_framework/tasks/benchmarks/copa.py +39 -0
- eval_framework/tasks/benchmarks/duc.py +91 -0
- eval_framework/tasks/benchmarks/flores200.py +62 -0
- eval_framework/tasks/benchmarks/flores_plus.py +84 -0
- eval_framework/tasks/benchmarks/gpqa.py +177 -0
- eval_framework/tasks/benchmarks/gsm8k.py +148 -0
- eval_framework/tasks/benchmarks/hellaswag.py +44 -0
- eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
- eval_framework/tasks/benchmarks/humaneval.py +97 -0
- eval_framework/tasks/benchmarks/ifeval.py +78 -0
- eval_framework/tasks/benchmarks/include.py +119 -0
- eval_framework/tasks/benchmarks/infinitebench.py +302 -0
- eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
- eval_framework/tasks/benchmarks/mbpp.py +192 -0
- eval_framework/tasks/benchmarks/mmlu.py +190 -0
- eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
- eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
- eval_framework/tasks/benchmarks/mmmlu.py +529 -0
- eval_framework/tasks/benchmarks/openbookqa.py +37 -0
- eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
- eval_framework/tasks/benchmarks/pawsx.py +65 -0
- eval_framework/tasks/benchmarks/piqa.py +39 -0
- eval_framework/tasks/benchmarks/quality.py +56 -0
- eval_framework/tasks/benchmarks/sciq.py +44 -0
- eval_framework/tasks/benchmarks/sphyr.py +75 -0
- eval_framework/tasks/benchmarks/squad.py +89 -0
- eval_framework/tasks/benchmarks/struct_eval.py +110 -0
- eval_framework/tasks/benchmarks/tablebench.py +117 -0
- eval_framework/tasks/benchmarks/triviaqa.py +42 -0
- eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
- eval_framework/tasks/benchmarks/winogender.py +39 -0
- eval_framework/tasks/benchmarks/winogrande.py +44 -0
- eval_framework/tasks/benchmarks/winox.py +57 -0
- eval_framework/tasks/benchmarks/wmt.py +160 -0
- eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
- eval_framework/tasks/eval_config.py +112 -0
- eval_framework/tasks/perturbation.py +83 -0
- eval_framework/tasks/registry.py +186 -0
- eval_framework/tasks/task_loader.py +80 -0
- eval_framework/tasks/task_names.py +138 -0
- eval_framework/tasks/utils.py +578 -0
- eval_framework/utils/constants.py +9 -0
- eval_framework/utils/generate_task_docs.py +229 -0
- eval_framework/utils/helpers.py +3 -0
- eval_framework/utils/logging.py +50 -0
- eval_framework/utils/packaging.py +52 -0
- eval_framework-0.2.0.dist-info/METADATA +514 -0
- eval_framework-0.2.0.dist-info/RECORD +161 -0
- eval_framework-0.2.0.dist-info/WHEEL +4 -0
- eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
- template_formatting/README.md +83 -0
- template_formatting/__init__.py +0 -0
- template_formatting/formatter.py +536 -0
- template_formatting/mistral_formatter.py +159 -0
- template_formatting/py.typed +0 -0
- template_formatting/tests/test_formatter_eval.py +408 -0
- template_formatting/tests/test_formatter_scaling.py +253 -0
- template_formatting/tests/test_mistral_formatter.py +136 -0
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
2
|
+
|
|
3
|
+
from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
|
|
4
|
+
from eval_framework.metrics.llm.graders.language import Language
|
|
5
|
+
from eval_framework.metrics.llm.graders.models import GradingOutput, PromptTemplate, parse_json_output
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ChatbotStyleGradingOutput(GradingOutput):
|
|
9
|
+
thought_process: str | None
|
|
10
|
+
is_chatbot_style: bool | None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ChatbotStyleGrader:
|
|
14
|
+
COMPLETION_KEY = "completion"
|
|
15
|
+
PROMPT_TEMPLATES = {
|
|
16
|
+
Language("de"): PromptTemplate(
|
|
17
|
+
system_prompt="""Deine Aufgabe ist es zu klassifizieren, ob eine von einem Textgenerator gelieferte Antwort dem Stile eines Chatbots entspricht.
|
|
18
|
+
|
|
19
|
+
Hier sind einige Schlüsselmerkmale einer Antwort im Stile eines Chatbots:
|
|
20
|
+
* Sie leitet den Hauptinhalt mit Phrasen wie "Natürlich, ich helfe Dir gerne", "Na klar!" oder "Selbstverständlich kann ich" ein.
|
|
21
|
+
* Sie endet mit Phrasen wie "Ich hoffe, ich konnte Dir weiterhelfen!"
|
|
22
|
+
* Sie stellt Nachfragen an den Benutzer.
|
|
23
|
+
* Sie neigt dazu, überaus wortreich zu sein.
|
|
24
|
+
* Sie enthält Gesprächs- und Unterhaltungsfloskeln.
|
|
25
|
+
* Sie enthält Text, der zum Verständnis des Inhalts nicht zwingend notwendig ist.
|
|
26
|
+
* Sie bewahrt einen überaus freundlichen Ton.
|
|
27
|
+
|
|
28
|
+
Beachte, dass die Erfüllung von nur einem dieser Merkmale ausreicht, um die Antwort als Chatbot-Stil zu klassifizieren.
|
|
29
|
+
|
|
30
|
+
Gebe deine Bewertung in folgendem JSON-Format:
|
|
31
|
+
{
|
|
32
|
+
"thought_process": str (Achte sehr genau auf die Antwort und argumentiere in ein paar Sätzen, ob die Antwort dem Chatbot-Stil folgt oder nicht),
|
|
33
|
+
"is_chatbot_style": bool
|
|
34
|
+
}""", # noqa: E501
|
|
35
|
+
user_prompt=f"""**Antwort des Textgenerators**
|
|
36
|
+
{{{COMPLETION_KEY}}}""",
|
|
37
|
+
),
|
|
38
|
+
Language("en"): PromptTemplate(
|
|
39
|
+
system_prompt="""Your task is to classify if a text generation model's response follows a chatbot-style format.
|
|
40
|
+
|
|
41
|
+
Here are some key characteristics of a chatbot-style response:
|
|
42
|
+
* It introduces the main content with phrases like "Certainly, here is", "Sure!" or "Of course."
|
|
43
|
+
* It ends with phrases such as "I hope this helps!"
|
|
44
|
+
* It asks follow-up questions.
|
|
45
|
+
* It tends to be verbose.
|
|
46
|
+
* It tends to contain fluff that is not necessary to understand the content.
|
|
47
|
+
* It maintains a friendly tone.
|
|
48
|
+
|
|
49
|
+
Note that even one of these characteristics is enough to classify the response as following a chatbot-style format.
|
|
50
|
+
|
|
51
|
+
You must provide your evaluation in the following JSON format:
|
|
52
|
+
{
|
|
53
|
+
"thought_process": str (Pay very close attention to the response and argue whether the response follows a chatbot-style or not in a few sentences),
|
|
54
|
+
"is_chatbot_style": bool
|
|
55
|
+
}""", # noqa: E501
|
|
56
|
+
user_prompt=f"""**Model Response**:
|
|
57
|
+
{{{COMPLETION_KEY}}}""",
|
|
58
|
+
),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
grading_model: StructuredOutputChatModel,
|
|
64
|
+
prompt_templates: Mapping[Language, PromptTemplate] = PROMPT_TEMPLATES,
|
|
65
|
+
) -> None:
|
|
66
|
+
self._grading_model = grading_model
|
|
67
|
+
|
|
68
|
+
if not all(self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()):
|
|
69
|
+
raise ValueError(f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}'.")
|
|
70
|
+
self._prompt_templates = prompt_templates
|
|
71
|
+
|
|
72
|
+
def grade(self, completion: str, language: Language) -> ChatbotStyleGradingOutput:
|
|
73
|
+
try:
|
|
74
|
+
prompt_template = language.language_config(self._prompt_templates)
|
|
75
|
+
except Exception as _:
|
|
76
|
+
prompt_template = Language("en").language_config(self._prompt_templates)
|
|
77
|
+
|
|
78
|
+
messages = prompt_template.to_messages(
|
|
79
|
+
[],
|
|
80
|
+
[
|
|
81
|
+
(self.COMPLETION_KEY, completion),
|
|
82
|
+
],
|
|
83
|
+
)
|
|
84
|
+
raw_completion = self._grading_model.generate_from_messages([messages])[0]
|
|
85
|
+
loaded_json = parse_json_output(raw_completion.completion)
|
|
86
|
+
|
|
87
|
+
return ChatbotStyleGradingOutput(
|
|
88
|
+
thought_process=loaded_json.get("thought_process", None),
|
|
89
|
+
is_chatbot_style=loaded_json.get("is_chatbot_style", None),
|
|
90
|
+
judge_prompt=raw_completion.prompt,
|
|
91
|
+
judge_response=raw_completion.completion,
|
|
92
|
+
)
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
|
|
5
|
+
from eval_framework.metrics.llm.graders.language import Language
|
|
6
|
+
from eval_framework.metrics.llm.graders.models import (
|
|
7
|
+
GradingOutput,
|
|
8
|
+
PromptTemplateWithParseMap,
|
|
9
|
+
parse_json_output,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MatchOutcome(str, Enum):
|
|
14
|
+
A_WINS = "a_wins"
|
|
15
|
+
DRAW = "draw"
|
|
16
|
+
B_WINS = "b_wins"
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def payoff(self) -> tuple[float, float]:
|
|
20
|
+
if self == self.A_WINS:
|
|
21
|
+
return (1, 0)
|
|
22
|
+
if self == self.DRAW:
|
|
23
|
+
return (0.5, 0.5)
|
|
24
|
+
return (0, 1)
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def from_rank_literal(rank: int) -> "MatchOutcome":
|
|
28
|
+
match rank:
|
|
29
|
+
case 1:
|
|
30
|
+
return MatchOutcome.A_WINS
|
|
31
|
+
case 2:
|
|
32
|
+
return MatchOutcome.B_WINS
|
|
33
|
+
case 3:
|
|
34
|
+
return MatchOutcome.DRAW
|
|
35
|
+
case _:
|
|
36
|
+
raise ValueError(f"Got unexpected rank {rank}")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ComparisonGradingOutput(GradingOutput):
|
|
40
|
+
reasoning: str
|
|
41
|
+
outcome: MatchOutcome
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ComparisonGrader:
|
|
45
|
+
INSTRUCTION_KEY = "instruction"
|
|
46
|
+
ANSWER_1_KEY = "answer_1"
|
|
47
|
+
ANSWER_2_KEY = "answer_2"
|
|
48
|
+
REASONING_KEY = "explanation"
|
|
49
|
+
BETTER_ANSWER_KEY = "better_answer"
|
|
50
|
+
PROMPT_TEMPLATES = {
|
|
51
|
+
Language("de"): PromptTemplateWithParseMap(
|
|
52
|
+
system_prompt=f"""Beachte die gegebene Aufgabe und dazugehörigen Antworten. Entscheide, welche Antwort besser ist, Antwort 1 oder Antwort 2. Gebe anschließend "Antwort 1 ist besser", "Antwort 2 ist besser" oder "Beide gleich" aus.
|
|
53
|
+
|
|
54
|
+
Eine gute Antwort ist:
|
|
55
|
+
1. Inhaltlich korrekt.
|
|
56
|
+
2. Beachtet die Anforderungen der Aufgabe präzise.
|
|
57
|
+
3. Ist im Rahmen der Aufgabenstellung kreativ und nicht repetetiv.
|
|
58
|
+
4. In der Sprache der Aufgabe verfasst.
|
|
59
|
+
|
|
60
|
+
Gebe die Antwort im folgenden json-Format:
|
|
61
|
+
{{
|
|
62
|
+
"{REASONING_KEY}": str (Beschreibe in wenigen Sätzen (max. 5) die Unterschiede der beiden Antworten und begründe, warum eine der beiden Antworten besser ist oder warum die Antworten ähnlich gut sind.),
|
|
63
|
+
"{BETTER_ANSWER_KEY}": Literal["Antwort 1 ist besser", "Antwort 2 ist besser", "Beide gleich"]
|
|
64
|
+
}}""", # noqa: E501
|
|
65
|
+
user_prompt=f"""Aufgabe:
|
|
66
|
+
{{{INSTRUCTION_KEY}}}
|
|
67
|
+
---
|
|
68
|
+
Antwort 1:
|
|
69
|
+
{{{ANSWER_1_KEY}}}
|
|
70
|
+
---
|
|
71
|
+
Antwort 2:
|
|
72
|
+
{{{ANSWER_2_KEY}}}""",
|
|
73
|
+
parse_map={
|
|
74
|
+
"Antwort 1 ist besser": MatchOutcome.A_WINS,
|
|
75
|
+
"Antwort 2 ist besser": MatchOutcome.B_WINS,
|
|
76
|
+
"Beide gleich": MatchOutcome.DRAW,
|
|
77
|
+
},
|
|
78
|
+
),
|
|
79
|
+
Language("en"): PromptTemplateWithParseMap(
|
|
80
|
+
system_prompt=f"""Note the given task and the corresponding answers. Decide which answer is better, answer 1 or answer 2. Then output "Answer 1 is better", "Answer 2 is better" or "Both equal".
|
|
81
|
+
|
|
82
|
+
A good answer is:
|
|
83
|
+
1. correct in content.
|
|
84
|
+
2. follows the requirements of the task precisely.
|
|
85
|
+
3. is creative and not repetitive in the context of the task.
|
|
86
|
+
4. written in the same language as the task.
|
|
87
|
+
|
|
88
|
+
Enter the answer in the following json format:
|
|
89
|
+
{{
|
|
90
|
+
"{REASONING_KEY}": str (Describe in a few sentences (max. 5) the differences between the two answers and give reasons why one of the two answers is better or why the answers are similarly good),
|
|
91
|
+
"{BETTER_ANSWER_KEY}": Literal["Answer 1 is better", "Answer 2 is better", "Both equal"]
|
|
92
|
+
}}""", # noqa: E501
|
|
93
|
+
user_prompt=f"""Task:
|
|
94
|
+
{{{INSTRUCTION_KEY}}}
|
|
95
|
+
---
|
|
96
|
+
Answer 1:
|
|
97
|
+
{{{ANSWER_1_KEY}}}
|
|
98
|
+
---
|
|
99
|
+
Answer 2:
|
|
100
|
+
{{{ANSWER_2_KEY}}}""",
|
|
101
|
+
parse_map={
|
|
102
|
+
"Answer 1 is better": MatchOutcome.A_WINS,
|
|
103
|
+
"Answer 2 is better": MatchOutcome.B_WINS,
|
|
104
|
+
"Both equal": MatchOutcome.DRAW,
|
|
105
|
+
},
|
|
106
|
+
),
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
def __init__(
|
|
110
|
+
self,
|
|
111
|
+
grading_model: StructuredOutputChatModel,
|
|
112
|
+
prompt_templates: Mapping[Language, PromptTemplateWithParseMap] = PROMPT_TEMPLATES,
|
|
113
|
+
) -> None:
|
|
114
|
+
self._grading_model = grading_model
|
|
115
|
+
|
|
116
|
+
if not all(
|
|
117
|
+
self.INSTRUCTION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
|
|
118
|
+
) or not all(self.ANSWER_1_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()):
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f"At least one PromptTemplate invalid, must contain '{self.ANSWER_1_KEY}' and '{self.INSTRUCTION_KEY}'."
|
|
121
|
+
)
|
|
122
|
+
self._prompt_templates = prompt_templates
|
|
123
|
+
|
|
124
|
+
def grade(
|
|
125
|
+
self, instruction: str, completion_1: str, completion_2: str, language: Language
|
|
126
|
+
) -> ComparisonGradingOutput:
|
|
127
|
+
prompt_template = language.language_config(self._prompt_templates)
|
|
128
|
+
|
|
129
|
+
messages = prompt_template.to_messages(
|
|
130
|
+
[],
|
|
131
|
+
[
|
|
132
|
+
(self.INSTRUCTION_KEY, instruction),
|
|
133
|
+
(self.ANSWER_1_KEY, completion_1),
|
|
134
|
+
(self.ANSWER_2_KEY, completion_2),
|
|
135
|
+
],
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
raw_completion = self._grading_model.generate_from_messages([messages])[0]
|
|
139
|
+
loaded_json = parse_json_output(raw_completion.completion)
|
|
140
|
+
|
|
141
|
+
return ComparisonGradingOutput(
|
|
142
|
+
reasoning=loaded_json.get(self.REASONING_KEY, None),
|
|
143
|
+
outcome=prompt_template.parse_map.get(str(loaded_json.get(self.BETTER_ANSWER_KEY, None)), None),
|
|
144
|
+
judge_prompt=raw_completion.prompt,
|
|
145
|
+
judge_response=raw_completion.completion,
|
|
146
|
+
)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
2
|
+
|
|
3
|
+
from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
|
|
4
|
+
from eval_framework.metrics.llm.graders.language import Language
|
|
5
|
+
from eval_framework.metrics.llm.graders.models import GradingOutput, PromptTemplate, parse_json_output
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ConcisenessGradingOutput(GradingOutput):
|
|
9
|
+
thought_process: str | None
|
|
10
|
+
is_concise: bool | None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ConcisenessGrader:
|
|
14
|
+
INSTRUCTION_KEY = "instruction"
|
|
15
|
+
COMPLETION_KEY = "completion"
|
|
16
|
+
PROMPT_TEMPLATES = {
|
|
17
|
+
Language("de"): PromptTemplate(
|
|
18
|
+
system_prompt="""Deine Aufgabe ist es zu klassifizieren, ob eine von einem Textgenerator gelieferte Antwort kurz und prägnant ist.
|
|
19
|
+
|
|
20
|
+
Eine kurz und prägnante ("concise") Antwort ist eine Antwort, die knapp und auf den Punkt ist, ohne unnötige Details oder Ausführungen.
|
|
21
|
+
|
|
22
|
+
Gebe deine Bewertung in folgendem JSON-Format:
|
|
23
|
+
{
|
|
24
|
+
"thought_process": str (Achte sehr genau auf die Antwort und argumentiere in ein paar Sätzen, ob die Antwort kurz und prägnant ("concise") ist oder nicht),
|
|
25
|
+
"is_concise": bool
|
|
26
|
+
}""", # noqa: E501
|
|
27
|
+
user_prompt=f"""**Benutzeranweisung**
|
|
28
|
+
{{{INSTRUCTION_KEY}}}
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
**Antwort des Textgenerators**
|
|
32
|
+
{{{COMPLETION_KEY}}}""",
|
|
33
|
+
),
|
|
34
|
+
Language("en"): PromptTemplate(
|
|
35
|
+
system_prompt="""Your task is to classify if a text generation model's response is concise.
|
|
36
|
+
|
|
37
|
+
A concise response is one that is brief and to the point, without unnecessary details or elaboration.
|
|
38
|
+
|
|
39
|
+
You must provide your evaluation in the following JSON format:
|
|
40
|
+
{
|
|
41
|
+
"thought_process": str (Pay very close attention to the response and argue whether the response is concise or not in a few sentences),
|
|
42
|
+
"is_concise": bool
|
|
43
|
+
}""", # noqa: E501
|
|
44
|
+
user_prompt=f"""**User Instruction**:
|
|
45
|
+
{{{INSTRUCTION_KEY}}}
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
**Model Response**:
|
|
49
|
+
{{{COMPLETION_KEY}}}""",
|
|
50
|
+
),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
grading_model: StructuredOutputChatModel,
|
|
56
|
+
prompt_templates: Mapping[Language, PromptTemplate] = PROMPT_TEMPLATES,
|
|
57
|
+
) -> None:
|
|
58
|
+
self._grading_model = grading_model
|
|
59
|
+
|
|
60
|
+
if not all(
|
|
61
|
+
self.INSTRUCTION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
|
|
62
|
+
) or not all(
|
|
63
|
+
self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
|
|
64
|
+
):
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}' "
|
|
67
|
+
"and '{self.INSTRUCTION_KEY}'."
|
|
68
|
+
)
|
|
69
|
+
self._prompt_templates = prompt_templates
|
|
70
|
+
|
|
71
|
+
def grade(self, instruction: str, completion: str, language: Language) -> ConcisenessGradingOutput:
|
|
72
|
+
try:
|
|
73
|
+
prompt_template = language.language_config(self._prompt_templates)
|
|
74
|
+
except Exception as _:
|
|
75
|
+
prompt_template = Language("en").language_config(self._prompt_templates)
|
|
76
|
+
|
|
77
|
+
messages = prompt_template.to_messages(
|
|
78
|
+
[],
|
|
79
|
+
[
|
|
80
|
+
(self.INSTRUCTION_KEY, instruction),
|
|
81
|
+
(self.COMPLETION_KEY, completion),
|
|
82
|
+
],
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
raw_completion = self._grading_model.generate_from_messages([messages])[0]
|
|
86
|
+
loaded_json = parse_json_output(raw_completion.completion)
|
|
87
|
+
|
|
88
|
+
return ConcisenessGradingOutput(
|
|
89
|
+
thought_process=loaded_json.get("thought_process", None),
|
|
90
|
+
is_concise=loaded_json.get("is_concise", None),
|
|
91
|
+
judge_prompt=raw_completion.prompt,
|
|
92
|
+
judge_response=raw_completion.completion,
|
|
93
|
+
)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
2
|
+
|
|
3
|
+
from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
|
|
4
|
+
from eval_framework.metrics.llm.graders.language import Language
|
|
5
|
+
from eval_framework.metrics.llm.graders.models import GradingOutput, PromptTemplate, parse_json_output
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ContainsNamesGradingOutput(GradingOutput):
|
|
9
|
+
contains_names: bool | None
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ContainsNamesGrader:
|
|
13
|
+
COMPLETION_KEY = "completion"
|
|
14
|
+
PROMPT_TEMPLATES = {
|
|
15
|
+
Language("en"): PromptTemplate(
|
|
16
|
+
system_prompt="""Your assignment is to assess whether text contains names of individuals.
|
|
17
|
+
|
|
18
|
+
Your assessment should be formatted in JSON as follows:
|
|
19
|
+
{
|
|
20
|
+
"contains_names": bool (State with a true or false whether the text contains any names.)
|
|
21
|
+
}""",
|
|
22
|
+
user_prompt=f"""{{{COMPLETION_KEY}}}""",
|
|
23
|
+
),
|
|
24
|
+
Language("de"): PromptTemplate(
|
|
25
|
+
system_prompt="""Deine Aufgabe ist es zu bewerten, ob ein Text Namen von Personen enthält.
|
|
26
|
+
|
|
27
|
+
Deine Bewertung sollte im JSON-Format wie folgt aussehen:
|
|
28
|
+
{
|
|
29
|
+
"contains_names": bool (Gib an, ob der Text Namen enthält oder nicht.)
|
|
30
|
+
}""",
|
|
31
|
+
user_prompt=f"""{{{COMPLETION_KEY}}}""",
|
|
32
|
+
),
|
|
33
|
+
Language("fr"): PromptTemplate(
|
|
34
|
+
system_prompt="""Votre tâche consiste à évaluer si un texte contient des noms de personnes.
|
|
35
|
+
|
|
36
|
+
Votre évaluation doit être formatée en JSON comme suit :
|
|
37
|
+
{
|
|
38
|
+
"contains_names": bool (Indiquez si le texte contient des noms ou non.)
|
|
39
|
+
}""",
|
|
40
|
+
user_prompt=f"""{{{COMPLETION_KEY}}}""",
|
|
41
|
+
),
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
grading_model: StructuredOutputChatModel,
|
|
47
|
+
prompt_templates: Mapping[Language, PromptTemplate] = PROMPT_TEMPLATES,
|
|
48
|
+
) -> None:
|
|
49
|
+
self._grading_model = grading_model
|
|
50
|
+
|
|
51
|
+
if not all(self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()):
|
|
52
|
+
raise ValueError(f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}'.")
|
|
53
|
+
self._prompt_templates = prompt_templates
|
|
54
|
+
|
|
55
|
+
def grade(self, completion: str, language: Language) -> ContainsNamesGradingOutput:
|
|
56
|
+
prompt_template = language.language_config(self._prompt_templates)
|
|
57
|
+
messages = prompt_template.to_messages(
|
|
58
|
+
[],
|
|
59
|
+
[
|
|
60
|
+
(self.COMPLETION_KEY, completion),
|
|
61
|
+
],
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
raw_completion = self._grading_model.generate_from_messages([messages])[0]
|
|
65
|
+
loaded_json = parse_json_output(raw_completion.completion)
|
|
66
|
+
|
|
67
|
+
return ContainsNamesGradingOutput(
|
|
68
|
+
contains_names=loaded_json.get("contains_names", None),
|
|
69
|
+
judge_prompt=raw_completion.prompt,
|
|
70
|
+
judge_response=raw_completion.completion,
|
|
71
|
+
)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
2
|
+
|
|
3
|
+
from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
|
|
4
|
+
from eval_framework.metrics.llm.graders.language import Language
|
|
5
|
+
from eval_framework.metrics.llm.graders.models import (
|
|
6
|
+
FOFOPromptTemplate,
|
|
7
|
+
GradingOutput,
|
|
8
|
+
parse_json_output,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FormatCorrectnessOutput(GradingOutput):
|
|
13
|
+
reasons: str | None
|
|
14
|
+
format_correctness: int | None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FormatCorrectnessGrader:
|
|
18
|
+
INSTRUCTION_KEY = "<instruction>"
|
|
19
|
+
COMPLETION_KEY = "<completion>"
|
|
20
|
+
|
|
21
|
+
PROMPT_TEMPLATES = {
|
|
22
|
+
Language("en"): FOFOPromptTemplate(
|
|
23
|
+
system_prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
|
24
|
+
user_prompt=f"""
|
|
25
|
+
I would like you to create a leaderboard that evaluates the correctness of the format of answers from
|
|
26
|
+
various large language models. To accomplish this, you will need to analyze the text prompts given to
|
|
27
|
+
the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are
|
|
28
|
+
properly formatted as a json string. I will provide both the prompts and the responses for this purpose.\n
|
|
29
|
+
|
|
30
|
+
Here is the prompt: {{
|
|
31
|
+
"instruction": {INSTRUCTION_KEY}
|
|
32
|
+
}}
|
|
33
|
+
|
|
34
|
+
Here are the outputs of the models:
|
|
35
|
+
[
|
|
36
|
+
{{
|
|
37
|
+
"model": "model",
|
|
38
|
+
"answer": {COMPLETION_KEY}
|
|
39
|
+
}},
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
Please evaluate the formatting of the model’s responses by checking if they comply with the format
|
|
43
|
+
specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation
|
|
44
|
+
for why the format is correct or incorrect. Your feedback should include the name of the model, followed
|
|
45
|
+
by the format correctness status represented as ’1’ for correct and ’0’ for incorrect. Present your
|
|
46
|
+
reasoning as bullet points within a single string for each model assessed. In other words, you should
|
|
47
|
+
produce the following output:
|
|
48
|
+
```json
|
|
49
|
+
[
|
|
50
|
+
{{
|
|
51
|
+
"model": <model-name>,
|
|
52
|
+
"format_correctness": <correctness>,
|
|
53
|
+
"reasons": <reasons-of-format-correctness>
|
|
54
|
+
}}
|
|
55
|
+
]```
|
|
56
|
+
|
|
57
|
+
Please note that your response should be a properly formatted JSON string and should not contain any
|
|
58
|
+
additional content. We will load it directly as a JSON string in Python.
|
|
59
|
+
""",
|
|
60
|
+
),
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
grading_model: StructuredOutputChatModel,
|
|
66
|
+
prompt_templates: Mapping[Language, FOFOPromptTemplate] = PROMPT_TEMPLATES,
|
|
67
|
+
) -> None:
|
|
68
|
+
self._grading_model = grading_model
|
|
69
|
+
|
|
70
|
+
if not all(
|
|
71
|
+
self.INSTRUCTION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
|
|
72
|
+
) or not all(
|
|
73
|
+
self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
|
|
74
|
+
):
|
|
75
|
+
raise ValueError(
|
|
76
|
+
f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}' "
|
|
77
|
+
"and '{self.INSTRUCTION_KEY}'."
|
|
78
|
+
)
|
|
79
|
+
self._prompt_templates = prompt_templates
|
|
80
|
+
|
|
81
|
+
def grade(self, instruction: str, completion: str, language: Language) -> FormatCorrectnessOutput:
|
|
82
|
+
try:
|
|
83
|
+
prompt_template = language.language_config(self._prompt_templates)
|
|
84
|
+
except Exception as _:
|
|
85
|
+
prompt_template = Language("en").language_config(self._prompt_templates)
|
|
86
|
+
|
|
87
|
+
messages = prompt_template.to_messages(
|
|
88
|
+
[],
|
|
89
|
+
[
|
|
90
|
+
(self.INSTRUCTION_KEY, instruction),
|
|
91
|
+
(self.COMPLETION_KEY, completion),
|
|
92
|
+
],
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
raw_completion = self._grading_model.generate_from_messages([messages])[0]
|
|
96
|
+
loaded_json = parse_json_output(raw_completion.completion)
|
|
97
|
+
reasons = loaded_json.get("reasons", None)
|
|
98
|
+
if isinstance(reasons, list):
|
|
99
|
+
reasons = ["• " + reason + "\n " for reason in reasons]
|
|
100
|
+
reasons = "".join(reasons)
|
|
101
|
+
elif isinstance(reasons, str):
|
|
102
|
+
reasons = "• " + reasons + "\n "
|
|
103
|
+
|
|
104
|
+
return FormatCorrectnessOutput(
|
|
105
|
+
reasons=reasons,
|
|
106
|
+
format_correctness=loaded_json.get("format_correctness", None),
|
|
107
|
+
judge_prompt=raw_completion.prompt,
|
|
108
|
+
judge_response=raw_completion.completion,
|
|
109
|
+
)
|