eval-framework 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_framework/__init__.py +7 -0
- eval_framework/base_config.py +36 -0
- eval_framework/context/__init__.py +0 -0
- eval_framework/context/determined.py +170 -0
- eval_framework/context/eval.py +114 -0
- eval_framework/context/local.py +52 -0
- eval_framework/evaluation_generator.py +231 -0
- eval_framework/exceptions.py +2 -0
- eval_framework/external/ifeval_impl/README.md +5 -0
- eval_framework/external/ifeval_impl/instructions.py +1523 -0
- eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
- eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
- eval_framework/external/ifeval_impl/utils.py +135 -0
- eval_framework/llm/__init__.py +0 -0
- eval_framework/llm/aleph_alpha.py +323 -0
- eval_framework/llm/base.py +58 -0
- eval_framework/llm/huggingface.py +332 -0
- eval_framework/llm/mistral.py +73 -0
- eval_framework/llm/models.py +16 -0
- eval_framework/llm/openai.py +205 -0
- eval_framework/llm/vllm.py +438 -0
- eval_framework/logger.py +3 -0
- eval_framework/main.py +187 -0
- eval_framework/metrics/__init__.py +0 -0
- eval_framework/metrics/base.py +40 -0
- eval_framework/metrics/completion/__init__.py +1 -0
- eval_framework/metrics/completion/accuracy_completion.py +16 -0
- eval_framework/metrics/completion/bleu.py +76 -0
- eval_framework/metrics/completion/chrf.py +62 -0
- eval_framework/metrics/completion/code_assertion.py +44 -0
- eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
- eval_framework/metrics/completion/comet.py +56 -0
- eval_framework/metrics/completion/concordance_index.py +38 -0
- eval_framework/metrics/completion/csv_format.py +102 -0
- eval_framework/metrics/completion/cwe_accuracy.py +49 -0
- eval_framework/metrics/completion/exponential_similarity.py +65 -0
- eval_framework/metrics/completion/f1.py +42 -0
- eval_framework/metrics/completion/format_checker.py +56 -0
- eval_framework/metrics/completion/grid_difference.py +77 -0
- eval_framework/metrics/completion/ifeval.py +73 -0
- eval_framework/metrics/completion/json_format.py +171 -0
- eval_framework/metrics/completion/language_checker.py +74 -0
- eval_framework/metrics/completion/length_control.py +83 -0
- eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
- eval_framework/metrics/completion/niah_accuracy.py +163 -0
- eval_framework/metrics/completion/placeholder_checker.py +27 -0
- eval_framework/metrics/completion/repetition.py +88 -0
- eval_framework/metrics/completion/rouge_1.py +35 -0
- eval_framework/metrics/completion/rouge_2.py +45 -0
- eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
- eval_framework/metrics/completion/rouge_l.py +52 -0
- eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
- eval_framework/metrics/completion/ter.py +67 -0
- eval_framework/metrics/completion/text_counter.py +182 -0
- eval_framework/metrics/efficiency/__init__.py +0 -0
- eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
- eval_framework/metrics/llm/__init__.py +0 -0
- eval_framework/metrics/llm/base.py +8 -0
- eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
- eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
- eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
- eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
- eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
- eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
- eval_framework/metrics/llm/graders/language.py +56 -0
- eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
- eval_framework/metrics/llm/graders/models.py +74 -0
- eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
- eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
- eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
- eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
- eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
- eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
- eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
- eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
- eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
- eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
- eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
- eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
- eval_framework/metrics/llm/llm_judge_sql.py +394 -0
- eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
- eval_framework/metrics/loglikelihood/__init__.py +0 -0
- eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
- eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
- eval_framework/py.typed +0 -0
- eval_framework/response_generator.py +416 -0
- eval_framework/result_processors/__init__.py +0 -0
- eval_framework/result_processors/base.py +74 -0
- eval_framework/result_processors/hf_processor.py +87 -0
- eval_framework/result_processors/result_processor.py +129 -0
- eval_framework/run.py +314 -0
- eval_framework/run_direct.py +42 -0
- eval_framework/shared/types.py +227 -0
- eval_framework/tasks/__init__.py +6 -0
- eval_framework/tasks/base.py +314 -0
- eval_framework/tasks/benchmarks/__init__.py +0 -0
- eval_framework/tasks/benchmarks/arc.py +46 -0
- eval_framework/tasks/benchmarks/arc_de.py +46 -0
- eval_framework/tasks/benchmarks/arc_fi.py +46 -0
- eval_framework/tasks/benchmarks/belebele.py +60 -0
- eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
- eval_framework/tasks/benchmarks/casehold.py +47 -0
- eval_framework/tasks/benchmarks/chembench.py +85 -0
- eval_framework/tasks/benchmarks/copa.py +39 -0
- eval_framework/tasks/benchmarks/duc.py +91 -0
- eval_framework/tasks/benchmarks/flores200.py +62 -0
- eval_framework/tasks/benchmarks/flores_plus.py +84 -0
- eval_framework/tasks/benchmarks/gpqa.py +177 -0
- eval_framework/tasks/benchmarks/gsm8k.py +148 -0
- eval_framework/tasks/benchmarks/hellaswag.py +44 -0
- eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
- eval_framework/tasks/benchmarks/humaneval.py +97 -0
- eval_framework/tasks/benchmarks/ifeval.py +78 -0
- eval_framework/tasks/benchmarks/include.py +119 -0
- eval_framework/tasks/benchmarks/infinitebench.py +302 -0
- eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
- eval_framework/tasks/benchmarks/mbpp.py +192 -0
- eval_framework/tasks/benchmarks/mmlu.py +190 -0
- eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
- eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
- eval_framework/tasks/benchmarks/mmmlu.py +529 -0
- eval_framework/tasks/benchmarks/openbookqa.py +37 -0
- eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
- eval_framework/tasks/benchmarks/pawsx.py +65 -0
- eval_framework/tasks/benchmarks/piqa.py +39 -0
- eval_framework/tasks/benchmarks/quality.py +56 -0
- eval_framework/tasks/benchmarks/sciq.py +44 -0
- eval_framework/tasks/benchmarks/sphyr.py +75 -0
- eval_framework/tasks/benchmarks/squad.py +89 -0
- eval_framework/tasks/benchmarks/struct_eval.py +110 -0
- eval_framework/tasks/benchmarks/tablebench.py +117 -0
- eval_framework/tasks/benchmarks/triviaqa.py +42 -0
- eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
- eval_framework/tasks/benchmarks/winogender.py +39 -0
- eval_framework/tasks/benchmarks/winogrande.py +44 -0
- eval_framework/tasks/benchmarks/winox.py +57 -0
- eval_framework/tasks/benchmarks/wmt.py +160 -0
- eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
- eval_framework/tasks/eval_config.py +112 -0
- eval_framework/tasks/perturbation.py +83 -0
- eval_framework/tasks/registry.py +186 -0
- eval_framework/tasks/task_loader.py +80 -0
- eval_framework/tasks/task_names.py +138 -0
- eval_framework/tasks/utils.py +578 -0
- eval_framework/utils/constants.py +9 -0
- eval_framework/utils/generate_task_docs.py +229 -0
- eval_framework/utils/helpers.py +3 -0
- eval_framework/utils/logging.py +50 -0
- eval_framework/utils/packaging.py +52 -0
- eval_framework-0.2.0.dist-info/METADATA +514 -0
- eval_framework-0.2.0.dist-info/RECORD +161 -0
- eval_framework-0.2.0.dist-info/WHEEL +4 -0
- eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
- template_formatting/README.md +83 -0
- template_formatting/__init__.py +0 -0
- template_formatting/formatter.py +536 -0
- template_formatting/mistral_formatter.py +159 -0
- template_formatting/py.typed +0 -0
- template_formatting/tests/test_formatter_eval.py +408 -0
- template_formatting/tests/test_formatter_scaling.py +253 -0
- template_formatting/tests/test_mistral_formatter.py +136 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# Copyright 2023 The Google Research Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
# mypy: ignore-errors
|
|
16
|
+
|
|
17
|
+
"""Registry of all instructions."""
|
|
18
|
+
|
|
19
|
+
from eval_framework.external.ifeval_impl import instructions
|
|
20
|
+
|
|
21
|
+
_KEYWORD = "keywords:"
|
|
22
|
+
|
|
23
|
+
_LANGUAGE = "language:"
|
|
24
|
+
|
|
25
|
+
_LENGTH = "length_constraints:"
|
|
26
|
+
|
|
27
|
+
_CONTENT = "detectable_content:"
|
|
28
|
+
|
|
29
|
+
_FORMAT = "detectable_format:"
|
|
30
|
+
|
|
31
|
+
_MULTITURN = "multi-turn:"
|
|
32
|
+
|
|
33
|
+
_COMBINATION = "combination:"
|
|
34
|
+
|
|
35
|
+
_STARTEND = "startend:"
|
|
36
|
+
|
|
37
|
+
_CHANGE_CASES = "change_case:"
|
|
38
|
+
|
|
39
|
+
_PUNCTUATION = "punctuation:"
|
|
40
|
+
|
|
41
|
+
INSTRUCTION_DICT = {
|
|
42
|
+
_KEYWORD + "existence": instructions.KeywordChecker,
|
|
43
|
+
_KEYWORD + "frequency": instructions.KeywordFrequencyChecker,
|
|
44
|
+
# _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
|
|
45
|
+
_KEYWORD + "forbidden_words": instructions.ForbiddenWords,
|
|
46
|
+
_KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker,
|
|
47
|
+
_LANGUAGE + "response_language": instructions.ResponseLanguageChecker,
|
|
48
|
+
_LENGTH + "number_sentences": instructions.NumberOfSentences,
|
|
49
|
+
_LENGTH + "number_paragraphs": instructions.ParagraphChecker,
|
|
50
|
+
_LENGTH + "number_words": instructions.NumberOfWords,
|
|
51
|
+
_LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck,
|
|
52
|
+
_CONTENT + "number_placeholders": instructions.PlaceholderChecker,
|
|
53
|
+
_CONTENT + "postscript": instructions.PostscriptChecker,
|
|
54
|
+
_FORMAT + "number_bullet_lists": instructions.BulletListChecker,
|
|
55
|
+
# _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
|
|
56
|
+
_FORMAT + "constrained_response": instructions.ConstrainedResponseChecker,
|
|
57
|
+
_FORMAT + "number_highlighted_sections": (instructions.HighlightSectionChecker),
|
|
58
|
+
_FORMAT + "multiple_sections": instructions.SectionChecker,
|
|
59
|
+
# _FORMAT + "rephrase": instructions.RephraseChecker,
|
|
60
|
+
_FORMAT + "json_format": instructions.JsonFormat,
|
|
61
|
+
_FORMAT + "title": instructions.TitleChecker,
|
|
62
|
+
# _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
|
|
63
|
+
_COMBINATION + "two_responses": instructions.TwoResponsesChecker,
|
|
64
|
+
_COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer,
|
|
65
|
+
_STARTEND + "end_checker": instructions.EndChecker,
|
|
66
|
+
_CHANGE_CASES + "capital_word_frequency": instructions.CapitalWordFrequencyChecker,
|
|
67
|
+
_CHANGE_CASES + "english_capital": instructions.CapitalLettersEnglishChecker,
|
|
68
|
+
_CHANGE_CASES + "english_lowercase": instructions.LowercaseLettersEnglishChecker,
|
|
69
|
+
_PUNCTUATION + "no_comma": instructions.CommaChecker,
|
|
70
|
+
_STARTEND + "quotation": instructions.QuotationChecker,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
INSTRUCTION_CONFLICTS = {
|
|
74
|
+
_KEYWORD + "existence": {_KEYWORD + "existence"},
|
|
75
|
+
_KEYWORD + "frequency": {_KEYWORD + "frequency"},
|
|
76
|
+
# _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
|
|
77
|
+
_KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
|
|
78
|
+
_KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
|
|
79
|
+
_LANGUAGE + "response_language": {
|
|
80
|
+
_LANGUAGE + "response_language",
|
|
81
|
+
_FORMAT + "multiple_sections",
|
|
82
|
+
_KEYWORD + "existence",
|
|
83
|
+
_KEYWORD + "frequency",
|
|
84
|
+
_KEYWORD + "forbidden_words",
|
|
85
|
+
_STARTEND + "end_checker",
|
|
86
|
+
_CHANGE_CASES + "english_capital",
|
|
87
|
+
_CHANGE_CASES + "english_lowercase",
|
|
88
|
+
},
|
|
89
|
+
_LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
|
|
90
|
+
_LENGTH + "number_paragraphs": {
|
|
91
|
+
_LENGTH + "number_paragraphs",
|
|
92
|
+
_LENGTH + "nth_paragraph_first_word",
|
|
93
|
+
_LENGTH + "number_sentences",
|
|
94
|
+
_LENGTH + "nth_paragraph_first_word",
|
|
95
|
+
},
|
|
96
|
+
_LENGTH + "number_words": {_LENGTH + "number_words"},
|
|
97
|
+
_LENGTH + "nth_paragraph_first_word": {
|
|
98
|
+
_LENGTH + "nth_paragraph_first_word",
|
|
99
|
+
_LENGTH + "number_paragraphs",
|
|
100
|
+
},
|
|
101
|
+
_CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"},
|
|
102
|
+
_CONTENT + "postscript": {_CONTENT + "postscript"},
|
|
103
|
+
_FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"},
|
|
104
|
+
# _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
|
|
105
|
+
_FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
|
|
106
|
+
_FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
|
|
107
|
+
_FORMAT + "multiple_sections": {
|
|
108
|
+
_FORMAT + "multiple_sections",
|
|
109
|
+
_LANGUAGE + "response_language",
|
|
110
|
+
_FORMAT + "number_highlighted_sections",
|
|
111
|
+
},
|
|
112
|
+
# _FORMAT + "rephrase": instructions.RephraseChecker,
|
|
113
|
+
_FORMAT + "json_format": set(INSTRUCTION_DICT.keys()).difference(
|
|
114
|
+
{_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
|
|
115
|
+
),
|
|
116
|
+
_FORMAT + "title": {_FORMAT + "title"},
|
|
117
|
+
# _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
|
|
118
|
+
_COMBINATION + "two_responses": set(INSTRUCTION_DICT.keys()).difference(
|
|
119
|
+
{
|
|
120
|
+
_KEYWORD + "forbidden_words",
|
|
121
|
+
_KEYWORD + "existence",
|
|
122
|
+
_LANGUAGE + "response_language",
|
|
123
|
+
_FORMAT + "title",
|
|
124
|
+
_PUNCTUATION + "no_comma",
|
|
125
|
+
}
|
|
126
|
+
),
|
|
127
|
+
_COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
|
|
128
|
+
{_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"}
|
|
129
|
+
),
|
|
130
|
+
_STARTEND + "end_checker": {_STARTEND + "end_checker"},
|
|
131
|
+
_CHANGE_CASES + "capital_word_frequency": {
|
|
132
|
+
_CHANGE_CASES + "capital_word_frequency",
|
|
133
|
+
_CHANGE_CASES + "english_lowercase",
|
|
134
|
+
_CHANGE_CASES + "english_capital",
|
|
135
|
+
},
|
|
136
|
+
_CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
|
|
137
|
+
_CHANGE_CASES + "english_lowercase": {
|
|
138
|
+
_CHANGE_CASES + "english_lowercase",
|
|
139
|
+
_CHANGE_CASES + "english_capital",
|
|
140
|
+
},
|
|
141
|
+
_PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"},
|
|
142
|
+
_STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"},
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def conflict_make(conflicts):
|
|
147
|
+
"""Makes sure if A conflicts with B, B will conflict with A.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
conflicts: Dictionary of potential conflicts where key is instruction id
|
|
151
|
+
and value is set of instruction ids that it conflicts with.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Revised version of the dictionary. All instructions conflict with
|
|
155
|
+
themselves. If A conflicts with B, B will conflict with A.
|
|
156
|
+
"""
|
|
157
|
+
for key in conflicts:
|
|
158
|
+
for k in conflicts[key]:
|
|
159
|
+
conflicts[k].add(key)
|
|
160
|
+
conflicts[key].add(key)
|
|
161
|
+
return conflicts
|