eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +170 -0
  5. eval_framework/context/eval.py +114 -0
  6. eval_framework/context/local.py +52 -0
  7. eval_framework/evaluation_generator.py +231 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +323 -0
  16. eval_framework/llm/base.py +58 -0
  17. eval_framework/llm/huggingface.py +332 -0
  18. eval_framework/llm/mistral.py +73 -0
  19. eval_framework/llm/models.py +16 -0
  20. eval_framework/llm/openai.py +205 -0
  21. eval_framework/llm/vllm.py +438 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +187 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/bleu.py +76 -0
  29. eval_framework/metrics/completion/chrf.py +62 -0
  30. eval_framework/metrics/completion/code_assertion.py +44 -0
  31. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  32. eval_framework/metrics/completion/comet.py +56 -0
  33. eval_framework/metrics/completion/concordance_index.py +38 -0
  34. eval_framework/metrics/completion/csv_format.py +102 -0
  35. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  36. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  37. eval_framework/metrics/completion/f1.py +42 -0
  38. eval_framework/metrics/completion/format_checker.py +56 -0
  39. eval_framework/metrics/completion/grid_difference.py +77 -0
  40. eval_framework/metrics/completion/ifeval.py +73 -0
  41. eval_framework/metrics/completion/json_format.py +171 -0
  42. eval_framework/metrics/completion/language_checker.py +74 -0
  43. eval_framework/metrics/completion/length_control.py +83 -0
  44. eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
  45. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  46. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  47. eval_framework/metrics/completion/repetition.py +88 -0
  48. eval_framework/metrics/completion/rouge_1.py +35 -0
  49. eval_framework/metrics/completion/rouge_2.py +45 -0
  50. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  51. eval_framework/metrics/completion/rouge_l.py +52 -0
  52. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  53. eval_framework/metrics/completion/ter.py +67 -0
  54. eval_framework/metrics/completion/text_counter.py +182 -0
  55. eval_framework/metrics/efficiency/__init__.py +0 -0
  56. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  57. eval_framework/metrics/llm/__init__.py +0 -0
  58. eval_framework/metrics/llm/base.py +8 -0
  59. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  60. eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
  61. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  62. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  63. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  64. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  65. eval_framework/metrics/llm/graders/language.py +56 -0
  66. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  67. eval_framework/metrics/llm/graders/models.py +74 -0
  68. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  69. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  70. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  71. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  72. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  73. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  74. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  75. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  76. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  77. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
  78. eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
  79. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  80. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  81. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  82. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  83. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  84. eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
  85. eval_framework/py.typed +0 -0
  86. eval_framework/response_generator.py +416 -0
  87. eval_framework/result_processors/__init__.py +0 -0
  88. eval_framework/result_processors/base.py +74 -0
  89. eval_framework/result_processors/hf_processor.py +87 -0
  90. eval_framework/result_processors/result_processor.py +129 -0
  91. eval_framework/run.py +314 -0
  92. eval_framework/run_direct.py +42 -0
  93. eval_framework/shared/types.py +227 -0
  94. eval_framework/tasks/__init__.py +6 -0
  95. eval_framework/tasks/base.py +314 -0
  96. eval_framework/tasks/benchmarks/__init__.py +0 -0
  97. eval_framework/tasks/benchmarks/arc.py +46 -0
  98. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  99. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  100. eval_framework/tasks/benchmarks/belebele.py +60 -0
  101. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  102. eval_framework/tasks/benchmarks/casehold.py +47 -0
  103. eval_framework/tasks/benchmarks/chembench.py +85 -0
  104. eval_framework/tasks/benchmarks/copa.py +39 -0
  105. eval_framework/tasks/benchmarks/duc.py +91 -0
  106. eval_framework/tasks/benchmarks/flores200.py +62 -0
  107. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  108. eval_framework/tasks/benchmarks/gpqa.py +177 -0
  109. eval_framework/tasks/benchmarks/gsm8k.py +148 -0
  110. eval_framework/tasks/benchmarks/hellaswag.py +44 -0
  111. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  112. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  113. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  114. eval_framework/tasks/benchmarks/include.py +119 -0
  115. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  116. eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
  117. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  118. eval_framework/tasks/benchmarks/mmlu.py +190 -0
  119. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  120. eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
  121. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  122. eval_framework/tasks/benchmarks/openbookqa.py +37 -0
  123. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  124. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  125. eval_framework/tasks/benchmarks/piqa.py +39 -0
  126. eval_framework/tasks/benchmarks/quality.py +56 -0
  127. eval_framework/tasks/benchmarks/sciq.py +44 -0
  128. eval_framework/tasks/benchmarks/sphyr.py +75 -0
  129. eval_framework/tasks/benchmarks/squad.py +89 -0
  130. eval_framework/tasks/benchmarks/struct_eval.py +110 -0
  131. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  132. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  133. eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
  134. eval_framework/tasks/benchmarks/winogender.py +39 -0
  135. eval_framework/tasks/benchmarks/winogrande.py +44 -0
  136. eval_framework/tasks/benchmarks/winox.py +57 -0
  137. eval_framework/tasks/benchmarks/wmt.py +160 -0
  138. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  139. eval_framework/tasks/eval_config.py +112 -0
  140. eval_framework/tasks/perturbation.py +83 -0
  141. eval_framework/tasks/registry.py +186 -0
  142. eval_framework/tasks/task_loader.py +80 -0
  143. eval_framework/tasks/task_names.py +138 -0
  144. eval_framework/tasks/utils.py +578 -0
  145. eval_framework/utils/constants.py +9 -0
  146. eval_framework/utils/generate_task_docs.py +229 -0
  147. eval_framework/utils/helpers.py +3 -0
  148. eval_framework/utils/logging.py +50 -0
  149. eval_framework/utils/packaging.py +52 -0
  150. eval_framework-0.2.0.dist-info/METADATA +514 -0
  151. eval_framework-0.2.0.dist-info/RECORD +161 -0
  152. eval_framework-0.2.0.dist-info/WHEEL +4 -0
  153. eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
  154. template_formatting/README.md +83 -0
  155. template_formatting/__init__.py +0 -0
  156. template_formatting/formatter.py +536 -0
  157. template_formatting/mistral_formatter.py +159 -0
  158. template_formatting/py.typed +0 -0
  159. template_formatting/tests/test_formatter_eval.py +408 -0
  160. template_formatting/tests/test_formatter_scaling.py +253 -0
  161. template_formatting/tests/test_mistral_formatter.py +136 -0
@@ -0,0 +1,161 @@
1
+ # Copyright 2023 The Google Research Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # mypy: ignore-errors
16
+
17
+ """Registry of all instructions."""
18
+
19
+ from eval_framework.external.ifeval_impl import instructions
20
+
21
+ _KEYWORD = "keywords:"
22
+
23
+ _LANGUAGE = "language:"
24
+
25
+ _LENGTH = "length_constraints:"
26
+
27
+ _CONTENT = "detectable_content:"
28
+
29
+ _FORMAT = "detectable_format:"
30
+
31
+ _MULTITURN = "multi-turn:"
32
+
33
+ _COMBINATION = "combination:"
34
+
35
+ _STARTEND = "startend:"
36
+
37
+ _CHANGE_CASES = "change_case:"
38
+
39
+ _PUNCTUATION = "punctuation:"
40
+
41
+ INSTRUCTION_DICT = {
42
+ _KEYWORD + "existence": instructions.KeywordChecker,
43
+ _KEYWORD + "frequency": instructions.KeywordFrequencyChecker,
44
+ # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
45
+ _KEYWORD + "forbidden_words": instructions.ForbiddenWords,
46
+ _KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker,
47
+ _LANGUAGE + "response_language": instructions.ResponseLanguageChecker,
48
+ _LENGTH + "number_sentences": instructions.NumberOfSentences,
49
+ _LENGTH + "number_paragraphs": instructions.ParagraphChecker,
50
+ _LENGTH + "number_words": instructions.NumberOfWords,
51
+ _LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck,
52
+ _CONTENT + "number_placeholders": instructions.PlaceholderChecker,
53
+ _CONTENT + "postscript": instructions.PostscriptChecker,
54
+ _FORMAT + "number_bullet_lists": instructions.BulletListChecker,
55
+ # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
56
+ _FORMAT + "constrained_response": instructions.ConstrainedResponseChecker,
57
+ _FORMAT + "number_highlighted_sections": (instructions.HighlightSectionChecker),
58
+ _FORMAT + "multiple_sections": instructions.SectionChecker,
59
+ # _FORMAT + "rephrase": instructions.RephraseChecker,
60
+ _FORMAT + "json_format": instructions.JsonFormat,
61
+ _FORMAT + "title": instructions.TitleChecker,
62
+ # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
63
+ _COMBINATION + "two_responses": instructions.TwoResponsesChecker,
64
+ _COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer,
65
+ _STARTEND + "end_checker": instructions.EndChecker,
66
+ _CHANGE_CASES + "capital_word_frequency": instructions.CapitalWordFrequencyChecker,
67
+ _CHANGE_CASES + "english_capital": instructions.CapitalLettersEnglishChecker,
68
+ _CHANGE_CASES + "english_lowercase": instructions.LowercaseLettersEnglishChecker,
69
+ _PUNCTUATION + "no_comma": instructions.CommaChecker,
70
+ _STARTEND + "quotation": instructions.QuotationChecker,
71
+ }
72
+
73
+ INSTRUCTION_CONFLICTS = {
74
+ _KEYWORD + "existence": {_KEYWORD + "existence"},
75
+ _KEYWORD + "frequency": {_KEYWORD + "frequency"},
76
+ # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
77
+ _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
78
+ _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
79
+ _LANGUAGE + "response_language": {
80
+ _LANGUAGE + "response_language",
81
+ _FORMAT + "multiple_sections",
82
+ _KEYWORD + "existence",
83
+ _KEYWORD + "frequency",
84
+ _KEYWORD + "forbidden_words",
85
+ _STARTEND + "end_checker",
86
+ _CHANGE_CASES + "english_capital",
87
+ _CHANGE_CASES + "english_lowercase",
88
+ },
89
+ _LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
90
+ _LENGTH + "number_paragraphs": {
91
+ _LENGTH + "number_paragraphs",
92
+ _LENGTH + "nth_paragraph_first_word",
93
+ _LENGTH + "number_sentences",
94
+ _LENGTH + "nth_paragraph_first_word",
95
+ },
96
+ _LENGTH + "number_words": {_LENGTH + "number_words"},
97
+ _LENGTH + "nth_paragraph_first_word": {
98
+ _LENGTH + "nth_paragraph_first_word",
99
+ _LENGTH + "number_paragraphs",
100
+ },
101
+ _CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"},
102
+ _CONTENT + "postscript": {_CONTENT + "postscript"},
103
+ _FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"},
104
+ # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
105
+ _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
106
+ _FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
107
+ _FORMAT + "multiple_sections": {
108
+ _FORMAT + "multiple_sections",
109
+ _LANGUAGE + "response_language",
110
+ _FORMAT + "number_highlighted_sections",
111
+ },
112
+ # _FORMAT + "rephrase": instructions.RephraseChecker,
113
+ _FORMAT + "json_format": set(INSTRUCTION_DICT.keys()).difference(
114
+ {_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
115
+ ),
116
+ _FORMAT + "title": {_FORMAT + "title"},
117
+ # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
118
+ _COMBINATION + "two_responses": set(INSTRUCTION_DICT.keys()).difference(
119
+ {
120
+ _KEYWORD + "forbidden_words",
121
+ _KEYWORD + "existence",
122
+ _LANGUAGE + "response_language",
123
+ _FORMAT + "title",
124
+ _PUNCTUATION + "no_comma",
125
+ }
126
+ ),
127
+ _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
128
+ {_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"}
129
+ ),
130
+ _STARTEND + "end_checker": {_STARTEND + "end_checker"},
131
+ _CHANGE_CASES + "capital_word_frequency": {
132
+ _CHANGE_CASES + "capital_word_frequency",
133
+ _CHANGE_CASES + "english_lowercase",
134
+ _CHANGE_CASES + "english_capital",
135
+ },
136
+ _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
137
+ _CHANGE_CASES + "english_lowercase": {
138
+ _CHANGE_CASES + "english_lowercase",
139
+ _CHANGE_CASES + "english_capital",
140
+ },
141
+ _PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"},
142
+ _STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"},
143
+ }
144
+
145
+
146
+ def conflict_make(conflicts):
147
+ """Makes sure if A conflicts with B, B will conflict with A.
148
+
149
+ Args:
150
+ conflicts: Dictionary of potential conflicts where key is instruction id
151
+ and value is set of instruction ids that it conflicts with.
152
+
153
+ Returns:
154
+ Revised version of the dictionary. All instructions conflict with
155
+ themselves. If A conflicts with B, B will conflict with A.
156
+ """
157
+ for key in conflicts:
158
+ for k in conflicts[key]:
159
+ conflicts[k].add(key)
160
+ conflicts[key].add(key)
161
+ return conflicts