evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/base.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/clip.py +2 -2
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
- evalscope/benchmarks/ifeval/instructions.py +1477 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +27 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +30 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +45 -7
- evalscope/constants.py +7 -38
- evalscope/evaluator/__init__.py +0 -1
- evalscope/evaluator/evaluator.py +89 -121
- evalscope/evaluator/rating_eval.py +1 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +140 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/arguments.py +3 -1
- evalscope/perf/benchmark.py +3 -3
- evalscope/perf/main.py +5 -7
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +54 -50
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/benchmark_util.py +4 -4
- evalscope/perf/utils/db_util.py +66 -22
- evalscope/perf/utils/local_server.py +4 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +693 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +64 -125
- evalscope/run_arena.py +3 -2
- evalscope/summarizer.py +15 -27
- evalscope/third_party/longbench_write/eval.py +2 -1
- evalscope/third_party/longbench_write/longbench_write.py +2 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +1 -0
- evalscope/utils/chat_service.py +6 -5
- evalscope/utils/io_utils.py +170 -0
- evalscope/utils/logger.py +13 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -200
- evalscope/version.py +2 -2
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +57 -7
- tests/perf/test_perf.py +3 -2
- tests/rag/test_mteb.py +3 -2
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/evaluator/humaneval_evaluator.py +0 -158
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -135
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# Copyright 2023 The Google Research Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Registry of all instructions."""
|
|
15
|
+
|
|
16
|
+
from evalscope.benchmarks.ifeval import instructions
|
|
17
|
+
|
|
18
|
+
_KEYWORD = 'keywords:'
|
|
19
|
+
|
|
20
|
+
_LANGUAGE = 'language:'
|
|
21
|
+
|
|
22
|
+
_LENGTH = 'length_constraints:'
|
|
23
|
+
|
|
24
|
+
_CONTENT = 'detectable_content:'
|
|
25
|
+
|
|
26
|
+
_FORMAT = 'detectable_format:'
|
|
27
|
+
|
|
28
|
+
_MULTITURN = 'multi-turn:'
|
|
29
|
+
|
|
30
|
+
_COMBINATION = 'combination:'
|
|
31
|
+
|
|
32
|
+
_STARTEND = 'startend:'
|
|
33
|
+
|
|
34
|
+
_CHANGE_CASES = 'change_case:'
|
|
35
|
+
|
|
36
|
+
_PUNCTUATION = 'punctuation:'
|
|
37
|
+
|
|
38
|
+
INSTRUCTION_DICT = {
|
|
39
|
+
_KEYWORD + 'existence':
|
|
40
|
+
instructions.KeywordChecker,
|
|
41
|
+
_KEYWORD + 'frequency':
|
|
42
|
+
instructions.KeywordFrequencyChecker,
|
|
43
|
+
# TODO(jeffreyzhou): make a proper set of sentences to choose from
|
|
44
|
+
# _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
|
|
45
|
+
_KEYWORD + 'forbidden_words':
|
|
46
|
+
instructions.ForbiddenWords,
|
|
47
|
+
_KEYWORD + 'letter_frequency':
|
|
48
|
+
instructions.LetterFrequencyChecker,
|
|
49
|
+
_LANGUAGE + 'response_language':
|
|
50
|
+
instructions.ResponseLanguageChecker,
|
|
51
|
+
_LENGTH + 'number_sentences':
|
|
52
|
+
instructions.NumberOfSentences,
|
|
53
|
+
_LENGTH + 'number_paragraphs':
|
|
54
|
+
instructions.ParagraphChecker,
|
|
55
|
+
_LENGTH + 'number_words':
|
|
56
|
+
instructions.NumberOfWords,
|
|
57
|
+
_LENGTH + 'nth_paragraph_first_word':
|
|
58
|
+
instructions.ParagraphFirstWordCheck,
|
|
59
|
+
_CONTENT + 'number_placeholders':
|
|
60
|
+
instructions.PlaceholderChecker,
|
|
61
|
+
_CONTENT + 'postscript':
|
|
62
|
+
instructions.PostscriptChecker,
|
|
63
|
+
_FORMAT + 'number_bullet_lists':
|
|
64
|
+
instructions.BulletListChecker,
|
|
65
|
+
# TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
|
|
66
|
+
# _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
|
|
67
|
+
_FORMAT + 'constrained_response':
|
|
68
|
+
instructions.ConstrainedResponseChecker,
|
|
69
|
+
_FORMAT + 'number_highlighted_sections': (instructions.HighlightSectionChecker),
|
|
70
|
+
_FORMAT + 'multiple_sections':
|
|
71
|
+
instructions.SectionChecker,
|
|
72
|
+
# TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
|
|
73
|
+
# _FORMAT + "rephrase": instructions.RephraseChecker,
|
|
74
|
+
_FORMAT + 'json_format':
|
|
75
|
+
instructions.JsonFormat,
|
|
76
|
+
_FORMAT + 'title':
|
|
77
|
+
instructions.TitleChecker,
|
|
78
|
+
# TODO(tianjianlu): Re-enable with specific prompts.
|
|
79
|
+
# _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
|
|
80
|
+
_COMBINATION + 'two_responses':
|
|
81
|
+
instructions.TwoResponsesChecker,
|
|
82
|
+
_COMBINATION + 'repeat_prompt':
|
|
83
|
+
instructions.RepeatPromptThenAnswer,
|
|
84
|
+
_STARTEND + 'end_checker':
|
|
85
|
+
instructions.EndChecker,
|
|
86
|
+
_CHANGE_CASES + 'capital_word_frequency':
|
|
87
|
+
instructions.CapitalWordFrequencyChecker,
|
|
88
|
+
_CHANGE_CASES + 'english_capital':
|
|
89
|
+
instructions.CapitalLettersEnglishChecker,
|
|
90
|
+
_CHANGE_CASES + 'english_lowercase':
|
|
91
|
+
instructions.LowercaseLettersEnglishChecker,
|
|
92
|
+
_PUNCTUATION + 'no_comma':
|
|
93
|
+
instructions.CommaChecker,
|
|
94
|
+
_STARTEND + 'quotation':
|
|
95
|
+
instructions.QuotationChecker,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
INSTRUCTION_CONFLICTS = {
|
|
99
|
+
_KEYWORD + 'existence': {_KEYWORD + 'existence'},
|
|
100
|
+
_KEYWORD + 'frequency': {_KEYWORD + 'frequency'},
|
|
101
|
+
# TODO(jeffreyzhou): make a proper set of sentences to choose from
|
|
102
|
+
# _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
|
|
103
|
+
_KEYWORD + 'forbidden_words': {_KEYWORD + 'forbidden_words'},
|
|
104
|
+
_KEYWORD + 'letter_frequency': {_KEYWORD + 'letter_frequency'},
|
|
105
|
+
_LANGUAGE + 'response_language': {
|
|
106
|
+
_LANGUAGE + 'response_language',
|
|
107
|
+
_FORMAT + 'multiple_sections',
|
|
108
|
+
_KEYWORD + 'existence',
|
|
109
|
+
_KEYWORD + 'frequency',
|
|
110
|
+
_KEYWORD + 'forbidden_words',
|
|
111
|
+
_STARTEND + 'end_checker',
|
|
112
|
+
_CHANGE_CASES + 'english_capital',
|
|
113
|
+
_CHANGE_CASES + 'english_lowercase',
|
|
114
|
+
},
|
|
115
|
+
_LENGTH + 'number_sentences': {_LENGTH + 'number_sentences'},
|
|
116
|
+
_LENGTH + 'number_paragraphs': {
|
|
117
|
+
_LENGTH + 'number_paragraphs',
|
|
118
|
+
_LENGTH + 'nth_paragraph_first_word',
|
|
119
|
+
_LENGTH + 'number_sentences',
|
|
120
|
+
_LENGTH + 'nth_paragraph_first_word',
|
|
121
|
+
},
|
|
122
|
+
_LENGTH + 'number_words': {_LENGTH + 'number_words'},
|
|
123
|
+
_LENGTH + 'nth_paragraph_first_word': {
|
|
124
|
+
_LENGTH + 'nth_paragraph_first_word',
|
|
125
|
+
_LENGTH + 'number_paragraphs',
|
|
126
|
+
},
|
|
127
|
+
_CONTENT + 'number_placeholders': {_CONTENT + 'number_placeholders'},
|
|
128
|
+
_CONTENT + 'postscript': {_CONTENT + 'postscript'},
|
|
129
|
+
_FORMAT + 'number_bullet_lists': {_FORMAT + 'number_bullet_lists'},
|
|
130
|
+
# TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
|
|
131
|
+
# _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
|
|
132
|
+
_FORMAT + 'constrained_response':
|
|
133
|
+
set(INSTRUCTION_DICT.keys()),
|
|
134
|
+
_FORMAT + 'number_highlighted_sections': {_FORMAT + 'number_highlighted_sections'},
|
|
135
|
+
_FORMAT + 'multiple_sections': {
|
|
136
|
+
_FORMAT + 'multiple_sections',
|
|
137
|
+
_LANGUAGE + 'response_language',
|
|
138
|
+
_FORMAT + 'number_highlighted_sections',
|
|
139
|
+
},
|
|
140
|
+
# TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
|
|
141
|
+
# _FORMAT + "rephrase": instructions.RephraseChecker,
|
|
142
|
+
_FORMAT + 'json_format':
|
|
143
|
+
set(INSTRUCTION_DICT.keys()).difference({_KEYWORD + 'forbidden_words', _KEYWORD + 'existence'}),
|
|
144
|
+
_FORMAT + 'title': {_FORMAT + 'title'},
|
|
145
|
+
# TODO(tianjianlu): Re-enable with specific prompts.
|
|
146
|
+
# _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
|
|
147
|
+
_COMBINATION + 'two_responses':
|
|
148
|
+
set(INSTRUCTION_DICT.keys()).difference({
|
|
149
|
+
_KEYWORD + 'forbidden_words',
|
|
150
|
+
_KEYWORD + 'existence',
|
|
151
|
+
_LANGUAGE + 'response_language',
|
|
152
|
+
_FORMAT + 'title',
|
|
153
|
+
_PUNCTUATION + 'no_comma',
|
|
154
|
+
}),
|
|
155
|
+
_COMBINATION + 'repeat_prompt':
|
|
156
|
+
set(INSTRUCTION_DICT.keys()).difference({_KEYWORD + 'existence', _FORMAT + 'title', _PUNCTUATION + 'no_comma'}),
|
|
157
|
+
_STARTEND + 'end_checker': {_STARTEND + 'end_checker'},
|
|
158
|
+
_CHANGE_CASES + 'capital_word_frequency': {
|
|
159
|
+
_CHANGE_CASES + 'capital_word_frequency',
|
|
160
|
+
_CHANGE_CASES + 'english_lowercase',
|
|
161
|
+
_CHANGE_CASES + 'english_capital',
|
|
162
|
+
},
|
|
163
|
+
_CHANGE_CASES + 'english_capital': {_CHANGE_CASES + 'english_capital'},
|
|
164
|
+
_CHANGE_CASES + 'english_lowercase': {
|
|
165
|
+
_CHANGE_CASES + 'english_lowercase',
|
|
166
|
+
_CHANGE_CASES + 'english_capital',
|
|
167
|
+
},
|
|
168
|
+
_PUNCTUATION + 'no_comma': {_PUNCTUATION + 'no_comma'},
|
|
169
|
+
_STARTEND + 'quotation': {_STARTEND + 'quotation', _FORMAT + 'title'},
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def conflict_make(conflicts):
|
|
174
|
+
"""Makes sure if A conflicts with B, B will conflict with A.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
conflicts: Dictionary of potential conflicts where key is instruction id
|
|
178
|
+
and value is set of instruction ids that it conflicts with.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Revised version of the dictionary. All instructions conflict with
|
|
182
|
+
themselves. If A conflicts with B, B will conflict with A.
|
|
183
|
+
"""
|
|
184
|
+
for key in conflicts:
|
|
185
|
+
for k in conflicts[key]:
|
|
186
|
+
conflicts[k].add(key)
|
|
187
|
+
conflicts[key].add(key)
|
|
188
|
+
return conflicts
|