evalscope 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (72) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/backend/rag_eval/__init__.py +1 -1
  3. evalscope/backend/rag_eval/backend_manager.py +21 -5
  4. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  5. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  6. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  7. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  8. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  9. evalscope/backend/rag_eval/utils/llm.py +8 -9
  10. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  11. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  12. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  13. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  14. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  15. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  16. evalscope/benchmarks/arena_hard/utils.py +162 -0
  17. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  18. evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  19. evalscope/benchmarks/data_adapter.py +30 -2
  20. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  21. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
  22. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  23. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  24. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
  25. evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
  26. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  27. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  28. evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  30. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  31. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  32. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  33. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  34. evalscope/collections/evaluator.py +4 -2
  35. evalscope/config.py +2 -2
  36. evalscope/metrics/llm_judge.py +1 -1
  37. evalscope/models/chat_adapter.py +32 -11
  38. evalscope/perf/arguments.py +30 -9
  39. evalscope/perf/benchmark.py +57 -103
  40. evalscope/perf/http_client.py +2 -3
  41. evalscope/perf/plugin/api/custom_api.py +1 -1
  42. evalscope/perf/plugin/api/openai_api.py +4 -2
  43. evalscope/perf/plugin/datasets/custom.py +4 -1
  44. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  45. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  46. evalscope/perf/plugin/datasets/openqa.py +4 -1
  47. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  48. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  49. evalscope/perf/utils/benchmark_util.py +12 -6
  50. evalscope/perf/utils/db_util.py +3 -3
  51. evalscope/perf/utils/log_utils.py +41 -0
  52. evalscope/report/app.py +11 -11
  53. evalscope/run.py +7 -0
  54. evalscope/summarizer.py +2 -1
  55. evalscope/utils/utils.py +36 -25
  56. evalscope/version.py +2 -2
  57. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/METADATA +21 -55
  58. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/RECORD +70 -62
  59. tests/cli/test_all.py +36 -27
  60. tests/cli/test_collection.py +2 -1
  61. tests/cli/test_run.py +38 -20
  62. tests/perf/test_perf.py +1 -2
  63. tests/rag/test_clip_benchmark.py +0 -1
  64. tests/rag/test_mteb.py +37 -8
  65. tests/rag/test_ragas.py +33 -27
  66. tests/vlm/test_vlmeval.py +37 -1
  67. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  68. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  69. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
  70. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
  71. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
  72. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
File without changes
@@ -0,0 +1,79 @@
1
+ from typing import Any
2
+
3
+ from evalscope.benchmarks import Benchmark, DataAdapter
4
+ from evalscope.constants import EvalType, OutputType
5
+ from evalscope.metrics import exact_match
6
+ from evalscope.utils.utils import ResponseParser
7
+
8
+ SUBSET_LIST = ['default']
9
+
10
+
11
+ @Benchmark.register(
12
+ name='maritime_bench',
13
+ pretty_name='MaritimeBench',
14
+ dataset_id='HiDolphin/MaritimeBench',
15
+ model_adapter=OutputType.GENERATION,
16
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
17
+ subset_list=SUBSET_LIST,
18
+ metric_list=['AverageAccuracy'],
19
+ eval_split='test',
20
+ prompt_template=
21
+ '题目来自于{subset_name}请回答单选题。要求只输出选项,不输出解释,将选项放在<>里,直接输出答案。示例:\n\n题目:在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答:<A> 当前题目\n {query}', # noqa: E501
22
+ )
23
+ class MaritimeBenchAdapter(DataAdapter):
24
+
25
+ def __init__(self, **kwargs):
26
+ super().__init__(**kwargs)
27
+
28
+ self.choices = ['A', 'B', 'C', 'D']
29
+
30
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
31
+
32
+ prefix = ''
33
+ query = prefix + input_d['question'] + '\n'
34
+ available_choices = []
35
+ for option in self.choices:
36
+ if option in input_d and input_d[option]:
37
+ query += option + ':' + input_d[option] + '\n'
38
+ available_choices.append(option)
39
+
40
+ full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
41
+ return self.gen_prompt_data(full_prompt, choices=available_choices)
42
+
43
+ def get_gold_answer(self, input_d: dict) -> str:
44
+ """
45
+ Parse the raw input labels (gold).
46
+
47
+ Args:
48
+ input_d: input raw data. Depending on the dataset.
49
+
50
+ Returns:
51
+ The parsed input. e.g. gold answer ... Depending on the dataset.
52
+ """
53
+ return input_d['answer']
54
+
55
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
56
+ """
57
+ Parse the raw model prediction (pred).
58
+
59
+ Args:
60
+ pred: model prediction. Depending on the model.
61
+
62
+ Returns:
63
+ The parsed prediction. e.g. model answer... Depending on the model.
64
+ """
65
+
66
+ return ResponseParser.parse_bracketed_answer(result, options=self.choices)
67
+
68
+ def match(self, gold: Any, pred: Any) -> Any:
69
+ """
70
+ Match the gold answer with the predicted answer.
71
+
72
+ Args:
73
+ gold: The gold answer.
74
+ pred: The predicted answer.
75
+
76
+ Returns:
77
+ The result of the match.
78
+ """
79
+ return exact_match(gold=gold, pred=pred)
@@ -145,7 +145,7 @@ SUBJECT_MAPPING = {
145
145
  train_split='train',
146
146
  eval_split='test',
147
147
  prompt_template=
148
- 'Answer the following multiple choice question about {subset_name}. There is only one correct answer. The last line of your response should be in the format "Answer: LETTER" (without quotes), where LETTER is one of A, B, C, D. \n{query}',
148
+ """Answer the following multiple choice question about {subset_name}. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{query}""", # noqa: E501
149
149
  )
150
150
  class MMLUAdapter(DataAdapter):
151
151
 
@@ -224,9 +224,8 @@ class MMLUAdapter(DataAdapter):
224
224
 
225
225
  context: str = '\n'.join(few_shot_prompts) + '\n'
226
226
  context += self._generate_prompt(input_d=input_d, include_answer=False)
227
- query = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
228
227
 
229
- full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=query)
228
+ full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
230
229
 
231
230
  return self.gen_prompt_data(full_prompt)
232
231
 
@@ -249,7 +248,7 @@ class MMLUAdapter(DataAdapter):
249
248
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
250
249
  return result
251
250
  else:
252
- return ResponseParser.parse_first_option(result)
251
+ return ResponseParser.parse_first_option(result, options=self.choices)
253
252
 
254
253
  def match(self, gold: str, pred: str) -> float:
255
254
  return exact_match(gold=gold, pred=pred)
@@ -260,11 +259,10 @@ class MMLUAdapter(DataAdapter):
260
259
 
261
260
  example: str = input_d['input']
262
261
  for j in range(len(self.choices)):
263
- example += '\n{}. {}'.format(self.choices[j], input_choices[j])
262
+ example += f'\n{self.choices[j]}) {input_choices[j]}'
264
263
 
265
- example += '\nAnswer:'
266
264
  if include_answer:
267
- example += ' {}\n\n'.format(input_d['target'])
265
+ example += f"\nAnswer: {input_d['target']}\n\n"
268
266
 
269
267
  return example
270
268
 
@@ -92,7 +92,7 @@ class MMLUProAdapter(DataAdapter):
92
92
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
93
93
  return result
94
94
  else:
95
- return ResponseParser.parse_first_option(result)
95
+ return ResponseParser.parse_first_option(result, options=self.choices)
96
96
 
97
97
  def match(self, gold: str, pred: str) -> float:
98
98
  """
File without changes
@@ -0,0 +1,182 @@
1
+ from collections import defaultdict
2
+ from typing import Any, Dict
3
+
4
+ from evalscope.benchmarks import Benchmark, DataAdapter
5
+ from evalscope.constants import EvalType, OutputType
6
+ from evalscope.metrics import exact_match
7
+ from evalscope.utils.logger import get_logger
8
+ from evalscope.utils.utils import ResponseParser
9
+
10
+ logger = get_logger()
11
+
12
+ SUBSET_LIST = [
13
+ 'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
14
+ 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
15
+ 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
16
+ 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
17
+ 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics',
18
+ 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
19
+ 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
20
+ 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning',
21
+ 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
22
+ 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
23
+ 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology',
24
+ 'world_religions'
25
+ ]
26
+
27
+ SUBJECT_MAPPING = {
28
+ 'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
29
+ 'anatomy': ['Anatomy', 'health', 'Other'],
30
+ 'astronomy': ['Astronomy', 'physics', 'STEM'],
31
+ 'business_ethics': ['Business Ethics', 'business', 'Other'],
32
+ 'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
33
+ 'college_biology': ['College Biology', 'biology', 'STEM'],
34
+ 'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
35
+ 'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
36
+ 'college_mathematics': ['College Mathematics', 'math', 'STEM'],
37
+ 'college_medicine': ['College Medicine', 'health', 'Other'],
38
+ 'college_physics': ['College Physics', 'physics', 'STEM'],
39
+ 'computer_security': ['Computer Security', 'computer science', 'STEM'],
40
+ 'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
41
+ 'econometrics': ['Econometrics', 'economics', 'Social Science'],
42
+ 'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
43
+ 'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
44
+ 'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
45
+ 'global_facts': ['Global Facts', 'other', 'Other'],
46
+ 'high_school_biology': ['High School Biology', 'biology', 'STEM'],
47
+ 'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
48
+ 'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
49
+ 'high_school_european_history': ['High School European History', 'history', 'Humanities'],
50
+ 'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
51
+ 'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
52
+ 'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
53
+ 'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
54
+ 'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
55
+ 'high_school_physics': ['High School Physics', 'physics', 'STEM'],
56
+ 'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
57
+ 'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
58
+ 'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
59
+ 'high_school_world_history': ['High School World History', 'history', 'Humanities'],
60
+ 'human_aging': ['Human Aging', 'health', 'Other'],
61
+ 'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
62
+ 'international_law': ['International Law', 'law', 'Humanities'],
63
+ 'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
64
+ 'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
65
+ 'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
66
+ 'management': ['Management', 'business', 'Other'],
67
+ 'marketing': ['Marketing', 'business', 'Other'],
68
+ 'medical_genetics': ['Medical Genetics', 'health', 'Other'],
69
+ 'miscellaneous': ['Miscellaneous', 'other', 'Other'],
70
+ 'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
71
+ 'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
72
+ 'nutrition': ['Nutrition', 'health', 'Other'],
73
+ 'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
74
+ 'prehistory': ['Prehistory', 'history', 'Humanities'],
75
+ 'professional_accounting': ['Professional Accounting', 'other', 'Other'],
76
+ 'professional_law': ['Professional Law', 'law', 'Humanities'],
77
+ 'professional_medicine': ['Professional Medicine', 'health', 'Other'],
78
+ 'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
79
+ 'public_relations': ['Public Relations', 'politics', 'Social Science'],
80
+ 'security_studies': ['Security Studies', 'politics', 'Social Science'],
81
+ 'sociology': ['Sociology', 'culture', 'Social Science'],
82
+ 'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
83
+ 'virology': ['Virology', 'health', 'Other'],
84
+ 'world_religions': ['World Religions', 'philosophy', 'Humanities'],
85
+ }
86
+
87
+
88
+ @Benchmark.register(
89
+ name='mmlu_redux',
90
+ pretty_name='MMLU-Redux',
91
+ dataset_id='AI-ModelScope/mmlu-redux-2.0',
92
+ model_adapter=OutputType.GENERATION,
93
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
94
+ subset_list=SUBSET_LIST,
95
+ metric_list=['AverageAccuracy'],
96
+ few_shot_num=0,
97
+ train_split=None,
98
+ eval_split='test',
99
+ prompt_template=
100
+ 'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
101
+ )
102
+ class MMLUReduxAdapter(DataAdapter):
103
+
104
+ def __init__(self, **kwargs):
105
+ super().__init__(**kwargs)
106
+
107
+ if self.few_shot_num > 0:
108
+ self.few_shot_num = 0
109
+ logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
110
+
111
+ self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
112
+ self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
113
+
114
+ def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
115
+ if self.few_shot_num > 0:
116
+ prefix = self.format_fewshot_examples(few_shot_list)
117
+ else:
118
+ prefix = ''
119
+ query = prefix + 'Q: ' + input_d['question'] + '\n' + \
120
+ self.__form_options(input_d['choices']) + '\n'
121
+
122
+ full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
123
+ return self.gen_prompt_data(full_prompt)
124
+
125
+ def format_fewshot_examples(self, few_shot_list):
126
+ # load few-shot prompts for each category
127
+ prompts = ''
128
+ for index, d in enumerate(few_shot_list):
129
+ prompts += 'Q: ' + d['question'] + '\n' + \
130
+ self.__form_options(d['choices']) + '\n'
131
+ return prompts
132
+
133
+ def __form_options(self, options: list):
134
+ option_str = 'Options are:\n'
135
+ for opt, choice in zip(options, self.choices):
136
+ option_str += f'({choice}): {opt}' + '\n'
137
+ return option_str
138
+
139
+ def get_gold_answer(self, input_d: dict) -> str:
140
+ """
141
+ Parse the raw input labels (gold).
142
+
143
+ Args:
144
+ input_d: input raw data. Depending on the dataset.
145
+
146
+ Returns:
147
+ The parsed input. e.g. gold answer ... Depending on the dataset.
148
+ """
149
+ answer_index = int(input_d['answer'])
150
+ return self.choices[answer_index]
151
+
152
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
153
+ """
154
+ Parse the predicted result and extract proper answer.
155
+
156
+ Args:
157
+ result: Predicted answer from the model. Usually a string for chat.
158
+ raw_input_d: The raw input. Depending on the dataset.
159
+ eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
160
+
161
+ Returns:
162
+ The parsed answer. Depending on the dataset. Usually a string for chat.
163
+ """
164
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
165
+ return result
166
+ else:
167
+ return ResponseParser.parse_first_option(result, options=self.choices)
168
+
169
+ def match(self, gold: str, pred: str) -> float:
170
+ """
171
+ Match the gold answer and the predicted answer.
172
+
173
+ Args:
174
+ gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
175
+ e.g. 'A', extracted from get_gold_answer method.
176
+ pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
177
+ e.g. 'B', extracted from parse_pred_result method.
178
+
179
+ Returns:
180
+ The match result. Usually a score (float) for chat/multiple-choice-questions.
181
+ """
182
+ return exact_match(gold=gold, pred=pred)
@@ -62,7 +62,7 @@ class MuSRAdapter(DataAdapter):
62
62
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
63
63
  return result
64
64
  else:
65
- return ResponseParser.parse_first_option(result)
65
+ return ResponseParser.parse_first_option(result, options=self.choices)
66
66
 
67
67
  def match(self, gold: str, pred: str) -> float:
68
68
  """
@@ -126,7 +126,7 @@ class SimpleQAAdapter(DataAdapter):
126
126
 
127
127
  def match(self, gold: str, pred: str) -> float:
128
128
  # simple match
129
- logger.warning(f'Please use LLMJudge to match the result for SimpleQA')
129
+ logger.warning(f'Please use LLMJudge to match the result for {self.name}')
130
130
  is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
131
131
  is_incorrect = not is_correct
132
132
  is_not_attempted = 0
@@ -159,9 +159,6 @@ class SimpleQAAdapter(DataAdapter):
159
159
  review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
160
160
  """
161
161
  # zip dict answers
162
- res_dict = defaultdict(list)
163
- for res in review_res_list:
164
- for key, value in res.items():
165
- res_dict[key].append(value)
162
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
166
163
 
167
164
  return super().compute_metric(res_dict, **kwargs)
@@ -65,7 +65,7 @@ class EvaluatorCollection:
65
65
  self.evaluators = self._initialize_evaluators()
66
66
 
67
67
  def load(self) -> tuple[list[DatasetEntry], str]:
68
- dataset_name = os.path.basename(self.data_adapter.dataset_id).split('.')[0]
68
+ dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
69
69
  raw_dataset = self.data_adapter.load()
70
70
  # limit the dataset
71
71
  if self.task_cfg.limit:
@@ -174,6 +174,7 @@ class EvaluatorCollection:
174
174
  os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
175
175
  with open(report_file_path, 'w', encoding='utf-8') as f:
176
176
  json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
177
+ return report
177
178
 
178
179
  def _filter_answer(self, pred_file_path):
179
180
  answer_dict = defaultdict(dict)
@@ -274,4 +275,5 @@ class EvaluatorCollection:
274
275
  answers = self.get_answers()
275
276
  reviews = self.get_reviews(answers)
276
277
  scores = self.get_scores(reviews)
277
- self.get_report(scores)
278
+ report = self.get_report(scores)
279
+ return report
evalscope/config.py CHANGED
@@ -75,7 +75,7 @@ class TaskConfig:
75
75
 
76
76
  # LLMJudge arguments
77
77
  judge_strategy: str = JudgeStrategy.AUTO
78
- judge_worker_num: int = 8
78
+ judge_worker_num: int = 1
79
79
  judge_model_args: Optional[Dict] = field(default_factory=lambda: {})
80
80
 
81
81
  def __post_init__(self):
@@ -212,7 +212,7 @@ def parse_task_config(task_cfg) -> TaskConfig:
212
212
  logger.info('Args: Task config is provided with CommandLine type.')
213
213
  task_cfg = TaskConfig.from_args(task_cfg)
214
214
  elif isinstance(task_cfg, str):
215
- extension = task_cfg.split('.')[-1]
215
+ extension = os.path.splitext(task_cfg)[-1]
216
216
  logger.info(f'Args: Task config is provided with {extension} file type.')
217
217
  if extension in ['yaml', 'yml']:
218
218
  task_cfg = TaskConfig.from_yaml(task_cfg)
@@ -49,7 +49,7 @@ class LLMJudge:
49
49
  """
50
50
  self.api_key = api_key or os.environ.get('OPENAI_API_KEY', 'EMPTY')
51
51
  self.api_url = api_url or os.environ.get('OPENAI_API_BASE', 'https://api.openai.com/v1')
52
- self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-3.5-turbo')
52
+ self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-4')
53
53
  self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
54
54
  self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
55
55
  self.generation_config = generation_config
@@ -1,13 +1,13 @@
1
1
  import os
2
2
  import time
3
3
  import torch
4
- from typing import List, Union
4
+ from typing import Any, Dict, List, Tuple, Union
5
5
 
6
6
  from evalscope.constants import OutputType
7
7
  from evalscope.models.base_adapter import BaseModelAdapter
8
8
  from evalscope.models.local_model import LocalModel
9
9
  from evalscope.models.register import register_model_adapter
10
- from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
10
+ from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
11
11
  from evalscope.utils.logger import get_logger
12
12
  from evalscope.utils.model_utils import fix_do_sample_warning
13
13
 
@@ -60,7 +60,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
60
60
 
61
61
  return generation_config
62
62
 
63
- def _model_generate(self, queries: List[str], system_prompts: List[str] = None, infer_cfg: dict = {}) -> List[str]:
63
+ def _model_generate(self,
64
+ queries: List[str],
65
+ system_prompts: List[str] = None,
66
+ infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
64
67
  """
65
68
  Args:
66
69
  queries: The input queries.
@@ -69,6 +72,11 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
69
72
  Returns:
70
73
  The prediction results.
71
74
  """
75
+ if system_prompts is None:
76
+ system_prompts = []
77
+ if infer_cfg is None:
78
+ infer_cfg = {}
79
+
72
80
  # Process infer_cfg
73
81
  num_return_sequences = infer_cfg.get('num_return_sequences', 1)
74
82
  if num_return_sequences > 1:
@@ -111,7 +119,9 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
111
119
  # Run inference
112
120
  output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
113
121
 
122
+ # Decode output
114
123
  responses = []
124
+ input_lengths = [len(self.tokenizer.encode(prompt)) for prompt in formatted_prompts]
115
125
  for i in range(0, len(output_ids), num_return_sequences):
116
126
  query_responses = []
117
127
  for j in range(num_return_sequences):
@@ -121,7 +131,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
121
131
  query_responses.append(response)
122
132
  responses.append(query_responses)
123
133
 
124
- return responses
134
+ return responses, input_lengths
125
135
 
126
136
  @torch.no_grad()
127
137
  def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
@@ -141,22 +151,33 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
141
151
  queries.append(input_item['data'][0])
142
152
  system_prompts.append(input_item.get('system_prompt', None))
143
153
 
144
- responses = self._model_generate(queries, system_prompts, infer_cfg)
154
+ # Run inference
155
+ responses, input_lengths = self._model_generate(queries, system_prompts, infer_cfg)
145
156
 
157
+ # Process outputs
146
158
  results = []
147
- for response in responses:
148
- choices_list = [
149
- ChatCompletionResponseChoice(
159
+ for response, input_length in zip(responses, input_lengths):
160
+ choices_list = []
161
+ completion_tokens = 0
162
+
163
+ for index, one_response in enumerate(response):
164
+ choice = ChatCompletionResponseChoice(
150
165
  index=index, message=ChatMessage(content=one_response, role='assistant'), finish_reason='stop')
151
- for index, one_response in enumerate(response)
152
- ]
166
+ choices_list.append(choice)
167
+
168
+ completion_tokens += len(self.tokenizer.encode(one_response))
169
+
170
+ usage = Usage(
171
+ prompt_tokens=input_length,
172
+ completion_tokens=completion_tokens,
173
+ total_tokens=input_length + completion_tokens)
153
174
 
154
175
  res_d = ChatCompletionResponse(
155
176
  model=self.model_id,
156
177
  choices=choices_list,
157
178
  object='chat.completion',
158
179
  created=int(time.time()),
159
- usage=None).model_dump(exclude_unset=True)
180
+ usage=usage).model_dump(exclude_unset=True)
160
181
 
161
182
  results.append(res_d)
162
183
 
@@ -27,7 +27,7 @@ class Arguments:
27
27
  no_test_connection: bool = False # Test the connection before starting the benchmark
28
28
 
29
29
  # Performance and parallelism
30
- number: Optional[int] = None # Number of requests to be made
30
+ number: int = 1000 # Number of requests to be made
31
31
  parallel: int = 1 # Number of parallel requests
32
32
  rate: int = -1 # Rate limit for requests (default: -1, no limit)
33
33
 
@@ -35,6 +35,7 @@ class Arguments:
35
35
  log_every_n_query: int = 10 # Log every N queries
36
36
  debug: bool = False # Debug mode
37
37
  wandb_api_key: Optional[str] = None # WandB API key for logging
38
+ swanlab_api_key: Optional[str] = None # SwanLab API key for logging
38
39
  name: Optional[str] = None # Name for the run
39
40
 
40
41
  # Output settings
@@ -46,6 +47,7 @@ class Arguments:
46
47
  prefix_length: int = 0 # Length of the prefix, only for random dataset
47
48
  prompt: Optional[str] = None # The prompt text
48
49
  query_template: Optional[str] = None # Template for the query
50
+ apply_chat_template: Optional[bool] = None # Whether to apply chat template
49
51
 
50
52
  # Dataset settings
51
53
  dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
@@ -57,13 +59,14 @@ class Arguments:
57
59
  max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
58
60
  min_tokens: Optional[int] = None # Minimum number of tokens in the response
59
61
  n_choices: Optional[int] = None # Number of response choices
60
- seed: Optional[int] = 42 # Random seed for reproducibility
62
+ seed: Optional[int] = 0 # Random seed for reproducibility
61
63
  stop: Optional[List[str]] = field(default_factory=list) # Stop sequences for the response
62
64
  stop_token_ids: Optional[List[str]] = field(default_factory=list) # Stop token IDs for the response
63
- stream: Optional[bool] = None # Whether to stream the response
64
- temperature: Optional[float] = None # Temperature setting for the response
65
+ stream: Optional[bool] = True # Whether to stream the response
66
+ temperature: float = 0.0 # Temperature setting for the response
65
67
  top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
66
68
  top_k: Optional[int] = None # Top-k sampling setting for the response
69
+ extra_args: Optional[Dict[str, Any]] = None # Extra arguments
67
70
 
68
71
  @staticmethod
69
72
  def from_args(args):
@@ -75,12 +78,26 @@ class Arguments:
75
78
  return Arguments(**args_dict)
76
79
 
77
80
  def __post_init__(self):
81
+ # Set the default headers
78
82
  self.headers = self.headers or {} # Default to empty dictionary
79
83
  if self.api_key:
80
84
  # Assuming the API key is used as a Bearer token
81
85
  self.headers['Authorization'] = f'Bearer {self.api_key}'
86
+
87
+ # Set the model ID based on the model name
82
88
  self.model_id = os.path.basename(self.model)
83
89
 
90
+ # Set the URL based on the dataset type
91
+ if self.api.startswith('local'):
92
+ if self.dataset.startswith('speed_benchmark'):
93
+ self.url = f'http://127.0.0.1:{self.port}/v1/completions'
94
+ else:
95
+ self.url = f'http://127.0.0.1:{self.port}/v1/chat/completions'
96
+
97
+ # Set the apply_chat_template flag based on the URL
98
+ if self.apply_chat_template is None:
99
+ self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
100
+
84
101
  def __str__(self):
85
102
  return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
86
103
 
@@ -126,7 +143,7 @@ def add_argument(parser: argparse.ArgumentParser):
126
143
  parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
127
144
 
128
145
  # Performance and parallelism
129
- parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
146
+ parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
130
147
  parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
131
148
  parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
132
149
 
@@ -134,7 +151,8 @@ def add_argument(parser: argparse.ArgumentParser):
134
151
  parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
135
152
  parser.add_argument('--debug', action='store_true', default=False, help='Debug request send')
136
153
  parser.add_argument('--wandb-api-key', type=str, default=None, help='The wandb API key')
137
- parser.add_argument('--name', type=str, help='The wandb db result name and result db name')
154
+ parser.add_argument('--swanlab-api-key', type=str, default=None, help='The swanlab API key')
155
+ parser.add_argument('--name', type=str, help='The wandb/swanlab db result name and result db name')
138
156
 
139
157
  # Prompt settings
140
158
  parser.add_argument('--max-prompt-length', type=int, default=sys.maxsize, help='Maximum input prompt length')
@@ -142,6 +160,8 @@ def add_argument(parser: argparse.ArgumentParser):
142
160
  parser.add_argument('--prefix-length', type=int, default=0, help='The prefix length')
143
161
  parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
144
162
  parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
163
+ parser.add_argument(
164
+ '--apply-chat-template', type=argparse.BooleanOptionalAction, default=None, help='Apply chat template to the prompt') # noqa: E501
145
165
 
146
166
  # Output settings
147
167
  parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
@@ -158,13 +178,14 @@ def add_argument(parser: argparse.ArgumentParser):
158
178
  parser.add_argument(
159
179
  '--min-tokens', type=int, help='The minimum number of tokens that can be generated', default=None)
160
180
  parser.add_argument('--n-choices', type=int, help='How many completion choices to generate', default=None)
161
- parser.add_argument('--seed', type=int, help='The random seed', default=42)
181
+ parser.add_argument('--seed', type=int, help='The random seed', default=0)
162
182
  parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
163
183
  parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
164
- parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
165
- parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
184
+ parser.add_argument('--stream', action=argparse.BooleanOptionalAction, help='Stream output with SSE', default=True)
185
+ parser.add_argument('--temperature', type=float, help='The sample temperature', default=0.0)
166
186
  parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
167
187
  parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
188
+ parser.add_argument('--extra-args', type=json.loads, default='{}', help='Extra arguments, should in JSON format',)
168
189
  # yapf: enable
169
190
 
170
191