evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  25. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  26. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  27. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  28. evalscope/benchmarks/data_adapter.py +29 -9
  29. evalscope/benchmarks/general_arena/__init__.py +0 -0
  30. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  31. evalscope/benchmarks/general_arena/utils.py +226 -0
  32. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
  33. evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
  34. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  35. evalscope/benchmarks/hle/__init__.py +0 -0
  36. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  37. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  38. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  40. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  41. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  42. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  43. evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
  44. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  45. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  46. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  47. evalscope/benchmarks/race/race_adapter.py +1 -1
  48. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  49. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  50. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  51. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  52. evalscope/benchmarks/utils.py +2 -2
  53. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  54. evalscope/config.py +8 -123
  55. evalscope/constants.py +5 -21
  56. evalscope/evaluator/__init__.py +1 -1
  57. evalscope/evaluator/evaluator.py +20 -15
  58. evalscope/metrics/__init__.py +9 -1
  59. evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
  60. evalscope/metrics/llm_judge.py +106 -20
  61. evalscope/metrics/metrics.py +20 -8
  62. evalscope/models/__init__.py +4 -8
  63. evalscope/models/adapters/__init__.py +4 -9
  64. evalscope/models/adapters/base_adapter.py +4 -0
  65. evalscope/models/adapters/bfcl_adapter.py +2 -0
  66. evalscope/models/adapters/chat_adapter.py +3 -0
  67. evalscope/models/adapters/choice_adapter.py +4 -0
  68. evalscope/models/adapters/custom_adapter.py +7 -3
  69. evalscope/models/adapters/server_adapter.py +4 -2
  70. evalscope/models/adapters/t2i_adapter.py +3 -0
  71. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  72. evalscope/models/custom/dummy_model.py +3 -3
  73. evalscope/models/register.py +0 -14
  74. evalscope/perf/arguments.py +15 -16
  75. evalscope/perf/benchmark.py +38 -39
  76. evalscope/perf/http_client.py +30 -86
  77. evalscope/perf/main.py +3 -3
  78. evalscope/perf/plugin/__init__.py +3 -2
  79. evalscope/perf/plugin/api/__init__.py +4 -3
  80. evalscope/perf/plugin/api/base.py +22 -4
  81. evalscope/perf/plugin/api/custom_api.py +212 -55
  82. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  83. evalscope/perf/plugin/api/default_api.py +105 -0
  84. evalscope/perf/plugin/api/openai_api.py +17 -19
  85. evalscope/perf/plugin/datasets/__init__.py +10 -7
  86. evalscope/perf/plugin/datasets/base.py +22 -1
  87. evalscope/perf/plugin/datasets/custom.py +2 -1
  88. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  89. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  90. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  91. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  92. evalscope/perf/plugin/datasets/openqa.py +2 -1
  93. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  94. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  95. evalscope/perf/plugin/registry.py +36 -16
  96. evalscope/perf/utils/analysis_result.py +24 -23
  97. evalscope/perf/utils/benchmark_util.py +14 -20
  98. evalscope/perf/utils/db_util.py +79 -61
  99. evalscope/report/__init__.py +1 -1
  100. evalscope/report/utils.py +34 -15
  101. evalscope/run.py +1 -1
  102. evalscope/summarizer.py +1 -2
  103. evalscope/utils/__init__.py +63 -2
  104. evalscope/utils/argument_utils.py +64 -0
  105. evalscope/utils/import_utils.py +16 -0
  106. evalscope/utils/io_utils.py +55 -4
  107. evalscope/utils/model_utils.py +37 -1
  108. evalscope/version.py +2 -2
  109. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
  110. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
  111. tests/aigc/test_t2i.py +1 -1
  112. tests/cli/test_all.py +68 -4
  113. tests/cli/test_collection.py +1 -1
  114. tests/cli/test_custom.py +261 -0
  115. tests/cli/test_run.py +34 -70
  116. tests/perf/test_perf.py +31 -4
  117. tests/rag/test_clip_benchmark.py +2 -1
  118. tests/rag/test_mteb.py +3 -1
  119. tests/rag/test_ragas.py +3 -1
  120. tests/swift/test_run_swift_eval.py +2 -1
  121. tests/swift/test_run_swift_vlm_eval.py +2 -1
  122. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  123. tests/utils.py +13 -0
  124. tests/vlm/test_vlmeval.py +8 -2
  125. evalscope/evaluator/rating_eval.py +0 -157
  126. evalscope/evaluator/reviewer/__init__.py +0 -1
  127. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  128. evalscope/models/model.py +0 -189
  129. evalscope/registry/__init__.py +0 -1
  130. evalscope/registry/config/cfg_arena.yaml +0 -77
  131. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  132. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  133. evalscope/registry/config/cfg_single.yaml +0 -78
  134. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  135. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  136. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  137. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  138. evalscope/registry/data/question.jsonl +0 -80
  139. evalscope/registry/tasks/arc.yaml +0 -28
  140. evalscope/registry/tasks/bbh.yaml +0 -26
  141. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  142. evalscope/registry/tasks/ceval.yaml +0 -27
  143. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  144. evalscope/registry/tasks/cmmlu.yaml +0 -27
  145. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  146. evalscope/registry/tasks/general_qa.yaml +0 -27
  147. evalscope/registry/tasks/gsm8k.yaml +0 -29
  148. evalscope/registry/tasks/mmlu.yaml +0 -29
  149. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  150. evalscope/run_arena.py +0 -202
  151. evalscope/utils/arena_utils.py +0 -217
  152. evalscope/utils/completion_parsers.py +0 -82
  153. /evalscope/{utils → benchmarks}/filters.py +0 -0
  154. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  155. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  156. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  157. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,8 @@ logger = get_logger()
14
14
  @Benchmark.register(
15
15
  name='general_qa',
16
16
  pretty_name='General-QA',
17
- description='General Question Answering dataset',
17
+ description='A general question answering dataset for custom evaluation. '
18
+ 'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#qa).', # noqa: E501
18
19
  tags=['QA', 'Custom'],
19
20
  dataset_id='general_qa',
20
21
  subset_list=['default'],
@@ -25,13 +26,21 @@ logger = get_logger()
25
26
  prompt_template='请回答问题\n{query}',
26
27
  )
27
28
  class GeneralQAAdapter(DataAdapter):
28
- # TODO: set few_shot_num
29
29
 
30
30
  def __init__(self, **kwargs):
31
-
32
31
  super().__init__(**kwargs)
33
32
 
34
33
  def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
34
+ """
35
+ Load dataset from the given path or dataset name.
36
+
37
+ Args:
38
+ dataset_name_or_path (str): Path to dataset directory or file.
39
+ subset_list (list): List of subset names to load.
40
+
41
+ Returns:
42
+ dict: Loaded dataset organized by subset.
43
+ """
35
44
  dataset_name_or_path = dataset_name_or_path or self.dataset_id
36
45
  subset_list = subset_list or self.subset_list
37
46
 
@@ -61,58 +70,64 @@ class GeneralQAAdapter(DataAdapter):
61
70
 
62
71
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
63
72
  """
73
+ Generate prompt for the model based on input data.
74
+
64
75
  Args:
65
- input_d:
66
- format1: {'history': [['q1', 'a1'], ['q2', 'a2']], 'question': '', 'answer': ''}
67
- format2: {'history': [['q1', 'a1'], ['q2', 'a2']], 'query': '', 'response': ''}
76
+ input_d (dict): Input data dictionary.
77
+ subset_name (str): Name of the subset.
78
+ few_shot_list (list): List of few-shot examples.
68
79
 
69
80
  Returns:
70
- {'data': [prompt]}
71
-
81
+ dict: Dictionary containing the generated prompt.
72
82
  """
73
- # prompt = f"'<|im_start|>user\n{input_d['input']}<|im_end|>\n<|im_start|>assistant\n'"
74
- history = input_d.get('history', []) # history: [['q1', 'a1'], ['q2', 'a2'], ...]
75
- if len(history) > 0:
76
- logger.warning('The history is not included in the prompt for GeneralQA. \
77
- To be supported in the future.')
78
-
83
+ messages = input_d.get('messages')
79
84
  query = input_d.get('question', '') or input_d.get('query', '')
80
85
  system_prompt = input_d.get('system')
81
86
  prompt = self.prompt_template.format(query=query)
82
- return self.gen_prompt_data(prompt, system_prompt=system_prompt)
87
+ return self.gen_prompt_data(prompt, system_prompt=system_prompt, messages=messages)
83
88
 
84
89
  def get_gold_answer(self, input_d: dict) -> str:
85
90
  """
91
+ Extract the gold (reference) answer from the input data.
92
+
86
93
  Args:
87
- input_d: {'history': [], 'question': '', 'answer': ''}
94
+ input_d (dict): Input data dictionary.
88
95
 
89
96
  Returns:
90
- gold_answer: str
91
-
97
+ str: Gold answer string.
92
98
  """
93
- return input_d.get('answer', '') or input_d.get('response', '')
99
+ return input_d.get('answer') or input_d.get('response')
94
100
 
95
101
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
96
102
  """
103
+ Parse the prediction result.
104
+
97
105
  Args:
98
- result: str
106
+ result (str): Model prediction result.
107
+ raw_input_d (dict, optional): Original input data.
108
+ eval_type (str): Evaluation type.
99
109
 
100
110
  Returns:
101
- pred_result: str
102
-
111
+ str: Parsed prediction result.
103
112
  """
104
113
  return result
105
114
 
106
115
  def match(self, gold: str, pred: str) -> dict:
107
116
  """
117
+ Compute metric scores between gold and predicted answers.
118
+
108
119
  Args:
109
- gold: str
110
- pred: str
120
+ gold (str): Gold answer.
121
+ pred (str): Predicted answer.
111
122
 
112
123
  Returns:
113
- bleu_score: dict
114
-
124
+ dict: Dictionary of computed metric scores.
115
125
  """
126
+ # reference free metrics
127
+ if gold is None:
128
+ return {'AverageAccuracy': -1}
129
+
130
+ # calculate rouge and bleu scores
116
131
  res = dict()
117
132
  if 'AverageRouge' in self.metric_list:
118
133
  from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
@@ -128,14 +143,13 @@ class GeneralQAAdapter(DataAdapter):
128
143
 
129
144
  def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
130
145
  """
131
- compute weighted mean of the bleu score of all samples
146
+ Compute weighted mean of the metric scores for all samples.
132
147
 
133
148
  Args:
134
- review_res_list: [score1, score2, ...]
149
+ review_res_list (list): List of metric score dictionaries.
135
150
 
136
151
  Returns:
137
- avg_res: List[dict]
138
-
152
+ list: List of dictionaries with averaged metric results.
139
153
  """
140
154
  items = super().compute_dict_metric(review_res_list, **kwargs)
141
155
  return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
@@ -6,9 +6,9 @@ import re
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType, OutputType
8
8
  from evalscope.metrics import exact_match
9
+ from evalscope.metrics.completion_parsers import ResponseParser
9
10
  from evalscope.utils.io_utils import jsonl_to_list
10
11
  from evalscope.utils.logger import get_logger
11
- from evalscope.utils.utils import ResponseParser
12
12
 
13
13
  # flake8: noqa
14
14
 
File without changes
@@ -0,0 +1,118 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.metrics import DEFAULT_PROMPT_TEMPLATE, LLMJudge, exact_match, mean
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ # flake8: noqa
10
+
11
+ logger = get_logger()
12
+
13
+ SUBSET_LIST = [
14
+ 'Biology/Medicine',
15
+ 'Chemistry',
16
+ 'Computer Science/AI',
17
+ 'Engineering',
18
+ 'Humanities/Social Science',
19
+ 'Math',
20
+ 'Physics',
21
+ 'Other',
22
+ ]
23
+
24
+
25
+ @Benchmark.register(
26
+ name='hle',
27
+ pretty_name="Humanity's-Last-Exam",
28
+ tags=['Knowledge', 'QA'],
29
+ description=
30
+ 'Humanity\'s Last Exam (HLE) is a language model benchmark consisting of 2,500 questions across a broad range of subjects. It was created jointly by the Center for AI Safety and Scale AI. The benchmark classifies the questions into the following broad subjects: mathematics (41%), physics (9%), biology/medicine (11%), humanities/social science (9%), computer science/artificial intelligence (10%), engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions require the ability to understand both text and images, i.e., multi-modality. 24% of the questions are multiple-choice; the rest are short-answer, exact-match questions.', # noqa: E501
31
+ dataset_id='cais/hle',
32
+ subset_list=SUBSET_LIST,
33
+ metric_list=['AverageAccuracy'],
34
+ few_shot_num=0,
35
+ train_split=None,
36
+ eval_split='test',
37
+ prompt_template='{query}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
38
+ )
39
+ class HLEAdapter(DataAdapter):
40
+
41
+ def __init__(self, *args, **kwargs):
42
+ super().__init__(*args, **kwargs)
43
+
44
+ self.llm_as_a_judge = True
45
+
46
+ def load(self, **kwargs):
47
+ kwargs['subset_list'] = ['default']
48
+ data_dict = super().load(**kwargs)
49
+ return self.reformat_subset(data_dict, subset_key='category', format='{}')
50
+
51
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
52
+ # remove image preview
53
+ input_d.pop('image_preview', None)
54
+ input_d.pop('rationale_image', None)
55
+ # generate prompt
56
+ question = input_d['question']
57
+ prompt = self.prompt_template.format(query=question)
58
+ image = input_d.get('image', None)
59
+ # build messages for multi-modal input
60
+ messages = []
61
+ if self.system_prompt:
62
+ messages.append({'role': 'system', 'content': self.system_prompt})
63
+ if image:
64
+ messages.append({
65
+ 'role':
66
+ 'user',
67
+ 'content': [{
68
+ 'type': 'text',
69
+ 'text': prompt
70
+ }, {
71
+ 'type': 'image_url',
72
+ 'image_url': {
73
+ 'url': image
74
+ }
75
+ }]
76
+ })
77
+ else:
78
+ messages.append({'role': 'user', 'content': prompt})
79
+ return self.gen_prompt_data(prompt='', messages=messages)
80
+
81
+ def get_gold_answer(self, input_d: dict) -> str:
82
+ return input_d['answer']
83
+
84
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
85
+ # Extract the answer from the model output \boxed{answer}
86
+ match = re.search(r'\\boxed{([^}]*)}', result)
87
+ if match:
88
+ return match.group(1).strip()
89
+ else:
90
+ logger.warning(f'No answer found in the model output: {result}')
91
+ return ''
92
+
93
+ def llm_parse_pred_result(self, result, raw_input_d=None, **kwargs) -> str:
94
+ return result.strip()
95
+
96
+ def match(self, gold: str, pred: str) -> dict:
97
+ # simple match
98
+ return {
99
+ 'AverageAccuracy': 1.0 if exact_match(gold, pred) else 0.0,
100
+ }
101
+
102
+ def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
103
+ raw_input = kwargs.get('raw_input', None)
104
+ question = raw_input['question']
105
+ # get grading response
106
+ prompt = judge.build_prompt(pred, gold, question)
107
+ judge_response = judge(prompt)
108
+ score = judge.get_score(judge_response)
109
+ return {
110
+ 'AverageAccuracy': score,
111
+ 'response': judge_response,
112
+ }
113
+
114
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
115
+ # zip dict answers
116
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
117
+
118
+ return super().compute_metric(res_dict, **kwargs)
@@ -22,7 +22,8 @@ logger = get_logger()
22
22
  few_shot_num=0,
23
23
  train_split=None,
24
24
  eval_split='test',
25
- prompt_template='Complete the following python code:\n{query}',
25
+ prompt_template=
26
+ 'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{query}', # noqa: E501
26
27
  extra_params={
27
28
  'num_workers': 4,
28
29
  'timeout': 4
@@ -76,26 +77,9 @@ class HumanevalAdapter(DataAdapter):
76
77
 
77
78
  @classmethod
78
79
  def _postprocess(cls, text: str) -> str:
79
- if '```' in text:
80
- blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
81
- if len(blocks) == 0:
82
- text = text.split('```')[1] # fall back to default strategy
83
- else:
84
- text = blocks[0] # fetch the first code block
85
- if not text.startswith('\n'): # in case starting with ```python
86
- text = text[max(text.find('\n') + 1, 0):]
87
- if text.strip().startswith('from') or text.strip().startswith('import'):
88
- def_idx = text.find('def')
89
- if def_idx != -1:
90
- text = text[max(text.find('\n', def_idx) + 1, 0):]
91
- text = text.split('\n\n')[0]
92
- if text.strip().startswith('def'):
93
- text = '\n'.join(text.split('\n')[1:])
94
- if not text.startswith(' '):
95
- if text.startswith(' '):
96
- text = ' ' + text.lstrip()
97
- else:
98
- text = '\n'.join([' ' + line for line in text.split('\n')])
80
+ blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
81
+ if len(blocks) >= 1:
82
+ text = blocks[0]
99
83
  return text
100
84
 
101
85
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
@@ -2,7 +2,6 @@ from collections import defaultdict
2
2
  from typing import Any, Dict, List
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.benchmarks.ifeval.utils import process_results
6
5
  from evalscope.constants import EvalType
7
6
  from evalscope.metrics import Metric, mean, metric_registry
8
7
 
@@ -43,10 +42,9 @@ class IFEvalAdapter(DataAdapter):
43
42
  def get_gold_answer(self, input_d: dict) -> str:
44
43
  return input_d
45
44
 
46
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
47
- return result
48
-
49
45
  def match(self, gold: Any, pred: Any) -> Dict:
46
+ from evalscope.benchmarks.ifeval.utils import process_results
47
+
50
48
  return process_results(gold, [pred])
51
49
 
52
50
  def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
@@ -1,7 +1,7 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
2
  from evalscope.constants import EvalType, OutputType
3
3
  from evalscope.metrics import exact_match
4
- from evalscope.utils.utils import ResponseParser
4
+ from evalscope.metrics.completion_parsers import ResponseParser
5
5
 
6
6
 
7
7
  @Benchmark.register(
@@ -69,12 +69,6 @@ class LiveCodeBenchAdapter(DataAdapter):
69
69
  # Extract the gold answer from the input dict.
70
70
  return input_d
71
71
 
72
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
73
- """
74
- Parse the model output to get the answer. Could be the best choice index.
75
- """
76
- return result
77
-
78
72
  def match(self, gold: dict, pred: str) -> float:
79
73
  from .evaluate_utils import codegen_metrics
80
74
  from .extract_utils import extract_code_generation
@@ -3,7 +3,7 @@ from typing import Any
3
3
  from evalscope.benchmarks import Benchmark, DataAdapter
4
4
  from evalscope.constants import EvalType, OutputType
5
5
  from evalscope.metrics import exact_match
6
- from evalscope.utils.utils import ResponseParser
6
+ from evalscope.metrics.completion_parsers import ResponseParser
7
7
 
8
8
  SUBSET_LIST = ['default']
9
9
 
@@ -54,4 +54,5 @@ class Math500Adapter(DataAdapter):
54
54
  return result
55
55
 
56
56
  def match(self, gold: str, pred: str) -> float:
57
- return math_equal(pred, gold)
57
+ res = math_equal(pred, gold)
58
+ return 1.0 if res else 0.0
@@ -5,7 +5,7 @@ import os
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics import exact_match
8
- from evalscope.utils import ResponseParser
8
+ from evalscope.metrics.completion_parsers import ResponseParser
9
9
  from evalscope.utils.logger import get_logger
10
10
 
11
11
  # flake8: noqa
@@ -144,7 +144,7 @@ SUBJECT_MAPPING = {
144
144
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
145
145
  subset_list=SUBSET_LIST,
146
146
  metric_list=['AverageAccuracy'],
147
- few_shot_num=5,
147
+ few_shot_num=0,
148
148
  train_split='train',
149
149
  eval_split='test',
150
150
  prompt_template=
@@ -4,7 +4,7 @@ from typing import Any, Dict
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.constants import EvalType, OutputType
6
6
  from evalscope.metrics import exact_match
7
- from evalscope.utils.utils import ResponseParser
7
+ from evalscope.metrics.completion_parsers import ResponseParser
8
8
 
9
9
  SUBSET_LIST = [
10
10
  'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
@@ -4,8 +4,8 @@ from typing import Any, Dict
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.constants import EvalType, OutputType
6
6
  from evalscope.metrics import exact_match
7
+ from evalscope.metrics.completion_parsers import ResponseParser
7
8
  from evalscope.utils.logger import get_logger
8
- from evalscope.utils.utils import ResponseParser
9
9
 
10
10
  logger = get_logger()
11
11
 
@@ -4,7 +4,7 @@ from typing import Any
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.constants import EvalType, OutputType
6
6
  from evalscope.metrics import exact_match
7
- from evalscope.utils.utils import ResponseParser
7
+ from evalscope.metrics.completion_parsers import ResponseParser
8
8
 
9
9
 
10
10
  @Benchmark.register(
@@ -5,7 +5,7 @@ import os
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics import exact_match
8
- from evalscope.utils import ResponseParser
8
+ from evalscope.metrics.completion_parsers import ResponseParser
9
9
  from evalscope.utils.io_utils import jsonl_to_list
10
10
  from evalscope.utils.logger import get_logger
11
11
 
File without changes
@@ -0,0 +1,110 @@
1
+ import importlib
2
+ from collections import defaultdict
3
+ from typing import Dict, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.metrics import Metric, mean, metric_registry
7
+ from evalscope.utils import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ @Benchmark.register(
13
+ name='tau_bench',
14
+ pretty_name='τ-bench',
15
+ tags=['Reasoning', 'Agent', 'Function Calling'],
16
+ description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
17
+ 'and a language agent provided with domain-specific API tools and policy guidelines. '
18
+ 'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating and set a user model. ', # noqa: E501
19
+ dataset_id='https://github.com/sierra-research/tau-bench',
20
+ model_adapter='tau_bench_server',
21
+ subset_list=['airline', 'retail'],
22
+ metric_list=['Pass^1'],
23
+ eval_split='test',
24
+ extra_params={
25
+ 'user_model': 'qwen-plus',
26
+ 'api_key': 'EMPTY',
27
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
28
+ 'generation_config': {
29
+ 'temperature': 0.7,
30
+ 'max_new_tokens': 1024
31
+ }
32
+ })
33
+ class TauBenchAdapter(DataAdapter):
34
+
35
+ def __init__(self, **kwargs):
36
+ super().__init__(**kwargs)
37
+
38
+ spec = importlib.util.find_spec('tau_bench')
39
+ if spec is None:
40
+ raise ImportError(
41
+ '`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.' # noqa: E501
42
+ )
43
+
44
+ metric_registry.register(Metric(name='Pass^1', object=mean))
45
+
46
+ # setup user model args
47
+ extra_params = kwargs.get('extra_params', {})
48
+ self.user_model = extra_params.get('user_model', 'qwen-plus')
49
+ self.api_key = extra_params.get('api_key', 'EMPTY')
50
+ self.api_base = extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
51
+ self.generation_config = extra_params.get('generation_config', {'temperature': 0.7, 'max_new_tokens': 1024})
52
+
53
+ self._patch_env_completion()
54
+
55
+ def _patch_env_completion(self) -> str:
56
+ from tau_bench.envs.user import LLMUserSimulationEnv
57
+
58
+ def new_generate_next_message(self, messages):
59
+ from evalscope.models import ServerModelAdapter
60
+
61
+ user_server = ServerModelAdapter(
62
+ api_url=adapter_instance.api_base,
63
+ model_id=adapter_instance.user_model,
64
+ api_key=adapter_instance.api_key)
65
+ request_json = user_server.make_request(
66
+ input_item={'messages': messages}, infer_cfg=adapter_instance.generation_config)
67
+ res = user_server.send_request(request_json)
68
+
69
+ message = res['choices'][0]['message']
70
+ self.messages.append(message)
71
+ self.total_cost = 0
72
+ return message['content']
73
+
74
+ # get the current instance of TauBenchAdapter
75
+ adapter_instance = self
76
+ LLMUserSimulationEnv.generate_next_message = new_generate_next_message
77
+
78
+ def load(self, **kwargs):
79
+ from tau_bench.envs import get_env
80
+
81
+ data_dict = defaultdict(dict)
82
+ for env_name in self.subset_list:
83
+ logger.info(f'Loading TauBench environment: {env_name}')
84
+ env = get_env(
85
+ env_name=env_name,
86
+ user_strategy='llm',
87
+ user_model='dummy', # Use dummy model to prevent errors
88
+ user_provider='openai', # Use dummy provider to prevent errors
89
+ task_split=self.eval_split,
90
+ )
91
+ tasks = []
92
+ for i in range(len(env.tasks)):
93
+ tasks.append({
94
+ 'task_index': i,
95
+ 'env_name': env_name,
96
+ })
97
+ data_dict[env_name][self.eval_split] = tasks
98
+
99
+ return data_dict
100
+
101
+ def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
102
+ return self.gen_prompt_data(extra_data=input_d)
103
+
104
+ def get_gold_answer(self, input_d):
105
+ return ''
106
+
107
+ def match(self, gold, pred):
108
+ import json
109
+ res = json.loads(pred)
110
+ return res.get('reward', 0.0)
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from typing import Dict, List
2
3
 
3
4
  from evalscope.benchmarks import Benchmark, DataAdapter
@@ -8,7 +9,7 @@ from evalscope.metrics import Metric, mean, metric_registry
8
9
  @Benchmark.register(
9
10
  name='tool_bench',
10
11
  pretty_name='ToolBench-Static',
11
- tags=['Reasoning', 'Agent'],
12
+ tags=['Reasoning', 'Agent', 'Function Calling'],
12
13
  description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
13
14
  'It includes various subsets such as in-domain and out-of-domain, '
14
15
  'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
@@ -40,6 +41,11 @@ class ToolBenchAdapter(DataAdapter):
40
41
  for message in messages:
41
42
  if 'name' in message:
42
43
  del message['name']
44
+ if 'role' in message:
45
+ if message['role'] == 'function':
46
+ content = json.dumps(message, ensure_ascii=False)
47
+ message['role'] = 'user'
48
+ message['content'] = content
43
49
  return self.gen_prompt_data(prompt='', messages=messages)
44
50
 
45
51
  def get_gold_answer(self, input_d: dict) -> str:
@@ -96,13 +96,16 @@ class TriviaQaAdapter(DataAdapter):
96
96
  def get_sys_prompt(inp: dict) -> str:
97
97
  return inp['input'][0]['content']
98
98
 
99
- prompt = get_sys_prompt(input_d)
99
+ if self.few_shot_num > 0:
100
+ sys_prompt = get_sys_prompt(input_d)
101
+ else:
102
+ sys_prompt = None
100
103
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
101
- context: str = '\n'.join(few_shot_prompts) + '\n'
104
+ context = '\n'.join(few_shot_prompts) + '\n'
102
105
  context += self._generate_prompt(input_d=input_d, include_answer=False)
103
106
  full_prompt = context
104
107
 
105
- return self.gen_prompt_data(full_prompt)
108
+ return self.gen_prompt_data(full_prompt, system_prompt=sys_prompt)
106
109
 
107
110
  def get_gold_answer(self, input_d: dict) -> list:
108
111
  # Get the gold choice
@@ -124,7 +127,9 @@ class TriviaQaAdapter(DataAdapter):
124
127
  return result
125
128
 
126
129
  def match(self, gold: list, pred: str) -> float:
127
- is_correct = any([cand in pred for cand in gold])
130
+ lower_pred = pred.lower()
131
+ gold = [g.lower() for g in gold]
132
+ is_correct = any([cand in lower_pred for cand in gold])
128
133
  return 1 if is_correct else 0
129
134
 
130
135
  @classmethod
@@ -2,8 +2,7 @@ from dataclasses import asdict, dataclass
2
2
  from functools import wraps
3
3
  from typing import Dict, List, Optional, Union
4
4
 
5
- from evalscope.constants import EvalType
6
- from evalscope.utils.filters import Filter
5
+ from .filters import Filter
7
6
 
8
7
 
9
8
  @dataclass
@@ -14,6 +13,7 @@ class PromptData:
14
13
  multi_choices: Optional[List[str]] = None
15
14
  id: Optional[str] = None
16
15
  messages: Optional[List[dict]] = None
16
+ extra_data: Optional[Dict] = None
17
17
 
18
18
  def to_dict(self) -> Dict:
19
19
  return {k: v for k, v in asdict(self).items() if v is not None}
@@ -1,7 +1,7 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
2
  from evalscope.constants import EvalType, OutputType
3
3
  from evalscope.metrics import exact_match
4
- from evalscope.utils.utils import ResponseParser
4
+ from evalscope.metrics.completion_parsers import ResponseParser
5
5
 
6
6
 
7
7
  @Benchmark.register(