evalscope 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (81) hide show
  1. evalscope/arguments.py +3 -0
  2. evalscope/benchmarks/aime/__init__.py +0 -0
  3. evalscope/benchmarks/aime/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  5. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  6. evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
  7. evalscope/benchmarks/benchmark.py +5 -3
  8. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  9. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  10. evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
  11. evalscope/benchmarks/data_adapter.py +88 -29
  12. evalscope/benchmarks/data_collection/__init__.py +0 -0
  13. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  14. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  15. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
  16. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
  17. evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
  18. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
  19. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  20. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  21. evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
  22. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  23. evalscope/benchmarks/math_500/__init__.py +0 -0
  24. evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
  25. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  26. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
  27. evalscope/benchmarks/musr/__init__.py +0 -0
  28. evalscope/benchmarks/musr/musr_adapter.py +68 -0
  29. evalscope/benchmarks/process_bench/__init__.py +0 -0
  30. evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  31. evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
  32. evalscope/benchmarks/race/race_adapter.py +3 -3
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  34. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
  35. evalscope/cli/start_app.py +4 -1
  36. evalscope/cli/start_eval.py +4 -3
  37. evalscope/cli/start_perf.py +4 -2
  38. evalscope/collections/evaluator.py +109 -39
  39. evalscope/collections/sampler.py +2 -1
  40. evalscope/collections/schema.py +1 -2
  41. evalscope/config.py +4 -1
  42. evalscope/evaluator/evaluator.py +81 -65
  43. evalscope/metrics/__init__.py +2 -1
  44. evalscope/metrics/math_parser.py +526 -0
  45. evalscope/metrics/metrics.py +39 -3
  46. evalscope/metrics/named_metrics.py +31 -7
  47. evalscope/models/base_adapter.py +7 -1
  48. evalscope/models/chat_adapter.py +69 -49
  49. evalscope/models/choice_adapter.py +52 -45
  50. evalscope/models/custom_adapter.py +2 -2
  51. evalscope/models/local_model.py +7 -2
  52. evalscope/models/server_adapter.py +106 -61
  53. evalscope/perf/__init__.py +0 -1
  54. evalscope/perf/arguments.py +5 -1
  55. evalscope/perf/http_client.py +2 -2
  56. evalscope/perf/plugin/api/openai_api.py +11 -1
  57. evalscope/perf/utils/benchmark_util.py +6 -2
  58. evalscope/report/app.py +42 -23
  59. evalscope/run.py +11 -8
  60. evalscope/third_party/thinkbench/__init__.py +3 -0
  61. evalscope/third_party/thinkbench/eval.py +264 -0
  62. evalscope/third_party/thinkbench/infer.py +100 -0
  63. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  64. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  65. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  66. evalscope/third_party/thinkbench/tools/llm.py +47 -0
  67. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  68. evalscope/utils/chat_service.py +2 -2
  69. evalscope/utils/io_utils.py +1 -1
  70. evalscope/utils/model_utils.py +17 -1
  71. evalscope/utils/utils.py +45 -45
  72. evalscope/version.py +2 -2
  73. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/METADATA +22 -8
  74. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/RECORD +79 -58
  75. tests/cli/test_run.py +108 -19
  76. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  77. evalscope/metrics/math_accuracy.py +0 -200
  78. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
  79. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
  80. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
  81. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,7 @@ class BenchmarkData:
23
23
  n_chunks: int = 0
24
24
  n_chunks_time: float = 0.0
25
25
  max_gpu_memory_cost = 0
26
+ time_per_output_token: float = 0.0
26
27
 
27
28
  prompt_tokens = None
28
29
  completion_tokens = None
@@ -37,6 +38,7 @@ class BenchmarkData:
37
38
  self.first_chunk_latency = self.query_latency
38
39
  self.n_chunks = 1
39
40
  self.n_chunks_time = self.query_latency
41
+ self.time_per_output_token = self.query_latency / self.completion_tokens
40
42
 
41
43
  def _calculate_tokens(self, api_plugin):
42
44
  self.prompt_tokens, self.completion_tokens = \
@@ -63,6 +65,7 @@ class BenchmarkMetrics:
63
65
  start_time: Optional[float] = None
64
66
  total_time: float = 1.0
65
67
  n_total_queries: int = 0
68
+ n_time_per_output_token: float = 0.0
66
69
 
67
70
  avg_first_chunk_latency: float = -1
68
71
  avg_latency: float = -1
@@ -92,6 +95,7 @@ class BenchmarkMetrics:
92
95
  self.total_first_chunk_latency += benchmark_data.first_chunk_latency
93
96
  self.n_total_chunks += benchmark_data.n_chunks
94
97
  self.total_chunks_time += benchmark_data.n_chunks_time
98
+ self.n_time_per_output_token += benchmark_data.time_per_output_token
95
99
  else:
96
100
  self.n_failed_queries += 1
97
101
 
@@ -108,7 +112,7 @@ class BenchmarkMetrics:
108
112
  self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
109
113
  self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
110
114
  self.avg_token_per_seconds = self.n_total_completion_tokens / self.total_time
111
- self.avg_time_per_token = self.total_time / self.n_total_completion_tokens
115
+ self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
112
116
  self.qps = self.n_succeed_queries / self.total_time
113
117
  except ZeroDivisionError as e:
114
118
  logger.exception(e)
@@ -125,7 +129,7 @@ class BenchmarkMetrics:
125
129
  'Average QPS': round(self.qps, default_ndigits),
126
130
  'Average latency (s)': round(self.avg_latency, default_ndigits),
127
131
  'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
128
- 'Average time per output token (s)': round(self.avg_time_per_token, 5),
132
+ 'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
129
133
  'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
130
134
  'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
131
135
  'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
evalscope/report/app.py CHANGED
@@ -6,6 +6,7 @@ import os
6
6
  import pandas as pd
7
7
  import plotly.express as px
8
8
  import plotly.graph_objects as go
9
+ import re
9
10
  from dataclasses import dataclass
10
11
  from typing import Any, List, Union
11
12
 
@@ -18,6 +19,9 @@ from evalscope.version import __version__
18
19
  logger = get_logger()
19
20
 
20
21
  PLOTLY_THEME = 'plotly_dark'
22
+ REPORT_TOKEN = '@@'
23
+ MODEL_TOKEN = '::'
24
+ DATASET_TOKEN = ', '
21
25
 
22
26
 
23
27
  def scan_for_report_folders(root_path):
@@ -41,8 +45,9 @@ def scan_for_report_folders(root_path):
41
45
  datasets = []
42
46
  for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
43
47
  datasets.append(os.path.basename(dataset_item).split('.')[0])
44
- datasets = ','.join(datasets)
45
- reports.append(f'{os.path.basename(folder)}@{os.path.basename(model_item)}:{datasets}')
48
+ datasets = DATASET_TOKEN.join(datasets)
49
+ reports.append(
50
+ f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
46
51
 
47
52
  reports = sorted(reports, reverse=True)
48
53
  logger.debug(f'reports: {reports}')
@@ -50,9 +55,9 @@ def scan_for_report_folders(root_path):
50
55
 
51
56
 
52
57
  def process_report_name(report_name: str):
53
- prefix, report_name = report_name.split('@')
54
- model_name, datasets = report_name.split(':')
55
- datasets = datasets.split(',')
58
+ prefix, report_name = report_name.split(REPORT_TOKEN)
59
+ model_name, datasets = report_name.split(MODEL_TOKEN)
60
+ datasets = datasets.split(DATASET_TOKEN)
56
61
  return prefix, model_name, datasets
57
62
 
58
63
 
@@ -170,7 +175,7 @@ def plot_single_dataset_scores(df: pd.DataFrame):
170
175
  text=df[ReportKey.score],
171
176
  barmode='group')
172
177
 
173
- width = 0.2 if len(df[ReportKey.subset_name]) <= 5 else None
178
+ width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
174
179
  plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
175
180
  plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
176
181
  return plot
@@ -218,7 +223,16 @@ def dict_to_markdown(data) -> str:
218
223
  return '\n\n'.join(markdown_lines)
219
224
 
220
225
 
226
+ def convert_html_tags(text):
227
+ # match begin label
228
+ text = re.sub(r'<(\w+)>', r'[\1]', text)
229
+ # match end label
230
+ text = re.sub(r'</(\w+)>', r'[/\1]', text)
231
+ return text
232
+
233
+
221
234
  def process_string(string: str, max_length: int = 2048) -> str:
235
+ string = convert_html_tags(string) # for display labels e.g. `<think>`
222
236
  if len(string) > max_length:
223
237
  return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
224
238
  return string
@@ -226,9 +240,11 @@ def process_string(string: str, max_length: int = 2048) -> str:
226
240
 
227
241
  def process_model_prediction(item: Any):
228
242
  if isinstance(item, dict):
229
- return dict_to_markdown(item)
243
+ res = dict_to_markdown(item)
244
+ return process_string(res)
230
245
  elif isinstance(item, list):
231
- return '\n'.join([process_model_prediction(item) for item in item])
246
+ res = '\n'.join([process_model_prediction(item) for item in item])
247
+ return process_string(res)
232
248
  else:
233
249
  return process_string(str(item))
234
250
 
@@ -257,19 +273,20 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
257
273
  ds = []
258
274
  for i, item in origin_df.iterrows():
259
275
  raw_input = item['raw_input']
260
- raw_pred_answer = item['choices'][0]['message']['content']
261
- parsed_gold_answer = item['choices'][0]['review']['gold']
262
- parsed_pred_answer = item['choices'][0]['review']['pred']
263
- score = item['choices'][0]['review']['result']
264
- raw_d = {
265
- 'Input': raw_input,
266
- 'Generated': raw_pred_answer,
267
- 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
268
- 'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
269
- 'Score': score,
270
- 'NScore': normalize_score(score)
271
- }
272
- ds.append(raw_d)
276
+ for choice in item['choices']:
277
+ raw_pred_answer = choice['message']['content']
278
+ parsed_gold_answer = choice['review']['gold']
279
+ parsed_pred_answer = choice['review']['pred']
280
+ score = choice['review']['result']
281
+ raw_d = {
282
+ 'Input': raw_input,
283
+ 'Generated': raw_pred_answer,
284
+ 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
285
+ 'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
286
+ 'Score': score,
287
+ 'NScore': normalize_score(score)
288
+ }
289
+ ds.append(raw_d)
273
290
 
274
291
  df_subset = pd.DataFrame(ds)
275
292
  return df_subset
@@ -284,6 +301,8 @@ def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: i
284
301
  end = start + rows_per_page
285
302
  df_subset = data_review_df.iloc[start:end].copy()
286
303
  df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
304
+ df_subset['Generated'] = df_subset['Generated'].map(process_model_prediction).astype(str)
305
+ df_subset['Pred'] = df_subset['Pred'].map(process_model_prediction).astype(str)
287
306
  df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
288
307
  styler = style_df(df_subset, columns=['NScore'])
289
308
  return df_subset, styler
@@ -504,8 +523,8 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
504
523
  outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
505
524
  def update_single_report_data(root_path, report_name):
506
525
  report_list, datasets, task_cfg = load_single_report(root_path, report_name)
507
- work_dir = os.path.join(root_path, report_name.split('@')[0])
508
- model_name = report_name.split('@')[1].split(':')[0]
526
+ work_dir = os.path.join(root_path, report_name.split(REPORT_TOKEN)[0])
527
+ model_name = report_name.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
509
528
  return (report_list, task_cfg, gr.update(choices=datasets, value=datasets[0]), work_dir, model_name)
510
529
 
511
530
  @report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])
evalscope/run.py CHANGED
@@ -46,11 +46,13 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
46
46
 
47
47
  def setup_work_directory(task_cfg: TaskConfig, run_time: str):
48
48
  """Set the working directory for the task."""
49
+ # use cache
49
50
  if task_cfg.use_cache:
50
51
  task_cfg.work_dir = task_cfg.use_cache
51
52
  logger.info(f'Set resume from {task_cfg.work_dir}')
52
53
  # elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
53
- task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
54
+ else:
55
+ task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
54
56
 
55
57
  outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
56
58
 
@@ -112,8 +114,8 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
112
114
  logger.info(task_cfg)
113
115
 
114
116
  for evaluator in evaluators:
115
- res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
116
- eval_results[dataset_name] = res_dict
117
+ res_dict = evaluator.eval()
118
+ eval_results[evaluator.dataset_name] = res_dict
117
119
 
118
120
  return eval_results
119
121
 
@@ -124,21 +126,22 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
124
126
  from evalscope.evaluator import Evaluator
125
127
  from evalscope.models import initialize_model_adapter
126
128
 
129
+ benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
130
+ # Initialize data adapter
131
+ data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
132
+
127
133
  if dataset_name == DataCollection.NAME:
128
134
  # EvaluatorCollection is a collection of evaluators
129
135
  from evalscope.collections import EvaluatorCollection
130
- return EvaluatorCollection(task_cfg, outputs)
136
+ return EvaluatorCollection(task_cfg, data_adapter, outputs)
131
137
 
132
- benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
133
-
134
- data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
138
+ # Initialize model adapter
135
139
  model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
136
140
 
137
141
  # update task_cfg.dataset_args
138
142
  task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
139
143
 
140
144
  return Evaluator(
141
- dataset_name_or_path=benchmark.dataset_id,
142
145
  data_adapter=data_adapter,
143
146
  model_adapter=model_adapter,
144
147
  outputs=outputs,
@@ -0,0 +1,3 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.third_party.thinkbench.eval import run_task
@@ -0,0 +1,264 @@
1
+ import json
2
+ import os
3
+ import pandas as pd
4
+ import plotly.graph_objects as go
5
+ import re
6
+ from collections import defaultdict
7
+ from functools import lru_cache
8
+ from modelscope import AutoTokenizer
9
+ from plotly.subplots import make_subplots
10
+ from tqdm.contrib.concurrent import thread_map
11
+ from typing import List
12
+
13
+ from evalscope.third_party.thinkbench.tools.llm import request_url
14
+ from evalscope.third_party.thinkbench.tools.utils import extract_answer
15
+ from evalscope.utils.io_utils import dump_jsonl_data
16
+
17
+ cur_path = os.path.dirname(os.path.abspath(__file__))
18
+
19
+ class EvalThink:
20
+ def __init__(self, report_path, tokenizer_path, model_name, dataset_name, subsets, split_strategies='llm', judge_config=None):
21
+ self.report_path = report_path
22
+ self.reformat_template = open(os.path.join(cur_path, 'resources/reformat_template.txt'), 'r').read()
23
+ self.critique_template = open(os.path.join(cur_path, 'resources/critique_template.txt'), 'r').read()
24
+ self.switch_tokens = ['alternatively', 'but wait', 'let me reconsider', 'another way', 'another approach', 'another method', 'another angle']
25
+ self.subset_dict = defaultdict(lambda: defaultdict(list))
26
+ self.think_end_token = '</think>'
27
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
28
+ self.model_name = model_name
29
+ self.dataset_name = dataset_name
30
+ self.subsets = subsets
31
+ self.metrics = ['token_efficiency', 'completion_len', 'thought_num', 'accuracy']
32
+ self.split_strategies = split_strategies # split by llm, keywords, separator
33
+ self.judge_config = judge_config
34
+
35
+ @lru_cache(maxsize=None)
36
+ def get_think_part(self, text):
37
+ last_think_end = text.rfind(self.think_end_token)
38
+ return text[:last_think_end].lower()
39
+
40
+ @lru_cache(maxsize=None)
41
+ def cal_tokens(self, text: str):
42
+ return len(self.tokenizer.encode(text, add_special_tokens=False))
43
+
44
+ def process_choice(self, choice, problem):
45
+ think_part = self.get_think_part(choice['message']['content'])
46
+ answer = choice['review']['gold']
47
+ tokens = self.cal_tokens(think_part)
48
+ switch_count = sum(think_part.count(token) for token in self.switch_tokens)
49
+ useful_tokens = self.cal_tokens(self.get_first_correct(think_part, problem, answer))
50
+ score = choice['review']['result']
51
+ return tokens, switch_count, useful_tokens, score
52
+
53
+ def process_item(self, item):
54
+ problem = item['raw_input'].get('question') or item['raw_input'].get('problem') or ''
55
+ results = []
56
+ for choice in item['choices']:
57
+ results.append(self.process_choice(choice, problem))
58
+ break # only process the first choice
59
+
60
+ tokens, switch_counts, useful_tokens, scores = zip(*results)
61
+
62
+ avg_tokens = sum(tokens) / len(tokens)
63
+ avg_thought_num = sum(switch_counts) / len(switch_counts)
64
+ avg_token_efficiency = sum(useful_tokens) / sum(tokens)
65
+ avg_accuracy = sum(scores) / len(scores)
66
+
67
+ return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy
68
+
69
+ def split_by_llm(self, response, problem) -> List[str]:
70
+ response = response.replace('\n', ' ') # remove newline characters
71
+ prompt = self.reformat_template.format(problem=problem, response=response)
72
+ llm_response = request_url(self.judge_config, prompt)
73
+ return llm_response.split('\n\n')
74
+
75
+ def split_by_keywords(self, text) -> List[str]:
76
+ pattern = r'(?=\b(?:{})\b)'.format('|'.join(map(re.escape, self.switch_tokens)))
77
+ segments = re.split(pattern, text)
78
+ # remove empty segments
79
+ segments = [segment.strip() for segment in segments if segment.strip()]
80
+
81
+ return segments if segments else [text]
82
+
83
+ def split_by_separator(self, text) -> List[str]:
84
+ return text.split('\n\n')
85
+
86
+ def get_answer_index(self, response: List[str], problem: str, answer: str) -> int:
87
+ tagged_response = ''
88
+ for sdx, step in enumerate(response):
89
+ tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
90
+ tagged_response = tagged_response.strip()
91
+
92
+ prompt = self.critique_template.format(problem=problem, answer=answer, tagged_response=tagged_response)
93
+ llm_response = request_url(self.judge_config, prompt)
94
+ answer_index = extract_answer(llm_response)
95
+
96
+ dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
97
+ os.path.join(self.report_path, 'answer_index.jsonl'),
98
+ dump_mode='append')
99
+ try:
100
+ answer_index = int(answer_index)
101
+ except Exception:
102
+ answer_index = -1
103
+ return answer_index
104
+
105
+ def get_first_correct(self, response: str, problem: str, answer: str) -> str:
106
+ if self.split_strategies == 'llm':
107
+ text_list = self.split_by_llm(response, problem)
108
+ elif self.split_strategies == 'keywords':
109
+ text_list = self.split_by_keywords(response)
110
+ else:
111
+ text_list = self.split_by_separator(response)
112
+
113
+ answer_index = self.get_answer_index(text_list, problem, answer)
114
+
115
+ if answer_index == -1: # no correct answer found
116
+ first_correct = ''
117
+ else:
118
+ first_correct = '\n\n'.join(text_list[: answer_index])
119
+ return first_correct
120
+
121
+ def plot_metrics(self, results, output_dir):
122
+ fig = make_subplots(rows=1, cols=len(self.metrics),
123
+ subplot_titles=('Token Efficiency', 'Completion Length', 'Thought Num', 'Accuracy'),
124
+ shared_xaxes=True, x_title='Subsets')
125
+
126
+
127
+ for i, metric in enumerate(self.metrics, start=1):
128
+ y_values = [results[metric][subset] for subset in self.subsets]
129
+ fig.add_trace(
130
+ go.Scatter(x=list(range(len(self.subsets))), y=y_values,
131
+ mode='lines+markers',
132
+ name=metric.replace('_', ' ').title()),
133
+ row=1, col=i
134
+ )
135
+ # Add annotations for each data point
136
+ for j, y in enumerate(y_values):
137
+ fig.add_annotation(
138
+ x=j,
139
+ y=y,
140
+ text=f'{y:.2f}',
141
+ showarrow=False,
142
+ yshift=10,
143
+ row=1,
144
+ col=i
145
+ )
146
+
147
+ fig.update_layout(
148
+ height=500,
149
+ width=1500,
150
+ title_text=f'Evaluation Metrics for {self.model_name} on {self.dataset_name}',
151
+ legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
152
+ )
153
+
154
+ for i in range(1, len(self.metrics) + 1):
155
+ fig.update_xaxes(
156
+ ticktext=self.subsets,
157
+ tickvals=list(range(len(self.subsets))),
158
+ row=1, col=i
159
+ )
160
+ fig.update_yaxes(title_text=self.metrics[i-1].replace('_', ' ').title(), row=1, col=i)
161
+ # Update y-axis ranges
162
+ fig.update_yaxes(range=[0, 1], row=1, col=1) # Token Efficiency
163
+ fig.update_yaxes(range=[0, 13], row=1, col=3) # Switch Frequency
164
+ fig.update_yaxes(range=[0, 1], row=1, col=4) # Accuracy
165
+
166
+ os.makedirs(output_dir, exist_ok=True)
167
+ output_path = os.path.join(output_dir, f'{self.model_name}_{self.dataset_name}_metrics.png')
168
+ fig.write_image(output_path)
169
+ print(f'save figure to: {output_path}')
170
+
171
+
172
+
173
+ def filter_df(self, df, response_len: int = 8000, count: int=10):
174
+ def is_valid_row(row):
175
+ return all(self.cal_tokens(choice['message']['content']) <= response_len for choice in row['choices'])
176
+
177
+ bools = df.apply(is_valid_row, axis=1)
178
+
179
+ return df[bools].head(count)
180
+
181
+
182
+ def evaluate(self, output_dir, max_tokens=8000, count=50):
183
+ for subset in self.subsets:
184
+ review_path = os.path.join(self.report_path, 'reviews', self.model_name, f'{self.dataset_name}_{subset}.jsonl')
185
+ review_df = pd.read_json(review_path, lines=True)
186
+
187
+ review_df = self.filter_df(review_df, response_len=max_tokens, count=count)
188
+
189
+ results = thread_map(
190
+ self.process_item,
191
+ (item for _, item in review_df.iterrows()),
192
+ desc=f'Evaluating {subset}',
193
+ total=len(review_df),
194
+ max_workers=16
195
+ )
196
+
197
+ avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy = zip(*results)
198
+
199
+ self.subset_dict[subset]['completion_len'] = sum(avg_tokens) / len(avg_tokens)
200
+ self.subset_dict[subset]['thought_num'] = sum(avg_thought_num) / len(avg_thought_num)
201
+ self.subset_dict[subset]['token_efficiency'] = sum(avg_token_efficiency) / len(avg_token_efficiency)
202
+ self.subset_dict[subset]['accuracy'] = sum(avg_accuracy) / len(avg_accuracy)
203
+
204
+
205
+ results = {metric: {subset: self.subset_dict[subset][metric] for subset in self.subsets}
206
+ for metric in self.metrics}
207
+
208
+ self.plot_metrics(results, output_dir)
209
+
210
+ return results
211
+
212
+ def run_task(config, output_dir='outputs', max_tokens=8000, count=50):
213
+ evaluator = EvalThink(**config,)
214
+ results = evaluator.evaluate(output_dir, max_tokens, count)
215
+ print(results)
216
+
217
+ judge_config = dict(
218
+ api_key='EMPTY',
219
+ base_url='http://0.0.0.0:8801/v1',
220
+ model_name='Qwen2.5-72B-Instruct',
221
+ )
222
+
223
+ distill_qwen_config = dict(
224
+ report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250218_180219',
225
+ model_name = 'DeepSeek-R1-Distill-Qwen-7B',
226
+ tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
227
+ dataset_name = 'math_500',
228
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
229
+ split_strategies='separator',
230
+ judge_config=judge_config
231
+ )
232
+
233
+ math_qwen_config = dict(
234
+ report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250219_202358',
235
+ model_name = 'Qwen2.5-Math-7B-Instruct',
236
+ tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
237
+ dataset_name = 'math_500',
238
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
239
+ split_strategies='separator'
240
+ )
241
+
242
+ r1_config = dict(
243
+ report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202',
244
+ model_name = 'deepseek-r1',
245
+ tokenizer_path = 'deepseek-ai/DeepSeek-R1',
246
+ dataset_name = 'math_500',
247
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
248
+ split_strategies='separator'
249
+ )
250
+
251
+ qwq_config = dict(
252
+ report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911',
253
+ model_name = 'qwq-32b-preview',
254
+ tokenizer_path = 'Qwen/QwQ-32B-Preview',
255
+ dataset_name = 'math_500',
256
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
257
+ split_strategies='separator'
258
+ )
259
+
260
+ if __name__ == '__main__':
261
+ run_task(distill_qwen_config)
262
+ # run_task(math_qwen_config)
263
+ # run_task(r1_config)
264
+ # run_task(qwq_config)
@@ -0,0 +1,100 @@
1
+ import os
2
+
3
+ from evalscope import TaskConfig, run_task
4
+
5
+
6
+ def eval_distill_qwen():
7
+ model_name = 'DeepSeek-R1-Distill-Qwen-7B'
8
+ dataset_name = 'math_500'
9
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
10
+
11
+ task_config = TaskConfig(
12
+ api_url='http://0.0.0.0:8801/v1/chat/completions',
13
+ model=model_name,
14
+ eval_type='service',
15
+ datasets=[dataset_name],
16
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
17
+ eval_batch_size=32,
18
+ generation_config={
19
+ 'max_tokens': 20000, # avoid exceed max length
20
+ 'temperature': 0.6,
21
+ 'top_p': 0.95,
22
+ 'n': 1,
23
+ },
24
+ )
25
+ run_task(task_config)
26
+
27
+
28
+ def eval_math_qwen():
29
+ model_name = 'Qwen2.5-Math-7B-Instruct'
30
+ dataset_name = 'math_500'
31
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
32
+
33
+ task_config = TaskConfig(
34
+ api_url='http://0.0.0.0:8801/v1/chat/completions',
35
+ model=model_name,
36
+ eval_type='service',
37
+ datasets=[dataset_name],
38
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
39
+ eval_batch_size=32,
40
+ generation_config={
41
+ 'max_tokens': 3000, # avoid exceed max length
42
+ 'temperature': 0.6,
43
+ 'top_p': 0.95,
44
+ 'n': 3,
45
+ },
46
+ )
47
+ run_task(task_config)
48
+
49
+ def eval_r1():
50
+ model_name = 'deepseek-r1'
51
+ dataset_name = 'math_500'
52
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
53
+
54
+ task_config = TaskConfig(
55
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
56
+ api_key=os.environ['DASHSCOPE_API_KEY'],
57
+ model=model_name,
58
+ eval_type='service',
59
+ datasets=[dataset_name],
60
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
61
+ eval_batch_size=3,
62
+ generation_config={
63
+ 'max_tokens': 12000, # avoid exceed max length
64
+ 'temperature': 0.6,
65
+ 'top_p': 0.95,
66
+ 'n': 1,
67
+ },
68
+ limit=50,
69
+ use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202'
70
+ )
71
+ run_task(task_config)
72
+
73
+ def eval_qwq():
74
+ model_name = 'qwq-32b-preview'
75
+ dataset_name = 'math_500'
76
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
77
+
78
+ task_config = TaskConfig(
79
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
80
+ api_key=os.environ['DASHSCOPE_API_KEY'],
81
+ model=model_name,
82
+ eval_type='service',
83
+ datasets=[dataset_name],
84
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
85
+ eval_batch_size=32,
86
+ generation_config={
87
+ 'max_tokens': 8000, # avoid exceed max length
88
+ 'temperature': 0.6,
89
+ 'top_p': 0.95,
90
+ 'n': 1,
91
+ },
92
+ use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911'
93
+ )
94
+ run_task(task_config)
95
+
96
+ if __name__ == '__main__':
97
+ # eval_distill_qwen()
98
+ # eval_math_qwen()
99
+ # eval_r1()
100
+ eval_qwq()
@@ -0,0 +1,17 @@
1
+ The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
2
+
3
+ [Math Problem]
4
+
5
+ {problem}
6
+
7
+ [Correct Answer]
8
+
9
+ {answer}
10
+
11
+ [Solution]
12
+
13
+ {tagged_response}
14
+
15
+ Your task is to review and critique the solution paragraph by paragraph. Once you identify an correct answer in a paragraph, return the index of the paragraph where the earliest correct answer occurs. Otherwise, return the index of -1 (which typically denotes "not found").
16
+
17
+ Please put your final answer (i.e., the index) in \boxed{{}}.
@@ -0,0 +1,31 @@
1
+ I will present you with a solution to a math problem. Unfortunately, the solution lacks proper paragraphing, making it hard to read. Your task is to improve readability by reformatting the solution into well-structured paragraphs. Follow these specific guidelines:
2
+
3
+ * Insert \n\n for paragraph breaks within the original solution. Do **NOT** alter any content of the original solution (the only exception is for itemized lists; see below).
4
+
5
+ - Each paragraph should represent a distinct, concise reasoning step that logically advances the solution.
6
+
7
+ - Reasoning steps can include case discussions, formula simplifications, or formula derivations. Each of these should be treated as an individual reasoning step and paragraphed accordingly.
8
+
9
+ - If an introductory analysis exists in the original solution, treat it as an initial reasoning step and place it as the first paragraph.
10
+
11
+ - Do **NOT** place any mathematical formulas in their own separate paragraphs; instead, include them within the same paragraph as the preceding text to form a cohesive reasoning step.
12
+
13
+ * For any itemized lists (ordered or unordered), convert them into a written format, such as "First/Second/Third." This is the **ONLY** content modification allowed.
14
+
15
+ * Avoid making paragraphs too lengthy, as long paragraphs might contain multiple reasoning steps that should be paragraphed separately.
16
+
17
+ * Disregard the accuracy of the solution content. Do **NOT** alter any of the original solution's content; focus solely on structuring it into logical, readable paragraphs.
18
+
19
+ * Reply with the reformatted solution directly.
20
+
21
+ --------------------------------------------------
22
+
23
+ Here is the math problem, and the solution that needs to be reformatted:
24
+
25
+ [Math Problem]
26
+
27
+ {problem}
28
+
29
+ [Solution]
30
+
31
+ {response}
File without changes