evalscope 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +3 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +49 -0
- evalscope/benchmarks/aime/aime25_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
- evalscope/benchmarks/benchmark.py +5 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
- evalscope/benchmarks/data_adapter.py +88 -29
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
- evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +68 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/critique_template.txt +13 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
- evalscope/cli/start_app.py +4 -1
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +4 -2
- evalscope/collections/evaluator.py +109 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +4 -1
- evalscope/evaluator/evaluator.py +81 -65
- evalscope/metrics/__init__.py +2 -1
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +39 -3
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/base_adapter.py +7 -1
- evalscope/models/chat_adapter.py +69 -49
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +7 -2
- evalscope/models/server_adapter.py +106 -61
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +5 -1
- evalscope/perf/http_client.py +2 -2
- evalscope/perf/plugin/api/openai_api.py +11 -1
- evalscope/perf/utils/benchmark_util.py +6 -2
- evalscope/report/app.py +42 -23
- evalscope/run.py +11 -8
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +264 -0
- evalscope/third_party/thinkbench/infer.py +100 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +47 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/utils/model_utils.py +17 -1
- evalscope/utils/utils.py +45 -45
- evalscope/version.py +2 -2
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/METADATA +22 -8
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/RECORD +79 -58
- tests/cli/test_run.py +108 -19
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
|
@@ -23,6 +23,7 @@ class BenchmarkData:
|
|
|
23
23
|
n_chunks: int = 0
|
|
24
24
|
n_chunks_time: float = 0.0
|
|
25
25
|
max_gpu_memory_cost = 0
|
|
26
|
+
time_per_output_token: float = 0.0
|
|
26
27
|
|
|
27
28
|
prompt_tokens = None
|
|
28
29
|
completion_tokens = None
|
|
@@ -37,6 +38,7 @@ class BenchmarkData:
|
|
|
37
38
|
self.first_chunk_latency = self.query_latency
|
|
38
39
|
self.n_chunks = 1
|
|
39
40
|
self.n_chunks_time = self.query_latency
|
|
41
|
+
self.time_per_output_token = self.query_latency / self.completion_tokens
|
|
40
42
|
|
|
41
43
|
def _calculate_tokens(self, api_plugin):
|
|
42
44
|
self.prompt_tokens, self.completion_tokens = \
|
|
@@ -63,6 +65,7 @@ class BenchmarkMetrics:
|
|
|
63
65
|
start_time: Optional[float] = None
|
|
64
66
|
total_time: float = 1.0
|
|
65
67
|
n_total_queries: int = 0
|
|
68
|
+
n_time_per_output_token: float = 0.0
|
|
66
69
|
|
|
67
70
|
avg_first_chunk_latency: float = -1
|
|
68
71
|
avg_latency: float = -1
|
|
@@ -92,6 +95,7 @@ class BenchmarkMetrics:
|
|
|
92
95
|
self.total_first_chunk_latency += benchmark_data.first_chunk_latency
|
|
93
96
|
self.n_total_chunks += benchmark_data.n_chunks
|
|
94
97
|
self.total_chunks_time += benchmark_data.n_chunks_time
|
|
98
|
+
self.n_time_per_output_token += benchmark_data.time_per_output_token
|
|
95
99
|
else:
|
|
96
100
|
self.n_failed_queries += 1
|
|
97
101
|
|
|
@@ -108,7 +112,7 @@ class BenchmarkMetrics:
|
|
|
108
112
|
self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
|
|
109
113
|
self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
|
|
110
114
|
self.avg_token_per_seconds = self.n_total_completion_tokens / self.total_time
|
|
111
|
-
self.avg_time_per_token = self.
|
|
115
|
+
self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
|
|
112
116
|
self.qps = self.n_succeed_queries / self.total_time
|
|
113
117
|
except ZeroDivisionError as e:
|
|
114
118
|
logger.exception(e)
|
|
@@ -125,7 +129,7 @@ class BenchmarkMetrics:
|
|
|
125
129
|
'Average QPS': round(self.qps, default_ndigits),
|
|
126
130
|
'Average latency (s)': round(self.avg_latency, default_ndigits),
|
|
127
131
|
'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
|
|
128
|
-
'Average time per output token (s)': round(self.avg_time_per_token,
|
|
132
|
+
'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
|
|
129
133
|
'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
|
|
130
134
|
'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
|
|
131
135
|
'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
|
evalscope/report/app.py
CHANGED
|
@@ -6,6 +6,7 @@ import os
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import plotly.express as px
|
|
8
8
|
import plotly.graph_objects as go
|
|
9
|
+
import re
|
|
9
10
|
from dataclasses import dataclass
|
|
10
11
|
from typing import Any, List, Union
|
|
11
12
|
|
|
@@ -18,6 +19,9 @@ from evalscope.version import __version__
|
|
|
18
19
|
logger = get_logger()
|
|
19
20
|
|
|
20
21
|
PLOTLY_THEME = 'plotly_dark'
|
|
22
|
+
REPORT_TOKEN = '@@'
|
|
23
|
+
MODEL_TOKEN = '::'
|
|
24
|
+
DATASET_TOKEN = ', '
|
|
21
25
|
|
|
22
26
|
|
|
23
27
|
def scan_for_report_folders(root_path):
|
|
@@ -41,8 +45,9 @@ def scan_for_report_folders(root_path):
|
|
|
41
45
|
datasets = []
|
|
42
46
|
for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
|
|
43
47
|
datasets.append(os.path.basename(dataset_item).split('.')[0])
|
|
44
|
-
datasets =
|
|
45
|
-
reports.append(
|
|
48
|
+
datasets = DATASET_TOKEN.join(datasets)
|
|
49
|
+
reports.append(
|
|
50
|
+
f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
|
|
46
51
|
|
|
47
52
|
reports = sorted(reports, reverse=True)
|
|
48
53
|
logger.debug(f'reports: {reports}')
|
|
@@ -50,9 +55,9 @@ def scan_for_report_folders(root_path):
|
|
|
50
55
|
|
|
51
56
|
|
|
52
57
|
def process_report_name(report_name: str):
|
|
53
|
-
prefix, report_name = report_name.split(
|
|
54
|
-
model_name, datasets = report_name.split(
|
|
55
|
-
datasets = datasets.split(
|
|
58
|
+
prefix, report_name = report_name.split(REPORT_TOKEN)
|
|
59
|
+
model_name, datasets = report_name.split(MODEL_TOKEN)
|
|
60
|
+
datasets = datasets.split(DATASET_TOKEN)
|
|
56
61
|
return prefix, model_name, datasets
|
|
57
62
|
|
|
58
63
|
|
|
@@ -170,7 +175,7 @@ def plot_single_dataset_scores(df: pd.DataFrame):
|
|
|
170
175
|
text=df[ReportKey.score],
|
|
171
176
|
barmode='group')
|
|
172
177
|
|
|
173
|
-
width = 0.2 if len(df[ReportKey.subset_name]) <=
|
|
178
|
+
width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
|
|
174
179
|
plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
|
|
175
180
|
plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
|
|
176
181
|
return plot
|
|
@@ -218,7 +223,16 @@ def dict_to_markdown(data) -> str:
|
|
|
218
223
|
return '\n\n'.join(markdown_lines)
|
|
219
224
|
|
|
220
225
|
|
|
226
|
+
def convert_html_tags(text):
|
|
227
|
+
# match begin label
|
|
228
|
+
text = re.sub(r'<(\w+)>', r'[\1]', text)
|
|
229
|
+
# match end label
|
|
230
|
+
text = re.sub(r'</(\w+)>', r'[/\1]', text)
|
|
231
|
+
return text
|
|
232
|
+
|
|
233
|
+
|
|
221
234
|
def process_string(string: str, max_length: int = 2048) -> str:
|
|
235
|
+
string = convert_html_tags(string) # for display labels e.g. `<think>`
|
|
222
236
|
if len(string) > max_length:
|
|
223
237
|
return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
|
|
224
238
|
return string
|
|
@@ -226,9 +240,11 @@ def process_string(string: str, max_length: int = 2048) -> str:
|
|
|
226
240
|
|
|
227
241
|
def process_model_prediction(item: Any):
|
|
228
242
|
if isinstance(item, dict):
|
|
229
|
-
|
|
243
|
+
res = dict_to_markdown(item)
|
|
244
|
+
return process_string(res)
|
|
230
245
|
elif isinstance(item, list):
|
|
231
|
-
|
|
246
|
+
res = '\n'.join([process_model_prediction(item) for item in item])
|
|
247
|
+
return process_string(res)
|
|
232
248
|
else:
|
|
233
249
|
return process_string(str(item))
|
|
234
250
|
|
|
@@ -257,19 +273,20 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
|
|
|
257
273
|
ds = []
|
|
258
274
|
for i, item in origin_df.iterrows():
|
|
259
275
|
raw_input = item['raw_input']
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
276
|
+
for choice in item['choices']:
|
|
277
|
+
raw_pred_answer = choice['message']['content']
|
|
278
|
+
parsed_gold_answer = choice['review']['gold']
|
|
279
|
+
parsed_pred_answer = choice['review']['pred']
|
|
280
|
+
score = choice['review']['result']
|
|
281
|
+
raw_d = {
|
|
282
|
+
'Input': raw_input,
|
|
283
|
+
'Generated': raw_pred_answer,
|
|
284
|
+
'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
|
|
285
|
+
'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
|
|
286
|
+
'Score': score,
|
|
287
|
+
'NScore': normalize_score(score)
|
|
288
|
+
}
|
|
289
|
+
ds.append(raw_d)
|
|
273
290
|
|
|
274
291
|
df_subset = pd.DataFrame(ds)
|
|
275
292
|
return df_subset
|
|
@@ -284,6 +301,8 @@ def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: i
|
|
|
284
301
|
end = start + rows_per_page
|
|
285
302
|
df_subset = data_review_df.iloc[start:end].copy()
|
|
286
303
|
df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
|
|
304
|
+
df_subset['Generated'] = df_subset['Generated'].map(process_model_prediction).astype(str)
|
|
305
|
+
df_subset['Pred'] = df_subset['Pred'].map(process_model_prediction).astype(str)
|
|
287
306
|
df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
|
|
288
307
|
styler = style_df(df_subset, columns=['NScore'])
|
|
289
308
|
return df_subset, styler
|
|
@@ -504,8 +523,8 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
504
523
|
outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
|
|
505
524
|
def update_single_report_data(root_path, report_name):
|
|
506
525
|
report_list, datasets, task_cfg = load_single_report(root_path, report_name)
|
|
507
|
-
work_dir = os.path.join(root_path, report_name.split(
|
|
508
|
-
model_name = report_name.split(
|
|
526
|
+
work_dir = os.path.join(root_path, report_name.split(REPORT_TOKEN)[0])
|
|
527
|
+
model_name = report_name.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
|
|
509
528
|
return (report_list, task_cfg, gr.update(choices=datasets, value=datasets[0]), work_dir, model_name)
|
|
510
529
|
|
|
511
530
|
@report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])
|
evalscope/run.py
CHANGED
|
@@ -46,11 +46,13 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
46
46
|
|
|
47
47
|
def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
48
48
|
"""Set the working directory for the task."""
|
|
49
|
+
# use cache
|
|
49
50
|
if task_cfg.use_cache:
|
|
50
51
|
task_cfg.work_dir = task_cfg.use_cache
|
|
51
52
|
logger.info(f'Set resume from {task_cfg.work_dir}')
|
|
52
53
|
# elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
|
|
53
|
-
|
|
54
|
+
else:
|
|
55
|
+
task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
|
|
54
56
|
|
|
55
57
|
outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
|
|
56
58
|
|
|
@@ -112,8 +114,8 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
112
114
|
logger.info(task_cfg)
|
|
113
115
|
|
|
114
116
|
for evaluator in evaluators:
|
|
115
|
-
res_dict = evaluator.eval(
|
|
116
|
-
eval_results[dataset_name] = res_dict
|
|
117
|
+
res_dict = evaluator.eval()
|
|
118
|
+
eval_results[evaluator.dataset_name] = res_dict
|
|
117
119
|
|
|
118
120
|
return eval_results
|
|
119
121
|
|
|
@@ -124,21 +126,22 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
|
|
|
124
126
|
from evalscope.evaluator import Evaluator
|
|
125
127
|
from evalscope.models import initialize_model_adapter
|
|
126
128
|
|
|
129
|
+
benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
|
|
130
|
+
# Initialize data adapter
|
|
131
|
+
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
132
|
+
|
|
127
133
|
if dataset_name == DataCollection.NAME:
|
|
128
134
|
# EvaluatorCollection is a collection of evaluators
|
|
129
135
|
from evalscope.collections import EvaluatorCollection
|
|
130
|
-
return EvaluatorCollection(task_cfg, outputs)
|
|
136
|
+
return EvaluatorCollection(task_cfg, data_adapter, outputs)
|
|
131
137
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
138
|
+
# Initialize model adapter
|
|
135
139
|
model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
|
|
136
140
|
|
|
137
141
|
# update task_cfg.dataset_args
|
|
138
142
|
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
|
139
143
|
|
|
140
144
|
return Evaluator(
|
|
141
|
-
dataset_name_or_path=benchmark.dataset_id,
|
|
142
145
|
data_adapter=data_adapter,
|
|
143
146
|
model_adapter=model_adapter,
|
|
144
147
|
outputs=outputs,
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import plotly.graph_objects as go
|
|
5
|
+
import re
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from functools import lru_cache
|
|
8
|
+
from modelscope import AutoTokenizer
|
|
9
|
+
from plotly.subplots import make_subplots
|
|
10
|
+
from tqdm.contrib.concurrent import thread_map
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
from evalscope.third_party.thinkbench.tools.llm import request_url
|
|
14
|
+
from evalscope.third_party.thinkbench.tools.utils import extract_answer
|
|
15
|
+
from evalscope.utils.io_utils import dump_jsonl_data
|
|
16
|
+
|
|
17
|
+
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
18
|
+
|
|
19
|
+
class EvalThink:
|
|
20
|
+
def __init__(self, report_path, tokenizer_path, model_name, dataset_name, subsets, split_strategies='llm', judge_config=None):
|
|
21
|
+
self.report_path = report_path
|
|
22
|
+
self.reformat_template = open(os.path.join(cur_path, 'resources/reformat_template.txt'), 'r').read()
|
|
23
|
+
self.critique_template = open(os.path.join(cur_path, 'resources/critique_template.txt'), 'r').read()
|
|
24
|
+
self.switch_tokens = ['alternatively', 'but wait', 'let me reconsider', 'another way', 'another approach', 'another method', 'another angle']
|
|
25
|
+
self.subset_dict = defaultdict(lambda: defaultdict(list))
|
|
26
|
+
self.think_end_token = '</think>'
|
|
27
|
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
|
|
28
|
+
self.model_name = model_name
|
|
29
|
+
self.dataset_name = dataset_name
|
|
30
|
+
self.subsets = subsets
|
|
31
|
+
self.metrics = ['token_efficiency', 'completion_len', 'thought_num', 'accuracy']
|
|
32
|
+
self.split_strategies = split_strategies # split by llm, keywords, separator
|
|
33
|
+
self.judge_config = judge_config
|
|
34
|
+
|
|
35
|
+
@lru_cache(maxsize=None)
|
|
36
|
+
def get_think_part(self, text):
|
|
37
|
+
last_think_end = text.rfind(self.think_end_token)
|
|
38
|
+
return text[:last_think_end].lower()
|
|
39
|
+
|
|
40
|
+
@lru_cache(maxsize=None)
|
|
41
|
+
def cal_tokens(self, text: str):
|
|
42
|
+
return len(self.tokenizer.encode(text, add_special_tokens=False))
|
|
43
|
+
|
|
44
|
+
def process_choice(self, choice, problem):
|
|
45
|
+
think_part = self.get_think_part(choice['message']['content'])
|
|
46
|
+
answer = choice['review']['gold']
|
|
47
|
+
tokens = self.cal_tokens(think_part)
|
|
48
|
+
switch_count = sum(think_part.count(token) for token in self.switch_tokens)
|
|
49
|
+
useful_tokens = self.cal_tokens(self.get_first_correct(think_part, problem, answer))
|
|
50
|
+
score = choice['review']['result']
|
|
51
|
+
return tokens, switch_count, useful_tokens, score
|
|
52
|
+
|
|
53
|
+
def process_item(self, item):
|
|
54
|
+
problem = item['raw_input'].get('question') or item['raw_input'].get('problem') or ''
|
|
55
|
+
results = []
|
|
56
|
+
for choice in item['choices']:
|
|
57
|
+
results.append(self.process_choice(choice, problem))
|
|
58
|
+
break # only process the first choice
|
|
59
|
+
|
|
60
|
+
tokens, switch_counts, useful_tokens, scores = zip(*results)
|
|
61
|
+
|
|
62
|
+
avg_tokens = sum(tokens) / len(tokens)
|
|
63
|
+
avg_thought_num = sum(switch_counts) / len(switch_counts)
|
|
64
|
+
avg_token_efficiency = sum(useful_tokens) / sum(tokens)
|
|
65
|
+
avg_accuracy = sum(scores) / len(scores)
|
|
66
|
+
|
|
67
|
+
return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy
|
|
68
|
+
|
|
69
|
+
def split_by_llm(self, response, problem) -> List[str]:
|
|
70
|
+
response = response.replace('\n', ' ') # remove newline characters
|
|
71
|
+
prompt = self.reformat_template.format(problem=problem, response=response)
|
|
72
|
+
llm_response = request_url(self.judge_config, prompt)
|
|
73
|
+
return llm_response.split('\n\n')
|
|
74
|
+
|
|
75
|
+
def split_by_keywords(self, text) -> List[str]:
|
|
76
|
+
pattern = r'(?=\b(?:{})\b)'.format('|'.join(map(re.escape, self.switch_tokens)))
|
|
77
|
+
segments = re.split(pattern, text)
|
|
78
|
+
# remove empty segments
|
|
79
|
+
segments = [segment.strip() for segment in segments if segment.strip()]
|
|
80
|
+
|
|
81
|
+
return segments if segments else [text]
|
|
82
|
+
|
|
83
|
+
def split_by_separator(self, text) -> List[str]:
|
|
84
|
+
return text.split('\n\n')
|
|
85
|
+
|
|
86
|
+
def get_answer_index(self, response: List[str], problem: str, answer: str) -> int:
|
|
87
|
+
tagged_response = ''
|
|
88
|
+
for sdx, step in enumerate(response):
|
|
89
|
+
tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
|
|
90
|
+
tagged_response = tagged_response.strip()
|
|
91
|
+
|
|
92
|
+
prompt = self.critique_template.format(problem=problem, answer=answer, tagged_response=tagged_response)
|
|
93
|
+
llm_response = request_url(self.judge_config, prompt)
|
|
94
|
+
answer_index = extract_answer(llm_response)
|
|
95
|
+
|
|
96
|
+
dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
|
|
97
|
+
os.path.join(self.report_path, 'answer_index.jsonl'),
|
|
98
|
+
dump_mode='append')
|
|
99
|
+
try:
|
|
100
|
+
answer_index = int(answer_index)
|
|
101
|
+
except Exception:
|
|
102
|
+
answer_index = -1
|
|
103
|
+
return answer_index
|
|
104
|
+
|
|
105
|
+
def get_first_correct(self, response: str, problem: str, answer: str) -> str:
|
|
106
|
+
if self.split_strategies == 'llm':
|
|
107
|
+
text_list = self.split_by_llm(response, problem)
|
|
108
|
+
elif self.split_strategies == 'keywords':
|
|
109
|
+
text_list = self.split_by_keywords(response)
|
|
110
|
+
else:
|
|
111
|
+
text_list = self.split_by_separator(response)
|
|
112
|
+
|
|
113
|
+
answer_index = self.get_answer_index(text_list, problem, answer)
|
|
114
|
+
|
|
115
|
+
if answer_index == -1: # no correct answer found
|
|
116
|
+
first_correct = ''
|
|
117
|
+
else:
|
|
118
|
+
first_correct = '\n\n'.join(text_list[: answer_index])
|
|
119
|
+
return first_correct
|
|
120
|
+
|
|
121
|
+
def plot_metrics(self, results, output_dir):
|
|
122
|
+
fig = make_subplots(rows=1, cols=len(self.metrics),
|
|
123
|
+
subplot_titles=('Token Efficiency', 'Completion Length', 'Thought Num', 'Accuracy'),
|
|
124
|
+
shared_xaxes=True, x_title='Subsets')
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
for i, metric in enumerate(self.metrics, start=1):
|
|
128
|
+
y_values = [results[metric][subset] for subset in self.subsets]
|
|
129
|
+
fig.add_trace(
|
|
130
|
+
go.Scatter(x=list(range(len(self.subsets))), y=y_values,
|
|
131
|
+
mode='lines+markers',
|
|
132
|
+
name=metric.replace('_', ' ').title()),
|
|
133
|
+
row=1, col=i
|
|
134
|
+
)
|
|
135
|
+
# Add annotations for each data point
|
|
136
|
+
for j, y in enumerate(y_values):
|
|
137
|
+
fig.add_annotation(
|
|
138
|
+
x=j,
|
|
139
|
+
y=y,
|
|
140
|
+
text=f'{y:.2f}',
|
|
141
|
+
showarrow=False,
|
|
142
|
+
yshift=10,
|
|
143
|
+
row=1,
|
|
144
|
+
col=i
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
fig.update_layout(
|
|
148
|
+
height=500,
|
|
149
|
+
width=1500,
|
|
150
|
+
title_text=f'Evaluation Metrics for {self.model_name} on {self.dataset_name}',
|
|
151
|
+
legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
for i in range(1, len(self.metrics) + 1):
|
|
155
|
+
fig.update_xaxes(
|
|
156
|
+
ticktext=self.subsets,
|
|
157
|
+
tickvals=list(range(len(self.subsets))),
|
|
158
|
+
row=1, col=i
|
|
159
|
+
)
|
|
160
|
+
fig.update_yaxes(title_text=self.metrics[i-1].replace('_', ' ').title(), row=1, col=i)
|
|
161
|
+
# Update y-axis ranges
|
|
162
|
+
fig.update_yaxes(range=[0, 1], row=1, col=1) # Token Efficiency
|
|
163
|
+
fig.update_yaxes(range=[0, 13], row=1, col=3) # Switch Frequency
|
|
164
|
+
fig.update_yaxes(range=[0, 1], row=1, col=4) # Accuracy
|
|
165
|
+
|
|
166
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
167
|
+
output_path = os.path.join(output_dir, f'{self.model_name}_{self.dataset_name}_metrics.png')
|
|
168
|
+
fig.write_image(output_path)
|
|
169
|
+
print(f'save figure to: {output_path}')
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def filter_df(self, df, response_len: int = 8000, count: int=10):
|
|
174
|
+
def is_valid_row(row):
|
|
175
|
+
return all(self.cal_tokens(choice['message']['content']) <= response_len for choice in row['choices'])
|
|
176
|
+
|
|
177
|
+
bools = df.apply(is_valid_row, axis=1)
|
|
178
|
+
|
|
179
|
+
return df[bools].head(count)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def evaluate(self, output_dir, max_tokens=8000, count=50):
|
|
183
|
+
for subset in self.subsets:
|
|
184
|
+
review_path = os.path.join(self.report_path, 'reviews', self.model_name, f'{self.dataset_name}_{subset}.jsonl')
|
|
185
|
+
review_df = pd.read_json(review_path, lines=True)
|
|
186
|
+
|
|
187
|
+
review_df = self.filter_df(review_df, response_len=max_tokens, count=count)
|
|
188
|
+
|
|
189
|
+
results = thread_map(
|
|
190
|
+
self.process_item,
|
|
191
|
+
(item for _, item in review_df.iterrows()),
|
|
192
|
+
desc=f'Evaluating {subset}',
|
|
193
|
+
total=len(review_df),
|
|
194
|
+
max_workers=16
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy = zip(*results)
|
|
198
|
+
|
|
199
|
+
self.subset_dict[subset]['completion_len'] = sum(avg_tokens) / len(avg_tokens)
|
|
200
|
+
self.subset_dict[subset]['thought_num'] = sum(avg_thought_num) / len(avg_thought_num)
|
|
201
|
+
self.subset_dict[subset]['token_efficiency'] = sum(avg_token_efficiency) / len(avg_token_efficiency)
|
|
202
|
+
self.subset_dict[subset]['accuracy'] = sum(avg_accuracy) / len(avg_accuracy)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
results = {metric: {subset: self.subset_dict[subset][metric] for subset in self.subsets}
|
|
206
|
+
for metric in self.metrics}
|
|
207
|
+
|
|
208
|
+
self.plot_metrics(results, output_dir)
|
|
209
|
+
|
|
210
|
+
return results
|
|
211
|
+
|
|
212
|
+
def run_task(config, output_dir='outputs', max_tokens=8000, count=50):
|
|
213
|
+
evaluator = EvalThink(**config,)
|
|
214
|
+
results = evaluator.evaluate(output_dir, max_tokens, count)
|
|
215
|
+
print(results)
|
|
216
|
+
|
|
217
|
+
judge_config = dict(
|
|
218
|
+
api_key='EMPTY',
|
|
219
|
+
base_url='http://0.0.0.0:8801/v1',
|
|
220
|
+
model_name='Qwen2.5-72B-Instruct',
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
distill_qwen_config = dict(
|
|
224
|
+
report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250218_180219',
|
|
225
|
+
model_name = 'DeepSeek-R1-Distill-Qwen-7B',
|
|
226
|
+
tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
|
|
227
|
+
dataset_name = 'math_500',
|
|
228
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
229
|
+
split_strategies='separator',
|
|
230
|
+
judge_config=judge_config
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
math_qwen_config = dict(
|
|
234
|
+
report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250219_202358',
|
|
235
|
+
model_name = 'Qwen2.5-Math-7B-Instruct',
|
|
236
|
+
tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
|
|
237
|
+
dataset_name = 'math_500',
|
|
238
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
239
|
+
split_strategies='separator'
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
r1_config = dict(
|
|
243
|
+
report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202',
|
|
244
|
+
model_name = 'deepseek-r1',
|
|
245
|
+
tokenizer_path = 'deepseek-ai/DeepSeek-R1',
|
|
246
|
+
dataset_name = 'math_500',
|
|
247
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
248
|
+
split_strategies='separator'
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
qwq_config = dict(
|
|
252
|
+
report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911',
|
|
253
|
+
model_name = 'qwq-32b-preview',
|
|
254
|
+
tokenizer_path = 'Qwen/QwQ-32B-Preview',
|
|
255
|
+
dataset_name = 'math_500',
|
|
256
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
257
|
+
split_strategies='separator'
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
if __name__ == '__main__':
|
|
261
|
+
run_task(distill_qwen_config)
|
|
262
|
+
# run_task(math_qwen_config)
|
|
263
|
+
# run_task(r1_config)
|
|
264
|
+
# run_task(qwq_config)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from evalscope import TaskConfig, run_task
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def eval_distill_qwen():
|
|
7
|
+
model_name = 'DeepSeek-R1-Distill-Qwen-7B'
|
|
8
|
+
dataset_name = 'math_500'
|
|
9
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
10
|
+
|
|
11
|
+
task_config = TaskConfig(
|
|
12
|
+
api_url='http://0.0.0.0:8801/v1/chat/completions',
|
|
13
|
+
model=model_name,
|
|
14
|
+
eval_type='service',
|
|
15
|
+
datasets=[dataset_name],
|
|
16
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
17
|
+
eval_batch_size=32,
|
|
18
|
+
generation_config={
|
|
19
|
+
'max_tokens': 20000, # avoid exceed max length
|
|
20
|
+
'temperature': 0.6,
|
|
21
|
+
'top_p': 0.95,
|
|
22
|
+
'n': 1,
|
|
23
|
+
},
|
|
24
|
+
)
|
|
25
|
+
run_task(task_config)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def eval_math_qwen():
|
|
29
|
+
model_name = 'Qwen2.5-Math-7B-Instruct'
|
|
30
|
+
dataset_name = 'math_500'
|
|
31
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
32
|
+
|
|
33
|
+
task_config = TaskConfig(
|
|
34
|
+
api_url='http://0.0.0.0:8801/v1/chat/completions',
|
|
35
|
+
model=model_name,
|
|
36
|
+
eval_type='service',
|
|
37
|
+
datasets=[dataset_name],
|
|
38
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
39
|
+
eval_batch_size=32,
|
|
40
|
+
generation_config={
|
|
41
|
+
'max_tokens': 3000, # avoid exceed max length
|
|
42
|
+
'temperature': 0.6,
|
|
43
|
+
'top_p': 0.95,
|
|
44
|
+
'n': 3,
|
|
45
|
+
},
|
|
46
|
+
)
|
|
47
|
+
run_task(task_config)
|
|
48
|
+
|
|
49
|
+
def eval_r1():
|
|
50
|
+
model_name = 'deepseek-r1'
|
|
51
|
+
dataset_name = 'math_500'
|
|
52
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
53
|
+
|
|
54
|
+
task_config = TaskConfig(
|
|
55
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
56
|
+
api_key=os.environ['DASHSCOPE_API_KEY'],
|
|
57
|
+
model=model_name,
|
|
58
|
+
eval_type='service',
|
|
59
|
+
datasets=[dataset_name],
|
|
60
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
61
|
+
eval_batch_size=3,
|
|
62
|
+
generation_config={
|
|
63
|
+
'max_tokens': 12000, # avoid exceed max length
|
|
64
|
+
'temperature': 0.6,
|
|
65
|
+
'top_p': 0.95,
|
|
66
|
+
'n': 1,
|
|
67
|
+
},
|
|
68
|
+
limit=50,
|
|
69
|
+
use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202'
|
|
70
|
+
)
|
|
71
|
+
run_task(task_config)
|
|
72
|
+
|
|
73
|
+
def eval_qwq():
|
|
74
|
+
model_name = 'qwq-32b-preview'
|
|
75
|
+
dataset_name = 'math_500'
|
|
76
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
77
|
+
|
|
78
|
+
task_config = TaskConfig(
|
|
79
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
80
|
+
api_key=os.environ['DASHSCOPE_API_KEY'],
|
|
81
|
+
model=model_name,
|
|
82
|
+
eval_type='service',
|
|
83
|
+
datasets=[dataset_name],
|
|
84
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
85
|
+
eval_batch_size=32,
|
|
86
|
+
generation_config={
|
|
87
|
+
'max_tokens': 8000, # avoid exceed max length
|
|
88
|
+
'temperature': 0.6,
|
|
89
|
+
'top_p': 0.95,
|
|
90
|
+
'n': 1,
|
|
91
|
+
},
|
|
92
|
+
use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911'
|
|
93
|
+
)
|
|
94
|
+
run_task(task_config)
|
|
95
|
+
|
|
96
|
+
if __name__ == '__main__':
|
|
97
|
+
# eval_distill_qwen()
|
|
98
|
+
# eval_math_qwen()
|
|
99
|
+
# eval_r1()
|
|
100
|
+
eval_qwq()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
|
|
2
|
+
|
|
3
|
+
[Math Problem]
|
|
4
|
+
|
|
5
|
+
{problem}
|
|
6
|
+
|
|
7
|
+
[Correct Answer]
|
|
8
|
+
|
|
9
|
+
{answer}
|
|
10
|
+
|
|
11
|
+
[Solution]
|
|
12
|
+
|
|
13
|
+
{tagged_response}
|
|
14
|
+
|
|
15
|
+
Your task is to review and critique the solution paragraph by paragraph. Once you identify an correct answer in a paragraph, return the index of the paragraph where the earliest correct answer occurs. Otherwise, return the index of -1 (which typically denotes "not found").
|
|
16
|
+
|
|
17
|
+
Please put your final answer (i.e., the index) in \boxed{{}}.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
I will present you with a solution to a math problem. Unfortunately, the solution lacks proper paragraphing, making it hard to read. Your task is to improve readability by reformatting the solution into well-structured paragraphs. Follow these specific guidelines:
|
|
2
|
+
|
|
3
|
+
* Insert \n\n for paragraph breaks within the original solution. Do **NOT** alter any content of the original solution (the only exception is for itemized lists; see below).
|
|
4
|
+
|
|
5
|
+
- Each paragraph should represent a distinct, concise reasoning step that logically advances the solution.
|
|
6
|
+
|
|
7
|
+
- Reasoning steps can include case discussions, formula simplifications, or formula derivations. Each of these should be treated as an individual reasoning step and paragraphed accordingly.
|
|
8
|
+
|
|
9
|
+
- If an introductory analysis exists in the original solution, treat it as an initial reasoning step and place it as the first paragraph.
|
|
10
|
+
|
|
11
|
+
- Do **NOT** place any mathematical formulas in their own separate paragraphs; instead, include them within the same paragraph as the preceding text to form a cohesive reasoning step.
|
|
12
|
+
|
|
13
|
+
* For any itemized lists (ordered or unordered), convert them into a written format, such as "First/Second/Third." This is the **ONLY** content modification allowed.
|
|
14
|
+
|
|
15
|
+
* Avoid making paragraphs too lengthy, as long paragraphs might contain multiple reasoning steps that should be paragraphed separately.
|
|
16
|
+
|
|
17
|
+
* Disregard the accuracy of the solution content. Do **NOT** alter any of the original solution's content; focus solely on structuring it into logical, readable paragraphs.
|
|
18
|
+
|
|
19
|
+
* Reply with the reformatted solution directly.
|
|
20
|
+
|
|
21
|
+
--------------------------------------------------
|
|
22
|
+
|
|
23
|
+
Here is the math problem, and the solution that needs to be reformatted:
|
|
24
|
+
|
|
25
|
+
[Math Problem]
|
|
26
|
+
|
|
27
|
+
{problem}
|
|
28
|
+
|
|
29
|
+
[Solution]
|
|
30
|
+
|
|
31
|
+
{response}
|
|
File without changes
|