evalscope 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +21 -5
- evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- evalscope/backend/rag_eval/utils/embedding.py +49 -3
- evalscope/backend/rag_eval/utils/llm.py +8 -9
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
- evalscope/benchmarks/arena_hard/utils.py +162 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
- evalscope/benchmarks/data_adapter.py +30 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
- evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
- evalscope/collections/evaluator.py +4 -2
- evalscope/config.py +2 -2
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/models/chat_adapter.py +32 -11
- evalscope/perf/arguments.py +30 -9
- evalscope/perf/benchmark.py +57 -103
- evalscope/perf/http_client.py +2 -3
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +4 -2
- evalscope/perf/plugin/datasets/custom.py +4 -1
- evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -1
- evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/utils/benchmark_util.py +12 -6
- evalscope/perf/utils/db_util.py +3 -3
- evalscope/perf/utils/log_utils.py +41 -0
- evalscope/report/app.py +11 -11
- evalscope/run.py +7 -0
- evalscope/summarizer.py +2 -1
- evalscope/utils/utils.py +36 -25
- evalscope/version.py +2 -2
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/METADATA +21 -55
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/RECORD +70 -62
- tests/cli/test_all.py +36 -27
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +38 -20
- tests/perf/test_perf.py +1 -2
- tests/rag/test_clip_benchmark.py +0 -1
- tests/rag/test_mteb.py +37 -8
- tests/rag/test_ragas.py +33 -27
- tests/vlm/test_vlmeval.py +37 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import re
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from sklearn.linear_model import LogisticRegression
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def post_process_arenahard(completion):
|
|
15
|
+
result = re.findall(r'\[\[([AB<>=]+)\]\]', completion)
|
|
16
|
+
if result:
|
|
17
|
+
return result[0]
|
|
18
|
+
else:
|
|
19
|
+
return None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_battles_from_row(row, first_game_only=False, multiplier=3):
|
|
23
|
+
results = []
|
|
24
|
+
output = {'model_a': row['model_a'], 'model_b': row['model_b']}
|
|
25
|
+
|
|
26
|
+
game = row['games'][0]
|
|
27
|
+
weight = 1
|
|
28
|
+
if game['score'] == 'A=B':
|
|
29
|
+
output['winner'] = 'tie'
|
|
30
|
+
elif game['score'] == 'A>B':
|
|
31
|
+
output['winner'] = 'model_a'
|
|
32
|
+
elif game['score'] == 'A>>B':
|
|
33
|
+
output['winner'] = 'model_a'
|
|
34
|
+
weight = multiplier
|
|
35
|
+
elif game['score'] == 'B>A':
|
|
36
|
+
output['winner'] = 'model_b'
|
|
37
|
+
elif game['score'] == 'B>>A':
|
|
38
|
+
output['winner'] = 'model_b'
|
|
39
|
+
weight = multiplier
|
|
40
|
+
else:
|
|
41
|
+
weight = 0
|
|
42
|
+
|
|
43
|
+
if weight:
|
|
44
|
+
results += [output] * weight
|
|
45
|
+
|
|
46
|
+
if first_game_only:
|
|
47
|
+
return pd.DataFrame(results)
|
|
48
|
+
|
|
49
|
+
# game 2
|
|
50
|
+
output = {'model_a': row['model_a'], 'model_b': row['model_b']}
|
|
51
|
+
|
|
52
|
+
game = row['games'][1]
|
|
53
|
+
|
|
54
|
+
weight = 1
|
|
55
|
+
if game['score'] == 'A=B':
|
|
56
|
+
output['winner'] = 'tie'
|
|
57
|
+
elif game['score'] == 'A>B':
|
|
58
|
+
output['winner'] = 'model_b'
|
|
59
|
+
elif game['score'] == 'A>>B':
|
|
60
|
+
output['winner'] = 'model_b'
|
|
61
|
+
weight = multiplier
|
|
62
|
+
elif game['score'] == 'B>A':
|
|
63
|
+
output['winner'] = 'model_a'
|
|
64
|
+
elif game['score'] == 'B>>A':
|
|
65
|
+
output['winner'] = 'model_a'
|
|
66
|
+
weight = multiplier
|
|
67
|
+
else:
|
|
68
|
+
weight = 0
|
|
69
|
+
|
|
70
|
+
if weight:
|
|
71
|
+
results += [output] * weight
|
|
72
|
+
|
|
73
|
+
return pd.DataFrame(results)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
77
|
+
models = pd.concat([df['model_a'], df['model_b']]).unique()
|
|
78
|
+
models = pd.Series(np.arange(len(models)), index=models)
|
|
79
|
+
|
|
80
|
+
# duplicate battles
|
|
81
|
+
df = pd.concat([df, df], ignore_index=True)
|
|
82
|
+
p = len(models.index)
|
|
83
|
+
n = df.shape[0]
|
|
84
|
+
|
|
85
|
+
X = np.zeros([n, p])
|
|
86
|
+
X[np.arange(n), models[df['model_a']]] = +math.log(BASE)
|
|
87
|
+
X[np.arange(n), models[df['model_b']]] = -math.log(BASE)
|
|
88
|
+
|
|
89
|
+
# one A win => two A win
|
|
90
|
+
Y = np.zeros(n)
|
|
91
|
+
Y[df['winner'] == 'model_a'] = 1.0
|
|
92
|
+
|
|
93
|
+
# one tie => one A win + one B win
|
|
94
|
+
# find tie + tie (both bad) index
|
|
95
|
+
tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)')
|
|
96
|
+
tie_idx[len(tie_idx) // 2:] = False
|
|
97
|
+
Y[tie_idx] = 1.0
|
|
98
|
+
|
|
99
|
+
if len(np.unique(Y)) < 2:
|
|
100
|
+
logger.info('Warning: Only one class in the data')
|
|
101
|
+
elo_scores = pd.Series(INIT_RATING, index=models.index)
|
|
102
|
+
if np.all(Y == 1.0):
|
|
103
|
+
elo_scores[df['model_a'].iloc[0]] += SCALE # Boost the winning model
|
|
104
|
+
elif np.all(Y == 0.0):
|
|
105
|
+
elo_scores[df['model_b'].iloc[0]] += SCALE # Boost the winning model
|
|
106
|
+
return elo_scores.sort_values(ascending=False)
|
|
107
|
+
|
|
108
|
+
lr = LogisticRegression(
|
|
109
|
+
fit_intercept=False, penalty=None, tol=1e-8) # May need to set a small value when not use GPT4 as judge model
|
|
110
|
+
lr.fit(X, Y)
|
|
111
|
+
|
|
112
|
+
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
|
|
113
|
+
|
|
114
|
+
# set anchor as gpt4-0314 = 1000
|
|
115
|
+
if 'gpt4-0314' in models.index:
|
|
116
|
+
elo_scores += 1000 - elo_scores[models['gpt4-0314']]
|
|
117
|
+
return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def get_bootstrap_result(battles, func_compute_elo, num_round):
|
|
121
|
+
rows = []
|
|
122
|
+
for _ in tqdm(range(num_round), desc='bootstrap'):
|
|
123
|
+
res = func_compute_elo(battles.sample(frac=1.0, replace=True))
|
|
124
|
+
if res is not None:
|
|
125
|
+
rows.append(res)
|
|
126
|
+
df = pd.DataFrame(rows)
|
|
127
|
+
return df[df.median().sort_values(ascending=False).index]
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def preety_print_two_ratings(ratings_1, ratings_2, column_names):
|
|
131
|
+
df = (
|
|
132
|
+
pd.DataFrame(
|
|
133
|
+
[[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()],
|
|
134
|
+
columns=['Model', column_names[0], column_names[1]],
|
|
135
|
+
).sort_values(column_names[0], ascending=False).reset_index(drop=True))
|
|
136
|
+
df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
|
|
137
|
+
df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
|
|
138
|
+
df.index = df.index + 1
|
|
139
|
+
return df
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
143
|
+
names = sorted(list(elo_ratings.keys()))
|
|
144
|
+
wins = defaultdict(lambda: defaultdict(lambda: 0))
|
|
145
|
+
for a in names:
|
|
146
|
+
for b in names:
|
|
147
|
+
ea = 1 / (1 + BASE**((elo_ratings[b] - elo_ratings[a]) / SCALE))
|
|
148
|
+
wins[a][b] = ea
|
|
149
|
+
wins[b][a] = 1 - ea
|
|
150
|
+
|
|
151
|
+
data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
|
|
152
|
+
|
|
153
|
+
df = pd.DataFrame(data, index=names)
|
|
154
|
+
df.index.name = 'model_a'
|
|
155
|
+
df.columns.name = 'model_b'
|
|
156
|
+
return df.T
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def get_win_rate_column(df, column, baseline='gpt4-0314'):
|
|
160
|
+
to_dict = df[['model', column]].set_index('model').to_dict()[column]
|
|
161
|
+
win_rate_table = predict_win_rate(to_dict)
|
|
162
|
+
return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x, 4))
|
|
@@ -126,7 +126,7 @@ class ChineseSimpleQAAdapter(DataAdapter):
|
|
|
126
126
|
|
|
127
127
|
def match(self, gold: str, pred: str) -> float:
|
|
128
128
|
# simple match
|
|
129
|
-
logger.warning(f'Please use LLMJudge to match the result for
|
|
129
|
+
logger.warning(f'Please use LLMJudge to match the result for {self.name}')
|
|
130
130
|
is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
|
|
131
131
|
is_incorrect = not is_correct
|
|
132
132
|
is_not_attempted = 0
|
|
@@ -160,9 +160,6 @@ class ChineseSimpleQAAdapter(DataAdapter):
|
|
|
160
160
|
review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
|
|
161
161
|
"""
|
|
162
162
|
# zip dict answers
|
|
163
|
-
res_dict =
|
|
164
|
-
for res in review_res_list:
|
|
165
|
-
for key, value in res.items():
|
|
166
|
-
res_dict[key].append(value)
|
|
163
|
+
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
167
164
|
|
|
168
165
|
return super().compute_metric(res_dict, **kwargs)
|
|
@@ -8,7 +8,6 @@ from collections import defaultdict
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
9
|
from evalscope.constants import AnswerKeys
|
|
10
10
|
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
11
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
12
11
|
from evalscope.utils.logger import get_logger
|
|
13
12
|
|
|
14
13
|
# flake8: noqa
|
|
@@ -245,6 +245,29 @@ class DataAdapter(ABC):
|
|
|
245
245
|
res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
|
|
246
246
|
return res_list
|
|
247
247
|
|
|
248
|
+
def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
|
|
249
|
+
"""
|
|
250
|
+
compute weighted mean of the bleu score of all samples
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
review_res_list: [score1, score2, ...]
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
avg_res: List[dict]
|
|
257
|
+
|
|
258
|
+
"""
|
|
259
|
+
if isinstance(review_res_list[0], list):
|
|
260
|
+
review_res_list = [item for sublist in review_res_list for item in sublist]
|
|
261
|
+
|
|
262
|
+
items = defaultdict(list)
|
|
263
|
+
for scores in review_res_list:
|
|
264
|
+
if isinstance(scores, dict):
|
|
265
|
+
for k, v in scores.items():
|
|
266
|
+
items[k].append(v)
|
|
267
|
+
else:
|
|
268
|
+
items['AverageAccuracy'].append(scores)
|
|
269
|
+
return items
|
|
270
|
+
|
|
248
271
|
def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
|
|
249
272
|
"""
|
|
250
273
|
Generate report for the evaluation results for all subsets.
|
|
@@ -291,10 +314,15 @@ class DataAdapter(ABC):
|
|
|
291
314
|
kwargs['metric_list'] = self.metric_list
|
|
292
315
|
return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
|
|
293
316
|
|
|
294
|
-
def gen_prompt_data(self,
|
|
317
|
+
def gen_prompt_data(self,
|
|
318
|
+
prompt: str,
|
|
319
|
+
system_prompt: Optional[str] = None,
|
|
320
|
+
choices: Optional[List[str]] = None,
|
|
321
|
+
**kwargs) -> dict:
|
|
295
322
|
if not isinstance(prompt, list):
|
|
296
323
|
prompt = [prompt]
|
|
297
|
-
prompt_data = PromptData(
|
|
324
|
+
prompt_data = PromptData(
|
|
325
|
+
data=prompt, multi_choices=choices or self.choices, system_prompt=system_prompt or self.system_prompt)
|
|
298
326
|
return prompt_data.to_dict()
|
|
299
327
|
|
|
300
328
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
@@ -5,7 +5,6 @@ from typing import Any, Optional
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, EvalType, HubType
|
|
8
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
9
8
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
11
10
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import os.path
|
|
3
3
|
from collections import defaultdict
|
|
4
|
-
from typing import List
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.metrics import bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
|
|
@@ -40,7 +40,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
40
40
|
for subset_name in subset_list:
|
|
41
41
|
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
42
42
|
elif os.path.isfile(dataset_name_or_path):
|
|
43
|
-
cur_subset_name = os.path.basename(dataset_name_or_path)
|
|
43
|
+
cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
|
|
44
44
|
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
45
45
|
else:
|
|
46
46
|
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
@@ -74,8 +74,9 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
74
74
|
To be supported in the future.')
|
|
75
75
|
|
|
76
76
|
query = input_d.get('question', '') or input_d.get('query', '')
|
|
77
|
+
system_prompt = input_d.get('system')
|
|
77
78
|
prompt = self.prompt_template.format(query=query)
|
|
78
|
-
return self.gen_prompt_data(prompt)
|
|
79
|
+
return self.gen_prompt_data(prompt, system_prompt=system_prompt)
|
|
79
80
|
|
|
80
81
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
81
82
|
"""
|
|
@@ -118,7 +119,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
118
119
|
res.update(bleu_dict)
|
|
119
120
|
return res
|
|
120
121
|
|
|
121
|
-
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
122
|
+
def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
|
|
122
123
|
"""
|
|
123
124
|
compute weighted mean of the bleu score of all samples
|
|
124
125
|
|
|
@@ -129,12 +130,5 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
129
130
|
avg_res: List[dict]
|
|
130
131
|
|
|
131
132
|
"""
|
|
132
|
-
items =
|
|
133
|
-
for scores in review_res_list:
|
|
134
|
-
if isinstance(scores, dict):
|
|
135
|
-
for k, v in scores.items():
|
|
136
|
-
items[k].append(v)
|
|
137
|
-
else:
|
|
138
|
-
items['AverageAccuracy'].append(scores)
|
|
139
|
-
# items = [(score, 1.0) for score in review_res_list]
|
|
133
|
+
items = super().compute_dict_metric(review_res_list, **kwargs)
|
|
140
134
|
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
|
|
@@ -108,7 +108,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
108
108
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
109
109
|
return result
|
|
110
110
|
else:
|
|
111
|
-
return ResponseParser.parse_first_option(result)
|
|
111
|
+
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
112
112
|
|
|
113
113
|
def match(self, gold: str, pred: str) -> float:
|
|
114
114
|
return exact_match(gold=str(gold), pred=str(pred))
|
|
@@ -48,9 +48,6 @@ class IFEvalAdapter(DataAdapter):
|
|
|
48
48
|
|
|
49
49
|
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
|
|
50
50
|
# aggregate review results
|
|
51
|
-
res_dict =
|
|
52
|
-
for res in review_res_list:
|
|
53
|
-
for k, v in res.items():
|
|
54
|
-
res_dict[k].append(v)
|
|
51
|
+
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
55
52
|
|
|
56
|
-
return super().compute_metric(res_dict)
|
|
53
|
+
return super().compute_metric(res_dict, **kwargs)
|
|
@@ -18,7 +18,6 @@ logger = get_logger()
|
|
|
18
18
|
extra_params={
|
|
19
19
|
'start_date': None,
|
|
20
20
|
'end_date': None,
|
|
21
|
-
'num_process_evaluate': 1,
|
|
22
21
|
'timeout': 6
|
|
23
22
|
},
|
|
24
23
|
system_prompt=
|
|
@@ -33,7 +32,6 @@ class LiveCodeBenchAdapter(DataAdapter):
|
|
|
33
32
|
|
|
34
33
|
extra_params = kwargs.get('extra_params', {})
|
|
35
34
|
|
|
36
|
-
self.num_process_evaluate = extra_params.get('num_process_evaluate', 1)
|
|
37
35
|
self.timeout = extra_params.get('timeout', 6)
|
|
38
36
|
self.start_date = extra_params.get('start_date')
|
|
39
37
|
self.end_date = extra_params.get('end_date')
|
|
@@ -84,7 +82,7 @@ class LiveCodeBenchAdapter(DataAdapter):
|
|
|
84
82
|
references,
|
|
85
83
|
predictions,
|
|
86
84
|
k_list=[1],
|
|
87
|
-
num_process_evaluate=
|
|
85
|
+
num_process_evaluate=1,
|
|
88
86
|
timeout=self.timeout,
|
|
89
87
|
)
|
|
90
88
|
return metrics['pass@1'] / 100 # convert to point scale
|