evalscope 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/base.py +1 -1
- evalscope/backend/rag_eval/utils/clip.py +2 -2
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
- evalscope/benchmarks/humaneval/humaneval_adapter.py +193 -7
- evalscope/benchmarks/race/race_adapter.py +2 -1
- evalscope/config.py +35 -1
- evalscope/constants.py +24 -38
- evalscope/evaluator/__init__.py +0 -1
- evalscope/evaluator/evaluator.py +5 -4
- evalscope/evaluator/rating_eval.py +1 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
- evalscope/perf/arguments.py +2 -1
- evalscope/perf/benchmark.py +2 -2
- evalscope/perf/main.py +2 -5
- evalscope/perf/plugin/api/openai_api.py +2 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/benchmark_util.py +4 -4
- evalscope/perf/utils/db_util.py +66 -22
- evalscope/perf/utils/local_server.py +3 -1
- evalscope/run.py +45 -82
- evalscope/run_arena.py +2 -1
- evalscope/summarizer.py +14 -26
- evalscope/third_party/longbench_write/eval.py +2 -1
- evalscope/third_party/longbench_write/longbench_write.py +2 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/tools/combine_reports.py +2 -4
- evalscope/tools/rewrite_eval_results.py +1 -1
- evalscope/utils/__init__.py +1 -0
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +8 -0
- evalscope/utils/utils.py +0 -175
- evalscope/version.py +2 -2
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/METADATA +1 -1
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/RECORD +46 -46
- tests/cli/test_run.py +11 -12
- tests/perf/test_perf.py +2 -1
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/evaluator/humaneval_evaluator.py +0 -158
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
evalscope/backend/base.py
CHANGED
|
@@ -4,7 +4,7 @@ import torch.nn.functional as F
|
|
|
4
4
|
from langchain_core.embeddings import Embeddings
|
|
5
5
|
from PIL import Image
|
|
6
6
|
from transformers import AutoModel, AutoProcessor
|
|
7
|
-
from typing import List
|
|
7
|
+
from typing import List, Union
|
|
8
8
|
|
|
9
9
|
from evalscope.backend.rag_eval.utils.tools import PIL_to_base64, download_model
|
|
10
10
|
from evalscope.constants import HubType
|
|
@@ -86,7 +86,7 @@ class CLIPModel(Embeddings):
|
|
|
86
86
|
self.transform = self.processor.image_processor
|
|
87
87
|
self.tokenizer = self.processor.tokenizer
|
|
88
88
|
|
|
89
|
-
def encode_text(self, batch_texts: List[str]
|
|
89
|
+
def encode_text(self, batch_texts: Union[List[str], List[List[str]]]):
|
|
90
90
|
if isinstance(batch_texts[0], list):
|
|
91
91
|
batch_texts = [text for _, texts in enumerate(batch_texts) for text in texts]
|
|
92
92
|
# Ensure that the input texts are within the token limit
|
|
@@ -80,7 +80,7 @@ class BaseModel(Embeddings):
|
|
|
80
80
|
"""Embed query text. Compact mteb."""
|
|
81
81
|
raise NotImplementedError
|
|
82
82
|
|
|
83
|
-
def encode_corpus(self, corpus: List[str]
|
|
83
|
+
def encode_corpus(self, corpus: Union[List[str], List[Dict[str, str]]], **kwargs) -> list[torch.Tensor]:
|
|
84
84
|
"""Embed search docs . Compact mteb."""
|
|
85
85
|
raise NotImplementedError
|
|
86
86
|
|
|
@@ -8,7 +8,7 @@ from typing import Any, Optional
|
|
|
8
8
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
9
9
|
from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
|
|
10
10
|
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|
|
11
|
-
from evalscope.utils import jsonl_to_list
|
|
11
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
12
12
|
from evalscope.utils.logger import get_logger
|
|
13
13
|
|
|
14
14
|
logger = get_logger()
|
|
@@ -6,7 +6,8 @@ import re
|
|
|
6
6
|
|
|
7
7
|
from evalscope.benchmarks import DataAdapter
|
|
8
8
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
9
|
-
from evalscope.utils import
|
|
9
|
+
from evalscope.utils import normalize_score
|
|
10
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
11
|
from evalscope.utils.logger import get_logger
|
|
11
12
|
|
|
12
13
|
# flake8: noqa
|
|
@@ -5,7 +5,8 @@ import re
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
7
7
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
8
|
-
from evalscope.utils import
|
|
8
|
+
from evalscope.utils import normalize_score
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
11
|
|
|
11
12
|
# flake8: noqa
|
|
@@ -1,20 +1,206 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
from typing import List
|
|
2
7
|
|
|
3
|
-
|
|
8
|
+
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
9
|
+
from evalscope.metrics.metrics import weighted_mean
|
|
10
|
+
from evalscope.tools.combine_reports import gen_table
|
|
11
|
+
from evalscope.utils import normalize_score
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
4
15
|
|
|
5
16
|
DATASET_ID = 'modelscope/humaneval'
|
|
6
17
|
SUBSET_LIST = ['openai_humaneval']
|
|
7
18
|
|
|
8
|
-
# Note: ONLY FOR CLASS IMPORT, No implementation here.
|
|
9
|
-
|
|
10
19
|
# Example:
|
|
11
|
-
# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}
|
|
20
|
+
# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
|
|
12
21
|
|
|
13
22
|
|
|
14
|
-
class HumanevalAdapter:
|
|
23
|
+
class HumanevalAdapter(DataAdapter):
|
|
15
24
|
"""
|
|
16
25
|
A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
|
|
17
26
|
"""
|
|
18
27
|
|
|
19
|
-
def __init__(self
|
|
20
|
-
|
|
28
|
+
def __init__(self,
|
|
29
|
+
subset_list: list = None,
|
|
30
|
+
metric_list: list = None,
|
|
31
|
+
few_shot_num: int = None,
|
|
32
|
+
train_split: str = None,
|
|
33
|
+
eval_split: str = 'test',
|
|
34
|
+
prompt_template: str = 'Complete the following python code:\n',
|
|
35
|
+
**kwargs):
|
|
36
|
+
try:
|
|
37
|
+
from human_eval.data import stream_jsonl, write_jsonl
|
|
38
|
+
from human_eval.evaluation import check_correctness
|
|
39
|
+
except ImportError:
|
|
40
|
+
raise ImportError('Please install human_eval:'
|
|
41
|
+
'https://github.com/openai/human-eval/tree/master#installation , '
|
|
42
|
+
'Note that you need to enable the execution code in the human_eval/execution.py first.')
|
|
43
|
+
|
|
44
|
+
if subset_list is None:
|
|
45
|
+
subset_list = SUBSET_LIST
|
|
46
|
+
|
|
47
|
+
if metric_list is None:
|
|
48
|
+
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
49
|
+
|
|
50
|
+
self.k = [1]
|
|
51
|
+
self.num_workers = 4
|
|
52
|
+
self.timeout = 4.0
|
|
53
|
+
self.outputs = kwargs.get('outputs', None)
|
|
54
|
+
|
|
55
|
+
self.read_problems_func = stream_jsonl
|
|
56
|
+
self.write_jsonl_func = write_jsonl
|
|
57
|
+
self.eval_func = check_correctness
|
|
58
|
+
|
|
59
|
+
super().__init__(
|
|
60
|
+
subset_list=subset_list,
|
|
61
|
+
metric_list=metric_list,
|
|
62
|
+
few_shot_num=few_shot_num,
|
|
63
|
+
train_split=train_split,
|
|
64
|
+
eval_split=eval_split,
|
|
65
|
+
prompt_template=prompt_template,
|
|
66
|
+
**kwargs)
|
|
67
|
+
|
|
68
|
+
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
69
|
+
data_dict = {}
|
|
70
|
+
for subset_name in subset_list:
|
|
71
|
+
data_dict[subset_name] = {}
|
|
72
|
+
# [{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...]
|
|
73
|
+
data_dict[subset_name][self.eval_split] = [task for task in self.read_problems_func(dataset_name_or_path)]
|
|
74
|
+
|
|
75
|
+
return data_dict
|
|
76
|
+
|
|
77
|
+
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
78
|
+
"""
|
|
79
|
+
Generate prompt for the model.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
input_d (dict): The raw input. A single data format of the Humaneval:
|
|
83
|
+
{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
|
|
84
|
+
"""
|
|
85
|
+
full_prompt = input_d['prompt']
|
|
86
|
+
full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
|
|
87
|
+
|
|
88
|
+
return {'data': [full_prompt]}
|
|
89
|
+
|
|
90
|
+
def get_answers(self, infer_cfg: dict) -> List[dict]:
|
|
91
|
+
ans_list: list = []
|
|
92
|
+
system_prompt: str = ''
|
|
93
|
+
for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
|
|
94
|
+
prompt: str = system_prompt + data_d['prompt']
|
|
95
|
+
inputs: dict = {'data': [prompt]}
|
|
96
|
+
|
|
97
|
+
pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
|
|
98
|
+
|
|
99
|
+
pred_ans: str = pred_res['choices'][0]['message']['content']
|
|
100
|
+
pred_ans = self._postprocess(pred_ans)
|
|
101
|
+
|
|
102
|
+
ans_list.append({'task_id': task_id, 'completion': pred_ans})
|
|
103
|
+
|
|
104
|
+
return ans_list
|
|
105
|
+
|
|
106
|
+
def eval(self, infer_cfg: dict, **kwargs):
|
|
107
|
+
|
|
108
|
+
# predict
|
|
109
|
+
ans_list: list = self.get_answers(infer_cfg)
|
|
110
|
+
ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
|
|
111
|
+
|
|
112
|
+
self.write_jsonl_func(filename=ans_out_file, data=ans_list)
|
|
113
|
+
# logger.info(f'** Dump predictions to {ans_out_file} successfully.')
|
|
114
|
+
logger.info('** Dump predictions successfully.')
|
|
115
|
+
|
|
116
|
+
# evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
|
|
117
|
+
results = self.eval_func(
|
|
118
|
+
sample_file=ans_out_file,
|
|
119
|
+
k=self.k,
|
|
120
|
+
n_workers=self.num_workers,
|
|
121
|
+
timeout=self.timeout,
|
|
122
|
+
problem_file=self.problem_file)
|
|
123
|
+
|
|
124
|
+
# output: report
|
|
125
|
+
report_map: dict = self.gen_report(results=results)
|
|
126
|
+
report_dir: str = self.outputs_structure.reports_dir
|
|
127
|
+
report_file: str = os.path.join(report_dir, 'human_eval_report.json')
|
|
128
|
+
|
|
129
|
+
with open(report_file, 'w') as f:
|
|
130
|
+
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
131
|
+
# logger.info(f'** Dump report to {report_file} \n')
|
|
132
|
+
logger.info('** Dump report \n')
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
# Make table
|
|
136
|
+
report_table: str = gen_table([report_dir])
|
|
137
|
+
logger.info(f'** Report table: \n {report_table} \n')
|
|
138
|
+
except Exception:
|
|
139
|
+
logger.error('Failed to generate report table.')
|
|
140
|
+
|
|
141
|
+
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
142
|
+
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
143
|
+
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
144
|
+
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
145
|
+
cate_avg_list = [{
|
|
146
|
+
'name': subset_name,
|
|
147
|
+
'score': normalize_score(score=score)
|
|
148
|
+
} for subset_name, (score, _) in subset_score_map.items()]
|
|
149
|
+
|
|
150
|
+
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
151
|
+
|
|
152
|
+
res_map = dict(
|
|
153
|
+
name=report_name or 'HumanEval',
|
|
154
|
+
metric='pass@1',
|
|
155
|
+
score=weighted_avg_acc,
|
|
156
|
+
category=[category_d],
|
|
157
|
+
total_num=total_num)
|
|
158
|
+
|
|
159
|
+
return res_map
|
|
160
|
+
|
|
161
|
+
@classmethod
|
|
162
|
+
def _postprocess(cls, text: str) -> str:
|
|
163
|
+
if '```' in text:
|
|
164
|
+
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
|
|
165
|
+
if len(blocks) == 0:
|
|
166
|
+
text = text.split('```')[1] # fall back to default strategy
|
|
167
|
+
else:
|
|
168
|
+
text = blocks[0] # fetch the first code block
|
|
169
|
+
if not text.startswith('\n'): # in case starting with ```python
|
|
170
|
+
text = text[max(text.find('\n') + 1, 0):]
|
|
171
|
+
if text.strip().startswith('from') or text.strip().startswith('import'):
|
|
172
|
+
def_idx = text.find('def')
|
|
173
|
+
if def_idx != -1:
|
|
174
|
+
text = text[max(text.find('\n', def_idx) + 1, 0):]
|
|
175
|
+
text = text.split('\n\n')[0]
|
|
176
|
+
if text.strip().startswith('def'):
|
|
177
|
+
text = '\n'.join(text.split('\n')[1:])
|
|
178
|
+
if not text.startswith(' '):
|
|
179
|
+
if text.startswith(' '):
|
|
180
|
+
text = ' ' + text.lstrip()
|
|
181
|
+
else:
|
|
182
|
+
text = '\n'.join([' ' + line for line in text.split('\n')])
|
|
183
|
+
return text
|
|
184
|
+
|
|
185
|
+
def compute_metric(self, review_res_list: list) -> float:
|
|
186
|
+
"""
|
|
187
|
+
Compute evaluation result by specific metric.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
The metric score.
|
|
194
|
+
"""
|
|
195
|
+
items = [(score, 1.0) for score in review_res_list]
|
|
196
|
+
return weighted_mean(items)
|
|
197
|
+
|
|
198
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
199
|
+
return self._postprocess(result)
|
|
200
|
+
|
|
201
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
202
|
+
return input_d
|
|
203
|
+
|
|
204
|
+
def match(self, gold: str, pred: str) -> float:
|
|
205
|
+
res = self.eval_func(gold, pred, self.timeout)
|
|
206
|
+
return float(res['passed'])
|
|
@@ -5,7 +5,8 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
7
7
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
8
|
-
from evalscope.utils import
|
|
8
|
+
from evalscope.utils import normalize_score
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
11
|
|
|
11
12
|
# flake8: noqa
|
evalscope/config.py
CHANGED
|
@@ -9,7 +9,8 @@ from typing import Dict, List, Optional, Union
|
|
|
9
9
|
|
|
10
10
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
|
|
11
11
|
from evalscope.models.custom import CustomModel
|
|
12
|
-
from evalscope.utils import
|
|
12
|
+
from evalscope.utils import gen_hash
|
|
13
|
+
from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
|
|
13
14
|
from evalscope.utils.logger import get_logger
|
|
14
15
|
|
|
15
16
|
logger = get_logger()
|
|
@@ -31,6 +32,7 @@ DEFAULT_GENERATION_CONFIG = {
|
|
|
31
32
|
class TaskConfig:
|
|
32
33
|
# Model-related arguments
|
|
33
34
|
model: Union[str, CustomModel, None] = None
|
|
35
|
+
model_id: Optional[str] = None
|
|
34
36
|
model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
|
|
35
37
|
|
|
36
38
|
# Template-related arguments
|
|
@@ -64,6 +66,13 @@ class TaskConfig:
|
|
|
64
66
|
dry_run: bool = False
|
|
65
67
|
seed: int = 42
|
|
66
68
|
|
|
69
|
+
def __post_init__(self):
|
|
70
|
+
if (not self.model_id) and self.model:
|
|
71
|
+
if isinstance(self.model, CustomModel):
|
|
72
|
+
self.model_id = type(self.model).__name__
|
|
73
|
+
else:
|
|
74
|
+
self.model_id = os.path.basename(self.model).rstrip(os.sep)
|
|
75
|
+
|
|
67
76
|
def to_dict(self):
|
|
68
77
|
# Note: to avoid serialization error for some model instance
|
|
69
78
|
return self.__dict__
|
|
@@ -119,6 +128,7 @@ class TaskConfig:
|
|
|
119
128
|
continue
|
|
120
129
|
|
|
121
130
|
task.model = custom_model
|
|
131
|
+
task.model_id = type(custom_model).__name__
|
|
122
132
|
res_list.append(task)
|
|
123
133
|
|
|
124
134
|
return res_list
|
|
@@ -168,6 +178,30 @@ tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
|
|
|
168
178
|
registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
|
|
169
179
|
|
|
170
180
|
|
|
181
|
+
def parse_task_config(task_cfg) -> TaskConfig:
|
|
182
|
+
"""Parse task configuration from various formats into a TaskConfig object."""
|
|
183
|
+
if isinstance(task_cfg, TaskConfig):
|
|
184
|
+
logger.info('Args: Task config is provided with TaskConfig type.')
|
|
185
|
+
elif isinstance(task_cfg, dict):
|
|
186
|
+
logger.info('Args: Task config is provided with dictionary type.')
|
|
187
|
+
task_cfg = TaskConfig.from_dict(task_cfg)
|
|
188
|
+
elif isinstance(task_cfg, Namespace):
|
|
189
|
+
logger.info('Args: Task config is provided with CommandLine type.')
|
|
190
|
+
task_cfg = TaskConfig.from_args(task_cfg)
|
|
191
|
+
elif isinstance(task_cfg, str):
|
|
192
|
+
extension = task_cfg.split('.')[-1]
|
|
193
|
+
logger.info(f'Args: Task config is provided with {extension} file type.')
|
|
194
|
+
if extension in ['yaml', 'yml']:
|
|
195
|
+
task_cfg = TaskConfig.from_yaml(task_cfg)
|
|
196
|
+
elif extension == 'json':
|
|
197
|
+
task_cfg = TaskConfig.from_json(task_cfg)
|
|
198
|
+
else:
|
|
199
|
+
raise ValueError('Args: Unsupported file extension.')
|
|
200
|
+
else:
|
|
201
|
+
raise ValueError('Args: Please provide a valid task config.')
|
|
202
|
+
return task_cfg
|
|
203
|
+
|
|
204
|
+
|
|
171
205
|
class TempModel(CustomModel):
|
|
172
206
|
|
|
173
207
|
def __init__(self, config: dict):
|
evalscope/constants.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
2
|
from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
|
|
4
3
|
from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
|
|
5
4
|
|
|
@@ -7,6 +6,7 @@ DEFAULT_WORK_DIR = './outputs'
|
|
|
7
6
|
DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
|
|
8
7
|
DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
|
|
9
8
|
DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
|
|
9
|
+
DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class HubType:
|
|
@@ -76,33 +76,6 @@ class ArenaMode:
|
|
|
76
76
|
PAIRWISE_BASELINE = 'pairwise_baseline'
|
|
77
77
|
|
|
78
78
|
|
|
79
|
-
class OutputsStructure:
|
|
80
|
-
LOGS_DIR = 'logs'
|
|
81
|
-
PREDICTIONS_DIR = 'predictions'
|
|
82
|
-
REVIEWS_DIR = 'reviews'
|
|
83
|
-
REPORTS_DIR = 'reports'
|
|
84
|
-
CONFIGS_DIR = 'configs'
|
|
85
|
-
|
|
86
|
-
def __init__(self, outputs_dir: str, is_make: bool = True):
|
|
87
|
-
self.outputs_dir = outputs_dir
|
|
88
|
-
self.logs_dir = os.path.join(outputs_dir, OutputsStructure.LOGS_DIR)
|
|
89
|
-
self.predictions_dir = os.path.join(outputs_dir, OutputsStructure.PREDICTIONS_DIR)
|
|
90
|
-
self.reviews_dir = os.path.join(outputs_dir, OutputsStructure.REVIEWS_DIR)
|
|
91
|
-
self.reports_dir = os.path.join(outputs_dir, OutputsStructure.REPORTS_DIR)
|
|
92
|
-
self.configs_dir = os.path.join(outputs_dir, OutputsStructure.CONFIGS_DIR)
|
|
93
|
-
|
|
94
|
-
if is_make:
|
|
95
|
-
self.create_directories()
|
|
96
|
-
|
|
97
|
-
def create_directories(self):
|
|
98
|
-
os.makedirs(self.outputs_dir, exist_ok=True)
|
|
99
|
-
os.makedirs(self.logs_dir, exist_ok=True)
|
|
100
|
-
os.makedirs(self.predictions_dir, exist_ok=True)
|
|
101
|
-
os.makedirs(self.reviews_dir, exist_ok=True)
|
|
102
|
-
os.makedirs(self.reports_dir, exist_ok=True)
|
|
103
|
-
os.makedirs(self.configs_dir, exist_ok=True)
|
|
104
|
-
|
|
105
|
-
|
|
106
79
|
class AnswerKeys:
|
|
107
80
|
ANSWER_ID = 'answer_id'
|
|
108
81
|
RAW_INPUT = 'raw_input'
|
|
@@ -166,17 +139,30 @@ class EvalType:
|
|
|
166
139
|
|
|
167
140
|
|
|
168
141
|
class EvalBackend:
|
|
169
|
-
# Use native evaluation pipeline of EvalScope
|
|
170
|
-
NATIVE = 'Native'
|
|
171
142
|
|
|
172
|
-
|
|
173
|
-
|
|
143
|
+
class _Backend:
|
|
144
|
+
# compatible with old version, set 'value'
|
|
145
|
+
|
|
146
|
+
def __init__(self, value):
|
|
147
|
+
self._value = value
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def value(self):
|
|
151
|
+
return self._value
|
|
152
|
+
|
|
153
|
+
def __str__(self):
|
|
154
|
+
return self._value
|
|
174
155
|
|
|
175
|
-
|
|
176
|
-
|
|
156
|
+
def __repr__(self):
|
|
157
|
+
return f"'{self._value}'"
|
|
177
158
|
|
|
178
|
-
|
|
179
|
-
|
|
159
|
+
def __eq__(self, other):
|
|
160
|
+
if isinstance(other, str):
|
|
161
|
+
return self._value == other
|
|
162
|
+
return NotImplemented
|
|
180
163
|
|
|
181
|
-
|
|
182
|
-
|
|
164
|
+
NATIVE = _Backend('Native')
|
|
165
|
+
OPEN_COMPASS = _Backend('OpenCompass')
|
|
166
|
+
VLM_EVAL_KIT = _Backend('VLMEvalKit')
|
|
167
|
+
RAG_EVAL = _Backend('RAGEval')
|
|
168
|
+
THIRD_PARTY = _Backend('ThirdParty')
|
evalscope/evaluator/__init__.py
CHANGED
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -11,10 +11,11 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
11
11
|
from evalscope.benchmarks import DataAdapter
|
|
12
12
|
from evalscope.config import TaskConfig
|
|
13
13
|
from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
|
|
14
|
-
|
|
14
|
+
ReviewKeys)
|
|
15
15
|
from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
|
|
16
16
|
from evalscope.tools.combine_reports import gen_table
|
|
17
|
-
from evalscope.utils import dict_torch_dtype_to_str,
|
|
17
|
+
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
18
|
+
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
18
19
|
from evalscope.utils.logger import get_logger
|
|
19
20
|
|
|
20
21
|
logger = get_logger()
|
|
@@ -56,8 +57,8 @@ class Evaluator(object):
|
|
|
56
57
|
**kwargs):
|
|
57
58
|
|
|
58
59
|
self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
59
|
-
self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep))
|
|
60
|
-
self.model_name =
|
|
60
|
+
self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
|
|
61
|
+
self.model_name = overall_task_cfg.model_id
|
|
61
62
|
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
62
63
|
|
|
63
64
|
self.datasets_dir = os.path.expanduser(datasets_dir)
|
|
@@ -5,8 +5,8 @@ import pyarrow as pa
|
|
|
5
5
|
from typing import List, Union
|
|
6
6
|
|
|
7
7
|
from evalscope.constants import MetricMembers
|
|
8
|
-
from evalscope.utils import jsonl_to_list
|
|
9
8
|
from evalscope.utils.arena_utils import compute_elo
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
logger = get_logger()
|
|
@@ -12,8 +12,9 @@ from typing import Any, List
|
|
|
12
12
|
|
|
13
13
|
from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
|
|
14
14
|
from evalscope.models.openai_model import OpenAIModel
|
|
15
|
-
from evalscope.utils import completion_parsers,
|
|
15
|
+
from evalscope.utils import completion_parsers, random_seeded_choice
|
|
16
16
|
from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
|
|
17
|
+
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
|
|
17
18
|
from evalscope.utils.logger import get_logger
|
|
18
19
|
|
|
19
20
|
logger = get_logger()
|
evalscope/perf/arguments.py
CHANGED
|
@@ -16,7 +16,7 @@ class Arguments:
|
|
|
16
16
|
attn_implementation: Optional[str] = None # Attention implementaion, only for local inference
|
|
17
17
|
api: str = 'openai' # API to be used (default: 'openai')
|
|
18
18
|
tokenizer_path: Optional[str] = None # Path to the tokenizer
|
|
19
|
-
port:
|
|
19
|
+
port: int = 8877 # Port number for the local API server
|
|
20
20
|
|
|
21
21
|
# Connection settings
|
|
22
22
|
url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
|
|
@@ -138,6 +138,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
138
138
|
|
|
139
139
|
# Connection settings
|
|
140
140
|
parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
|
|
141
|
+
parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
|
|
141
142
|
parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
|
|
142
143
|
parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
|
|
143
144
|
parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -195,9 +195,9 @@ async def start_server(args: Arguments) -> bool:
|
|
|
195
195
|
server.start()
|
|
196
196
|
|
|
197
197
|
if args.dataset.startswith('speed_benchmark'):
|
|
198
|
-
args.url = 'http://127.0.0.1:
|
|
198
|
+
args.url = f'http://127.0.0.1:{args.port}/v1/completions'
|
|
199
199
|
else:
|
|
200
|
-
args.url = 'http://127.0.0.1:
|
|
200
|
+
args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
|
|
201
201
|
|
|
202
202
|
if not await test_connection(args):
|
|
203
203
|
raise TimeoutError('Test connection failed')
|
evalscope/perf/main.py
CHANGED
|
@@ -8,7 +8,7 @@ from evalscope.perf.arguments import Arguments, parse_args
|
|
|
8
8
|
from evalscope.perf.benchmark import benchmark
|
|
9
9
|
from evalscope.perf.utils.db_util import get_output_path
|
|
10
10
|
from evalscope.perf.utils.handler import add_signal_handlers
|
|
11
|
-
from evalscope.utils.logger import get_logger
|
|
11
|
+
from evalscope.utils.logger import configure_logging, get_logger
|
|
12
12
|
from evalscope.utils.utils import seed_everything
|
|
13
13
|
|
|
14
14
|
logger = get_logger()
|
|
@@ -23,10 +23,7 @@ def run_perf_benchmark(args):
|
|
|
23
23
|
|
|
24
24
|
# Setup logger and output
|
|
25
25
|
args.outputs_dir = get_output_path(args)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
if args.debug:
|
|
29
|
-
get_logger(log_level=logging.DEBUG, force=True)
|
|
26
|
+
configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
|
|
30
27
|
|
|
31
28
|
logger.info('Starting benchmark...')
|
|
32
29
|
logger.info(args)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
from transformers import AutoTokenizer
|
|
4
|
-
from typing import Any, Dict, Iterator, List
|
|
4
|
+
from typing import Any, Dict, Iterator, List, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.perf.arguments import Arguments
|
|
7
7
|
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
@@ -29,7 +29,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
29
29
|
else:
|
|
30
30
|
self.tokenizer = None
|
|
31
31
|
|
|
32
|
-
def build_request(self, messages: List[Dict]
|
|
32
|
+
def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
|
|
33
33
|
"""Build the openai format request based on prompt, dataset
|
|
34
34
|
|
|
35
35
|
Args:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, List, Type
|
|
1
|
+
from typing import Any, List, Type, Union
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class PluginRegistry:
|
|
@@ -20,7 +20,7 @@ class PluginRegistry:
|
|
|
20
20
|
return self.get_class(name)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def register_dataset(name: str
|
|
23
|
+
def register_dataset(name: Union[str, List[str]]):
|
|
24
24
|
|
|
25
25
|
def class_decorator(cls: Type):
|
|
26
26
|
if isinstance(name, str):
|
|
@@ -35,7 +35,7 @@ def register_dataset(name: str | List[str]):
|
|
|
35
35
|
return class_decorator
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def register_api(name: str
|
|
38
|
+
def register_api(name: Union[str, List[str]]):
|
|
39
39
|
|
|
40
40
|
def class_decorator(cls: Type):
|
|
41
41
|
if isinstance(name, str):
|
|
@@ -116,19 +116,19 @@ class BenchmarkMetrics:
|
|
|
116
116
|
|
|
117
117
|
def create_message(self, default_ndigits=3):
|
|
118
118
|
message = {
|
|
119
|
-
'Time taken for tests (
|
|
119
|
+
'Time taken for tests (s)': round(self.total_time, default_ndigits),
|
|
120
120
|
'Number of concurrency': self.concurrency,
|
|
121
121
|
'Total requests': int(self.n_total_queries),
|
|
122
122
|
'Succeed requests': self.n_succeed_queries,
|
|
123
123
|
'Failed requests': self.n_failed_queries,
|
|
124
|
+
'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
|
|
124
125
|
'Average QPS': round(self.qps, default_ndigits),
|
|
125
126
|
'Average latency (s)': round(self.avg_latency, default_ndigits),
|
|
126
127
|
'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
|
|
127
128
|
'Average time per output token (s)': round(self.avg_time_per_token, 5),
|
|
128
|
-
'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
|
|
129
|
-
'Average package per request': round(self.n_avg_chunks, default_ndigits),
|
|
130
|
-
'Throughput(average output tokens per second)': round(self.avg_token_per_seconds, default_ndigits),
|
|
131
129
|
'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
|
|
132
130
|
'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
|
|
131
|
+
'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
|
|
132
|
+
'Average package per request': round(self.n_avg_chunks, default_ndigits),
|
|
133
133
|
}
|
|
134
134
|
return message
|