evalscope 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +3 -0
- evalscope/backend/__init__.py +3 -0
- evalscope/backend/base.py +27 -0
- evalscope/backend/opencompass/__init__.py +3 -0
- evalscope/backend/opencompass/api_meta_template.py +64 -0
- evalscope/backend/opencompass/backend_manager.py +247 -0
- evalscope/backend/opencompass/tasks/__init__.py +1 -0
- evalscope/backend/opencompass/tasks/eval_api.py +30 -0
- evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
- evalscope/backend/vlm_eval_kit/__init__.py +1 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
- evalscope/benchmarks/__init__.py +4 -0
- evalscope/benchmarks/arc/__init__.py +5 -0
- evalscope/benchmarks/arc/ai2_arc.py +148 -0
- evalscope/benchmarks/arc/arc_adapter.py +231 -0
- evalscope/benchmarks/bbh/__init__.py +6 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
- evalscope/benchmarks/benchmark.py +65 -0
- evalscope/benchmarks/ceval/__init__.py +5 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
- evalscope/benchmarks/ceval/ceval_exam.py +159 -0
- evalscope/benchmarks/cmmlu/__init__.py +5 -0
- evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
- evalscope/benchmarks/competition_math/__init__.py +5 -0
- evalscope/benchmarks/competition_math/competition_math.py +88 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
- evalscope/benchmarks/data_adapter.py +263 -0
- evalscope/benchmarks/general_qa/__init__.py +5 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
- evalscope/benchmarks/gsm8k/__init__.py +5 -0
- evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
- evalscope/benchmarks/hellaswag/__init__.py +5 -0
- evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
- evalscope/benchmarks/humaneval/__init__.py +5 -0
- evalscope/benchmarks/humaneval/humaneval.py +82 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
- evalscope/benchmarks/mmlu/__init__.py +5 -0
- evalscope/benchmarks/mmlu/mmlu.py +174 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
- evalscope/benchmarks/race/__init__.py +5 -0
- evalscope/benchmarks/race/race.py +118 -0
- evalscope/benchmarks/race/race_adapter.py +229 -0
- evalscope/benchmarks/trivia_qa/__init__.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
- evalscope/benchmarks/truthful_qa/__init__.py +5 -0
- evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
- evalscope/cache.py +98 -0
- evalscope/cli/__init__.py +1 -0
- evalscope/cli/base.py +20 -0
- evalscope/cli/cli.py +26 -0
- evalscope/cli/start_perf.py +37 -0
- evalscope/cli/start_server.py +138 -0
- evalscope/config.py +165 -0
- evalscope/constants.py +150 -0
- evalscope/evaluator/__init__.py +3 -0
- evalscope/evaluator/evaluator.py +689 -0
- evalscope/evaluator/rating_eval.py +178 -0
- evalscope/evaluator/reviewer/__init__.py +1 -0
- evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
- evalscope/metrics/__init__.py +1 -0
- evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
- evalscope/metrics/code_metric.py +104 -0
- evalscope/metrics/math_accuracy.py +60 -0
- evalscope/metrics/metrics.py +405 -0
- evalscope/metrics/rouge_metric.py +129 -0
- evalscope/models/__init__.py +4 -0
- evalscope/models/custom/__init__.py +4 -0
- evalscope/models/custom/custom_model.py +53 -0
- evalscope/models/dummy_chat_model.py +50 -0
- evalscope/models/model.py +88 -0
- evalscope/models/model_adapter.py +586 -0
- evalscope/models/openai_model.py +103 -0
- evalscope/models/template.py +1446 -0
- evalscope/perf/__init__.py +0 -0
- evalscope/perf/_logging.py +32 -0
- evalscope/perf/api_plugin_base.py +60 -0
- evalscope/perf/custom_api.py +87 -0
- evalscope/perf/dashscope_api.py +84 -0
- evalscope/perf/dataset_plugin_base.py +64 -0
- evalscope/perf/datasets/__init__.py +0 -0
- evalscope/perf/datasets/line_by_line.py +18 -0
- evalscope/perf/datasets/longalpaca_12k.py +20 -0
- evalscope/perf/datasets/openqa.py +22 -0
- evalscope/perf/how_to_analysis_result.py +24 -0
- evalscope/perf/http_client.py +756 -0
- evalscope/perf/openai_api.py +130 -0
- evalscope/perf/plugin_registry.py +35 -0
- evalscope/perf/query_parameters.py +42 -0
- evalscope/perf/server_sent_event.py +43 -0
- evalscope/preprocess/__init__.py +1 -0
- evalscope/preprocess/tokenizers/__init__.py +0 -0
- evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
- evalscope/registry/__init__.py +1 -0
- evalscope/registry/tasks/arc.yaml +29 -0
- evalscope/registry/tasks/bbh.yaml +27 -0
- evalscope/registry/tasks/bbh_mini.yaml +27 -0
- evalscope/registry/tasks/ceval.yaml +27 -0
- evalscope/registry/tasks/ceval_mini.yaml +27 -0
- evalscope/registry/tasks/cmmlu.yaml +27 -0
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
- evalscope/registry/tasks/general_qa.yaml +27 -0
- evalscope/registry/tasks/gsm8k.yaml +29 -0
- evalscope/registry/tasks/mmlu.yaml +29 -0
- evalscope/registry/tasks/mmlu_mini.yaml +27 -0
- evalscope/run.py +404 -0
- evalscope/run_arena.py +204 -0
- evalscope/run_ms.py +140 -0
- evalscope/summarizer.py +144 -0
- evalscope/third_party/__init__.py +1 -0
- evalscope/third_party/toolbench_static/__init__.py +3 -0
- evalscope/third_party/toolbench_static/eval.py +219 -0
- evalscope/third_party/toolbench_static/infer.py +278 -0
- evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
- evalscope/tools/__init__.py +1 -0
- evalscope/tools/combine_reports.py +140 -0
- evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
- evalscope/tools/rewrite_eval_results.py +95 -0
- evalscope/utils/__init__.py +4 -0
- evalscope/utils/arena_utils.py +247 -0
- evalscope/utils/completion_parsers.py +87 -0
- evalscope/utils/logger.py +64 -0
- evalscope/utils/task_cfg_parser.py +10 -0
- evalscope/utils/task_utils.py +19 -0
- evalscope/utils/utils.py +625 -0
- evalscope/version.py +4 -0
- evalscope-0.5.0.dist-info/METADATA +566 -0
- evalscope-0.5.0.dist-info/RECORD +165 -0
- evalscope-0.5.0.dist-info/WHEEL +5 -0
- evalscope-0.5.0.dist-info/entry_points.txt +3 -0
- evalscope-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
import os, sys, time
|
|
4
|
+
from argparse import ArgumentParser
|
|
5
|
+
import subprocess
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
from evalscope.cli.base import CLICommand
|
|
9
|
+
from evalscope.perf.http_client import add_argument, run_perf_benchmark
|
|
10
|
+
|
|
11
|
+
current_path = os.path.dirname(os.path.abspath(__file__))
|
|
12
|
+
root_path = os.path.dirname(current_path)
|
|
13
|
+
def subparser_func(args):
|
|
14
|
+
""" Function which will be called for a specific sub parser.
|
|
15
|
+
"""
|
|
16
|
+
return PerfBenchCMD(args)
|
|
17
|
+
|
|
18
|
+
class PerfBenchCMD(CLICommand):
|
|
19
|
+
name = 'perf'
|
|
20
|
+
|
|
21
|
+
def __init__(self, args):
|
|
22
|
+
self.args = args
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def define_args(parsers: ArgumentParser):
|
|
26
|
+
""" define args for create pipeline template command.
|
|
27
|
+
"""
|
|
28
|
+
parser = parsers.add_parser(PerfBenchCMD.name)
|
|
29
|
+
add_argument(parser)
|
|
30
|
+
parser.set_defaults(func=subparser_func)
|
|
31
|
+
|
|
32
|
+
def execute(self):
|
|
33
|
+
run_perf_benchmark(self.args)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os, sys, time
|
|
3
|
+
from argparse import ArgumentParser
|
|
4
|
+
import subprocess
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
from evalscope.cli.base import CLICommand
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
current_path = os.path.dirname(os.path.abspath(__file__))
|
|
11
|
+
print(current_path)
|
|
12
|
+
root_path = os.path.dirname(current_path)
|
|
13
|
+
print(root_path)
|
|
14
|
+
|
|
15
|
+
def subparser_func(args):
|
|
16
|
+
""" Function which will be called for a specific sub parser.
|
|
17
|
+
"""
|
|
18
|
+
return PerfServerCMD(args)
|
|
19
|
+
|
|
20
|
+
def add_perf_args(parser):
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
'--server-command', required=True, type=str, help='The start server command.')
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
'--logdir', required=True, type=str, help='The monitor log save dir, tensorboard start at this path for display!')
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
'--host', type=str, default='0.0.0.0', help='The tensorboard host'
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
'--tensorboard-port', type=str, default='6006', help='The tensorboard port'
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def async_run_command_with_popen(cmd):
|
|
33
|
+
sub_process = subprocess.Popen(
|
|
34
|
+
cmd,
|
|
35
|
+
stdout=subprocess.PIPE,
|
|
36
|
+
stderr=subprocess.STDOUT,
|
|
37
|
+
bufsize=1,
|
|
38
|
+
universal_newlines=True,
|
|
39
|
+
encoding='utf8')
|
|
40
|
+
return sub_process
|
|
41
|
+
|
|
42
|
+
def start_monitor(args):
|
|
43
|
+
cmd = ['python',
|
|
44
|
+
'%s/perf/monitor.py'%root_path,
|
|
45
|
+
'--logdir',
|
|
46
|
+
args.logdir]
|
|
47
|
+
print(cmd)
|
|
48
|
+
p = async_run_command_with_popen(cmd)
|
|
49
|
+
os.set_blocking(p.stdout.fileno(), False)
|
|
50
|
+
return p
|
|
51
|
+
|
|
52
|
+
def start_tensorboard(args):
|
|
53
|
+
cmd = ['tensorboard',
|
|
54
|
+
'--logdir',
|
|
55
|
+
args.logdir,
|
|
56
|
+
'--host',
|
|
57
|
+
args.host,
|
|
58
|
+
'--port',
|
|
59
|
+
args.tensorboard_port
|
|
60
|
+
]
|
|
61
|
+
p = async_run_command_with_popen(cmd)
|
|
62
|
+
os.set_blocking(p.stdout.fileno(), False)
|
|
63
|
+
return p
|
|
64
|
+
|
|
65
|
+
def start_server(args):
|
|
66
|
+
cmd = args.server_command
|
|
67
|
+
print(cmd)
|
|
68
|
+
sub_process = subprocess.Popen(
|
|
69
|
+
cmd,
|
|
70
|
+
stdout=subprocess.PIPE,
|
|
71
|
+
stderr=subprocess.STDOUT,
|
|
72
|
+
bufsize=1,
|
|
73
|
+
shell=True,
|
|
74
|
+
universal_newlines=True,
|
|
75
|
+
encoding='utf8')
|
|
76
|
+
|
|
77
|
+
os.set_blocking(sub_process.stdout.fileno(), False)
|
|
78
|
+
return sub_process
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def wait_for_workers(workers):
|
|
82
|
+
while True:
|
|
83
|
+
for idx, worker in enumerate(workers):
|
|
84
|
+
if worker is None:
|
|
85
|
+
continue
|
|
86
|
+
# check worker is completed.
|
|
87
|
+
if worker.poll() is None:
|
|
88
|
+
for line in iter(worker.stdout.readline, ''):
|
|
89
|
+
if line != '':
|
|
90
|
+
sys.stdout.write(line)
|
|
91
|
+
else:
|
|
92
|
+
break
|
|
93
|
+
else:
|
|
94
|
+
print('Worker %s completed!'%idx)
|
|
95
|
+
for line in iter(worker.stdout.readline, ''):
|
|
96
|
+
if line != '':
|
|
97
|
+
sys.stdout.write(line)
|
|
98
|
+
else:
|
|
99
|
+
break
|
|
100
|
+
workers[idx] = None
|
|
101
|
+
|
|
102
|
+
is_all_completed = True
|
|
103
|
+
for idx, worker in enumerate(workers):
|
|
104
|
+
if worker is not None:
|
|
105
|
+
is_all_completed = False
|
|
106
|
+
break
|
|
107
|
+
|
|
108
|
+
if is_all_completed:
|
|
109
|
+
break
|
|
110
|
+
time.sleep(0.1)
|
|
111
|
+
|
|
112
|
+
class PerfServerCMD(CLICommand):
|
|
113
|
+
name = 'server'
|
|
114
|
+
|
|
115
|
+
def __init__(self, args):
|
|
116
|
+
self.args = args
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def define_args(parsers: ArgumentParser):
|
|
120
|
+
""" define args for create pipeline template command.
|
|
121
|
+
"""
|
|
122
|
+
parser = parsers.add_parser(PerfServerCMD.name)
|
|
123
|
+
add_perf_args(parser)
|
|
124
|
+
parser.set_defaults(func=subparser_func)
|
|
125
|
+
|
|
126
|
+
def execute(self):
|
|
127
|
+
# start monitor
|
|
128
|
+
p_monitor = start_monitor(self.args)
|
|
129
|
+
# start tensorboard
|
|
130
|
+
p_tensorboard = start_tensorboard(self.args)
|
|
131
|
+
# start server
|
|
132
|
+
p_server = start_server(self.args)
|
|
133
|
+
|
|
134
|
+
wait_for_workers([p_monitor, p_tensorboard, p_server])
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
|
evalscope/config.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import copy
|
|
5
|
+
from dataclasses import dataclass, asdict, field
|
|
6
|
+
from typing import Optional, List
|
|
7
|
+
|
|
8
|
+
from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
|
|
9
|
+
from evalscope.models.custom import CustomModel
|
|
10
|
+
from evalscope.utils import yaml_to_dict
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
16
|
+
|
|
17
|
+
registry_tasks = {
|
|
18
|
+
'arc': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/arc.yaml')),
|
|
19
|
+
'gsm8k': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/gsm8k.yaml')),
|
|
20
|
+
'mmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu.yaml')),
|
|
21
|
+
'cmmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/cmmlu.yaml')),
|
|
22
|
+
'ceval': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval.yaml')),
|
|
23
|
+
'bbh': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh.yaml')),
|
|
24
|
+
'general_qa': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/general_qa.yaml')),
|
|
25
|
+
|
|
26
|
+
# 'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
|
|
27
|
+
# 'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
|
|
28
|
+
# 'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
|
|
29
|
+
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class TaskConfig:
|
|
35
|
+
model_args: Optional[dict] = field(default_factory=dict)
|
|
36
|
+
generation_config: Optional[dict] = field(default_factory=dict)
|
|
37
|
+
dataset_args: Optional[dict] = field(default_factory=dict)
|
|
38
|
+
dry_run: bool = False
|
|
39
|
+
model: CustomModel = None
|
|
40
|
+
eval_type: str = 'custom'
|
|
41
|
+
datasets: list = field(default_factory=list)
|
|
42
|
+
work_dir: str = DEFAULT_ROOT_CACHE_DIR
|
|
43
|
+
outputs: str = None
|
|
44
|
+
mem_cache: bool = False
|
|
45
|
+
use_cache: bool = True
|
|
46
|
+
stage: str = 'all' # `all` or `infer` or `review`
|
|
47
|
+
dataset_hub: str = 'ModelScope'
|
|
48
|
+
dataset_dir: str = DEFAULT_ROOT_CACHE_DIR
|
|
49
|
+
limit: int = None
|
|
50
|
+
eval_backend: str = 'Native'
|
|
51
|
+
eval_config: dict = field(default_factory=dict)
|
|
52
|
+
|
|
53
|
+
# def __post_init__(self):
|
|
54
|
+
# self.registry_tasks = {
|
|
55
|
+
# 'arc': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/arc.yaml')),
|
|
56
|
+
# 'gsm8k': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/gsm8k.yaml')),
|
|
57
|
+
# 'mmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu.yaml')),
|
|
58
|
+
# 'ceval': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval.yaml')),
|
|
59
|
+
# 'bbh': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh.yaml')),
|
|
60
|
+
#
|
|
61
|
+
# 'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
|
|
62
|
+
# 'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
|
|
63
|
+
# 'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
|
|
64
|
+
#
|
|
65
|
+
# }
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
|
|
69
|
+
"""
|
|
70
|
+
Register a new task (dataset) for evaluation.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
name: str, the dataset name.
|
|
74
|
+
data_pattern: str, the data pattern for the task.
|
|
75
|
+
e.g. `mmlu`, `ceval`, `gsm8k`, ...
|
|
76
|
+
refer to task_config.list() for all available datasets.
|
|
77
|
+
dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
|
|
78
|
+
then your specific custom dataset directory will be /path/to/data/{name}
|
|
79
|
+
subset_list: list, the subset list for the dataset.
|
|
80
|
+
e.g. ['middle_school_politics', 'operating_system']
|
|
81
|
+
refer to the mmlu for example. https://github.com/hendrycks/test/blob/master/categories.py
|
|
82
|
+
"""
|
|
83
|
+
available_datasets = list(registry_tasks.keys())
|
|
84
|
+
if data_pattern not in available_datasets:
|
|
85
|
+
logger.error(f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
# Reuse the existing task config and update the datasets
|
|
89
|
+
pattern_config = registry_tasks.get(data_pattern)
|
|
90
|
+
|
|
91
|
+
custom_config = copy.deepcopy(pattern_config)
|
|
92
|
+
custom_config.update({'datasets': [data_pattern]})
|
|
93
|
+
custom_config.update({'dataset_hub': 'Local'}) # TODO: to support `ModelScope`
|
|
94
|
+
if 'dataset_args' in custom_config:
|
|
95
|
+
if data_pattern not in custom_config:
|
|
96
|
+
custom_config['dataset_args'].update({data_pattern: {}})
|
|
97
|
+
else:
|
|
98
|
+
custom_config.update({'dataset_args': {data_pattern: {}}})
|
|
99
|
+
|
|
100
|
+
if dataset_dir is not None:
|
|
101
|
+
custom_config['dataset_args'][data_pattern].update({'local_path': dataset_dir})
|
|
102
|
+
|
|
103
|
+
if subset_list is not None:
|
|
104
|
+
# custom_config['dataset_args'].get(data_pattern, {}).update({'subset_list': subset_list})
|
|
105
|
+
custom_config['dataset_args'][data_pattern].update({'subset_list': subset_list})
|
|
106
|
+
|
|
107
|
+
registry_tasks.update({name: custom_config})
|
|
108
|
+
logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
|
|
109
|
+
|
|
110
|
+
def to_dict(self):
|
|
111
|
+
# Note: to avoid serialization error for some model instance
|
|
112
|
+
_tmp_model = copy.copy(self.model)
|
|
113
|
+
self.model = None
|
|
114
|
+
res_dict = asdict(self)
|
|
115
|
+
res_dict.update({'model': _tmp_model})
|
|
116
|
+
self.model = _tmp_model
|
|
117
|
+
|
|
118
|
+
return res_dict
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
|
|
122
|
+
res_list = []
|
|
123
|
+
for task_name in tasks:
|
|
124
|
+
task: dict = registry_tasks.get(task_name, None)
|
|
125
|
+
if task is None:
|
|
126
|
+
logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
res = TaskConfig(**task)
|
|
130
|
+
res.model = custom_model
|
|
131
|
+
if res.outputs is None:
|
|
132
|
+
res.outputs = os.path.join(res.work_dir,
|
|
133
|
+
'outputs',
|
|
134
|
+
f"eval_{'-'.join(tasks)}_{res.model.config['model_id']}_{res.model_args.get('revision', 'default')}")
|
|
135
|
+
res_list.append(res)
|
|
136
|
+
|
|
137
|
+
return res_list
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
def list():
|
|
141
|
+
return list(registry_tasks.keys())
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class TempModel(CustomModel):
|
|
145
|
+
|
|
146
|
+
def __init__(self, config: dict):
|
|
147
|
+
super().__init__(config=config)
|
|
148
|
+
|
|
149
|
+
def predict(self, prompts: str, **kwargs):
|
|
150
|
+
return [item + ': response' for item in prompts]
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
if __name__ == '__main__':
|
|
154
|
+
model = TempModel(config={'model_id': 'test-swift-dummy-model'})
|
|
155
|
+
task_config = TaskConfig()
|
|
156
|
+
|
|
157
|
+
# Register a new task
|
|
158
|
+
TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
|
|
159
|
+
|
|
160
|
+
import json
|
|
161
|
+
swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
|
|
162
|
+
for item in swift_eval_task:
|
|
163
|
+
print(item.to_dict())
|
|
164
|
+
print()
|
|
165
|
+
|
evalscope/constants.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
DEFAULT_ROOT_CACHE_DIR = '~/.cache/evalscope'
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DumpMode:
|
|
8
|
+
OVERWRITE = 'overwrite'
|
|
9
|
+
APPEND = 'append'
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MetricsConstant:
|
|
13
|
+
EPSILON = float(1e-6)
|
|
14
|
+
INVALID_VALUE = -9999999
|
|
15
|
+
ROUGE_KEYS = [
|
|
16
|
+
'rouge-1-r',
|
|
17
|
+
'rouge-1-p',
|
|
18
|
+
'rouge-1-f',
|
|
19
|
+
'rouge-2-r',
|
|
20
|
+
'rouge-2-p',
|
|
21
|
+
'rouge-2-f',
|
|
22
|
+
'rouge-l-r',
|
|
23
|
+
'rouge-l-p',
|
|
24
|
+
'rouge-l-f',
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MetricMembers(Enum):
|
|
29
|
+
|
|
30
|
+
# Math accuracy metric
|
|
31
|
+
MATH_ACCURACY = 'math_accuracy'
|
|
32
|
+
|
|
33
|
+
# Code pass@k metric
|
|
34
|
+
CODE_PASS_K = 'code_pass_k'
|
|
35
|
+
|
|
36
|
+
# Code rouge metric
|
|
37
|
+
ROUGE = 'rouge'
|
|
38
|
+
|
|
39
|
+
# ELO rating system for pairwise comparison
|
|
40
|
+
ELO = 'elo'
|
|
41
|
+
|
|
42
|
+
# Pairwise comparison win/lose and tie(optional)
|
|
43
|
+
PAIRWISE = 'pairwise'
|
|
44
|
+
|
|
45
|
+
# Rating score for single model
|
|
46
|
+
SCORE = 'score'
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ArenaWinner:
|
|
50
|
+
|
|
51
|
+
MODEL_A = 'model_a'
|
|
52
|
+
|
|
53
|
+
MODEL_B = 'model_b'
|
|
54
|
+
|
|
55
|
+
TIE = 'tie'
|
|
56
|
+
|
|
57
|
+
TIE_BOTH_BAD = 'tie_both_bad'
|
|
58
|
+
|
|
59
|
+
UNKNOWN = 'unknown'
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ArenaMode:
|
|
63
|
+
SINGLE = 'single'
|
|
64
|
+
PAIRWISE = 'pairwise'
|
|
65
|
+
PAIRWISE_BASELINE = 'pairwise_baseline'
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class OutputsStructure:
|
|
69
|
+
|
|
70
|
+
LOGS_DIR = 'logs_dir'
|
|
71
|
+
|
|
72
|
+
PREDICTIONS_DIR = 'predictions_dir'
|
|
73
|
+
|
|
74
|
+
REVIEWS_DIR = 'reviews_dir'
|
|
75
|
+
|
|
76
|
+
REPORTS_DIR = 'reports_dir'
|
|
77
|
+
|
|
78
|
+
CONFIGS_DIR = 'configs_dir'
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class AnswerKeys:
|
|
82
|
+
|
|
83
|
+
ANSWER_ID = 'answer_id'
|
|
84
|
+
|
|
85
|
+
RAW_INPUT = 'raw_input'
|
|
86
|
+
|
|
87
|
+
ORIGIN_PROMPT = 'origin_prompt'
|
|
88
|
+
|
|
89
|
+
MODEL_SPEC = 'model_spec'
|
|
90
|
+
|
|
91
|
+
SUBSET_NAME = 'subset_name'
|
|
92
|
+
|
|
93
|
+
CHOICES = 'choices'
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class ReviewKeys:
|
|
97
|
+
|
|
98
|
+
REVIEW_ID = 'review_id'
|
|
99
|
+
|
|
100
|
+
REVIEWED = 'reviewed'
|
|
101
|
+
|
|
102
|
+
REVIEWER_SPEC = 'reviewer_spec'
|
|
103
|
+
|
|
104
|
+
REVIEW_TIME = 'review_time'
|
|
105
|
+
|
|
106
|
+
MESSAGE = 'message'
|
|
107
|
+
|
|
108
|
+
CONTENT = 'content'
|
|
109
|
+
|
|
110
|
+
GOLD = 'gold'
|
|
111
|
+
|
|
112
|
+
PRED = 'pred'
|
|
113
|
+
|
|
114
|
+
RESULT = 'result'
|
|
115
|
+
|
|
116
|
+
REVIEW = 'review'
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class EvalConfigKeys:
|
|
120
|
+
CLASS_REF = 'ref'
|
|
121
|
+
CLASS_ARGS = 'args'
|
|
122
|
+
ENABLE = 'enable'
|
|
123
|
+
POSITION_BIAS_MITIGATION = 'position_bias_mitigation'
|
|
124
|
+
RANDOM_SEED = 'random_seed'
|
|
125
|
+
FN_COMPLETION_PARSER = 'fn_completion_parser'
|
|
126
|
+
COMPLETION_PARSER_KWARGS = 'completion_parser_kwargs'
|
|
127
|
+
OUTPUT_FILE = 'output_file'
|
|
128
|
+
MODEL_ID_OR_PATH = 'model_id_or_path'
|
|
129
|
+
MODEL_REVISION = 'revision'
|
|
130
|
+
GENERATION_CONFIG = 'generation_config'
|
|
131
|
+
PRECISION = 'precision'
|
|
132
|
+
TEMPLATE_TYPE = 'template_type'
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class FnCompletionParser:
|
|
136
|
+
LMSYS_PARSER: str = 'lmsys_parser'
|
|
137
|
+
RANKING_PARSER: str = 'ranking_parser'
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class PositionBiasMitigation:
|
|
141
|
+
NONE = 'none'
|
|
142
|
+
RANDOMIZE_ORDER = 'randomize_order'
|
|
143
|
+
SWAP_POSITION = 'swap_position'
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class EvalStage:
|
|
147
|
+
# Enums: `all`, `infer`, `review`
|
|
148
|
+
ALL = 'all'
|
|
149
|
+
INFER = 'infer'
|
|
150
|
+
REVIEW = 'review'
|