evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,37 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from abc import abstractmethod
3
+ import os, sys, time
4
+ from argparse import ArgumentParser
5
+ import subprocess
6
+
7
+
8
+ from evalscope.cli.base import CLICommand
9
+ from evalscope.perf.http_client import add_argument, run_perf_benchmark
10
+
11
+ current_path = os.path.dirname(os.path.abspath(__file__))
12
+ root_path = os.path.dirname(current_path)
13
+ def subparser_func(args):
14
+ """ Function which will be called for a specific sub parser.
15
+ """
16
+ return PerfBenchCMD(args)
17
+
18
+ class PerfBenchCMD(CLICommand):
19
+ name = 'perf'
20
+
21
+ def __init__(self, args):
22
+ self.args = args
23
+
24
+ @staticmethod
25
+ def define_args(parsers: ArgumentParser):
26
+ """ define args for create pipeline template command.
27
+ """
28
+ parser = parsers.add_parser(PerfBenchCMD.name)
29
+ add_argument(parser)
30
+ parser.set_defaults(func=subparser_func)
31
+
32
+ def execute(self):
33
+ run_perf_benchmark(self.args)
34
+
35
+
36
+
37
+
@@ -0,0 +1,138 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os, sys, time
3
+ from argparse import ArgumentParser
4
+ import subprocess
5
+
6
+
7
+ from evalscope.cli.base import CLICommand
8
+
9
+
10
+ current_path = os.path.dirname(os.path.abspath(__file__))
11
+ print(current_path)
12
+ root_path = os.path.dirname(current_path)
13
+ print(root_path)
14
+
15
+ def subparser_func(args):
16
+ """ Function which will be called for a specific sub parser.
17
+ """
18
+ return PerfServerCMD(args)
19
+
20
+ def add_perf_args(parser):
21
+ parser.add_argument(
22
+ '--server-command', required=True, type=str, help='The start server command.')
23
+ parser.add_argument(
24
+ '--logdir', required=True, type=str, help='The monitor log save dir, tensorboard start at this path for display!')
25
+ parser.add_argument(
26
+ '--host', type=str, default='0.0.0.0', help='The tensorboard host'
27
+ )
28
+ parser.add_argument(
29
+ '--tensorboard-port', type=str, default='6006', help='The tensorboard port'
30
+ )
31
+
32
+ def async_run_command_with_popen(cmd):
33
+ sub_process = subprocess.Popen(
34
+ cmd,
35
+ stdout=subprocess.PIPE,
36
+ stderr=subprocess.STDOUT,
37
+ bufsize=1,
38
+ universal_newlines=True,
39
+ encoding='utf8')
40
+ return sub_process
41
+
42
+ def start_monitor(args):
43
+ cmd = ['python',
44
+ '%s/perf/monitor.py'%root_path,
45
+ '--logdir',
46
+ args.logdir]
47
+ print(cmd)
48
+ p = async_run_command_with_popen(cmd)
49
+ os.set_blocking(p.stdout.fileno(), False)
50
+ return p
51
+
52
+ def start_tensorboard(args):
53
+ cmd = ['tensorboard',
54
+ '--logdir',
55
+ args.logdir,
56
+ '--host',
57
+ args.host,
58
+ '--port',
59
+ args.tensorboard_port
60
+ ]
61
+ p = async_run_command_with_popen(cmd)
62
+ os.set_blocking(p.stdout.fileno(), False)
63
+ return p
64
+
65
+ def start_server(args):
66
+ cmd = args.server_command
67
+ print(cmd)
68
+ sub_process = subprocess.Popen(
69
+ cmd,
70
+ stdout=subprocess.PIPE,
71
+ stderr=subprocess.STDOUT,
72
+ bufsize=1,
73
+ shell=True,
74
+ universal_newlines=True,
75
+ encoding='utf8')
76
+
77
+ os.set_blocking(sub_process.stdout.fileno(), False)
78
+ return sub_process
79
+
80
+
81
+ def wait_for_workers(workers):
82
+ while True:
83
+ for idx, worker in enumerate(workers):
84
+ if worker is None:
85
+ continue
86
+ # check worker is completed.
87
+ if worker.poll() is None:
88
+ for line in iter(worker.stdout.readline, ''):
89
+ if line != '':
90
+ sys.stdout.write(line)
91
+ else:
92
+ break
93
+ else:
94
+ print('Worker %s completed!'%idx)
95
+ for line in iter(worker.stdout.readline, ''):
96
+ if line != '':
97
+ sys.stdout.write(line)
98
+ else:
99
+ break
100
+ workers[idx] = None
101
+
102
+ is_all_completed = True
103
+ for idx, worker in enumerate(workers):
104
+ if worker is not None:
105
+ is_all_completed = False
106
+ break
107
+
108
+ if is_all_completed:
109
+ break
110
+ time.sleep(0.1)
111
+
112
+ class PerfServerCMD(CLICommand):
113
+ name = 'server'
114
+
115
+ def __init__(self, args):
116
+ self.args = args
117
+
118
+ @staticmethod
119
+ def define_args(parsers: ArgumentParser):
120
+ """ define args for create pipeline template command.
121
+ """
122
+ parser = parsers.add_parser(PerfServerCMD.name)
123
+ add_perf_args(parser)
124
+ parser.set_defaults(func=subparser_func)
125
+
126
+ def execute(self):
127
+ # start monitor
128
+ p_monitor = start_monitor(self.args)
129
+ # start tensorboard
130
+ p_tensorboard = start_tensorboard(self.args)
131
+ # start server
132
+ p_server = start_server(self.args)
133
+
134
+ wait_for_workers([p_monitor, p_tensorboard, p_server])
135
+
136
+
137
+
138
+
evalscope/config.py ADDED
@@ -0,0 +1,165 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import os
4
+ import copy
5
+ from dataclasses import dataclass, asdict, field
6
+ from typing import Optional, List
7
+
8
+ from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
9
+ from evalscope.models.custom import CustomModel
10
+ from evalscope.utils import yaml_to_dict
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ cur_path = os.path.dirname(os.path.abspath(__file__))
16
+
17
+ registry_tasks = {
18
+ 'arc': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/arc.yaml')),
19
+ 'gsm8k': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/gsm8k.yaml')),
20
+ 'mmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu.yaml')),
21
+ 'cmmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/cmmlu.yaml')),
22
+ 'ceval': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval.yaml')),
23
+ 'bbh': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh.yaml')),
24
+ 'general_qa': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/general_qa.yaml')),
25
+
26
+ # 'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
27
+ # 'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
28
+ # 'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
29
+
30
+ }
31
+
32
+
33
+ @dataclass
34
+ class TaskConfig:
35
+ model_args: Optional[dict] = field(default_factory=dict)
36
+ generation_config: Optional[dict] = field(default_factory=dict)
37
+ dataset_args: Optional[dict] = field(default_factory=dict)
38
+ dry_run: bool = False
39
+ model: CustomModel = None
40
+ eval_type: str = 'custom'
41
+ datasets: list = field(default_factory=list)
42
+ work_dir: str = DEFAULT_ROOT_CACHE_DIR
43
+ outputs: str = None
44
+ mem_cache: bool = False
45
+ use_cache: bool = True
46
+ stage: str = 'all' # `all` or `infer` or `review`
47
+ dataset_hub: str = 'ModelScope'
48
+ dataset_dir: str = DEFAULT_ROOT_CACHE_DIR
49
+ limit: int = None
50
+ eval_backend: str = 'Native'
51
+ eval_config: dict = field(default_factory=dict)
52
+
53
+ # def __post_init__(self):
54
+ # self.registry_tasks = {
55
+ # 'arc': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/arc.yaml')),
56
+ # 'gsm8k': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/gsm8k.yaml')),
57
+ # 'mmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu.yaml')),
58
+ # 'ceval': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval.yaml')),
59
+ # 'bbh': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh.yaml')),
60
+ #
61
+ # 'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
62
+ # 'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
63
+ # 'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
64
+ #
65
+ # }
66
+
67
+ @staticmethod
68
+ def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
69
+ """
70
+ Register a new task (dataset) for evaluation.
71
+
72
+ Args:
73
+ name: str, the dataset name.
74
+ data_pattern: str, the data pattern for the task.
75
+ e.g. `mmlu`, `ceval`, `gsm8k`, ...
76
+ refer to task_config.list() for all available datasets.
77
+ dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
78
+ then your specific custom dataset directory will be /path/to/data/{name}
79
+ subset_list: list, the subset list for the dataset.
80
+ e.g. ['middle_school_politics', 'operating_system']
81
+ refer to the mmlu for example. https://github.com/hendrycks/test/blob/master/categories.py
82
+ """
83
+ available_datasets = list(registry_tasks.keys())
84
+ if data_pattern not in available_datasets:
85
+ logger.error(f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
86
+ return
87
+
88
+ # Reuse the existing task config and update the datasets
89
+ pattern_config = registry_tasks.get(data_pattern)
90
+
91
+ custom_config = copy.deepcopy(pattern_config)
92
+ custom_config.update({'datasets': [data_pattern]})
93
+ custom_config.update({'dataset_hub': 'Local'}) # TODO: to support `ModelScope`
94
+ if 'dataset_args' in custom_config:
95
+ if data_pattern not in custom_config:
96
+ custom_config['dataset_args'].update({data_pattern: {}})
97
+ else:
98
+ custom_config.update({'dataset_args': {data_pattern: {}}})
99
+
100
+ if dataset_dir is not None:
101
+ custom_config['dataset_args'][data_pattern].update({'local_path': dataset_dir})
102
+
103
+ if subset_list is not None:
104
+ # custom_config['dataset_args'].get(data_pattern, {}).update({'subset_list': subset_list})
105
+ custom_config['dataset_args'][data_pattern].update({'subset_list': subset_list})
106
+
107
+ registry_tasks.update({name: custom_config})
108
+ logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
109
+
110
+ def to_dict(self):
111
+ # Note: to avoid serialization error for some model instance
112
+ _tmp_model = copy.copy(self.model)
113
+ self.model = None
114
+ res_dict = asdict(self)
115
+ res_dict.update({'model': _tmp_model})
116
+ self.model = _tmp_model
117
+
118
+ return res_dict
119
+
120
+ @staticmethod
121
+ def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
122
+ res_list = []
123
+ for task_name in tasks:
124
+ task: dict = registry_tasks.get(task_name, None)
125
+ if task is None:
126
+ logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
127
+ continue
128
+
129
+ res = TaskConfig(**task)
130
+ res.model = custom_model
131
+ if res.outputs is None:
132
+ res.outputs = os.path.join(res.work_dir,
133
+ 'outputs',
134
+ f"eval_{'-'.join(tasks)}_{res.model.config['model_id']}_{res.model_args.get('revision', 'default')}")
135
+ res_list.append(res)
136
+
137
+ return res_list
138
+
139
+ @staticmethod
140
+ def list():
141
+ return list(registry_tasks.keys())
142
+
143
+
144
+ class TempModel(CustomModel):
145
+
146
+ def __init__(self, config: dict):
147
+ super().__init__(config=config)
148
+
149
+ def predict(self, prompts: str, **kwargs):
150
+ return [item + ': response' for item in prompts]
151
+
152
+
153
+ if __name__ == '__main__':
154
+ model = TempModel(config={'model_id': 'test-swift-dummy-model'})
155
+ task_config = TaskConfig()
156
+
157
+ # Register a new task
158
+ TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
159
+
160
+ import json
161
+ swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
162
+ for item in swift_eval_task:
163
+ print(item.to_dict())
164
+ print()
165
+
evalscope/constants.py ADDED
@@ -0,0 +1,150 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from enum import Enum
3
+
4
+ DEFAULT_ROOT_CACHE_DIR = '~/.cache/evalscope'
5
+
6
+
7
+ class DumpMode:
8
+ OVERWRITE = 'overwrite'
9
+ APPEND = 'append'
10
+
11
+
12
+ class MetricsConstant:
13
+ EPSILON = float(1e-6)
14
+ INVALID_VALUE = -9999999
15
+ ROUGE_KEYS = [
16
+ 'rouge-1-r',
17
+ 'rouge-1-p',
18
+ 'rouge-1-f',
19
+ 'rouge-2-r',
20
+ 'rouge-2-p',
21
+ 'rouge-2-f',
22
+ 'rouge-l-r',
23
+ 'rouge-l-p',
24
+ 'rouge-l-f',
25
+ ]
26
+
27
+
28
+ class MetricMembers(Enum):
29
+
30
+ # Math accuracy metric
31
+ MATH_ACCURACY = 'math_accuracy'
32
+
33
+ # Code pass@k metric
34
+ CODE_PASS_K = 'code_pass_k'
35
+
36
+ # Code rouge metric
37
+ ROUGE = 'rouge'
38
+
39
+ # ELO rating system for pairwise comparison
40
+ ELO = 'elo'
41
+
42
+ # Pairwise comparison win/lose and tie(optional)
43
+ PAIRWISE = 'pairwise'
44
+
45
+ # Rating score for single model
46
+ SCORE = 'score'
47
+
48
+
49
+ class ArenaWinner:
50
+
51
+ MODEL_A = 'model_a'
52
+
53
+ MODEL_B = 'model_b'
54
+
55
+ TIE = 'tie'
56
+
57
+ TIE_BOTH_BAD = 'tie_both_bad'
58
+
59
+ UNKNOWN = 'unknown'
60
+
61
+
62
+ class ArenaMode:
63
+ SINGLE = 'single'
64
+ PAIRWISE = 'pairwise'
65
+ PAIRWISE_BASELINE = 'pairwise_baseline'
66
+
67
+
68
+ class OutputsStructure:
69
+
70
+ LOGS_DIR = 'logs_dir'
71
+
72
+ PREDICTIONS_DIR = 'predictions_dir'
73
+
74
+ REVIEWS_DIR = 'reviews_dir'
75
+
76
+ REPORTS_DIR = 'reports_dir'
77
+
78
+ CONFIGS_DIR = 'configs_dir'
79
+
80
+
81
+ class AnswerKeys:
82
+
83
+ ANSWER_ID = 'answer_id'
84
+
85
+ RAW_INPUT = 'raw_input'
86
+
87
+ ORIGIN_PROMPT = 'origin_prompt'
88
+
89
+ MODEL_SPEC = 'model_spec'
90
+
91
+ SUBSET_NAME = 'subset_name'
92
+
93
+ CHOICES = 'choices'
94
+
95
+
96
+ class ReviewKeys:
97
+
98
+ REVIEW_ID = 'review_id'
99
+
100
+ REVIEWED = 'reviewed'
101
+
102
+ REVIEWER_SPEC = 'reviewer_spec'
103
+
104
+ REVIEW_TIME = 'review_time'
105
+
106
+ MESSAGE = 'message'
107
+
108
+ CONTENT = 'content'
109
+
110
+ GOLD = 'gold'
111
+
112
+ PRED = 'pred'
113
+
114
+ RESULT = 'result'
115
+
116
+ REVIEW = 'review'
117
+
118
+
119
+ class EvalConfigKeys:
120
+ CLASS_REF = 'ref'
121
+ CLASS_ARGS = 'args'
122
+ ENABLE = 'enable'
123
+ POSITION_BIAS_MITIGATION = 'position_bias_mitigation'
124
+ RANDOM_SEED = 'random_seed'
125
+ FN_COMPLETION_PARSER = 'fn_completion_parser'
126
+ COMPLETION_PARSER_KWARGS = 'completion_parser_kwargs'
127
+ OUTPUT_FILE = 'output_file'
128
+ MODEL_ID_OR_PATH = 'model_id_or_path'
129
+ MODEL_REVISION = 'revision'
130
+ GENERATION_CONFIG = 'generation_config'
131
+ PRECISION = 'precision'
132
+ TEMPLATE_TYPE = 'template_type'
133
+
134
+
135
+ class FnCompletionParser:
136
+ LMSYS_PARSER: str = 'lmsys_parser'
137
+ RANKING_PARSER: str = 'ranking_parser'
138
+
139
+
140
+ class PositionBiasMitigation:
141
+ NONE = 'none'
142
+ RANDOMIZE_ORDER = 'randomize_order'
143
+ SWAP_POSITION = 'swap_position'
144
+
145
+
146
+ class EvalStage:
147
+ # Enums: `all`, `infer`, `review`
148
+ ALL = 'all'
149
+ INFER = 'infer'
150
+ REVIEW = 'review'
@@ -0,0 +1,3 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.evaluator.evaluator import Evaluator