evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +29 -9
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +2 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +20 -15
- evalscope/metrics/__init__.py +9 -1
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
- evalscope/metrics/llm_judge.py +106 -20
- evalscope/metrics/metrics.py +20 -8
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +4 -2
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +15 -16
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +3 -3
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +55 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +68 -4
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +34 -70
- tests/perf/test_perf.py +31 -4
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/models/model.py +0 -189
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
evalscope/config.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
3
|
import copy
|
|
4
|
-
import json
|
|
5
4
|
import os
|
|
6
5
|
from argparse import Namespace
|
|
7
6
|
from dataclasses import dataclass, field
|
|
@@ -10,18 +9,15 @@ from typing import Dict, List, Optional, Union
|
|
|
10
9
|
from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType,
|
|
11
10
|
JudgeStrategy, ModelTask, OutputType)
|
|
12
11
|
from evalscope.models import CustomModel, DummyCustomModel
|
|
13
|
-
from evalscope.utils import
|
|
14
|
-
from evalscope.utils.io_utils import dict_to_yaml,
|
|
12
|
+
from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
|
|
13
|
+
from evalscope.utils.io_utils import dict_to_yaml, gen_hash
|
|
15
14
|
from evalscope.utils.logger import get_logger
|
|
16
|
-
from evalscope.utils.utils import parse_int_or_float
|
|
17
15
|
|
|
18
16
|
logger = get_logger()
|
|
19
17
|
|
|
20
|
-
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
21
|
-
|
|
22
18
|
|
|
23
19
|
@dataclass
|
|
24
|
-
class TaskConfig:
|
|
20
|
+
class TaskConfig(BaseArgument):
|
|
25
21
|
# Model-related arguments
|
|
26
22
|
model: Union[str, 'CustomModel', None] = None
|
|
27
23
|
model_id: Optional[str] = None
|
|
@@ -132,15 +128,6 @@ class TaskConfig:
|
|
|
132
128
|
'precision': 'torch.float16',
|
|
133
129
|
}
|
|
134
130
|
|
|
135
|
-
def to_dict(self):
|
|
136
|
-
result = self.__dict__.copy()
|
|
137
|
-
if isinstance(self.model, CustomModel):
|
|
138
|
-
result['model'] = self.model.__class__.__name__
|
|
139
|
-
return result
|
|
140
|
-
|
|
141
|
-
def __str__(self):
|
|
142
|
-
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
143
|
-
|
|
144
131
|
def update(self, other: Union['TaskConfig', dict]):
|
|
145
132
|
if isinstance(other, TaskConfig):
|
|
146
133
|
other = other.to_dict()
|
|
@@ -155,91 +142,11 @@ class TaskConfig:
|
|
|
155
142
|
except Exception as e:
|
|
156
143
|
logger.warning(f'Failed to dump overall task config: {e}')
|
|
157
144
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def from_yaml(yaml_file: str):
|
|
164
|
-
return TaskConfig.from_dict(yaml_to_dict(yaml_file))
|
|
165
|
-
|
|
166
|
-
@staticmethod
|
|
167
|
-
def from_dict(d: dict):
|
|
168
|
-
return TaskConfig(**d)
|
|
169
|
-
|
|
170
|
-
@staticmethod
|
|
171
|
-
def from_json(json_file: str):
|
|
172
|
-
return TaskConfig.from_dict(json_to_dict(json_file))
|
|
173
|
-
|
|
174
|
-
@staticmethod
|
|
175
|
-
def from_args(args: Namespace):
|
|
176
|
-
# Convert Namespace to a dictionary and filter out None values
|
|
177
|
-
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
178
|
-
|
|
179
|
-
if 'func' in args_dict:
|
|
180
|
-
del args_dict['func'] # Note: compat CLI arguments
|
|
181
|
-
|
|
182
|
-
return TaskConfig.from_dict(args_dict)
|
|
183
|
-
|
|
184
|
-
@staticmethod
|
|
185
|
-
def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
|
|
186
|
-
res_list = []
|
|
187
|
-
for task_name in tasks:
|
|
188
|
-
task = registry_tasks.get(task_name, None)
|
|
189
|
-
if task is None:
|
|
190
|
-
logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
|
|
191
|
-
continue
|
|
192
|
-
|
|
193
|
-
task.model = custom_model
|
|
194
|
-
task.model_args = custom_model.config
|
|
195
|
-
task.model_id = type(custom_model).__name__
|
|
196
|
-
res_list.append(task)
|
|
197
|
-
|
|
198
|
-
return res_list
|
|
199
|
-
|
|
200
|
-
@staticmethod
|
|
201
|
-
def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
|
|
202
|
-
"""
|
|
203
|
-
Register a new task (dataset) for evaluation.
|
|
204
|
-
|
|
205
|
-
Args:
|
|
206
|
-
name: str, the dataset name.
|
|
207
|
-
data_pattern: str, the data pattern for the task.
|
|
208
|
-
e.g. `mmlu`, `ceval`, `gsm8k`, ...
|
|
209
|
-
refer to task_config.list() for all available datasets.
|
|
210
|
-
dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
|
|
211
|
-
then your specific custom dataset directory will be /path/to/data/{name}
|
|
212
|
-
subset_list: list, the subset list for the dataset.
|
|
213
|
-
e.g. ['middle_school_politics', 'operating_system']
|
|
214
|
-
refer to the mmlu for example. https://github.com/hendrycks/test/blob/master/categories.py
|
|
215
|
-
"""
|
|
216
|
-
available_datasets = list(registry_tasks.keys())
|
|
217
|
-
if data_pattern not in available_datasets:
|
|
218
|
-
logger.error(
|
|
219
|
-
f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
|
|
220
|
-
return
|
|
221
|
-
|
|
222
|
-
# Reuse the existing task config and update the datasets
|
|
223
|
-
pattern_config = registry_tasks[data_pattern]
|
|
224
|
-
|
|
225
|
-
custom_config = copy.deepcopy(pattern_config)
|
|
226
|
-
custom_config.datasets = [data_pattern]
|
|
227
|
-
custom_config.dataset_args = {data_pattern: {}}
|
|
228
|
-
custom_config.eval_type = EvalType.CHECKPOINT
|
|
229
|
-
|
|
230
|
-
if dataset_dir is not None:
|
|
231
|
-
custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
|
|
232
|
-
|
|
233
|
-
if subset_list is not None:
|
|
234
|
-
custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
|
|
235
|
-
|
|
236
|
-
registry_tasks.update({name: custom_config})
|
|
237
|
-
logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
|
|
241
|
-
|
|
242
|
-
registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
|
|
145
|
+
def to_dict(self):
|
|
146
|
+
result = self.__dict__.copy()
|
|
147
|
+
if isinstance(self.model, CustomModel):
|
|
148
|
+
result['model'] = self.model.__class__.__name__
|
|
149
|
+
return result
|
|
243
150
|
|
|
244
151
|
|
|
245
152
|
def parse_task_config(task_cfg) -> TaskConfig:
|
|
@@ -264,25 +171,3 @@ def parse_task_config(task_cfg) -> TaskConfig:
|
|
|
264
171
|
else:
|
|
265
172
|
raise ValueError('Args: Please provide a valid task config.')
|
|
266
173
|
return task_cfg
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
class TempModel(CustomModel):
|
|
270
|
-
|
|
271
|
-
def __init__(self, config: dict):
|
|
272
|
-
super().__init__(config=config)
|
|
273
|
-
|
|
274
|
-
def predict(self, prompts: str, **kwargs):
|
|
275
|
-
return [item + ': response' for item in prompts]
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
if __name__ == '__main__':
|
|
279
|
-
model = TempModel(config={'model_id': 'test-swift-dummy-model'})
|
|
280
|
-
task_config = TaskConfig()
|
|
281
|
-
|
|
282
|
-
# Register a new task
|
|
283
|
-
TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
|
|
284
|
-
|
|
285
|
-
swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
|
|
286
|
-
for item in swift_eval_task:
|
|
287
|
-
print(item)
|
|
288
|
-
print()
|
evalscope/constants.py
CHANGED
|
@@ -41,27 +41,6 @@ class MetricsConstant:
|
|
|
41
41
|
]
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
class MetricMembers:
|
|
45
|
-
|
|
46
|
-
# Math accuracy metric
|
|
47
|
-
MATH_ACCURACY = 'math_accuracy'
|
|
48
|
-
|
|
49
|
-
# Code pass@k metric
|
|
50
|
-
CODE_PASS_K = 'code_pass_k'
|
|
51
|
-
|
|
52
|
-
# Code rouge metric
|
|
53
|
-
ROUGE = 'rouge'
|
|
54
|
-
|
|
55
|
-
# ELO rating system for pairwise comparison
|
|
56
|
-
ELO = 'elo'
|
|
57
|
-
|
|
58
|
-
# Pairwise comparison win/lose and tie(optional)
|
|
59
|
-
PAIRWISE = 'pairwise'
|
|
60
|
-
|
|
61
|
-
# Rating score for single model
|
|
62
|
-
SCORE = 'score'
|
|
63
|
-
|
|
64
|
-
|
|
65
44
|
class ArenaWinner:
|
|
66
45
|
|
|
67
46
|
MODEL_A = 'model_a'
|
|
@@ -172,6 +151,11 @@ class JudgeStrategy:
|
|
|
172
151
|
LLM_RECALL = 'llm_recall'
|
|
173
152
|
|
|
174
153
|
|
|
154
|
+
class JudgeScoreType:
|
|
155
|
+
NUMERIC = 'numeric' # numeric score
|
|
156
|
+
PATTERN = 'pattern' # pattern matching score
|
|
157
|
+
|
|
158
|
+
|
|
175
159
|
class ModelTask:
|
|
176
160
|
TEXT_GENERATION = 'text_generation'
|
|
177
161
|
IMAGE_GENERATION = 'image_generation'
|
evalscope/evaluator/__init__.py
CHANGED
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -7,16 +7,18 @@ from collections import OrderedDict, defaultdict
|
|
|
7
7
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
8
8
|
from copy import deepcopy
|
|
9
9
|
from tqdm import tqdm
|
|
10
|
-
from typing import Any, Dict, List, Optional, Union
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
|
11
11
|
|
|
12
12
|
from evalscope.benchmarks import DataAdapter
|
|
13
13
|
from evalscope.config import TaskConfig
|
|
14
14
|
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
|
|
15
|
-
from evalscope.models import BaseModelAdapter
|
|
16
15
|
from evalscope.report import Report, gen_table
|
|
17
|
-
from evalscope.utils import
|
|
18
|
-
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
16
|
+
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, gen_hash, jsonl_to_list
|
|
19
17
|
from evalscope.utils.logger import get_logger
|
|
18
|
+
from evalscope.utils.model_utils import dict_torch_dtype_to_str
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from evalscope.models import BaseModelAdapter
|
|
20
22
|
|
|
21
23
|
logger = get_logger()
|
|
22
24
|
|
|
@@ -38,7 +40,7 @@ class Evaluator(object):
|
|
|
38
40
|
|
|
39
41
|
def __init__(self,
|
|
40
42
|
data_adapter: DataAdapter,
|
|
41
|
-
model_adapter: BaseModelAdapter,
|
|
43
|
+
model_adapter: 'BaseModelAdapter',
|
|
42
44
|
outputs: OutputsStructure = None,
|
|
43
45
|
task_cfg: TaskConfig = None,
|
|
44
46
|
**kwargs):
|
|
@@ -237,9 +239,10 @@ class Evaluator(object):
|
|
|
237
239
|
if use_llm:
|
|
238
240
|
# Use LLM as judge
|
|
239
241
|
assert self.judge is not None, f'Judge model is required for LLM judging {self.data_adapter.name}'
|
|
242
|
+
pred_content = self.data_adapter.llm_parse_pred_result(
|
|
243
|
+
result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
|
|
240
244
|
review_result = self.data_adapter.llm_match(
|
|
241
|
-
gold_content,
|
|
242
|
-
pred = answer_content
|
|
245
|
+
gold_content, pred_content, self.judge, raw_input=raw_input_d)
|
|
243
246
|
else:
|
|
244
247
|
# Use rule-based judging
|
|
245
248
|
pred_content = self.data_adapter.parse_pred_result(
|
|
@@ -250,15 +253,14 @@ class Evaluator(object):
|
|
|
250
253
|
if (self.task_cfg.judge_strategy == JudgeStrategy.LLM_RECALL
|
|
251
254
|
and isinstance(review_result, (bool, int, float)) and not bool(review_result)):
|
|
252
255
|
assert self.judge is not None, f'Judge model is required for LLM_RECALL strategy {self.data_adapter.name}' # noqa: E501
|
|
256
|
+
pred_content = self.data_adapter.llm_parse_pred_result(
|
|
257
|
+
result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
|
|
253
258
|
review_result = self.data_adapter.llm_match(
|
|
254
|
-
gold_content,
|
|
255
|
-
pred = answer_content
|
|
256
|
-
else:
|
|
257
|
-
pred = pred_content
|
|
259
|
+
gold_content, pred_content, self.judge, raw_input=raw_input_d)
|
|
258
260
|
|
|
259
261
|
choice[ReviewKeys.REVIEW] = {
|
|
260
262
|
ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
|
|
261
|
-
ReviewKeys.PRED:
|
|
263
|
+
ReviewKeys.PRED: pred_content,
|
|
262
264
|
ReviewKeys.RESULT: review_result
|
|
263
265
|
}
|
|
264
266
|
rev_choices.append(choice)
|
|
@@ -394,9 +396,6 @@ class Evaluator(object):
|
|
|
394
396
|
report_map: Report = self.data_adapter.gen_report(
|
|
395
397
|
subset_score_map=reviews_score_all, model_name=self.model_name)
|
|
396
398
|
|
|
397
|
-
# Post process report
|
|
398
|
-
self.data_adapter.post_process_report(report_map, report_path=report_path)
|
|
399
|
-
|
|
400
399
|
# Make table
|
|
401
400
|
try:
|
|
402
401
|
report_table = gen_table(report_list=[report_map], add_overall_metric=True)
|
|
@@ -418,6 +417,12 @@ class Evaluator(object):
|
|
|
418
417
|
report_map.to_json(report_file)
|
|
419
418
|
logger.info(f'Dump report to: {report_file} \n')
|
|
420
419
|
|
|
420
|
+
# Post process report
|
|
421
|
+
try:
|
|
422
|
+
self.data_adapter.post_process_report(report_map, report_path=report_path)
|
|
423
|
+
except Exception as e:
|
|
424
|
+
logger.error(f'Failed to post process report: {e}')
|
|
425
|
+
|
|
421
426
|
return report_map
|
|
422
427
|
|
|
423
428
|
def eval(self, **kwargs) -> dict:
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -4,7 +4,8 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from evalscope.utils.import_utils import _LazyModule
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
-
from .
|
|
7
|
+
from .completion_parsers import ResponseParser, lmsys_parser, ranking_parser
|
|
8
|
+
from .llm_judge import DEFAULT_NUMERIC_SCORE_TEMPLATE, DEFAULT_PROMPT_TEMPLATE, LLMJudge
|
|
8
9
|
from .math_parser import extract_answer, math_equal, strip_answer_string
|
|
9
10
|
from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
|
|
10
11
|
weighted_mean)
|
|
@@ -33,12 +34,19 @@ else:
|
|
|
33
34
|
],
|
|
34
35
|
'llm_judge': [
|
|
35
36
|
'LLMJudge',
|
|
37
|
+
'DEFAULT_PROMPT_TEMPLATE',
|
|
38
|
+
'DEFAULT_NUMERIC_SCORE_TEMPLATE',
|
|
36
39
|
],
|
|
37
40
|
'math_parser': [
|
|
38
41
|
'extract_answer',
|
|
39
42
|
'math_equal',
|
|
40
43
|
'strip_answer_string',
|
|
41
44
|
],
|
|
45
|
+
'completion_parsers': [
|
|
46
|
+
'ResponseParser',
|
|
47
|
+
'lmsys_parser',
|
|
48
|
+
'ranking_parser',
|
|
49
|
+
],
|
|
42
50
|
}
|
|
43
51
|
|
|
44
52
|
import sys
|
|
@@ -1,77 +1,85 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
#
|
|
2
|
+
# flake8: noqa
|
|
3
3
|
|
|
4
|
-
import
|
|
5
|
-
import hashlib
|
|
6
|
-
import importlib
|
|
7
|
-
import importlib.util
|
|
8
|
-
import numpy as np
|
|
9
|
-
import os
|
|
10
|
-
import random
|
|
4
|
+
import ast
|
|
11
5
|
import re
|
|
12
|
-
import torch
|
|
13
|
-
from inspect import signature
|
|
14
|
-
from typing import Any, Dict, List, Tuple, Union
|
|
15
6
|
|
|
7
|
+
# from . import utils as ann_utils
|
|
8
|
+
from evalscope.constants import ArenaWinner
|
|
16
9
|
from evalscope.utils.logger import get_logger
|
|
17
10
|
|
|
18
11
|
logger = get_logger()
|
|
19
12
|
|
|
20
|
-
|
|
13
|
+
one_score_pattern = re.compile('\[\[(\d+\.?\d*)\]\]')
|
|
14
|
+
one_score_pattern_backup = re.compile('\[(\d+\.?\d*)\]')
|
|
21
15
|
|
|
22
|
-
# Example: export TEST_LEVEL_LIST=0,1
|
|
23
|
-
TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
|
|
24
16
|
|
|
17
|
+
# modified from: https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py#L47
|
|
18
|
+
# does not work with batched completions
|
|
19
|
+
def lmsys_parser(completion, output_format):
|
|
20
|
+
if output_format == '[[rating]]':
|
|
21
|
+
match = re.search(one_score_pattern, completion)
|
|
22
|
+
if not match:
|
|
23
|
+
match = re.search(one_score_pattern_backup, completion)
|
|
25
24
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
25
|
+
if match:
|
|
26
|
+
rating = ast.literal_eval(match.groups()[0])
|
|
27
|
+
else:
|
|
28
|
+
logger.error(f'Content: {completion}\n'
|
|
29
|
+
'You must manually fix the score.')
|
|
30
|
+
rating = -1
|
|
31
|
+
|
|
32
|
+
return rating
|
|
33
|
+
if output_format == '[[rating_a,rating_b]]':
|
|
34
|
+
try:
|
|
35
|
+
score_pair = completion.split('\n')[0]
|
|
36
|
+
score_pair = score_pair.replace(',', ' ')
|
|
37
|
+
sp = score_pair.split(' ')
|
|
38
|
+
if len(sp) == 2:
|
|
39
|
+
score_1 = float(sp[0])
|
|
40
|
+
score_2 = float(sp[1])
|
|
41
|
+
if score_1 > score_2:
|
|
42
|
+
winner = ArenaWinner.MODEL_A
|
|
43
|
+
elif score_1 < score_2:
|
|
44
|
+
winner = ArenaWinner.MODEL_B
|
|
45
|
+
else:
|
|
46
|
+
if score_1 == score_1 == -1:
|
|
47
|
+
winner = ArenaWinner.UNKNOWN
|
|
48
|
+
winner = ArenaWinner.TIE
|
|
49
|
+
return winner, [score_1, score_2]
|
|
50
|
+
else:
|
|
51
|
+
raise Exception('Invalid score pair.')
|
|
52
|
+
except Exception as e:
|
|
53
|
+
logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
|
|
54
|
+
return ArenaWinner.UNKNOWN, [-1, -1]
|
|
55
|
+
elif output_format == '[[A]]':
|
|
56
|
+
if '[[A]]' in completion:
|
|
57
|
+
winner = ArenaWinner.MODEL_A
|
|
58
|
+
elif '[[B]]' in completion:
|
|
59
|
+
winner = ArenaWinner.MODEL_B
|
|
60
|
+
elif '[[C]]' in completion:
|
|
61
|
+
winner = ArenaWinner.TIE
|
|
62
|
+
else:
|
|
63
|
+
logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
|
|
64
|
+
winner = ArenaWinner.UNKNOWN
|
|
65
|
+
return winner
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def ranking_parser(completion, **kwargs):
|
|
37
69
|
try:
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
if spliter:
|
|
44
|
-
for attr in cls_name.split('.'):
|
|
45
|
-
obj_cls = getattr(obj_cls, attr)
|
|
70
|
+
if isinstance(completion, str):
|
|
71
|
+
ordered_completions = ast.literal_eval(completion)
|
|
72
|
+
else:
|
|
73
|
+
ordered_completions = completion
|
|
46
74
|
|
|
47
|
-
|
|
75
|
+
rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
|
|
76
|
+
assert rank in [1, 2]
|
|
48
77
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def gen_hash(name: str, bits: int = 32):
|
|
56
|
-
return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()[:bits]
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
|
|
60
|
-
"""
|
|
61
|
-
Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
|
|
62
|
-
converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
|
|
63
|
-
string, which can then be stored in the json format.
|
|
64
|
-
|
|
65
|
-
Refer to: https://github.com/huggingface/transformers/pull/16065/files for details.
|
|
66
|
-
"""
|
|
67
|
-
if d.get('torch_dtype', None) is not None and not isinstance(d['torch_dtype'], str):
|
|
68
|
-
d['torch_dtype'] = str(d['torch_dtype']).split('.')[1]
|
|
69
|
-
|
|
70
|
-
for value in d.values():
|
|
71
|
-
if isinstance(value, dict):
|
|
72
|
-
dict_torch_dtype_to_str(value)
|
|
73
|
-
|
|
74
|
-
return d
|
|
78
|
+
return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f'{e}\nContent: {completion}\n'
|
|
81
|
+
'You must manually fix the score pair.')
|
|
82
|
+
return ArenaWinner.UNKNOWN
|
|
75
83
|
|
|
76
84
|
|
|
77
85
|
class ResponseParser:
|
|
@@ -194,7 +202,6 @@ class ResponseParser:
|
|
|
194
202
|
return last_capital
|
|
195
203
|
return 'No valid option found'
|
|
196
204
|
|
|
197
|
-
|
|
198
205
|
@staticmethod
|
|
199
206
|
def parse_bracketed_answer(text: str, options: list[str]) -> str:
|
|
200
207
|
options = ResponseParser.process_options(options)
|
|
@@ -212,121 +219,9 @@ class ResponseParser:
|
|
|
212
219
|
options_pattern = '|'.join(escaped_options)
|
|
213
220
|
return options_pattern
|
|
214
221
|
|
|
215
|
-
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
216
|
-
"""
|
|
217
|
-
Normalize score.
|
|
218
|
-
|
|
219
|
-
Args:
|
|
220
|
-
score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678}
|
|
221
|
-
keep_num: number of digits to keep.
|
|
222
|
-
|
|
223
|
-
Returns:
|
|
224
|
-
Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234}
|
|
225
|
-
"""
|
|
226
|
-
if isinstance(score, float):
|
|
227
|
-
score = round(score, keep_num)
|
|
228
|
-
elif isinstance(score, dict):
|
|
229
|
-
score = {k: round(v, keep_num) for k, v in score.items()}
|
|
230
|
-
else:
|
|
231
|
-
logger.warning(f'Unknown score type: {type(score)}')
|
|
232
|
-
|
|
233
|
-
return score
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
def is_module_installed(module_name):
|
|
237
|
-
try:
|
|
238
|
-
importlib.import_module(module_name)
|
|
239
|
-
return True
|
|
240
|
-
except ImportError:
|
|
241
|
-
return False
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
def get_module_path(module_name):
|
|
245
|
-
spec = importlib.util.find_spec(module_name)
|
|
246
|
-
if spec and spec.origin:
|
|
247
|
-
return os.path.abspath(spec.origin)
|
|
248
|
-
else:
|
|
249
|
-
raise ValueError(f'Cannot find module: {module_name}')
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
def get_valid_list(input_list, candidate_list):
|
|
253
|
-
"""
|
|
254
|
-
Get the valid and invalid list from input_list based on candidate_list.
|
|
255
|
-
Args:
|
|
256
|
-
input_list: The input list.
|
|
257
|
-
candidate_list: The candidate list.
|
|
258
|
-
|
|
259
|
-
Returns:
|
|
260
|
-
valid_list: The valid list.
|
|
261
|
-
invalid_list: The invalid list.
|
|
262
|
-
"""
|
|
263
|
-
return [i for i in input_list if i in candidate_list], \
|
|
264
|
-
[i for i in input_list if i not in candidate_list]
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
def get_latest_folder_path(work_dir):
|
|
268
|
-
from datetime import datetime
|
|
269
|
-
|
|
270
|
-
# Get all subdirectories in the work_dir
|
|
271
|
-
folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
|
|
272
|
-
|
|
273
|
-
# Get the timestamp(YYYYMMDD_HHMMSS)
|
|
274
|
-
timestamp_pattern = re.compile(r'^\d{8}_\d{6}$')
|
|
275
|
-
|
|
276
|
-
# Filter out the folders
|
|
277
|
-
timestamped_folders = [f for f in folders if timestamp_pattern.match(f)]
|
|
278
|
-
|
|
279
|
-
if not timestamped_folders:
|
|
280
|
-
print(f'>> No timestamped folders found in {work_dir}!')
|
|
281
|
-
return None
|
|
282
|
-
|
|
283
|
-
# timestamp parser
|
|
284
|
-
def parse_timestamp(folder_name):
|
|
285
|
-
return datetime.strptime(folder_name, '%Y%m%d_%H%M%S')
|
|
286
|
-
|
|
287
|
-
# Find the latest folder
|
|
288
|
-
latest_folder = max(timestamped_folders, key=parse_timestamp)
|
|
289
|
-
|
|
290
|
-
return os.path.join(work_dir, latest_folder)
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
def csv_to_list(file_path: str) -> List[dict]:
|
|
294
|
-
import csv
|
|
295
|
-
|
|
296
|
-
with open(file_path, mode='r', newline='', encoding='utf-8') as csv_file:
|
|
297
|
-
csv_reader = csv.DictReader(csv_file)
|
|
298
|
-
result = [row for row in csv_reader]
|
|
299
|
-
|
|
300
|
-
return result
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
def seed_everything(seed: int):
|
|
304
|
-
"""Set all random seeds to a fixed value for reproducibility.
|
|
305
|
-
|
|
306
|
-
Args:
|
|
307
|
-
seed (int): The seed value.
|
|
308
|
-
"""
|
|
309
|
-
random.seed(seed)
|
|
310
|
-
np.random.seed(seed)
|
|
311
|
-
torch.manual_seed(seed)
|
|
312
|
-
if torch.cuda.is_available():
|
|
313
|
-
torch.cuda.manual_seed_all(seed)
|
|
314
|
-
torch.backends.cudnn.deterministic = True
|
|
315
|
-
torch.backends.cudnn.benchmark = False
|
|
316
|
-
|
|
317
|
-
def get_supported_params(func):
|
|
318
|
-
"""Get the supported parameters of a function."""
|
|
319
|
-
sig = signature(func)
|
|
320
|
-
return list(sig.parameters.keys())
|
|
321
|
-
|
|
322
|
-
def parse_int_or_float(num):
|
|
323
|
-
number = float(num)
|
|
324
|
-
if number.is_integer():
|
|
325
|
-
return int(number)
|
|
326
|
-
return number
|
|
327
222
|
|
|
328
223
|
if __name__ == '__main__':
|
|
224
|
+
result = '**Answer: A **Answer: C**'
|
|
329
225
|
options = ['A', 'B', 'C', 'D']
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
print(ResponseParser.parse_first_option(answer, options))
|
|
226
|
+
parsed_result = ResponseParser.parse_first_option(result, options)
|
|
227
|
+
print(f'Parsed result: {parsed_result}') # Should print 'C'
|