evalscope 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +10 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +23 -99
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +114 -85
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +25 -53
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +178 -0
- evalscope/collections/sampler.py +132 -0
- evalscope/collections/schema.py +122 -0
- evalscope/config.py +7 -5
- evalscope/constants.py +7 -28
- evalscope/evaluator/evaluator.py +66 -109
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +6 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +7 -4
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +104 -0
- evalscope/run.py +37 -66
- evalscope/run_arena.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +4 -3
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +4 -0
- evalscope/utils/model_utils.py +10 -0
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/METADATA +32 -15
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/RECORD +75 -66
- tests/cli/test_collection.py +53 -0
- tests/cli/test_run.py +43 -1
- tests/rag/test_mteb.py +3 -2
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from typing import List, Union
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class DatasetInfo:
|
|
9
|
+
name: str
|
|
10
|
+
weight: float = 1.0 # sample weight in each collection
|
|
11
|
+
task_type: str = ''
|
|
12
|
+
tags: List[str] = field(default_factory=list)
|
|
13
|
+
args: dict = field(default_factory=dict)
|
|
14
|
+
|
|
15
|
+
def get_data(self) -> dict:
|
|
16
|
+
from evalscope.benchmarks import Benchmark
|
|
17
|
+
|
|
18
|
+
benchmark_meta = Benchmark.get(self.name)
|
|
19
|
+
|
|
20
|
+
data_adapter = benchmark_meta.get_data_adapter(config=self.args)
|
|
21
|
+
data_dict = data_adapter.load(
|
|
22
|
+
dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
|
|
23
|
+
prompts = data_adapter.gen_prompts(data_dict)
|
|
24
|
+
return prompts
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def flatten_weight(collection: 'CollectionSchema', base_weight=1):
|
|
28
|
+
total_weight = sum(dataset.weight for dataset in collection.datasets)
|
|
29
|
+
for dataset in collection.datasets:
|
|
30
|
+
current_weight = dataset.weight / total_weight * base_weight
|
|
31
|
+
if isinstance(dataset, CollectionSchema):
|
|
32
|
+
flatten_weight(dataset, current_weight)
|
|
33
|
+
else:
|
|
34
|
+
dataset.weight = current_weight
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def flatten_tags(collection: 'CollectionSchema', parent_names=None):
|
|
38
|
+
if parent_names is None:
|
|
39
|
+
parent_names = []
|
|
40
|
+
current_names = parent_names + [collection.name]
|
|
41
|
+
for dataset in collection.datasets:
|
|
42
|
+
if isinstance(dataset, CollectionSchema):
|
|
43
|
+
flatten_tags(dataset, current_names)
|
|
44
|
+
else:
|
|
45
|
+
# Add all parent CollectionSchema names to the tags of each DatasetInfo
|
|
46
|
+
for name in current_names:
|
|
47
|
+
if name not in dataset.tags:
|
|
48
|
+
dataset.tags.append(name)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def flatten_datasets(collection: 'CollectionSchema') -> List[DatasetInfo]:
|
|
52
|
+
flat_datasets = []
|
|
53
|
+
for dataset in collection.datasets:
|
|
54
|
+
if isinstance(dataset, CollectionSchema):
|
|
55
|
+
flat_datasets.extend(flatten_datasets(dataset))
|
|
56
|
+
else:
|
|
57
|
+
flat_datasets.append(dataset)
|
|
58
|
+
return flat_datasets
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class CollectionSchema:
|
|
63
|
+
name: str
|
|
64
|
+
weight: float = 1.0
|
|
65
|
+
datasets: List[Union[DatasetInfo, 'CollectionSchema']] = field(default_factory=list)
|
|
66
|
+
|
|
67
|
+
def __str__(self):
|
|
68
|
+
return json.dumps(self.to_dict(), ensure_ascii=False, indent=4)
|
|
69
|
+
|
|
70
|
+
def to_dict(self):
|
|
71
|
+
return {
|
|
72
|
+
'name':
|
|
73
|
+
self.name,
|
|
74
|
+
'weight':
|
|
75
|
+
self.weight,
|
|
76
|
+
'datasets':
|
|
77
|
+
[asdict(dataset) if isinstance(dataset, DatasetInfo) else dataset.to_dict() for dataset in self.datasets],
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def from_dict(cls, data):
|
|
82
|
+
instance = cls(name=data.get('name', ''), weight=data.get('weight', 1))
|
|
83
|
+
for dataset in data.get('datasets', []):
|
|
84
|
+
if 'datasets' in dataset:
|
|
85
|
+
instance.datasets.append(CollectionSchema.from_dict(dataset))
|
|
86
|
+
else:
|
|
87
|
+
instance.datasets.append(DatasetInfo(**dataset))
|
|
88
|
+
return instance
|
|
89
|
+
|
|
90
|
+
def dump_json(self, file_path):
|
|
91
|
+
d = self.to_dict()
|
|
92
|
+
with open(file_path, 'w') as f:
|
|
93
|
+
json.dump(d, f, ensure_ascii=False, indent=4)
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def from_json(cls, file_path):
|
|
97
|
+
with open(file_path, 'r') as f:
|
|
98
|
+
data = json.load(f)
|
|
99
|
+
return cls.from_dict(data)
|
|
100
|
+
|
|
101
|
+
def flatten(self) -> List[DatasetInfo]:
|
|
102
|
+
collection = copy.deepcopy(self)
|
|
103
|
+
flatten_tags(collection)
|
|
104
|
+
flatten_weight(collection)
|
|
105
|
+
return flatten_datasets(collection)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
if __name__ == '__main__':
|
|
109
|
+
schema = CollectionSchema(
|
|
110
|
+
name='reasoning',
|
|
111
|
+
datasets=[
|
|
112
|
+
DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en']),
|
|
113
|
+
DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh'], args={'subset_list': ['logic']})
|
|
114
|
+
])
|
|
115
|
+
print(schema)
|
|
116
|
+
print(schema.flatten())
|
|
117
|
+
schema.dump_json('outputs/schema.json')
|
|
118
|
+
|
|
119
|
+
schema = CollectionSchema.from_json('outputs/schema.json')
|
|
120
|
+
print(schema)
|
|
121
|
+
for dataset in schema.flatten():
|
|
122
|
+
print(dataset)
|
evalscope/config.py
CHANGED
|
@@ -31,7 +31,7 @@ DEFAULT_GENERATION_CONFIG = {
|
|
|
31
31
|
@dataclass
|
|
32
32
|
class TaskConfig:
|
|
33
33
|
# Model-related arguments
|
|
34
|
-
model: Union[str, CustomModel, None] = None
|
|
34
|
+
model: Union[str, 'CustomModel', None] = None
|
|
35
35
|
model_id: Optional[str] = None
|
|
36
36
|
model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
|
|
37
37
|
|
|
@@ -40,8 +40,8 @@ class TaskConfig:
|
|
|
40
40
|
chat_template: Optional[str] = None
|
|
41
41
|
|
|
42
42
|
# Dataset-related arguments
|
|
43
|
-
datasets:
|
|
44
|
-
dataset_args:
|
|
43
|
+
datasets: List[str] = field(default_factory=list)
|
|
44
|
+
dataset_args: Dict = field(default_factory=dict)
|
|
45
45
|
dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
|
|
46
46
|
dataset_hub: str = HubType.MODELSCOPE
|
|
47
47
|
|
|
@@ -64,7 +64,9 @@ class TaskConfig:
|
|
|
64
64
|
# Debug and runtime mode arguments
|
|
65
65
|
debug: bool = False
|
|
66
66
|
dry_run: bool = False
|
|
67
|
-
seed: int = 42
|
|
67
|
+
seed: Optional[int] = 42
|
|
68
|
+
api_url: Optional[str] = None # Only used for server model
|
|
69
|
+
api_key: Optional[str] = 'EMPTY' # Only used for server model
|
|
68
70
|
|
|
69
71
|
def __post_init__(self):
|
|
70
72
|
if (not self.model_id) and self.model:
|
|
@@ -74,7 +76,6 @@ class TaskConfig:
|
|
|
74
76
|
self.model_id = os.path.basename(self.model).rstrip(os.sep)
|
|
75
77
|
|
|
76
78
|
def to_dict(self):
|
|
77
|
-
# Note: to avoid serialization error for some model instance
|
|
78
79
|
return self.__dict__
|
|
79
80
|
|
|
80
81
|
def __str__(self):
|
|
@@ -130,6 +131,7 @@ class TaskConfig:
|
|
|
130
131
|
continue
|
|
131
132
|
|
|
132
133
|
task.model = custom_model
|
|
134
|
+
task.model_args = custom_model.config
|
|
133
135
|
task.model_id = type(custom_model).__name__
|
|
134
136
|
res_list.append(task)
|
|
135
137
|
|
evalscope/constants.py
CHANGED
|
@@ -135,34 +135,13 @@ class EvalStage:
|
|
|
135
135
|
class EvalType:
|
|
136
136
|
|
|
137
137
|
CUSTOM = 'custom'
|
|
138
|
-
CHECKPOINT = 'checkpoint'
|
|
138
|
+
CHECKPOINT = 'checkpoint' # native model checkpoint
|
|
139
|
+
SERVICE = 'service' # model service
|
|
139
140
|
|
|
140
141
|
|
|
141
142
|
class EvalBackend:
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
self._value = value
|
|
148
|
-
|
|
149
|
-
@property
|
|
150
|
-
def value(self):
|
|
151
|
-
return self._value
|
|
152
|
-
|
|
153
|
-
def __str__(self):
|
|
154
|
-
return self._value
|
|
155
|
-
|
|
156
|
-
def __repr__(self):
|
|
157
|
-
return f"'{self._value}'"
|
|
158
|
-
|
|
159
|
-
def __eq__(self, other):
|
|
160
|
-
if isinstance(other, str):
|
|
161
|
-
return self._value == other
|
|
162
|
-
return NotImplemented
|
|
163
|
-
|
|
164
|
-
NATIVE = _Backend('Native')
|
|
165
|
-
OPEN_COMPASS = _Backend('OpenCompass')
|
|
166
|
-
VLM_EVAL_KIT = _Backend('VLMEvalKit')
|
|
167
|
-
RAG_EVAL = _Backend('RAGEval')
|
|
168
|
-
THIRD_PARTY = _Backend('ThirdParty')
|
|
143
|
+
NATIVE = 'Native'
|
|
144
|
+
OPEN_COMPASS = 'OpenCompass'
|
|
145
|
+
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
146
|
+
RAG_EVAL = 'RAGEval'
|
|
147
|
+
THIRD_PARTY = 'ThirdParty'
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -10,9 +10,8 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
10
10
|
|
|
11
11
|
from evalscope.benchmarks import DataAdapter
|
|
12
12
|
from evalscope.config import TaskConfig
|
|
13
|
-
from evalscope.constants import
|
|
14
|
-
|
|
15
|
-
from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
|
|
13
|
+
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
|
|
14
|
+
from evalscope.models import BaseModelAdapter, CustomModelAdapter
|
|
16
15
|
from evalscope.tools.combine_reports import gen_table
|
|
17
16
|
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
18
17
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
@@ -30,73 +29,63 @@ class Evaluator(object):
|
|
|
30
29
|
if the dataset is a local path, e.g. /path/to/your_dataset_name,
|
|
31
30
|
then the task name will be the basename of the path, which is `your_dataset_name`.
|
|
32
31
|
data_adapter: DataAdapter, the data adapter for the dataset.
|
|
33
|
-
subset_list: list, the subset list for the dataset.
|
|
34
32
|
model_adapter: BaseModelAdapter, the model adapter for the model.
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
|
|
38
|
-
datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
|
|
39
|
-
stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
|
|
40
|
-
eval_type: str, the evaluation type. `checkpoint` or `service` or `custom`. Default: 'checkpoint'
|
|
41
|
-
overall_task_cfg: dict, the overall task config. Default: None
|
|
33
|
+
outputs: OutputsStructure, the outputs dir. Default: None
|
|
34
|
+
task_cfg: TaskConfig, the overall task config. Default: None
|
|
42
35
|
**kwargs: kwargs.
|
|
43
36
|
"""
|
|
44
37
|
|
|
45
38
|
def __init__(self,
|
|
46
39
|
dataset_name_or_path: str,
|
|
47
40
|
data_adapter: DataAdapter,
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
outputs: Optional[OutputsStructure] = None,
|
|
52
|
-
datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
53
|
-
datasets_hub: Optional[str] = HubType.MODELSCOPE,
|
|
54
|
-
stage: Optional[str] = EvalStage.ALL,
|
|
55
|
-
eval_type: Optional[str] = EvalType.CHECKPOINT,
|
|
56
|
-
overall_task_cfg: Optional[TaskConfig] = None,
|
|
41
|
+
model_adapter: BaseModelAdapter,
|
|
42
|
+
outputs: OutputsStructure = None,
|
|
43
|
+
task_cfg: TaskConfig = None,
|
|
57
44
|
**kwargs):
|
|
58
45
|
|
|
59
46
|
self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
60
47
|
self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
|
|
61
|
-
self.model_name =
|
|
48
|
+
self.model_name = task_cfg.model_id
|
|
62
49
|
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
63
50
|
|
|
64
|
-
self.datasets_dir = os.path.expanduser(datasets_dir)
|
|
65
|
-
self.kwargs = kwargs
|
|
66
51
|
self.data_adapter = data_adapter
|
|
67
52
|
self.model_adapter = model_adapter
|
|
68
|
-
self.
|
|
69
|
-
self.
|
|
70
|
-
self.
|
|
71
|
-
self.
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
self.model_cfg = self.model_adapter.model_cfg
|
|
76
|
-
|
|
53
|
+
self.model_cfg = model_adapter.model_cfg
|
|
54
|
+
self.eval_type = task_cfg.eval_type
|
|
55
|
+
self.dataset_hub = task_cfg.dataset_hub
|
|
56
|
+
self.stage = task_cfg.stage
|
|
57
|
+
self.use_cache = task_cfg.use_cache
|
|
58
|
+
self.task_cfg = task_cfg
|
|
77
59
|
# Deal with the output paths
|
|
78
60
|
self.outputs_structure = outputs
|
|
79
61
|
|
|
80
|
-
|
|
81
|
-
self.dataset = self.data_adapter.load(
|
|
82
|
-
dataset_name_or_path=dataset_name_or_path,
|
|
83
|
-
subset_list=subset_list,
|
|
84
|
-
work_dir=self.datasets_dir,
|
|
85
|
-
datasets_hub=datasets_hub,
|
|
86
|
-
**kwargs)
|
|
87
|
-
|
|
88
|
-
# Get prompts from dataset
|
|
89
|
-
# TODO: support sampler
|
|
90
|
-
self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
|
|
91
|
-
del self.dataset
|
|
92
|
-
|
|
93
|
-
def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
|
|
62
|
+
self.kwargs = kwargs
|
|
94
63
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
64
|
+
def load_dataset(self):
|
|
65
|
+
dataset = self.data_adapter.load(
|
|
66
|
+
dataset_name_or_path=self.dataset_name_or_path,
|
|
67
|
+
subset_list=self.data_adapter.subset_list,
|
|
68
|
+
work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
|
|
69
|
+
datasets_hub=self.dataset_hub,
|
|
70
|
+
**self.kwargs)
|
|
98
71
|
|
|
99
|
-
|
|
72
|
+
# Get prompts from dataset
|
|
73
|
+
prompts = self.data_adapter.gen_prompts(data_dict=dataset)
|
|
74
|
+
return prompts
|
|
75
|
+
|
|
76
|
+
def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
|
|
77
|
+
model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
|
|
78
|
+
input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
|
|
79
|
+
infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
80
|
+
return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
81
|
+
|
|
82
|
+
def _process_answer(self, answer_d, input_d, subset_name, answer_id):
|
|
83
|
+
answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
84
|
+
answer_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
85
|
+
answer_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
86
|
+
answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
|
|
87
|
+
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
88
|
+
return answer_d
|
|
100
89
|
|
|
101
90
|
def get_answers(self,
|
|
102
91
|
subset_name: str,
|
|
@@ -147,57 +136,24 @@ class Evaluator(object):
|
|
|
147
136
|
resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
|
|
148
137
|
inputs=prompts_list, infer_cfg=infer_cfg)
|
|
149
138
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
# Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
|
|
156
|
-
model_cfg_str = json.dumps(
|
|
157
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
|
|
158
|
-
ensure_ascii=False)
|
|
159
|
-
input_prompt_str = json.dumps(
|
|
160
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False)
|
|
161
|
-
infer_cfg_str = json.dumps(
|
|
162
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
163
|
-
answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
164
|
-
|
|
165
|
-
resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
166
|
-
resp_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
167
|
-
resp_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
168
|
-
resp_d[AnswerKeys.RAW_INPUT] = in_d[AnswerKeys.RAW_INPUT]
|
|
169
|
-
resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
|
|
170
|
-
|
|
171
|
-
answers_list.append(resp_d)
|
|
172
|
-
dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
139
|
+
for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
|
|
140
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
141
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
142
|
+
answers_list.append(processed_answer)
|
|
143
|
+
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
173
144
|
|
|
174
145
|
else:
|
|
175
146
|
for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
|
|
180
|
-
ensure_ascii=False)
|
|
181
|
-
input_prompt_str = json.dumps(
|
|
182
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False)
|
|
183
|
-
infer_cfg_str = json.dumps(
|
|
184
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
185
|
-
answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
186
|
-
|
|
187
|
-
# Get answers
|
|
188
|
-
answer_d: dict = self._pred_answer(
|
|
189
|
-
input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id)
|
|
190
|
-
|
|
191
|
-
answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
192
|
-
answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
|
|
193
|
-
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
|
|
147
|
+
answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
|
|
148
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
149
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
194
150
|
|
|
195
151
|
if debug:
|
|
196
152
|
logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
|
|
197
|
-
logger.info(f'**predicted ans: {json.dumps(
|
|
153
|
+
logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
|
|
198
154
|
|
|
199
|
-
answers_list.append(
|
|
200
|
-
dump_jsonl_data(
|
|
155
|
+
answers_list.append(processed_answer)
|
|
156
|
+
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
201
157
|
|
|
202
158
|
logger.info(f'Dump predictions to {pred_file_path}.')
|
|
203
159
|
return answers_list
|
|
@@ -241,6 +197,19 @@ class Evaluator(object):
|
|
|
241
197
|
|
|
242
198
|
return review_res
|
|
243
199
|
|
|
200
|
+
def _generate_review_id(self, answer_d):
|
|
201
|
+
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
202
|
+
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
203
|
+
reviewer_spec = {
|
|
204
|
+
'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
|
|
205
|
+
'reviewer': ['Evaluator'],
|
|
206
|
+
'revision': ['default']
|
|
207
|
+
}
|
|
208
|
+
reviewer_spec_str = json.dumps(
|
|
209
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
210
|
+
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
211
|
+
return review_id, reviewer_spec
|
|
212
|
+
|
|
244
213
|
def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
|
|
245
214
|
"""
|
|
246
215
|
Get reviews from answers.
|
|
@@ -264,19 +233,7 @@ class Evaluator(object):
|
|
|
264
233
|
logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
|
|
265
234
|
|
|
266
235
|
for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
|
|
267
|
-
|
|
268
|
-
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
269
|
-
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
270
|
-
|
|
271
|
-
reviewer_spec: dict = {
|
|
272
|
-
'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
|
|
273
|
-
'reviewer': ['Evaluator'],
|
|
274
|
-
'revision': ['default']
|
|
275
|
-
}
|
|
276
|
-
reviewer_spec_str = json.dumps(
|
|
277
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
278
|
-
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
279
|
-
|
|
236
|
+
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
280
237
|
# Get review
|
|
281
238
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
282
239
|
|
|
@@ -284,7 +241,6 @@ class Evaluator(object):
|
|
|
284
241
|
logger.info(review_d)
|
|
285
242
|
|
|
286
243
|
reviews_list.append(review_d)
|
|
287
|
-
|
|
288
244
|
# Dump reviews
|
|
289
245
|
dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
|
|
290
246
|
|
|
@@ -380,7 +336,8 @@ class Evaluator(object):
|
|
|
380
336
|
stage_answers_dict = {}
|
|
381
337
|
stage_reviews_dict = {}
|
|
382
338
|
|
|
383
|
-
|
|
339
|
+
prompts = self.load_dataset()
|
|
340
|
+
for subset_name, prompts_list in prompts.items():
|
|
384
341
|
limit = kwargs.get('limit', len(prompts_list))
|
|
385
342
|
prompts_list = prompts_list[:limit]
|
|
386
343
|
|
|
@@ -8,10 +8,10 @@ import sys
|
|
|
8
8
|
import time
|
|
9
9
|
from abc import ABC, abstractmethod
|
|
10
10
|
from functools import partial
|
|
11
|
-
from typing import Any, List
|
|
11
|
+
from typing import Any, List, Tuple
|
|
12
12
|
|
|
13
13
|
from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
|
|
14
|
-
from evalscope.models.
|
|
14
|
+
from evalscope.models.model import OpenAIModel
|
|
15
15
|
from evalscope.utils import completion_parsers, random_seeded_choice
|
|
16
16
|
from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
|
|
17
17
|
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
|
|
@@ -240,7 +240,15 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
240
240
|
review_text=review_text)
|
|
241
241
|
return review_result
|
|
242
242
|
|
|
243
|
-
def _get_review_pair(self,
|
|
243
|
+
def _get_review_pair(self,
|
|
244
|
+
model_a,
|
|
245
|
+
model_b,
|
|
246
|
+
question,
|
|
247
|
+
category,
|
|
248
|
+
ans1,
|
|
249
|
+
ans2,
|
|
250
|
+
dry_run=False,
|
|
251
|
+
**kwargs) -> Tuple[str, Any]:
|
|
244
252
|
input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
|
|
245
253
|
|
|
246
254
|
if self.reference_list:
|
|
@@ -263,7 +271,7 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
263
271
|
result = (result, None)
|
|
264
272
|
return review_text, *result
|
|
265
273
|
|
|
266
|
-
def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) ->
|
|
274
|
+
def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> Tuple[str, Any]:
|
|
267
275
|
input_msg = dict(ques=question, category=category, ans1=answer)
|
|
268
276
|
|
|
269
277
|
if self.reference_list:
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -1 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, weighted_mean
|
|
3
|
+
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|
|
4
|
+
|
|
5
|
+
WeightedAverageAccuracy = {'name': 'WeightedAverageAccuracy', 'object': weighted_mean}
|
|
6
|
+
WeightedAverageBLEU = {'name': 'WeightedAverageBLEU', 'object': weighted_mean}
|
|
7
|
+
Pass1 = {'name': 'Pass@1', 'object': weighted_mean}
|
|
@@ -55,7 +55,7 @@ try:
|
|
|
55
55
|
os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
|
|
56
56
|
os.system(f'unzip {punkt_path} -d {nltk_dir}')
|
|
57
57
|
else:
|
|
58
|
-
logger.
|
|
58
|
+
logger.debug(f'{punkt_path} already exists, skipping download')
|
|
59
59
|
except Exception as e:
|
|
60
60
|
logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
|
|
61
61
|
|