evalscope 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +10 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +23 -99
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +114 -85
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +25 -53
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +178 -0
- evalscope/collections/sampler.py +132 -0
- evalscope/collections/schema.py +122 -0
- evalscope/config.py +10 -6
- evalscope/constants.py +7 -28
- evalscope/evaluator/evaluator.py +66 -108
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +6 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +7 -4
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +104 -0
- evalscope/perf/arguments.py +1 -0
- evalscope/perf/benchmark.py +1 -1
- evalscope/perf/main.py +3 -1
- evalscope/perf/plugin/api/openai_api.py +51 -47
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/run.py +37 -66
- evalscope/run_arena.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +4 -3
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +4 -0
- evalscope/utils/model_utils.py +10 -0
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/METADATA +46 -17
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/RECORD +81 -92
- tests/cli/test_collection.py +53 -0
- tests/cli/test_run.py +43 -1
- tests/perf/test_perf.py +3 -3
- tests/rag/test_mteb.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from typing import List, Union
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class DatasetInfo:
|
|
9
|
+
name: str
|
|
10
|
+
weight: float = 1.0 # sample weight in each collection
|
|
11
|
+
task_type: str = ''
|
|
12
|
+
tags: List[str] = field(default_factory=list)
|
|
13
|
+
args: dict = field(default_factory=dict)
|
|
14
|
+
|
|
15
|
+
def get_data(self) -> dict:
|
|
16
|
+
from evalscope.benchmarks import Benchmark
|
|
17
|
+
|
|
18
|
+
benchmark_meta = Benchmark.get(self.name)
|
|
19
|
+
|
|
20
|
+
data_adapter = benchmark_meta.get_data_adapter(config=self.args)
|
|
21
|
+
data_dict = data_adapter.load(
|
|
22
|
+
dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
|
|
23
|
+
prompts = data_adapter.gen_prompts(data_dict)
|
|
24
|
+
return prompts
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def flatten_weight(collection: 'CollectionSchema', base_weight=1):
|
|
28
|
+
total_weight = sum(dataset.weight for dataset in collection.datasets)
|
|
29
|
+
for dataset in collection.datasets:
|
|
30
|
+
current_weight = dataset.weight / total_weight * base_weight
|
|
31
|
+
if isinstance(dataset, CollectionSchema):
|
|
32
|
+
flatten_weight(dataset, current_weight)
|
|
33
|
+
else:
|
|
34
|
+
dataset.weight = current_weight
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def flatten_tags(collection: 'CollectionSchema', parent_names=None):
|
|
38
|
+
if parent_names is None:
|
|
39
|
+
parent_names = []
|
|
40
|
+
current_names = parent_names + [collection.name]
|
|
41
|
+
for dataset in collection.datasets:
|
|
42
|
+
if isinstance(dataset, CollectionSchema):
|
|
43
|
+
flatten_tags(dataset, current_names)
|
|
44
|
+
else:
|
|
45
|
+
# Add all parent CollectionSchema names to the tags of each DatasetInfo
|
|
46
|
+
for name in current_names:
|
|
47
|
+
if name not in dataset.tags:
|
|
48
|
+
dataset.tags.append(name)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def flatten_datasets(collection: 'CollectionSchema') -> List[DatasetInfo]:
|
|
52
|
+
flat_datasets = []
|
|
53
|
+
for dataset in collection.datasets:
|
|
54
|
+
if isinstance(dataset, CollectionSchema):
|
|
55
|
+
flat_datasets.extend(flatten_datasets(dataset))
|
|
56
|
+
else:
|
|
57
|
+
flat_datasets.append(dataset)
|
|
58
|
+
return flat_datasets
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class CollectionSchema:
|
|
63
|
+
name: str
|
|
64
|
+
weight: float = 1.0
|
|
65
|
+
datasets: List[Union[DatasetInfo, 'CollectionSchema']] = field(default_factory=list)
|
|
66
|
+
|
|
67
|
+
def __str__(self):
|
|
68
|
+
return json.dumps(self.to_dict(), ensure_ascii=False, indent=4)
|
|
69
|
+
|
|
70
|
+
def to_dict(self):
|
|
71
|
+
return {
|
|
72
|
+
'name':
|
|
73
|
+
self.name,
|
|
74
|
+
'weight':
|
|
75
|
+
self.weight,
|
|
76
|
+
'datasets':
|
|
77
|
+
[asdict(dataset) if isinstance(dataset, DatasetInfo) else dataset.to_dict() for dataset in self.datasets],
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def from_dict(cls, data):
|
|
82
|
+
instance = cls(name=data.get('name', ''), weight=data.get('weight', 1))
|
|
83
|
+
for dataset in data.get('datasets', []):
|
|
84
|
+
if 'datasets' in dataset:
|
|
85
|
+
instance.datasets.append(CollectionSchema.from_dict(dataset))
|
|
86
|
+
else:
|
|
87
|
+
instance.datasets.append(DatasetInfo(**dataset))
|
|
88
|
+
return instance
|
|
89
|
+
|
|
90
|
+
def dump_json(self, file_path):
|
|
91
|
+
d = self.to_dict()
|
|
92
|
+
with open(file_path, 'w') as f:
|
|
93
|
+
json.dump(d, f, ensure_ascii=False, indent=4)
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def from_json(cls, file_path):
|
|
97
|
+
with open(file_path, 'r') as f:
|
|
98
|
+
data = json.load(f)
|
|
99
|
+
return cls.from_dict(data)
|
|
100
|
+
|
|
101
|
+
def flatten(self) -> List[DatasetInfo]:
|
|
102
|
+
collection = copy.deepcopy(self)
|
|
103
|
+
flatten_tags(collection)
|
|
104
|
+
flatten_weight(collection)
|
|
105
|
+
return flatten_datasets(collection)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
if __name__ == '__main__':
|
|
109
|
+
schema = CollectionSchema(
|
|
110
|
+
name='reasoning',
|
|
111
|
+
datasets=[
|
|
112
|
+
DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en']),
|
|
113
|
+
DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh'], args={'subset_list': ['logic']})
|
|
114
|
+
])
|
|
115
|
+
print(schema)
|
|
116
|
+
print(schema.flatten())
|
|
117
|
+
schema.dump_json('outputs/schema.json')
|
|
118
|
+
|
|
119
|
+
schema = CollectionSchema.from_json('outputs/schema.json')
|
|
120
|
+
print(schema)
|
|
121
|
+
for dataset in schema.flatten():
|
|
122
|
+
print(dataset)
|
evalscope/config.py
CHANGED
|
@@ -31,7 +31,7 @@ DEFAULT_GENERATION_CONFIG = {
|
|
|
31
31
|
@dataclass
|
|
32
32
|
class TaskConfig:
|
|
33
33
|
# Model-related arguments
|
|
34
|
-
model: Union[str, CustomModel, None] = None
|
|
34
|
+
model: Union[str, 'CustomModel', None] = None
|
|
35
35
|
model_id: Optional[str] = None
|
|
36
36
|
model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
|
|
37
37
|
|
|
@@ -40,8 +40,8 @@ class TaskConfig:
|
|
|
40
40
|
chat_template: Optional[str] = None
|
|
41
41
|
|
|
42
42
|
# Dataset-related arguments
|
|
43
|
-
datasets:
|
|
44
|
-
dataset_args:
|
|
43
|
+
datasets: List[str] = field(default_factory=list)
|
|
44
|
+
dataset_args: Dict = field(default_factory=dict)
|
|
45
45
|
dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
|
|
46
46
|
dataset_hub: str = HubType.MODELSCOPE
|
|
47
47
|
|
|
@@ -64,7 +64,9 @@ class TaskConfig:
|
|
|
64
64
|
# Debug and runtime mode arguments
|
|
65
65
|
debug: bool = False
|
|
66
66
|
dry_run: bool = False
|
|
67
|
-
seed: int = 42
|
|
67
|
+
seed: Optional[int] = 42
|
|
68
|
+
api_url: Optional[str] = None # Only used for server model
|
|
69
|
+
api_key: Optional[str] = 'EMPTY' # Only used for server model
|
|
68
70
|
|
|
69
71
|
def __post_init__(self):
|
|
70
72
|
if (not self.model_id) and self.model:
|
|
@@ -74,7 +76,6 @@ class TaskConfig:
|
|
|
74
76
|
self.model_id = os.path.basename(self.model).rstrip(os.sep)
|
|
75
77
|
|
|
76
78
|
def to_dict(self):
|
|
77
|
-
# Note: to avoid serialization error for some model instance
|
|
78
79
|
return self.__dict__
|
|
79
80
|
|
|
80
81
|
def __str__(self):
|
|
@@ -114,7 +115,9 @@ class TaskConfig:
|
|
|
114
115
|
def from_args(args: Namespace):
|
|
115
116
|
# Convert Namespace to a dictionary and filter out None values
|
|
116
117
|
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
117
|
-
|
|
118
|
+
|
|
119
|
+
if 'func' in args_dict:
|
|
120
|
+
del args_dict['func'] # Note: compat CLI arguments
|
|
118
121
|
|
|
119
122
|
return TaskConfig.from_dict(args_dict)
|
|
120
123
|
|
|
@@ -128,6 +131,7 @@ class TaskConfig:
|
|
|
128
131
|
continue
|
|
129
132
|
|
|
130
133
|
task.model = custom_model
|
|
134
|
+
task.model_args = custom_model.config
|
|
131
135
|
task.model_id = type(custom_model).__name__
|
|
132
136
|
res_list.append(task)
|
|
133
137
|
|
evalscope/constants.py
CHANGED
|
@@ -135,34 +135,13 @@ class EvalStage:
|
|
|
135
135
|
class EvalType:
|
|
136
136
|
|
|
137
137
|
CUSTOM = 'custom'
|
|
138
|
-
CHECKPOINT = 'checkpoint'
|
|
138
|
+
CHECKPOINT = 'checkpoint' # native model checkpoint
|
|
139
|
+
SERVICE = 'service' # model service
|
|
139
140
|
|
|
140
141
|
|
|
141
142
|
class EvalBackend:
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
self._value = value
|
|
148
|
-
|
|
149
|
-
@property
|
|
150
|
-
def value(self):
|
|
151
|
-
return self._value
|
|
152
|
-
|
|
153
|
-
def __str__(self):
|
|
154
|
-
return self._value
|
|
155
|
-
|
|
156
|
-
def __repr__(self):
|
|
157
|
-
return f"'{self._value}'"
|
|
158
|
-
|
|
159
|
-
def __eq__(self, other):
|
|
160
|
-
if isinstance(other, str):
|
|
161
|
-
return self._value == other
|
|
162
|
-
return NotImplemented
|
|
163
|
-
|
|
164
|
-
NATIVE = _Backend('Native')
|
|
165
|
-
OPEN_COMPASS = _Backend('OpenCompass')
|
|
166
|
-
VLM_EVAL_KIT = _Backend('VLMEvalKit')
|
|
167
|
-
RAG_EVAL = _Backend('RAGEval')
|
|
168
|
-
THIRD_PARTY = _Backend('ThirdParty')
|
|
143
|
+
NATIVE = 'Native'
|
|
144
|
+
OPEN_COMPASS = 'OpenCompass'
|
|
145
|
+
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
146
|
+
RAG_EVAL = 'RAGEval'
|
|
147
|
+
THIRD_PARTY = 'ThirdParty'
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -10,9 +10,8 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
10
10
|
|
|
11
11
|
from evalscope.benchmarks import DataAdapter
|
|
12
12
|
from evalscope.config import TaskConfig
|
|
13
|
-
from evalscope.constants import
|
|
14
|
-
|
|
15
|
-
from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
|
|
13
|
+
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
|
|
14
|
+
from evalscope.models import BaseModelAdapter, CustomModelAdapter
|
|
16
15
|
from evalscope.tools.combine_reports import gen_table
|
|
17
16
|
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
18
17
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
@@ -30,72 +29,63 @@ class Evaluator(object):
|
|
|
30
29
|
if the dataset is a local path, e.g. /path/to/your_dataset_name,
|
|
31
30
|
then the task name will be the basename of the path, which is `your_dataset_name`.
|
|
32
31
|
data_adapter: DataAdapter, the data adapter for the dataset.
|
|
33
|
-
subset_list: list, the subset list for the dataset.
|
|
34
32
|
model_adapter: BaseModelAdapter, the model adapter for the model.
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
|
|
38
|
-
datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
|
|
39
|
-
stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
|
|
40
|
-
eval_type: str, the evaluation type. `checkpoint` or `service` or `custom`. Default: 'checkpoint'
|
|
41
|
-
overall_task_cfg: dict, the overall task config. Default: None
|
|
33
|
+
outputs: OutputsStructure, the outputs dir. Default: None
|
|
34
|
+
task_cfg: TaskConfig, the overall task config. Default: None
|
|
42
35
|
**kwargs: kwargs.
|
|
43
36
|
"""
|
|
44
37
|
|
|
45
38
|
def __init__(self,
|
|
46
39
|
dataset_name_or_path: str,
|
|
47
40
|
data_adapter: DataAdapter,
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
outputs: Optional[OutputsStructure] = None,
|
|
52
|
-
datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
53
|
-
datasets_hub: Optional[str] = HubType.MODELSCOPE,
|
|
54
|
-
stage: Optional[str] = EvalStage.ALL,
|
|
55
|
-
eval_type: Optional[str] = EvalType.CHECKPOINT,
|
|
56
|
-
overall_task_cfg: Optional[TaskConfig] = None,
|
|
41
|
+
model_adapter: BaseModelAdapter,
|
|
42
|
+
outputs: OutputsStructure = None,
|
|
43
|
+
task_cfg: TaskConfig = None,
|
|
57
44
|
**kwargs):
|
|
58
45
|
|
|
59
46
|
self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
60
47
|
self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
|
|
61
|
-
self.model_name =
|
|
48
|
+
self.model_name = task_cfg.model_id
|
|
62
49
|
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
63
50
|
|
|
64
|
-
self.datasets_dir = os.path.expanduser(datasets_dir)
|
|
65
|
-
self.kwargs = kwargs
|
|
66
51
|
self.data_adapter = data_adapter
|
|
67
52
|
self.model_adapter = model_adapter
|
|
68
|
-
self.
|
|
69
|
-
self.
|
|
70
|
-
self.
|
|
71
|
-
self.
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
self.model_cfg = self.model_adapter.model_cfg
|
|
76
|
-
|
|
53
|
+
self.model_cfg = model_adapter.model_cfg
|
|
54
|
+
self.eval_type = task_cfg.eval_type
|
|
55
|
+
self.dataset_hub = task_cfg.dataset_hub
|
|
56
|
+
self.stage = task_cfg.stage
|
|
57
|
+
self.use_cache = task_cfg.use_cache
|
|
58
|
+
self.task_cfg = task_cfg
|
|
77
59
|
# Deal with the output paths
|
|
78
60
|
self.outputs_structure = outputs
|
|
79
61
|
|
|
80
|
-
|
|
81
|
-
self.dataset = self.data_adapter.load(
|
|
82
|
-
dataset_name_or_path=dataset_name_or_path,
|
|
83
|
-
subset_list=subset_list,
|
|
84
|
-
work_dir=self.datasets_dir,
|
|
85
|
-
datasets_hub=datasets_hub,
|
|
86
|
-
**kwargs)
|
|
87
|
-
|
|
88
|
-
# Get prompts from dataset
|
|
89
|
-
self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
|
|
90
|
-
del self.dataset
|
|
91
|
-
|
|
92
|
-
def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
|
|
62
|
+
self.kwargs = kwargs
|
|
93
63
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
64
|
+
def load_dataset(self):
|
|
65
|
+
dataset = self.data_adapter.load(
|
|
66
|
+
dataset_name_or_path=self.dataset_name_or_path,
|
|
67
|
+
subset_list=self.data_adapter.subset_list,
|
|
68
|
+
work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
|
|
69
|
+
datasets_hub=self.dataset_hub,
|
|
70
|
+
**self.kwargs)
|
|
97
71
|
|
|
98
|
-
|
|
72
|
+
# Get prompts from dataset
|
|
73
|
+
prompts = self.data_adapter.gen_prompts(data_dict=dataset)
|
|
74
|
+
return prompts
|
|
75
|
+
|
|
76
|
+
def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
|
|
77
|
+
model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
|
|
78
|
+
input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
|
|
79
|
+
infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
80
|
+
return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
81
|
+
|
|
82
|
+
def _process_answer(self, answer_d, input_d, subset_name, answer_id):
|
|
83
|
+
answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
84
|
+
answer_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
85
|
+
answer_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
86
|
+
answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
|
|
87
|
+
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
88
|
+
return answer_d
|
|
99
89
|
|
|
100
90
|
def get_answers(self,
|
|
101
91
|
subset_name: str,
|
|
@@ -146,57 +136,24 @@ class Evaluator(object):
|
|
|
146
136
|
resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
|
|
147
137
|
inputs=prompts_list, infer_cfg=infer_cfg)
|
|
148
138
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
# Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
|
|
155
|
-
model_cfg_str = json.dumps(
|
|
156
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
|
|
157
|
-
ensure_ascii=False)
|
|
158
|
-
input_prompt_str = json.dumps(
|
|
159
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False)
|
|
160
|
-
infer_cfg_str = json.dumps(
|
|
161
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
162
|
-
answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
163
|
-
|
|
164
|
-
resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
165
|
-
resp_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
166
|
-
resp_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
167
|
-
resp_d[AnswerKeys.RAW_INPUT] = in_d[AnswerKeys.RAW_INPUT]
|
|
168
|
-
resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
|
|
169
|
-
|
|
170
|
-
answers_list.append(resp_d)
|
|
171
|
-
dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
139
|
+
for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
|
|
140
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
141
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
142
|
+
answers_list.append(processed_answer)
|
|
143
|
+
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
172
144
|
|
|
173
145
|
else:
|
|
174
146
|
for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
|
|
179
|
-
ensure_ascii=False)
|
|
180
|
-
input_prompt_str = json.dumps(
|
|
181
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False)
|
|
182
|
-
infer_cfg_str = json.dumps(
|
|
183
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
184
|
-
answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
185
|
-
|
|
186
|
-
# Get answers
|
|
187
|
-
answer_d: dict = self._pred_answer(
|
|
188
|
-
input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id)
|
|
189
|
-
|
|
190
|
-
answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
191
|
-
answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
|
|
192
|
-
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
|
|
147
|
+
answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
|
|
148
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
149
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
193
150
|
|
|
194
151
|
if debug:
|
|
195
152
|
logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
|
|
196
|
-
logger.info(f'**predicted ans: {json.dumps(
|
|
153
|
+
logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
|
|
197
154
|
|
|
198
|
-
answers_list.append(
|
|
199
|
-
dump_jsonl_data(
|
|
155
|
+
answers_list.append(processed_answer)
|
|
156
|
+
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
200
157
|
|
|
201
158
|
logger.info(f'Dump predictions to {pred_file_path}.')
|
|
202
159
|
return answers_list
|
|
@@ -240,6 +197,19 @@ class Evaluator(object):
|
|
|
240
197
|
|
|
241
198
|
return review_res
|
|
242
199
|
|
|
200
|
+
def _generate_review_id(self, answer_d):
|
|
201
|
+
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
202
|
+
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
203
|
+
reviewer_spec = {
|
|
204
|
+
'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
|
|
205
|
+
'reviewer': ['Evaluator'],
|
|
206
|
+
'revision': ['default']
|
|
207
|
+
}
|
|
208
|
+
reviewer_spec_str = json.dumps(
|
|
209
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
210
|
+
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
211
|
+
return review_id, reviewer_spec
|
|
212
|
+
|
|
243
213
|
def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
|
|
244
214
|
"""
|
|
245
215
|
Get reviews from answers.
|
|
@@ -263,19 +233,7 @@ class Evaluator(object):
|
|
|
263
233
|
logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
|
|
264
234
|
|
|
265
235
|
for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
|
|
266
|
-
|
|
267
|
-
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
268
|
-
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
269
|
-
|
|
270
|
-
reviewer_spec: dict = {
|
|
271
|
-
'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
|
|
272
|
-
'reviewer': ['Evaluator'],
|
|
273
|
-
'revision': ['default']
|
|
274
|
-
}
|
|
275
|
-
reviewer_spec_str = json.dumps(
|
|
276
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
277
|
-
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
278
|
-
|
|
236
|
+
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
279
237
|
# Get review
|
|
280
238
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
281
239
|
|
|
@@ -283,7 +241,6 @@ class Evaluator(object):
|
|
|
283
241
|
logger.info(review_d)
|
|
284
242
|
|
|
285
243
|
reviews_list.append(review_d)
|
|
286
|
-
|
|
287
244
|
# Dump reviews
|
|
288
245
|
dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
|
|
289
246
|
|
|
@@ -379,7 +336,8 @@ class Evaluator(object):
|
|
|
379
336
|
stage_answers_dict = {}
|
|
380
337
|
stage_reviews_dict = {}
|
|
381
338
|
|
|
382
|
-
|
|
339
|
+
prompts = self.load_dataset()
|
|
340
|
+
for subset_name, prompts_list in prompts.items():
|
|
383
341
|
limit = kwargs.get('limit', len(prompts_list))
|
|
384
342
|
prompts_list = prompts_list[:limit]
|
|
385
343
|
|
|
@@ -8,10 +8,10 @@ import sys
|
|
|
8
8
|
import time
|
|
9
9
|
from abc import ABC, abstractmethod
|
|
10
10
|
from functools import partial
|
|
11
|
-
from typing import Any, List
|
|
11
|
+
from typing import Any, List, Tuple
|
|
12
12
|
|
|
13
13
|
from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
|
|
14
|
-
from evalscope.models.
|
|
14
|
+
from evalscope.models.model import OpenAIModel
|
|
15
15
|
from evalscope.utils import completion_parsers, random_seeded_choice
|
|
16
16
|
from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
|
|
17
17
|
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
|
|
@@ -240,7 +240,15 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
240
240
|
review_text=review_text)
|
|
241
241
|
return review_result
|
|
242
242
|
|
|
243
|
-
def _get_review_pair(self,
|
|
243
|
+
def _get_review_pair(self,
|
|
244
|
+
model_a,
|
|
245
|
+
model_b,
|
|
246
|
+
question,
|
|
247
|
+
category,
|
|
248
|
+
ans1,
|
|
249
|
+
ans2,
|
|
250
|
+
dry_run=False,
|
|
251
|
+
**kwargs) -> Tuple[str, Any]:
|
|
244
252
|
input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
|
|
245
253
|
|
|
246
254
|
if self.reference_list:
|
|
@@ -263,7 +271,7 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
263
271
|
result = (result, None)
|
|
264
272
|
return review_text, *result
|
|
265
273
|
|
|
266
|
-
def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) ->
|
|
274
|
+
def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> Tuple[str, Any]:
|
|
267
275
|
input_msg = dict(ques=question, category=category, ans1=answer)
|
|
268
276
|
|
|
269
277
|
if self.reference_list:
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -1 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, weighted_mean
|
|
3
|
+
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|
|
4
|
+
|
|
5
|
+
WeightedAverageAccuracy = {'name': 'WeightedAverageAccuracy', 'object': weighted_mean}
|
|
6
|
+
WeightedAverageBLEU = {'name': 'WeightedAverageBLEU', 'object': weighted_mean}
|
|
7
|
+
Pass1 = {'name': 'Pass@1', 'object': weighted_mean}
|
|
@@ -55,7 +55,7 @@ try:
|
|
|
55
55
|
os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
|
|
56
56
|
os.system(f'unzip {punkt_path} -d {nltk_dir}')
|
|
57
57
|
else:
|
|
58
|
-
logger.
|
|
58
|
+
logger.debug(f'{punkt_path} already exists, skipping download')
|
|
59
59
|
except Exception as e:
|
|
60
60
|
logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
|
|
61
61
|
|