evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +26 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +7 -5
- evalscope/constants.py +9 -26
- evalscope/evaluator/evaluator.py +87 -121
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +48 -72
- evalscope/run_arena.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +5 -4
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +5 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +52 -1
- tests/rag/test_mteb.py +3 -2
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from dataclasses import asdict, dataclass, field
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
|
|
7
|
+
from evalscope.metrics import macro_mean, micro_mean
|
|
8
|
+
from evalscope.utils import normalize_score
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Subset:
|
|
13
|
+
name: str = 'default_subset'
|
|
14
|
+
score: float = 0.0
|
|
15
|
+
num: int = 0
|
|
16
|
+
|
|
17
|
+
def __post_init__(self):
|
|
18
|
+
self.score = normalize_score(self.score)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class Category:
|
|
23
|
+
name: tuple[str] = field(default_factory=tuple)
|
|
24
|
+
num: int = 0
|
|
25
|
+
score: float = 0.0
|
|
26
|
+
macro_score: float = 0.0
|
|
27
|
+
subsets: List[Subset] = field(default_factory=list)
|
|
28
|
+
|
|
29
|
+
def __post_init__(self):
|
|
30
|
+
if isinstance(self.name, str):
|
|
31
|
+
# ensure name is tuple format
|
|
32
|
+
self.name = (self.name, )
|
|
33
|
+
self.num = sum(subset.num for subset in self.subsets)
|
|
34
|
+
self.score = normalize_score(micro_mean(self.subsets))
|
|
35
|
+
self.macro_score = normalize_score(macro_mean(self.subsets))
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def from_dict(cls, data: dict):
|
|
39
|
+
subsets = [Subset(**subset) for subset in data.get('subsets', [])]
|
|
40
|
+
return cls(name=data['name'], subsets=subsets)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Metric:
|
|
45
|
+
name: str = 'default_metric'
|
|
46
|
+
num: int = 0
|
|
47
|
+
score: float = 0.0
|
|
48
|
+
macro_score: float = 0.0
|
|
49
|
+
categories: List[Category] = field(default_factory=list)
|
|
50
|
+
|
|
51
|
+
def __post_init__(self):
|
|
52
|
+
self.num = sum(category.num for category in self.categories)
|
|
53
|
+
self.score = normalize_score(micro_mean(self.categories))
|
|
54
|
+
self.macro_score = normalize_score(macro_mean(self.categories))
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def from_dict(cls, data: dict):
|
|
58
|
+
categories = [Category.from_dict(category) for category in data.get('categories', [])]
|
|
59
|
+
return cls(name=data['name'], categories=categories)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ReportKey:
|
|
63
|
+
model_name = 'Model'
|
|
64
|
+
dataset_name = 'Dataset'
|
|
65
|
+
metric_name = 'Metric'
|
|
66
|
+
category_name = 'Category'
|
|
67
|
+
category_prefix = 'Cat.'
|
|
68
|
+
subset_name = 'Subset'
|
|
69
|
+
num = 'Num'
|
|
70
|
+
score = 'Score'
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class Report:
|
|
75
|
+
name: str = 'default_report'
|
|
76
|
+
dataset_name: str = 'default_dataset'
|
|
77
|
+
model_name: str = 'default_model'
|
|
78
|
+
score: float = 0.0
|
|
79
|
+
metrics: List[Metric] = field(default_factory=list)
|
|
80
|
+
|
|
81
|
+
def __post_init__(self):
|
|
82
|
+
self.score = self.metrics[0].score # NOTE: only use the first metric by default
|
|
83
|
+
|
|
84
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
85
|
+
return asdict(self)
|
|
86
|
+
|
|
87
|
+
@classmethod
|
|
88
|
+
def from_dict(cls, data: dict):
|
|
89
|
+
metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
|
|
90
|
+
return cls(
|
|
91
|
+
name=data['name'],
|
|
92
|
+
score=data['score'],
|
|
93
|
+
metrics=metrics,
|
|
94
|
+
dataset_name=data['dataset_name'],
|
|
95
|
+
model_name=data['model_name'])
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def from_json(cls, json_file: str):
|
|
99
|
+
with open(json_file, 'r') as f:
|
|
100
|
+
data = json.load(f)
|
|
101
|
+
return cls.from_dict(data)
|
|
102
|
+
|
|
103
|
+
def to_dataframe(self, flatten_metrics: bool = True, flatten_categories: bool = True):
|
|
104
|
+
table = defaultdict(list)
|
|
105
|
+
for metric in self.metrics:
|
|
106
|
+
for category in metric.categories:
|
|
107
|
+
for subset in category.subsets:
|
|
108
|
+
table[ReportKey.model_name].append(self.model_name)
|
|
109
|
+
table[ReportKey.dataset_name].append(self.dataset_name)
|
|
110
|
+
table[ReportKey.metric_name].append(metric.name)
|
|
111
|
+
table[ReportKey.category_name].append(category.name)
|
|
112
|
+
table[ReportKey.subset_name].append(subset.name)
|
|
113
|
+
table[ReportKey.num].append(subset.num)
|
|
114
|
+
table[ReportKey.score].append(subset.score) # TODO: convert to percentage
|
|
115
|
+
# NOTE: only flatten metrics if needed, use the first metric by default
|
|
116
|
+
if not flatten_metrics:
|
|
117
|
+
break
|
|
118
|
+
df = pd.DataFrame.from_dict(table, orient='columns')
|
|
119
|
+
if flatten_categories:
|
|
120
|
+
df = self._flatten_categories(df)
|
|
121
|
+
return df
|
|
122
|
+
|
|
123
|
+
def _flatten_categories(self, df: pd.DataFrame):
|
|
124
|
+
# expand categories to multiple rows
|
|
125
|
+
df_categories = df.copy()
|
|
126
|
+
# multi-level aggregation for categories
|
|
127
|
+
max_depth = df_categories[ReportKey.category_name].apply(len).max()
|
|
128
|
+
for level in range(max_depth):
|
|
129
|
+
df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[ReportKey.category_name].apply(
|
|
130
|
+
lambda x: x[level] if len(x) > level else None)
|
|
131
|
+
|
|
132
|
+
df_categories.drop(columns=[ReportKey.category_name], inplace=True)
|
|
133
|
+
return df_categories
|
evalscope/run.py
CHANGED
|
@@ -2,26 +2,21 @@
|
|
|
2
2
|
"""
|
|
3
3
|
Run evaluation for LLMs.
|
|
4
4
|
"""
|
|
5
|
-
import logging
|
|
6
5
|
import os.path
|
|
7
|
-
import torch
|
|
8
6
|
from argparse import Namespace
|
|
9
7
|
from datetime import datetime
|
|
10
|
-
from typing import List, Optional, Union
|
|
8
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
|
11
9
|
|
|
12
|
-
from evalscope.arguments import parse_args
|
|
13
10
|
from evalscope.config import TaskConfig, parse_task_config
|
|
14
|
-
from evalscope.constants import
|
|
15
|
-
from evalscope.
|
|
16
|
-
from evalscope.
|
|
17
|
-
from evalscope.utils import import_module_util, seed_everything
|
|
18
|
-
from evalscope.utils.io_utils import OutputsStructure, are_paths_same
|
|
11
|
+
from evalscope.constants import DataCollection, EvalBackend
|
|
12
|
+
from evalscope.utils import seed_everything
|
|
13
|
+
from evalscope.utils.io_utils import OutputsStructure
|
|
19
14
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
20
15
|
|
|
21
|
-
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from evalscope.models import LocalModel
|
|
22
18
|
|
|
23
|
-
|
|
24
|
-
MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
|
|
19
|
+
logger = get_logger()
|
|
25
20
|
|
|
26
21
|
|
|
27
22
|
def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
|
|
@@ -38,15 +33,13 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]
|
|
|
38
33
|
|
|
39
34
|
def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
40
35
|
"""Run a single evaluation task."""
|
|
41
|
-
|
|
36
|
+
if task_cfg.seed is not None:
|
|
37
|
+
seed_everything(task_cfg.seed)
|
|
42
38
|
outputs = setup_work_directory(task_cfg, run_time)
|
|
43
39
|
configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
|
|
44
40
|
|
|
45
|
-
task_cfg.dump_yaml(outputs.configs_dir)
|
|
46
|
-
logger.info(task_cfg)
|
|
47
|
-
|
|
48
41
|
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
49
|
-
return run_non_native_backend(task_cfg)
|
|
42
|
+
return run_non_native_backend(task_cfg, outputs)
|
|
50
43
|
else:
|
|
51
44
|
return evaluate_model(task_cfg, outputs)
|
|
52
45
|
|
|
@@ -56,8 +49,8 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
|
56
49
|
if task_cfg.use_cache:
|
|
57
50
|
task_cfg.work_dir = task_cfg.use_cache
|
|
58
51
|
logger.info(f'Set resume from {task_cfg.work_dir}')
|
|
59
|
-
elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
|
|
60
|
-
|
|
52
|
+
# elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
|
|
53
|
+
task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
|
|
61
54
|
|
|
62
55
|
outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
|
|
63
56
|
|
|
@@ -68,7 +61,7 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
|
68
61
|
return outputs
|
|
69
62
|
|
|
70
63
|
|
|
71
|
-
def run_non_native_backend(task_cfg: TaskConfig) -> dict:
|
|
64
|
+
def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
72
65
|
"""Run evaluation using a non-native backend."""
|
|
73
66
|
eval_backend = task_cfg.eval_backend
|
|
74
67
|
eval_config = task_cfg.eval_config
|
|
@@ -78,6 +71,10 @@ def run_non_native_backend(task_cfg: TaskConfig) -> dict:
|
|
|
78
71
|
|
|
79
72
|
backend_manager_class = get_backend_manager_class(eval_backend)
|
|
80
73
|
backend_manager = backend_manager_class(config=eval_config)
|
|
74
|
+
|
|
75
|
+
task_cfg.dump_yaml(outputs.configs_dir)
|
|
76
|
+
logger.info(task_cfg)
|
|
77
|
+
|
|
81
78
|
backend_manager.run()
|
|
82
79
|
|
|
83
80
|
return dict()
|
|
@@ -100,78 +97,57 @@ def get_backend_manager_class(eval_backend: EvalBackend):
|
|
|
100
97
|
|
|
101
98
|
def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
102
99
|
"""Evaluate the model based on the provided task configuration."""
|
|
100
|
+
from evalscope.models import get_local_model
|
|
101
|
+
|
|
103
102
|
# Initialize evaluator
|
|
104
103
|
eval_results = {}
|
|
105
|
-
|
|
104
|
+
base_model = get_local_model(task_cfg)
|
|
105
|
+
evaluators = []
|
|
106
106
|
for dataset_name in task_cfg.datasets:
|
|
107
|
-
evaluator = create_evaluator(task_cfg, dataset_name, outputs)
|
|
107
|
+
evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
|
|
108
|
+
evaluators.append(evaluator)
|
|
109
|
+
|
|
110
|
+
# dump task_cfg to outputs.configs_dir after creating evaluators
|
|
111
|
+
task_cfg.dump_yaml(outputs.configs_dir)
|
|
112
|
+
logger.info(task_cfg)
|
|
113
|
+
|
|
114
|
+
for evaluator in evaluators:
|
|
108
115
|
res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
|
|
109
116
|
eval_results[dataset_name] = res_dict
|
|
110
117
|
|
|
111
118
|
return eval_results
|
|
112
119
|
|
|
113
120
|
|
|
114
|
-
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure):
|
|
121
|
+
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: 'LocalModel'):
|
|
115
122
|
"""Create an evaluator object for the specified dataset."""
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
data_adapter = imported_modules['DataAdapterClass'](
|
|
126
|
-
few_shot_num=few_shot_num,
|
|
127
|
-
few_shot_random=few_shot_random,
|
|
128
|
-
prompt_template=in_prompt_template,
|
|
129
|
-
outputs=outputs,
|
|
130
|
-
)
|
|
131
|
-
in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
|
|
123
|
+
from evalscope.benchmarks import Benchmark, BenchmarkMeta
|
|
124
|
+
from evalscope.evaluator import Evaluator
|
|
125
|
+
from evalscope.models import initialize_model_adapter
|
|
126
|
+
|
|
127
|
+
if dataset_name == DataCollection.NAME:
|
|
128
|
+
# EvaluatorCollection is a collection of evaluators
|
|
129
|
+
from evalscope.collections import EvaluatorCollection
|
|
130
|
+
return EvaluatorCollection(task_cfg, outputs)
|
|
132
131
|
|
|
133
|
-
|
|
132
|
+
benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
|
|
133
|
+
|
|
134
|
+
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
135
|
+
model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
|
|
136
|
+
|
|
137
|
+
# update task_cfg.dataset_args
|
|
138
|
+
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
|
134
139
|
|
|
135
140
|
return Evaluator(
|
|
136
|
-
dataset_name_or_path=
|
|
137
|
-
subset_list=in_subset_list,
|
|
141
|
+
dataset_name_or_path=benchmark.dataset_id,
|
|
138
142
|
data_adapter=data_adapter,
|
|
139
143
|
model_adapter=model_adapter,
|
|
140
|
-
use_cache=task_cfg.use_cache,
|
|
141
144
|
outputs=outputs,
|
|
142
|
-
|
|
143
|
-
datasets_hub=task_cfg.dataset_hub,
|
|
144
|
-
stage=task_cfg.stage,
|
|
145
|
-
eval_type=task_cfg.eval_type,
|
|
146
|
-
overall_task_cfg=task_cfg,
|
|
145
|
+
task_cfg=task_cfg,
|
|
147
146
|
)
|
|
148
147
|
|
|
149
148
|
|
|
150
|
-
def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
|
|
151
|
-
"""Initialize the model adapter based on the task configuration."""
|
|
152
|
-
if task_cfg.dry_run:
|
|
153
|
-
from evalscope.models.dummy_chat_model import DummyChatModel
|
|
154
|
-
return DummyChatModel(model_cfg=dict())
|
|
155
|
-
elif task_cfg.eval_type == EvalType.CUSTOM:
|
|
156
|
-
if not isinstance(task_cfg.model, CustomModel):
|
|
157
|
-
raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
|
|
158
|
-
from evalscope.models.model_adapter import CustomModelAdapter
|
|
159
|
-
return CustomModelAdapter(custom_model=task_cfg.model)
|
|
160
|
-
else:
|
|
161
|
-
device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
|
|
162
|
-
model_precision = task_cfg.model_args.get('precision', torch.float16)
|
|
163
|
-
if isinstance(model_precision, str) and model_precision != 'auto':
|
|
164
|
-
model_precision = eval(model_precision)
|
|
165
|
-
return imported_modules['ModelAdapterClass'](
|
|
166
|
-
model_id=task_cfg.model,
|
|
167
|
-
model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
|
|
168
|
-
device_map=device_map,
|
|
169
|
-
torch_dtype=model_precision,
|
|
170
|
-
generation_config=task_cfg.generation_config,
|
|
171
|
-
chat_template=task_cfg.chat_template)
|
|
172
|
-
|
|
173
|
-
|
|
174
149
|
def main():
|
|
150
|
+
from evalscope.arguments import parse_args
|
|
175
151
|
args = parse_args()
|
|
176
152
|
run_task(args)
|
|
177
153
|
|
evalscope/run_arena.py
CHANGED
|
@@ -10,7 +10,7 @@ from tqdm import tqdm
|
|
|
10
10
|
|
|
11
11
|
from evalscope.constants import EvalConfigKeys
|
|
12
12
|
from evalscope.evaluator.rating_eval import RatingEvaluate
|
|
13
|
-
from evalscope.models
|
|
13
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
14
14
|
from evalscope.utils import get_obj_from_cfg
|
|
15
15
|
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
|
|
16
16
|
from evalscope.utils.logger import get_logger
|
evalscope/summarizer.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import List, Union
|
|
|
6
6
|
|
|
7
7
|
from evalscope.config import TaskConfig, parse_task_config
|
|
8
8
|
from evalscope.constants import EvalBackend
|
|
9
|
-
from evalscope.
|
|
9
|
+
from evalscope.report import gen_table
|
|
10
10
|
from evalscope.utils import csv_to_list, get_latest_folder_path
|
|
11
11
|
from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
|
|
12
12
|
from evalscope.utils.logger import get_logger
|
evalscope/utils/__init__.py
CHANGED
evalscope/utils/chat_service.py
CHANGED
|
@@ -3,11 +3,9 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from contextlib import contextmanager
|
|
5
5
|
from functools import partial
|
|
6
|
-
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
7
6
|
from pydantic import BaseModel, Field
|
|
8
7
|
from threading import Thread
|
|
9
|
-
from
|
|
10
|
-
from typing import List, Literal, Optional, Union
|
|
8
|
+
from typing import Any, List, Literal, Optional, Union
|
|
11
9
|
|
|
12
10
|
|
|
13
11
|
class Usage(BaseModel):
|
|
@@ -66,7 +64,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
|
|
|
66
64
|
class ChatCompletionResponse(BaseModel):
|
|
67
65
|
model: str
|
|
68
66
|
object: Literal['chat.completion', 'chat.completion.chunk']
|
|
69
|
-
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
|
|
67
|
+
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
|
|
70
68
|
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
|
|
71
69
|
usage: Optional[Usage]
|
|
72
70
|
|
|
@@ -96,6 +94,9 @@ class TextCompletionResponse(BaseModel):
|
|
|
96
94
|
class ChatService:
|
|
97
95
|
|
|
98
96
|
def __init__(self, model_path, attn_implementation):
|
|
97
|
+
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
98
|
+
from transformers import TextIteratorStreamer
|
|
99
|
+
|
|
99
100
|
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
|
100
101
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
101
102
|
model_path,
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -160,3 +160,11 @@ def are_paths_same(path1, path2):
|
|
|
160
160
|
real_path2 = os.path.realpath(os.path.abspath(os.path.expanduser(path2)))
|
|
161
161
|
|
|
162
162
|
return real_path1 == real_path2
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def dict_to_json(d: dict, json_file: str):
|
|
166
|
+
"""
|
|
167
|
+
Dump dict to json file.
|
|
168
|
+
"""
|
|
169
|
+
with open(json_file, 'w') as f:
|
|
170
|
+
json.dump(d, f, indent=4, ensure_ascii=False)
|
evalscope/utils/logger.py
CHANGED
|
@@ -14,6 +14,11 @@ DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else
|
|
|
14
14
|
|
|
15
15
|
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
|
|
16
16
|
|
|
17
|
+
# disable datasets logging
|
|
18
|
+
logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
19
|
+
logging.getLogger('modelscope').setLevel(logging.WARNING)
|
|
20
|
+
logging.getLogger('httpx').setLevel(logging.WARNING)
|
|
21
|
+
|
|
17
22
|
|
|
18
23
|
def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
|
|
19
24
|
"""Get logging logger
|
evalscope/utils/model_utils.py
CHANGED
|
@@ -1,7 +1,20 @@
|
|
|
1
|
-
from
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
2
3
|
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from transformers import GenerationConfig
|
|
3
6
|
|
|
4
|
-
|
|
7
|
+
|
|
8
|
+
class EvalBackend(Enum):
|
|
9
|
+
# NOTE: compatible with ms-swfit v2.x
|
|
10
|
+
NATIVE = 'Native'
|
|
11
|
+
OPEN_COMPASS = 'OpenCompass'
|
|
12
|
+
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
13
|
+
RAG_EVAL = 'RAGEval'
|
|
14
|
+
THIRD_PARTY = 'ThirdParty'
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def fix_do_sample_warning(generation_config: 'GenerationConfig') -> None:
|
|
5
18
|
# Use the default values of temperature/top_p/top_k in generation_config.
|
|
6
19
|
if generation_config.temperature == 0:
|
|
7
20
|
generation_config.do_sample = False
|
evalscope/utils/utils.py
CHANGED
|
@@ -121,7 +121,6 @@ class ResponseParser:
|
|
|
121
121
|
f'([{options_concat}])\s?是正确答案',
|
|
122
122
|
f'选项\s?([{options_concat}])\s?正确',
|
|
123
123
|
f'所以答\s?([{options_concat}])',
|
|
124
|
-
f'1.\s?([{options_concat}])[.。$]?$',
|
|
125
124
|
f'所以\s?([{options_concat}][.。$]?$)',
|
|
126
125
|
f'所有\s?([{options_concat}][.。$]?$)',
|
|
127
126
|
f'[\s,::,]([{options_concat}])[。,,\.]?$',
|
|
@@ -137,16 +136,15 @@ class ResponseParser:
|
|
|
137
136
|
f'答案为(.*?)[{options_concat}]',
|
|
138
137
|
f'固选(.*?)[{options_concat}]',
|
|
139
138
|
f'答案应该是(.*?)[{options_concat}]',
|
|
140
|
-
f'[Tt]he answer is [{options_concat}]',
|
|
139
|
+
f'[Tt]he answer is \(?[{options_concat}]\)?',
|
|
141
140
|
f'[Tt]he correct answer is [{options_concat}]',
|
|
142
141
|
f'[Tt]he correct answer is:\n[{options_concat}]',
|
|
143
142
|
f'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
|
|
144
|
-
f'[{options_concat}]',
|
|
145
143
|
f'^选项\s?([{options_concat}])',
|
|
146
144
|
f'^([{options_concat}])\s?选?项',
|
|
147
145
|
f'(\s|^)[{options_concat}][\s。,,::\.$]',
|
|
148
146
|
f'(\s|^)[{options_concat}](\s|$)',
|
|
149
|
-
f'
|
|
147
|
+
f'[{options_concat}]',
|
|
150
148
|
]
|
|
151
149
|
|
|
152
150
|
regexes = [re.compile(pattern) for pattern in patterns]
|
|
@@ -169,6 +167,7 @@ class ResponseParser:
|
|
|
169
167
|
"""
|
|
170
168
|
patterns = [
|
|
171
169
|
r'[Aa]nswer:\s*(\w+)',
|
|
170
|
+
r'answer is \(?(\w+)\)?',
|
|
172
171
|
r'[Tt]he correct answer is:\s*(\w+)',
|
|
173
172
|
r'[Tt]he correct answer is:\n\s*(\w+)',
|
|
174
173
|
r'[Tt]he correct answer is:\n\n-\s*(\w+)',
|
|
@@ -199,27 +198,6 @@ class ResponseParser:
|
|
|
199
198
|
|
|
200
199
|
|
|
201
200
|
|
|
202
|
-
def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
|
|
203
|
-
"""
|
|
204
|
-
Import module utility function.
|
|
205
|
-
|
|
206
|
-
Args:
|
|
207
|
-
import_path_prefix: e.g. 'evalscope.benchmarks.'
|
|
208
|
-
module_name: The module name to import. e.g. 'mmlu'
|
|
209
|
-
members_to_import: The members to import.
|
|
210
|
-
e.g. ['DATASET_ID', 'SUBJECT_MAPPING', 'SUBSET_LIST', 'DataAdapterClass']
|
|
211
|
-
|
|
212
|
-
Returns:
|
|
213
|
-
dict: imported modules map. e.g. {'DATASET_ID': 'mmlu', 'SUBJECT_MAPPING': {...}, ...}
|
|
214
|
-
"""
|
|
215
|
-
imported_modules = {}
|
|
216
|
-
module = importlib.import_module(import_path_prefix + module_name)
|
|
217
|
-
for member_name in members_to_import:
|
|
218
|
-
imported_modules[member_name] = getattr(module, member_name)
|
|
219
|
-
|
|
220
|
-
return imported_modules
|
|
221
|
-
|
|
222
|
-
|
|
223
201
|
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
224
202
|
"""
|
|
225
203
|
Normalize score.
|
evalscope/version.py
CHANGED