evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -5
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
- evalscope/benchmarks/benchmark.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
- evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
- evalscope/benchmarks/data_adapter.py +69 -70
- evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
- evalscope/benchmarks/race/race_adapter.py +4 -73
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/evaluator.py +82 -62
- evalscope/collections/sampler.py +47 -41
- evalscope/collections/schema.py +14 -10
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +22 -13
- evalscope/metrics/__init__.py +2 -5
- evalscope/metrics/metrics.py +11 -2
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/models/server_adapter.py +11 -4
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +16 -11
- evalscope/summarizer.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/logger.py +1 -0
- evalscope/utils/model_utils.py +5 -2
- evalscope/version.py +2 -2
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
- tests/cli/test_collection.py +11 -7
- tests/cli/test_run.py +13 -4
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from pandas import DataFrame
|
|
3
|
+
|
|
4
|
+
from evalscope.constants import DataCollection
|
|
5
|
+
from evalscope.report.utils import *
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ReportGenerator:
|
|
9
|
+
|
|
10
|
+
@staticmethod
|
|
11
|
+
def gen_report(subset_score_map: dict, report_name: str, **kwargs) -> Report:
|
|
12
|
+
"""
|
|
13
|
+
Generate report for specific dataset.
|
|
14
|
+
subset_score_map: e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}, {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}]}
|
|
15
|
+
category_map: e.g. {'subset_name': ['category_name1', 'category_name2'], ...}
|
|
16
|
+
metric_list: e.g. [{'object': AverageAccuracy, 'name': 'AverageAccuracy'}, {'object': 'WeightedAverageAccuracy', 'name': 'WeightedAverageAccuracy'}]
|
|
17
|
+
""" # noqa: E501
|
|
18
|
+
|
|
19
|
+
dataset_name = kwargs.get('dataset_name', None)
|
|
20
|
+
model_name = kwargs.get('model_name', None)
|
|
21
|
+
category_map = kwargs.get('category_map', {})
|
|
22
|
+
|
|
23
|
+
def flatten_subset() -> DataFrame:
|
|
24
|
+
"""
|
|
25
|
+
Flatten subset score map to a DataFrame.
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
name score num categories metric_name
|
|
29
|
+
0 ARC-Easy 0.5 2 [default] AverageAccuracy
|
|
30
|
+
1 ARC-Challenge 0.5 2 [default] AverageAccuracy
|
|
31
|
+
"""
|
|
32
|
+
subsets = []
|
|
33
|
+
for subset_name, scores in subset_score_map.items():
|
|
34
|
+
for score_item in scores:
|
|
35
|
+
categories = category_map.get(subset_name, ['default'])
|
|
36
|
+
if isinstance(categories, str):
|
|
37
|
+
categories = [categories]
|
|
38
|
+
subsets.append(
|
|
39
|
+
dict(
|
|
40
|
+
name=subset_name,
|
|
41
|
+
score=score_item['score'],
|
|
42
|
+
num=score_item['num'],
|
|
43
|
+
metric_name=score_item['metric_name'],
|
|
44
|
+
categories=tuple(categories)))
|
|
45
|
+
df = pd.DataFrame(subsets)
|
|
46
|
+
return df
|
|
47
|
+
|
|
48
|
+
df = flatten_subset()
|
|
49
|
+
|
|
50
|
+
metrics_list = []
|
|
51
|
+
for metric_name, group_metric in df.groupby('metric_name'):
|
|
52
|
+
categories = []
|
|
53
|
+
for category_name, group_category in group_metric.groupby('categories'):
|
|
54
|
+
subsets = []
|
|
55
|
+
for _, row in group_category.iterrows():
|
|
56
|
+
subsets.append(Subset(name=row['name'], score=row['score'], num=row['num']))
|
|
57
|
+
|
|
58
|
+
categories.append(Category(name=category_name, subsets=subsets))
|
|
59
|
+
|
|
60
|
+
metrics_list.append(Metric(name=metric_name, categories=categories))
|
|
61
|
+
|
|
62
|
+
report = Report(name=report_name, metrics=metrics_list, dataset_name=dataset_name, model_name=model_name)
|
|
63
|
+
return report
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
|
|
67
|
+
categories = []
|
|
68
|
+
for category_name, group_category in df.groupby('categories'):
|
|
69
|
+
subsets = []
|
|
70
|
+
for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
|
|
71
|
+
avg_score = group_subset['score'].mean()
|
|
72
|
+
num = group_subset['score'].count()
|
|
73
|
+
subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
|
|
74
|
+
|
|
75
|
+
categories.append(Category(name=category_name, subsets=subsets))
|
|
76
|
+
return Report(
|
|
77
|
+
name=DataCollection.NAME,
|
|
78
|
+
metrics=[Metric(name='Average', categories=categories)],
|
|
79
|
+
dataset_name=all_dataset_name,
|
|
80
|
+
model_name=model_name)
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from dataclasses import asdict, dataclass, field
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
|
|
7
|
+
from evalscope.metrics import macro_mean, micro_mean
|
|
8
|
+
from evalscope.utils import normalize_score
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Subset:
|
|
13
|
+
name: str = 'default_subset'
|
|
14
|
+
score: float = 0.0
|
|
15
|
+
num: int = 0
|
|
16
|
+
|
|
17
|
+
def __post_init__(self):
|
|
18
|
+
self.score = normalize_score(self.score)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class Category:
|
|
23
|
+
name: tuple[str] = field(default_factory=tuple)
|
|
24
|
+
num: int = 0
|
|
25
|
+
score: float = 0.0
|
|
26
|
+
macro_score: float = 0.0
|
|
27
|
+
subsets: List[Subset] = field(default_factory=list)
|
|
28
|
+
|
|
29
|
+
def __post_init__(self):
|
|
30
|
+
if isinstance(self.name, str):
|
|
31
|
+
# ensure name is tuple format
|
|
32
|
+
self.name = (self.name, )
|
|
33
|
+
self.num = sum(subset.num for subset in self.subsets)
|
|
34
|
+
self.score = normalize_score(micro_mean(self.subsets))
|
|
35
|
+
self.macro_score = normalize_score(macro_mean(self.subsets))
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def from_dict(cls, data: dict):
|
|
39
|
+
subsets = [Subset(**subset) for subset in data.get('subsets', [])]
|
|
40
|
+
return cls(name=data['name'], subsets=subsets)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Metric:
|
|
45
|
+
name: str = 'default_metric'
|
|
46
|
+
num: int = 0
|
|
47
|
+
score: float = 0.0
|
|
48
|
+
macro_score: float = 0.0
|
|
49
|
+
categories: List[Category] = field(default_factory=list)
|
|
50
|
+
|
|
51
|
+
def __post_init__(self):
|
|
52
|
+
self.num = sum(category.num for category in self.categories)
|
|
53
|
+
self.score = normalize_score(micro_mean(self.categories))
|
|
54
|
+
self.macro_score = normalize_score(macro_mean(self.categories))
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def from_dict(cls, data: dict):
|
|
58
|
+
categories = [Category.from_dict(category) for category in data.get('categories', [])]
|
|
59
|
+
return cls(name=data['name'], categories=categories)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ReportKey:
|
|
63
|
+
model_name = 'Model'
|
|
64
|
+
dataset_name = 'Dataset'
|
|
65
|
+
metric_name = 'Metric'
|
|
66
|
+
category_name = 'Category'
|
|
67
|
+
category_prefix = 'Cat.'
|
|
68
|
+
subset_name = 'Subset'
|
|
69
|
+
num = 'Num'
|
|
70
|
+
score = 'Score'
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class Report:
|
|
75
|
+
name: str = 'default_report'
|
|
76
|
+
dataset_name: str = 'default_dataset'
|
|
77
|
+
model_name: str = 'default_model'
|
|
78
|
+
score: float = 0.0
|
|
79
|
+
metrics: List[Metric] = field(default_factory=list)
|
|
80
|
+
|
|
81
|
+
def __post_init__(self):
|
|
82
|
+
self.score = self.metrics[0].score # NOTE: only use the first metric by default
|
|
83
|
+
|
|
84
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
85
|
+
return asdict(self)
|
|
86
|
+
|
|
87
|
+
@classmethod
|
|
88
|
+
def from_dict(cls, data: dict):
|
|
89
|
+
metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
|
|
90
|
+
return cls(
|
|
91
|
+
name=data['name'],
|
|
92
|
+
score=data['score'],
|
|
93
|
+
metrics=metrics,
|
|
94
|
+
dataset_name=data['dataset_name'],
|
|
95
|
+
model_name=data['model_name'])
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def from_json(cls, json_file: str):
|
|
99
|
+
with open(json_file, 'r') as f:
|
|
100
|
+
data = json.load(f)
|
|
101
|
+
return cls.from_dict(data)
|
|
102
|
+
|
|
103
|
+
def to_dataframe(self, flatten_metrics: bool = True, flatten_categories: bool = True):
|
|
104
|
+
table = defaultdict(list)
|
|
105
|
+
for metric in self.metrics:
|
|
106
|
+
for category in metric.categories:
|
|
107
|
+
for subset in category.subsets:
|
|
108
|
+
table[ReportKey.model_name].append(self.model_name)
|
|
109
|
+
table[ReportKey.dataset_name].append(self.dataset_name)
|
|
110
|
+
table[ReportKey.metric_name].append(metric.name)
|
|
111
|
+
table[ReportKey.category_name].append(category.name)
|
|
112
|
+
table[ReportKey.subset_name].append(subset.name)
|
|
113
|
+
table[ReportKey.num].append(subset.num)
|
|
114
|
+
table[ReportKey.score].append(subset.score) # TODO: convert to percentage
|
|
115
|
+
# NOTE: only flatten metrics if needed, use the first metric by default
|
|
116
|
+
if not flatten_metrics:
|
|
117
|
+
break
|
|
118
|
+
df = pd.DataFrame.from_dict(table, orient='columns')
|
|
119
|
+
if flatten_categories:
|
|
120
|
+
df = self._flatten_categories(df)
|
|
121
|
+
return df
|
|
122
|
+
|
|
123
|
+
def _flatten_categories(self, df: pd.DataFrame):
|
|
124
|
+
# expand categories to multiple rows
|
|
125
|
+
df_categories = df.copy()
|
|
126
|
+
# multi-level aggregation for categories
|
|
127
|
+
max_depth = df_categories[ReportKey.category_name].apply(len).max()
|
|
128
|
+
for level in range(max_depth):
|
|
129
|
+
df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[ReportKey.category_name].apply(
|
|
130
|
+
lambda x: x[level] if len(x) > level else None)
|
|
131
|
+
|
|
132
|
+
df_categories.drop(columns=[ReportKey.category_name], inplace=True)
|
|
133
|
+
return df_categories
|
evalscope/run.py
CHANGED
|
@@ -5,18 +5,17 @@ Run evaluation for LLMs.
|
|
|
5
5
|
import os.path
|
|
6
6
|
from argparse import Namespace
|
|
7
7
|
from datetime import datetime
|
|
8
|
-
from typing import List, Optional, Union
|
|
8
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
|
9
9
|
|
|
10
|
-
from evalscope.arguments import parse_args
|
|
11
|
-
from evalscope.benchmarks import Benchmark, BenchmarkMeta
|
|
12
10
|
from evalscope.config import TaskConfig, parse_task_config
|
|
13
|
-
from evalscope.constants import
|
|
14
|
-
from evalscope.evaluator import Evaluator
|
|
15
|
-
from evalscope.models import LocalModel, get_local_model, initialize_model_adapter
|
|
11
|
+
from evalscope.constants import DataCollection, EvalBackend
|
|
16
12
|
from evalscope.utils import seed_everything
|
|
17
|
-
from evalscope.utils.io_utils import OutputsStructure
|
|
13
|
+
from evalscope.utils.io_utils import OutputsStructure
|
|
18
14
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
19
15
|
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from evalscope.models import LocalModel
|
|
18
|
+
|
|
20
19
|
logger = get_logger()
|
|
21
20
|
|
|
22
21
|
|
|
@@ -50,8 +49,8 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
|
50
49
|
if task_cfg.use_cache:
|
|
51
50
|
task_cfg.work_dir = task_cfg.use_cache
|
|
52
51
|
logger.info(f'Set resume from {task_cfg.work_dir}')
|
|
53
|
-
elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
|
|
54
|
-
|
|
52
|
+
# elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
|
|
53
|
+
task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
|
|
55
54
|
|
|
56
55
|
outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
|
|
57
56
|
|
|
@@ -98,6 +97,8 @@ def get_backend_manager_class(eval_backend: EvalBackend):
|
|
|
98
97
|
|
|
99
98
|
def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
100
99
|
"""Evaluate the model based on the provided task configuration."""
|
|
100
|
+
from evalscope.models import get_local_model
|
|
101
|
+
|
|
101
102
|
# Initialize evaluator
|
|
102
103
|
eval_results = {}
|
|
103
104
|
base_model = get_local_model(task_cfg)
|
|
@@ -117,10 +118,13 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
117
118
|
return eval_results
|
|
118
119
|
|
|
119
120
|
|
|
120
|
-
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: LocalModel):
|
|
121
|
+
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: 'LocalModel'):
|
|
121
122
|
"""Create an evaluator object for the specified dataset."""
|
|
123
|
+
from evalscope.benchmarks import Benchmark, BenchmarkMeta
|
|
124
|
+
from evalscope.evaluator import Evaluator
|
|
125
|
+
from evalscope.models import initialize_model_adapter
|
|
122
126
|
|
|
123
|
-
if dataset_name ==
|
|
127
|
+
if dataset_name == DataCollection.NAME:
|
|
124
128
|
# EvaluatorCollection is a collection of evaluators
|
|
125
129
|
from evalscope.collections import EvaluatorCollection
|
|
126
130
|
return EvaluatorCollection(task_cfg, outputs)
|
|
@@ -143,6 +147,7 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
|
|
|
143
147
|
|
|
144
148
|
|
|
145
149
|
def main():
|
|
150
|
+
from evalscope.arguments import parse_args
|
|
146
151
|
args = parse_args()
|
|
147
152
|
run_task(args)
|
|
148
153
|
|
evalscope/summarizer.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import List, Union
|
|
|
6
6
|
|
|
7
7
|
from evalscope.config import TaskConfig, parse_task_config
|
|
8
8
|
from evalscope.constants import EvalBackend
|
|
9
|
-
from evalscope.
|
|
9
|
+
from evalscope.report import gen_table
|
|
10
10
|
from evalscope.utils import csv_to_list, get_latest_folder_path
|
|
11
11
|
from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
|
|
12
12
|
from evalscope.utils.logger import get_logger
|
evalscope/utils/chat_service.py
CHANGED
|
@@ -5,7 +5,6 @@ from contextlib import contextmanager
|
|
|
5
5
|
from functools import partial
|
|
6
6
|
from pydantic import BaseModel, Field
|
|
7
7
|
from threading import Thread
|
|
8
|
-
from transformers import TextIteratorStreamer
|
|
9
8
|
from typing import Any, List, Literal, Optional, Union
|
|
10
9
|
|
|
11
10
|
|
|
@@ -96,6 +95,7 @@ class ChatService:
|
|
|
96
95
|
|
|
97
96
|
def __init__(self, model_path, attn_implementation):
|
|
98
97
|
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
98
|
+
from transformers import TextIteratorStreamer
|
|
99
99
|
|
|
100
100
|
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
|
101
101
|
self.model = AutoModelForCausalLM.from_pretrained(
|
evalscope/utils/logger.py
CHANGED
|
@@ -17,6 +17,7 @@ logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
|
|
|
17
17
|
# disable datasets logging
|
|
18
18
|
logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
19
19
|
logging.getLogger('modelscope').setLevel(logging.WARNING)
|
|
20
|
+
logging.getLogger('httpx').setLevel(logging.WARNING)
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
|
evalscope/utils/model_utils.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from transformers import GenerationConfig
|
|
3
6
|
|
|
4
7
|
|
|
5
8
|
class EvalBackend(Enum):
|
|
@@ -11,7 +14,7 @@ class EvalBackend(Enum):
|
|
|
11
14
|
THIRD_PARTY = 'ThirdParty'
|
|
12
15
|
|
|
13
16
|
|
|
14
|
-
def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
|
|
17
|
+
def fix_do_sample_warning(generation_config: 'GenerationConfig') -> None:
|
|
15
18
|
# Use the default values of temperature/top_p/top_k in generation_config.
|
|
16
19
|
if generation_config.temperature == 0:
|
|
17
20
|
generation_config.do_sample = False
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -92,6 +92,11 @@ Requires-Dist: numpy; extra == "all"
|
|
|
92
92
|
Requires-Dist: sse-starlette; extra == "all"
|
|
93
93
|
Requires-Dist: transformers; extra == "all"
|
|
94
94
|
Requires-Dist: unicorn; extra == "all"
|
|
95
|
+
Requires-Dist: gradio>=5.4.0; extra == "all"
|
|
96
|
+
Requires-Dist: plotly>=5.23.0; extra == "all"
|
|
97
|
+
Provides-Extra: app
|
|
98
|
+
Requires-Dist: gradio>=5.4.0; extra == "app"
|
|
99
|
+
Requires-Dist: plotly>=5.23.0; extra == "app"
|
|
95
100
|
Provides-Extra: inner
|
|
96
101
|
Requires-Dist: absl-py; extra == "inner"
|
|
97
102
|
Requires-Dist: accelerate; extra == "inner"
|
|
@@ -210,6 +215,8 @@ Please scan the QR code below to join our community groups:
|
|
|
210
215
|
|
|
211
216
|
|
|
212
217
|
## 🎉 News
|
|
218
|
+
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
219
|
+
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
213
220
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
214
221
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
215
222
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
@@ -374,15 +381,85 @@ run_task(task_cfg="config.json")
|
|
|
374
381
|
- `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
|
|
375
382
|
|
|
376
383
|
### Output Results
|
|
384
|
+
```text
|
|
385
|
+
+-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
|
|
386
|
+
| Model Name | Dataset Name | Metric Name | Category Name | Subset Name | Num | Score |
|
|
387
|
+
+=======================+================+=================+=================+===============+=======+=========+
|
|
388
|
+
| Qwen2.5-0.5B-Instruct | gsm8k | AverageAccuracy | default | main | 5 | 0.4 |
|
|
389
|
+
+-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
|
|
390
|
+
| Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Easy | 5 | 0.8 |
|
|
391
|
+
+-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
|
|
392
|
+
| Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Challenge | 5 | 0.4 |
|
|
393
|
+
+-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
## 📈 Visualization of Evaluation Results
|
|
397
|
+
|
|
398
|
+
1. Install the dependencies required for visualization, including gradio, plotly, etc.
|
|
399
|
+
```bash
|
|
400
|
+
pip install 'evalscope[app]'
|
|
377
401
|
```
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
402
|
+
|
|
403
|
+
2. Start the Visualization Service
|
|
404
|
+
|
|
405
|
+
Run the following command to start the visualization service.
|
|
406
|
+
```bash
|
|
407
|
+
evalscope app
|
|
408
|
+
```
|
|
409
|
+
You can access the visualization service in the browser if the following output appears.
|
|
410
|
+
```text
|
|
411
|
+
* Running on local URL: http://127.0.0.1:7861
|
|
412
|
+
|
|
413
|
+
To create a public link, set `share=True` in `launch()`.
|
|
383
414
|
```
|
|
384
415
|
|
|
385
|
-
|
|
416
|
+
<table>
|
|
417
|
+
<tr>
|
|
418
|
+
<td style="text-align: center;">
|
|
419
|
+
<img src="docs/zh/get_started/images/setting.png" alt="Setting" style="width: 100%;" />
|
|
420
|
+
<p>Setting Interface</p>
|
|
421
|
+
</td>
|
|
422
|
+
<td style="text-align: center;">
|
|
423
|
+
<img src="docs/zh/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
|
|
424
|
+
<p>Model Comparison</p>
|
|
425
|
+
</td>
|
|
426
|
+
</tr>
|
|
427
|
+
<tr>
|
|
428
|
+
<td style="text-align: center;">
|
|
429
|
+
<img src="docs/zh/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
|
|
430
|
+
<p>Report Overview</p>
|
|
431
|
+
</td>
|
|
432
|
+
<td style="text-align: center;">
|
|
433
|
+
<img src="docs/zh/get_started/images/report_details.png" alt="Report Details" style="width: 100%;" />
|
|
434
|
+
<p>Report Details</p>
|
|
435
|
+
</td>
|
|
436
|
+
</tr>
|
|
437
|
+
</table>
|
|
438
|
+
|
|
439
|
+
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html)
|
|
440
|
+
|
|
441
|
+
## 🌐 Evaluation of Specified Model API
|
|
442
|
+
|
|
443
|
+
Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
|
|
444
|
+
|
|
445
|
+
For example, to launch a model service using [vLLM](https://github.com/vllm-project/vllm):
|
|
446
|
+
|
|
447
|
+
```shell
|
|
448
|
+
export VLLM_USE_MODELSCOPE=True && python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --served-model-name qwen2.5 --trust_remote_code --port 8801
|
|
449
|
+
```
|
|
450
|
+
Then, you can use the following command to evaluate the model API service:
|
|
451
|
+
```shell
|
|
452
|
+
evalscope eval \
|
|
453
|
+
--model qwen2.5 \
|
|
454
|
+
--api-url http://127.0.0.1:8801/v1/chat/completions \
|
|
455
|
+
--api-key EMPTY \
|
|
456
|
+
--eval-type service \
|
|
457
|
+
--datasets gsm8k \
|
|
458
|
+
--limit 10
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
## ⚙️ Custom Parameter Evaluation
|
|
462
|
+
|
|
386
463
|
For more customized evaluations, such as customizing model parameters or dataset parameters, you can use the following command. The evaluation startup method is the same as simple evaluation. Below shows how to start the evaluation using the `eval` command:
|
|
387
464
|
|
|
388
465
|
```shell
|