evalscope 0.16.3__py3-none-any.whl β 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +29 -9
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +2 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +20 -15
- evalscope/metrics/__init__.py +9 -1
- evalscope/{utils/utils.py β metrics/completion_parsers.py} +71 -176
- evalscope/metrics/llm_judge.py +106 -20
- evalscope/metrics/metrics.py +20 -8
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +4 -2
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +15 -16
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +3 -3
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +55 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.3.dist-info β evalscope-0.17.1.dist-info}/METADATA +100 -51
- {evalscope-0.16.3.dist-info β evalscope-0.17.1.dist-info}/RECORD +129 -133
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +68 -4
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +34 -70
- tests/perf/test_perf.py +31 -4
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/models/model.py +0 -189
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils β benchmarks}/filters.py +0 -0
- {evalscope-0.16.3.dist-info β evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.3.dist-info β evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.3.dist-info β evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.3.dist-info β evalscope-0.17.1.dist-info}/top_level.txt +0 -0
evalscope/run.py
CHANGED
|
@@ -9,9 +9,9 @@ from typing import TYPE_CHECKING, List, Optional, Union
|
|
|
9
9
|
|
|
10
10
|
from evalscope.config import TaskConfig, parse_task_config
|
|
11
11
|
from evalscope.constants import DataCollection, EvalBackend
|
|
12
|
-
from evalscope.utils import seed_everything
|
|
13
12
|
from evalscope.utils.io_utils import OutputsStructure
|
|
14
13
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
14
|
+
from evalscope.utils.model_utils import seed_everything
|
|
15
15
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
17
|
from evalscope.models import LocalModel
|
evalscope/summarizer.py
CHANGED
|
@@ -7,8 +7,7 @@ from typing import List, Union
|
|
|
7
7
|
from evalscope.config import TaskConfig, parse_task_config
|
|
8
8
|
from evalscope.constants import EvalBackend
|
|
9
9
|
from evalscope.report import gen_table
|
|
10
|
-
from evalscope.utils import csv_to_list, get_latest_folder_path
|
|
11
|
-
from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
|
|
10
|
+
from evalscope.utils.io_utils import OutputsStructure, csv_to_list, get_latest_folder_path, json_to_dict, yaml_to_dict
|
|
12
11
|
from evalscope.utils.logger import get_logger
|
|
13
12
|
|
|
14
13
|
logger = get_logger()
|
evalscope/utils/__init__.py
CHANGED
|
@@ -1,4 +1,65 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from .import_utils import _LazyModule
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from .argument_utils import BaseArgument, get_supported_params, parse_int_or_float
|
|
9
|
+
from .deprecation_utils import deprecated
|
|
10
|
+
from .import_utils import get_module_path, is_module_installed
|
|
11
|
+
from .io_utils import (OutputsStructure, csv_to_jsonl, csv_to_list, dict_to_yaml, gen_hash, get_latest_folder_path,
|
|
12
|
+
get_valid_list, json_to_dict, jsonl_to_csv, jsonl_to_list, yaml_to_dict)
|
|
13
|
+
from .logger import configure_logging, get_logger
|
|
14
|
+
from .model_utils import EvalBackend, dict_torch_dtype_to_str, fix_do_sample_warning, get_device, seed_everything
|
|
15
|
+
|
|
16
|
+
else:
|
|
17
|
+
_import_structure = {
|
|
18
|
+
'argument_utils': [
|
|
19
|
+
'BaseArgument',
|
|
20
|
+
'parse_int_or_float',
|
|
21
|
+
'get_supported_params',
|
|
22
|
+
],
|
|
23
|
+
'model_utils': [
|
|
24
|
+
'EvalBackend',
|
|
25
|
+
'get_device',
|
|
26
|
+
'seed_everything',
|
|
27
|
+
'dict_torch_dtype_to_str',
|
|
28
|
+
'fix_do_sample_warning',
|
|
29
|
+
],
|
|
30
|
+
'import_utils': [
|
|
31
|
+
'is_module_installed',
|
|
32
|
+
'get_module_path',
|
|
33
|
+
],
|
|
34
|
+
'io_utils': [
|
|
35
|
+
'OutputsStructure',
|
|
36
|
+
'csv_to_list',
|
|
37
|
+
'json_to_dict',
|
|
38
|
+
'yaml_to_dict',
|
|
39
|
+
'get_latest_folder_path',
|
|
40
|
+
'gen_hash',
|
|
41
|
+
'dict_to_yaml',
|
|
42
|
+
'csv_to_jsonl',
|
|
43
|
+
'jsonl_to_csv',
|
|
44
|
+
'jsonl_to_list',
|
|
45
|
+
'gen_hash',
|
|
46
|
+
'get_valid_list',
|
|
47
|
+
],
|
|
48
|
+
'deprecation_utils': [
|
|
49
|
+
'deprecated',
|
|
50
|
+
],
|
|
51
|
+
'logger': [
|
|
52
|
+
'get_logger',
|
|
53
|
+
'configure_logging',
|
|
54
|
+
],
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
import sys
|
|
58
|
+
|
|
59
|
+
sys.modules[__name__] = _LazyModule(
|
|
60
|
+
__name__,
|
|
61
|
+
globals()['__file__'],
|
|
62
|
+
_import_structure,
|
|
63
|
+
module_spec=__spec__,
|
|
64
|
+
extra_objects={},
|
|
65
|
+
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from argparse import Namespace
|
|
3
|
+
from inspect import signature
|
|
4
|
+
|
|
5
|
+
from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseArgument:
|
|
9
|
+
"""
|
|
10
|
+
BaseArgument is a base class designed to facilitate the creation and manipulation
|
|
11
|
+
of argument classes in the evalscope framework. It provides utility methods for
|
|
12
|
+
instantiating objects from various data formats and converting objects back into
|
|
13
|
+
dictionary representations.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def from_dict(cls, d: dict):
|
|
18
|
+
"""Instantiate the class from a dictionary."""
|
|
19
|
+
return cls(**d)
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def from_json(cls, json_file: str):
|
|
23
|
+
"""Instantiate the class from a JSON file."""
|
|
24
|
+
return cls.from_dict(json_to_dict(json_file))
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def from_yaml(cls, yaml_file: str):
|
|
28
|
+
"""Instantiate the class from a YAML file."""
|
|
29
|
+
return cls.from_dict(yaml_to_dict(yaml_file))
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def from_args(cls, args: Namespace):
|
|
33
|
+
"""
|
|
34
|
+
Instantiate the class from an argparse.Namespace object.
|
|
35
|
+
Filters out None values and removes 'func' if present.
|
|
36
|
+
"""
|
|
37
|
+
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
38
|
+
|
|
39
|
+
if 'func' in args_dict:
|
|
40
|
+
del args_dict['func'] # Note: compat CLI arguments
|
|
41
|
+
|
|
42
|
+
return cls.from_dict(args_dict)
|
|
43
|
+
|
|
44
|
+
def to_dict(self):
|
|
45
|
+
"""Convert the instance to a dictionary."""
|
|
46
|
+
result = self.__dict__.copy()
|
|
47
|
+
return result
|
|
48
|
+
|
|
49
|
+
def __str__(self):
|
|
50
|
+
"""Return a JSON-formatted string representation of the instance."""
|
|
51
|
+
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def parse_int_or_float(num):
|
|
55
|
+
number = float(num)
|
|
56
|
+
if number.is_integer():
|
|
57
|
+
return int(number)
|
|
58
|
+
return number
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_supported_params(func):
|
|
62
|
+
"""Get the supported parameters of a function."""
|
|
63
|
+
sig = signature(func)
|
|
64
|
+
return list(sig.parameters.keys())
|
evalscope/utils/import_utils.py
CHANGED
|
@@ -64,3 +64,19 @@ class _LazyModule(ModuleType):
|
|
|
64
64
|
|
|
65
65
|
def __reduce__(self):
|
|
66
66
|
return self.__class__, (self._name, self.__file__, self._import_structure)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def is_module_installed(module_name):
|
|
70
|
+
try:
|
|
71
|
+
importlib.import_module(module_name)
|
|
72
|
+
return True
|
|
73
|
+
except ImportError:
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_module_path(module_name):
|
|
78
|
+
spec = importlib.util.find_spec(module_name)
|
|
79
|
+
if spec and spec.origin:
|
|
80
|
+
return os.path.abspath(spec.origin)
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError(f'Cannot find module: {module_name}')
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -1,8 +1,13 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
import csv
|
|
3
|
+
import hashlib
|
|
2
4
|
import json
|
|
3
5
|
import jsonlines as jsonl
|
|
4
6
|
import os
|
|
7
|
+
import re
|
|
5
8
|
import yaml
|
|
9
|
+
from io import BytesIO
|
|
10
|
+
from PIL import Image
|
|
6
11
|
|
|
7
12
|
from evalscope.constants import DumpMode
|
|
8
13
|
from evalscope.utils.logger import get_logger
|
|
@@ -221,7 +226,53 @@ def dict_to_json(d: dict, json_file: str):
|
|
|
221
226
|
json.dump(d, f, indent=4, ensure_ascii=False)
|
|
222
227
|
|
|
223
228
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
229
|
+
def get_latest_folder_path(work_dir):
|
|
230
|
+
from datetime import datetime
|
|
231
|
+
|
|
232
|
+
# Get all subdirectories in the work_dir
|
|
233
|
+
folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
|
|
234
|
+
|
|
235
|
+
# Get the timestampοΌYYYYMMDD_HHMMSSοΌ
|
|
236
|
+
timestamp_pattern = re.compile(r'^\d{8}_\d{6}$')
|
|
237
|
+
|
|
238
|
+
# Filter out the folders
|
|
239
|
+
timestamped_folders = [f for f in folders if timestamp_pattern.match(f)]
|
|
240
|
+
|
|
241
|
+
if not timestamped_folders:
|
|
242
|
+
print(f'>> No timestamped folders found in {work_dir}!')
|
|
243
|
+
return None
|
|
244
|
+
|
|
245
|
+
# timestamp parser
|
|
246
|
+
def parse_timestamp(folder_name):
|
|
247
|
+
return datetime.strptime(folder_name, '%Y%m%d_%H%M%S')
|
|
248
|
+
|
|
249
|
+
# Find the latest folder
|
|
250
|
+
latest_folder = max(timestamped_folders, key=parse_timestamp)
|
|
251
|
+
|
|
252
|
+
return os.path.join(work_dir, latest_folder)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def gen_hash(name: str, bits: int = 32):
|
|
256
|
+
return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()[:bits]
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def get_valid_list(input_list, candidate_list):
|
|
260
|
+
"""
|
|
261
|
+
Get the valid and invalid list from input_list based on candidate_list.
|
|
262
|
+
Args:
|
|
263
|
+
input_list: The input list.
|
|
264
|
+
candidate_list: The candidate list.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
valid_list: The valid list.
|
|
268
|
+
invalid_list: The invalid list.
|
|
269
|
+
"""
|
|
270
|
+
return [i for i in input_list if i in candidate_list], \
|
|
271
|
+
[i for i in input_list if i not in candidate_list]
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def PIL_to_base64(image: Image.Image, format: str = 'JPEG') -> str:
|
|
275
|
+
buffered = BytesIO()
|
|
276
|
+
image.save(buffered, format=format)
|
|
277
|
+
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
278
|
+
return img_str
|
evalscope/utils/model_utils.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
import numpy as np
|
|
1
2
|
import os
|
|
3
|
+
import random
|
|
4
|
+
import torch
|
|
2
5
|
from enum import Enum
|
|
3
|
-
from typing import TYPE_CHECKING, Optional, Tuple, Union
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
4
7
|
|
|
5
8
|
if TYPE_CHECKING:
|
|
6
9
|
from transformers import GenerationConfig
|
|
@@ -38,3 +41,36 @@ def get_device() -> str:
|
|
|
38
41
|
device = 'cpu'
|
|
39
42
|
|
|
40
43
|
return device
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
|
|
47
|
+
"""
|
|
48
|
+
Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
|
|
49
|
+
converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
|
|
50
|
+
string, which can then be stored in the json format.
|
|
51
|
+
|
|
52
|
+
Refer to: https://github.com/huggingface/transformers/pull/16065/files for details.
|
|
53
|
+
"""
|
|
54
|
+
if d.get('torch_dtype', None) is not None and not isinstance(d['torch_dtype'], str):
|
|
55
|
+
d['torch_dtype'] = str(d['torch_dtype']).split('.')[1]
|
|
56
|
+
|
|
57
|
+
for value in d.values():
|
|
58
|
+
if isinstance(value, dict):
|
|
59
|
+
dict_torch_dtype_to_str(value)
|
|
60
|
+
|
|
61
|
+
return d
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def seed_everything(seed: int):
|
|
65
|
+
"""Set all random seeds to a fixed value for reproducibility.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
seed (int): The seed value.
|
|
69
|
+
"""
|
|
70
|
+
random.seed(seed)
|
|
71
|
+
np.random.seed(seed)
|
|
72
|
+
torch.manual_seed(seed)
|
|
73
|
+
if torch.cuda.is_available():
|
|
74
|
+
torch.cuda.manual_seed_all(seed)
|
|
75
|
+
torch.backends.cudnn.deterministic = True
|
|
76
|
+
torch.backends.cudnn.benchmark = False
|
evalscope/version.py
CHANGED
|
@@ -1,30 +1,31 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.17.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
7
7
|
Author-email: contact@modelscope.cn
|
|
8
|
+
License: Apache License 2.0
|
|
8
9
|
Keywords: python,llm,evaluation
|
|
9
10
|
Classifier: Development Status :: 4 - Beta
|
|
10
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
-
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: >=3.9
|
|
17
18
|
Description-Content-Type: text/markdown
|
|
18
19
|
License-File: LICENSE
|
|
19
20
|
Requires-Dist: accelerate
|
|
20
|
-
Requires-Dist: datasets
|
|
21
|
+
Requires-Dist: datasets==3.2.0
|
|
21
22
|
Requires-Dist: immutabledict
|
|
22
23
|
Requires-Dist: jieba
|
|
23
24
|
Requires-Dist: jsonlines
|
|
24
25
|
Requires-Dist: langdetect
|
|
25
26
|
Requires-Dist: latex2sympy2-extended
|
|
26
27
|
Requires-Dist: matplotlib
|
|
27
|
-
Requires-Dist: modelscope[framework]
|
|
28
|
+
Requires-Dist: modelscope[framework]>=1.27
|
|
28
29
|
Requires-Dist: nltk>=3.9
|
|
29
30
|
Requires-Dist: openai
|
|
30
31
|
Requires-Dist: pandas
|
|
@@ -52,14 +53,14 @@ Requires-Dist: opencv-python; extra == "aigc"
|
|
|
52
53
|
Requires-Dist: torchvision; extra == "aigc"
|
|
53
54
|
Provides-Extra: all
|
|
54
55
|
Requires-Dist: accelerate; extra == "all"
|
|
55
|
-
Requires-Dist: datasets
|
|
56
|
+
Requires-Dist: datasets==3.2.0; extra == "all"
|
|
56
57
|
Requires-Dist: immutabledict; extra == "all"
|
|
57
58
|
Requires-Dist: jieba; extra == "all"
|
|
58
59
|
Requires-Dist: jsonlines; extra == "all"
|
|
59
60
|
Requires-Dist: langdetect; extra == "all"
|
|
60
61
|
Requires-Dist: latex2sympy2-extended; extra == "all"
|
|
61
62
|
Requires-Dist: matplotlib; extra == "all"
|
|
62
|
-
Requires-Dist: modelscope[framework]; extra == "all"
|
|
63
|
+
Requires-Dist: modelscope[framework]>=1.27; extra == "all"
|
|
63
64
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
64
65
|
Requires-Dist: openai; extra == "all"
|
|
65
66
|
Requires-Dist: pandas; extra == "all"
|
|
@@ -102,9 +103,27 @@ Requires-Dist: omegaconf; extra == "all"
|
|
|
102
103
|
Requires-Dist: open-clip-torch; extra == "all"
|
|
103
104
|
Requires-Dist: opencv-python; extra == "all"
|
|
104
105
|
Requires-Dist: torchvision; extra == "all"
|
|
106
|
+
Requires-Dist: bfcl-eval; extra == "all"
|
|
107
|
+
Requires-Dist: human-eval; extra == "all"
|
|
108
|
+
Requires-Dist: pytest; extra == "all"
|
|
109
|
+
Requires-Dist: pytest-cov; extra == "all"
|
|
110
|
+
Requires-Dist: python-dotenv; extra == "all"
|
|
105
111
|
Provides-Extra: app
|
|
106
112
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
107
113
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
114
|
+
Provides-Extra: dev
|
|
115
|
+
Requires-Dist: bfcl-eval; extra == "dev"
|
|
116
|
+
Requires-Dist: human-eval; extra == "dev"
|
|
117
|
+
Requires-Dist: pytest; extra == "dev"
|
|
118
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
119
|
+
Requires-Dist: python-dotenv; extra == "dev"
|
|
120
|
+
Provides-Extra: docs
|
|
121
|
+
Requires-Dist: docutils>=0.16.0; extra == "docs"
|
|
122
|
+
Requires-Dist: myst-parser; extra == "docs"
|
|
123
|
+
Requires-Dist: recommonmark; extra == "docs"
|
|
124
|
+
Requires-Dist: sphinx>=5.3.0; extra == "docs"
|
|
125
|
+
Requires-Dist: sphinx-design; extra == "docs"
|
|
126
|
+
Requires-Dist: sphinxawesome-theme; extra == "docs"
|
|
108
127
|
Provides-Extra: opencompass
|
|
109
128
|
Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
|
|
110
129
|
Provides-Extra: perf
|
|
@@ -165,16 +184,17 @@ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
|
|
|
165
184
|
- [Basic Parameter](#basic-parameter)
|
|
166
185
|
- [Output Results](#output-results)
|
|
167
186
|
- [π Visualization of Evaluation Results](#-visualization-of-evaluation-results)
|
|
168
|
-
- [π Evaluation of
|
|
187
|
+
- [π Evaluation of Model API](#-evaluation-of-model-api)
|
|
169
188
|
- [βοΈ Custom Parameter Evaluation](#οΈ-custom-parameter-evaluation)
|
|
170
|
-
- [Parameter](#parameter)
|
|
171
|
-
- [Evaluation
|
|
189
|
+
- [Parameter Description](#parameter-description)
|
|
190
|
+
- [π§ͺ Other Evaluation Backends](#-other-evaluation-backends)
|
|
172
191
|
- [π Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
173
192
|
- [ποΈ Custom Dataset Evaluation](#οΈ-custom-dataset-evaluation)
|
|
174
|
-
- [
|
|
193
|
+
- [βοΈ Arena Mode](#οΈ-arena-mode)
|
|
175
194
|
- [π·ββοΈ Contribution](#οΈ-contribution)
|
|
195
|
+
- [π Citation](#-citation)
|
|
176
196
|
- [π Roadmap](#-roadmap)
|
|
177
|
-
- [Star History](
|
|
197
|
+
- [β Star History](#-star-history)
|
|
178
198
|
|
|
179
199
|
|
|
180
200
|
## π Introduction
|
|
@@ -198,24 +218,33 @@ EvalScope is not merely an evaluation tool; it is a valuable ally in your model
|
|
|
198
218
|
Below is the overall architecture diagram of EvalScope:
|
|
199
219
|
|
|
200
220
|
<p align="center">
|
|
201
|
-
<img src="
|
|
221
|
+
<img src="https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/doc/EvalScope%E6%9E%B6%E6%9E%84%E5%9B%BE.png" width="70%">
|
|
202
222
|
<br>EvalScope Framework.
|
|
203
223
|
</p>
|
|
204
224
|
|
|
205
225
|
<details><summary>Framework Description</summary>
|
|
206
226
|
|
|
207
227
|
The architecture includes the following modules:
|
|
208
|
-
1.
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
228
|
+
1. Input Layer
|
|
229
|
+
- **Model Sources**: API models (OpenAI API), local models (ModelScope)
|
|
230
|
+
- **Datasets**: Standard evaluation benchmarks (MMLU/GSM8k, etc.), custom data (MCQ/QA)
|
|
231
|
+
|
|
232
|
+
2. Core Functions
|
|
233
|
+
- **Multi-backend Evaluation**
|
|
234
|
+
- Native backends: Unified evaluation for LLM/VLM/Embedding/T2I models
|
|
235
|
+
- Integrated frameworks: OpenCompass/MTEB/VLMEvalKit/RAGAS
|
|
236
|
+
|
|
237
|
+
- **Performance Monitoring**
|
|
238
|
+
- Model plugins: Supports various model service APIs
|
|
239
|
+
- Data plugins: Supports multiple data formats
|
|
240
|
+
- Metric tracking: TTFT/TPOP/Stability and other metrics
|
|
241
|
+
|
|
242
|
+
- **Tool Extensions**
|
|
243
|
+
- Integration: Tool-Bench/Needle-in-a-Haystack/BFCL-v3
|
|
244
|
+
|
|
245
|
+
3. Output Layer
|
|
246
|
+
- **Structured Reports**: Supports JSON/Tables/Logs
|
|
247
|
+
- **Visualization Platforms**: Supports Gradio/Wandb/SwanLab
|
|
219
248
|
|
|
220
249
|
</details>
|
|
221
250
|
|
|
@@ -229,8 +258,12 @@ Please scan the QR code below to join our community groups:
|
|
|
229
258
|
|
|
230
259
|
|
|
231
260
|
## π News
|
|
232
|
-
|
|
233
|
-
- π₯ **[2025.
|
|
261
|
+
- π₯ **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
|
|
262
|
+
- π₯ **[2025.07.16]** Support for [Ο-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
|
|
263
|
+
- π₯ **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
|
|
264
|
+
- π₯ **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
|
|
265
|
+
- π₯ **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
|
|
266
|
+
- π₯ **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
234
267
|
- π₯ **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
235
268
|
- π₯ **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
236
269
|
- π₯ **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
@@ -239,6 +272,8 @@ Please scan the QR code below to join our community groups:
|
|
|
239
272
|
- π₯ **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
240
273
|
- π₯ **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
241
274
|
- π₯ **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
275
|
+
<details><summary>More</summary>
|
|
276
|
+
|
|
242
277
|
- π₯ **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
243
278
|
- π₯ **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
244
279
|
- π₯ **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
@@ -252,8 +287,6 @@ Please scan the QR code below to join our community groups:
|
|
|
252
287
|
- π₯ **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasetsοΌrefer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
253
288
|
- π₯ **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [π Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
254
289
|
- π₯ **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [π Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
255
|
-
<details><summary>More</summary>
|
|
256
|
-
|
|
257
290
|
- π₯π₯ **[2024.12.31]** Support for adding benchmark evaluations, refer to the [π Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [π Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
258
291
|
- π₯ **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [π User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
259
292
|
- π₯ **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [π User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
@@ -345,33 +378,31 @@ evalscope eval \
|
|
|
345
378
|
|
|
346
379
|
When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
|
|
347
380
|
|
|
348
|
-
**Using
|
|
381
|
+
**Using `TaskConfig`**
|
|
349
382
|
|
|
350
383
|
```python
|
|
351
|
-
from evalscope
|
|
384
|
+
from evalscope import run_task, TaskConfig
|
|
352
385
|
|
|
353
|
-
task_cfg =
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
386
|
+
task_cfg = TaskConfig(
|
|
387
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
388
|
+
datasets=['gsm8k', 'arc'],
|
|
389
|
+
limit=5
|
|
390
|
+
)
|
|
358
391
|
|
|
359
392
|
run_task(task_cfg=task_cfg)
|
|
360
393
|
```
|
|
361
|
-
|
|
362
394
|
<details><summary>More Startup Methods</summary>
|
|
363
395
|
|
|
364
|
-
**Using
|
|
396
|
+
**Using Python Dictionary**
|
|
365
397
|
|
|
366
398
|
```python
|
|
367
399
|
from evalscope.run import run_task
|
|
368
|
-
from evalscope.config import TaskConfig
|
|
369
400
|
|
|
370
|
-
task_cfg =
|
|
371
|
-
model
|
|
372
|
-
datasets
|
|
373
|
-
limit
|
|
374
|
-
|
|
401
|
+
task_cfg = {
|
|
402
|
+
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
403
|
+
'datasets': ['gsm8k', 'arc'],
|
|
404
|
+
'limit': 5
|
|
405
|
+
}
|
|
375
406
|
|
|
376
407
|
run_task(task_cfg=task_cfg)
|
|
377
408
|
```
|
|
@@ -474,7 +505,7 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
474
505
|
|
|
475
506
|
For more details, refer to: [π Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
476
507
|
|
|
477
|
-
## π Evaluation of
|
|
508
|
+
## π Evaluation of Model API
|
|
478
509
|
|
|
479
510
|
Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
|
|
480
511
|
|
|
@@ -525,7 +556,7 @@ evalscope eval \
|
|
|
525
556
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
526
557
|
|
|
527
558
|
|
|
528
|
-
## Evaluation
|
|
559
|
+
## π§ͺ Other Evaluation Backends
|
|
529
560
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
530
561
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
531
562
|
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [π User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
|
|
@@ -572,10 +603,17 @@ Speed Benchmark Results:
|
|
|
572
603
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [πUser Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
|
|
573
604
|
|
|
574
605
|
|
|
575
|
-
##
|
|
576
|
-
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
606
|
+
## βοΈ Arena Mode
|
|
577
607
|
|
|
578
|
-
Refer to: Arena Mode [π User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
608
|
+
Arena mode allows you to configure multiple candidate models and specify a baseline model. Evaluation is performed by pairwise battles between each candidate model and the baseline model, with the final output including each model's win rate and ranking. This method is suitable for comparative evaluation among multiple models, providing an intuitive reflection of each model's strengths and weaknesses. Refer to: Arena Mode [π User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
609
|
+
|
|
610
|
+
```text
|
|
611
|
+
Model WinRate (%) CI (%)
|
|
612
|
+
------------ ------------- ---------------
|
|
613
|
+
qwen2.5-72b 69.3 (-13.3 / +12.2)
|
|
614
|
+
qwen2.5-7b 50 (+0.0 / +0.0)
|
|
615
|
+
qwen2.5-0.5b 4.7 (-2.5 / +4.4)
|
|
616
|
+
```
|
|
579
617
|
|
|
580
618
|
## π·ββοΈ Contribution
|
|
581
619
|
|
|
@@ -591,6 +629,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
591
629
|
</table>
|
|
592
630
|
</a>
|
|
593
631
|
|
|
632
|
+
## π Citation
|
|
633
|
+
|
|
634
|
+
```bibtex
|
|
635
|
+
@misc{evalscope_2024,
|
|
636
|
+
title={{EvalScope}: Evaluation Framework for Large Models},
|
|
637
|
+
author={ModelScope Team},
|
|
638
|
+
year={2024},
|
|
639
|
+
url={https://github.com/modelscope/evalscope}
|
|
640
|
+
}
|
|
641
|
+
```
|
|
642
|
+
|
|
594
643
|
## π Roadmap
|
|
595
644
|
- [x] Support for better evaluation report visualization
|
|
596
645
|
- [x] Support for mixed evaluations across multiple datasets
|
|
@@ -601,11 +650,11 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
601
650
|
- [ ] Distributed evaluating
|
|
602
651
|
- [x] Multi-modal evaluation
|
|
603
652
|
- [ ] Benchmarks
|
|
604
|
-
- [
|
|
653
|
+
- [x] BFCL-v3
|
|
605
654
|
- [x] GPQA
|
|
606
655
|
- [x] MBPP
|
|
607
656
|
|
|
608
657
|
|
|
609
|
-
## Star History
|
|
658
|
+
## β Star History
|
|
610
659
|
|
|
611
660
|
[](https://star-history.com/#modelscope/evalscope&Date)
|