evalscope 0.16.1__py3-none-any.whl → 0.16.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +20 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/utils/embedding.py +2 -4
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +2 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/frames_adapter.py +1 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
- evalscope/benchmarks/needle_haystack/utils.py +2 -2
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/collections/evaluator.py +50 -28
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +6 -5
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +29 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +78 -17
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +16 -3
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/report/combinator.py +38 -12
- evalscope/report/utils.py +24 -1
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/version.py +2 -2
- {evalscope-0.16.1.dist-info → evalscope-0.16.3.dist-info}/METADATA +4 -3
- {evalscope-0.16.1.dist-info → evalscope-0.16.3.dist-info}/RECORD +82 -79
- tests/aigc/test_t2i.py +8 -8
- tests/cli/test_all.py +40 -33
- tests/cli/test_collection.py +4 -3
- tests/cli/test_run.py +36 -21
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +46 -2
- {evalscope-0.16.1.dist-info → evalscope-0.16.3.dist-info}/LICENSE +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.3.dist-info}/WHEEL +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.3.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.3.dist-info}/top_level.txt +0 -0
evalscope/app/app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import glob
|
|
3
3
|
import gradio as gr
|
|
4
|
+
import json
|
|
4
5
|
import numpy as np
|
|
5
6
|
import os
|
|
6
7
|
import pandas as pd
|
|
@@ -135,11 +136,11 @@ def plot_single_report_scores(df: pd.DataFrame):
|
|
|
135
136
|
|
|
136
137
|
def plot_single_report_sunburst(report_list: List[Report]):
|
|
137
138
|
if report_list[0].name == DataCollection.NAME:
|
|
138
|
-
df = get_data_frame(report_list)
|
|
139
|
+
df = get_data_frame(report_list=report_list)
|
|
139
140
|
categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
|
|
140
141
|
path = categories + [ReportKey.subset_name]
|
|
141
142
|
else:
|
|
142
|
-
df = get_data_frame(report_list, flatten_metrics=False)
|
|
143
|
+
df = get_data_frame(report_list=report_list, flatten_metrics=False)
|
|
143
144
|
categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
|
|
144
145
|
path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
|
|
145
146
|
logger.debug(f'df: {df}')
|
|
@@ -233,7 +234,7 @@ def convert_html_tags(text):
|
|
|
233
234
|
def process_string(string: str, max_length: int = 2048) -> str:
|
|
234
235
|
string = convert_html_tags(string) # for display labels e.g.
|
|
235
236
|
if max_length and len(string) > max_length:
|
|
236
|
-
return f'{string[:max_length // 2]}
|
|
237
|
+
return f'{string[:max_length // 2]}...[truncate]...{string[-max_length // 2:]}'
|
|
237
238
|
return string
|
|
238
239
|
|
|
239
240
|
|
|
@@ -257,7 +258,7 @@ def dict_to_markdown(data) -> str:
|
|
|
257
258
|
return '\n\n'.join(markdown_lines)
|
|
258
259
|
|
|
259
260
|
|
|
260
|
-
def
|
|
261
|
+
def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
|
|
261
262
|
"""
|
|
262
263
|
Process model prediction output into a formatted string.
|
|
263
264
|
|
|
@@ -281,6 +282,20 @@ def process_model_prediction(item: Any, max_length: int = 2048) -> str:
|
|
|
281
282
|
return result
|
|
282
283
|
|
|
283
284
|
|
|
285
|
+
def process_model_prediction(item: Any, max_length: int = 4096) -> str:
|
|
286
|
+
if isinstance(item, (dict, list)):
|
|
287
|
+
result = json.dumps(item, ensure_ascii=False, indent=2)
|
|
288
|
+
result = f'```json\n{result}\n```'
|
|
289
|
+
else:
|
|
290
|
+
result = str(item)
|
|
291
|
+
|
|
292
|
+
# Apply HTML tag conversion and truncation only at the final output
|
|
293
|
+
if max_length is not None:
|
|
294
|
+
return process_string(result, max_length)
|
|
295
|
+
|
|
296
|
+
return result
|
|
297
|
+
|
|
298
|
+
|
|
284
299
|
def normalize_score(score):
|
|
285
300
|
try:
|
|
286
301
|
if isinstance(score, bool):
|
|
@@ -583,7 +598,7 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
583
598
|
outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
|
|
584
599
|
def update_single_report_dataset(dataset_name, report_list):
|
|
585
600
|
logger.debug(f'Updating single report dataset: {dataset_name}')
|
|
586
|
-
report_df = get_data_frame(report_list)
|
|
601
|
+
report_df = get_data_frame(report_list=report_list)
|
|
587
602
|
analysis = get_report_analysis(report_list, dataset_name)
|
|
588
603
|
data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
|
|
589
604
|
data_score_plot = plot_single_dataset_scores(data_score_df)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import posixpath # For URL path handling
|
|
2
3
|
import torch
|
|
3
4
|
from torch.utils.data import DataLoader
|
|
4
5
|
from torch.utils.data import Dataset as TorchDataset
|
|
@@ -186,42 +187,53 @@ def build_wds_dataset(dataset_name, transform, split='test', data_dir='root', ca
|
|
|
186
187
|
|
|
187
188
|
Set `cache_dir` to a path to cache the dataset, otherwise, no caching will occur.
|
|
188
189
|
"""
|
|
190
|
+
import requests
|
|
189
191
|
import webdataset as wds
|
|
190
192
|
|
|
191
193
|
def read_txt(fname):
|
|
192
|
-
if '://'
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
194
|
+
if fname.startswith(('http://', 'https://')):
|
|
195
|
+
try:
|
|
196
|
+
response = requests.get(fname)
|
|
197
|
+
response.raise_for_status() # Ensure the HTTP request was successful
|
|
198
|
+
return response.text
|
|
199
|
+
except requests.exceptions.RequestException as e:
|
|
200
|
+
raise FileNotFoundError(f'Failed to read {fname}: {e}')
|
|
197
201
|
else:
|
|
198
202
|
with open(fname, 'r') as file:
|
|
199
|
-
|
|
200
|
-
|
|
203
|
+
return file.read()
|
|
204
|
+
|
|
205
|
+
def url_path_join(*parts):
|
|
206
|
+
"""Join URL path parts with forward slashes regardless of platform"""
|
|
207
|
+
return posixpath.join(*parts)
|
|
201
208
|
|
|
202
209
|
if not data_dir:
|
|
203
210
|
data_dir = f'https://modelscope.cn/datasets/clip-benchmark/wds_{dataset_name}/resolve/master'
|
|
204
211
|
|
|
205
212
|
# Git LFS files have a different file path to access the raw data than other files
|
|
206
|
-
|
|
213
|
+
is_url = data_dir.startswith(('http://', 'https://'))
|
|
214
|
+
if is_url and data_dir.startswith('https://modelscope.cn/datasets'):
|
|
207
215
|
*split_url_head, _, url_path = data_dir.split('/', 7)
|
|
208
216
|
url_head = '/'.join(split_url_head)
|
|
209
217
|
metadata_dir = '/'.join([url_head, 'resolve', url_path])
|
|
210
218
|
tardata_dir = '/'.join([url_head, 'resolve', url_path])
|
|
211
219
|
else:
|
|
212
220
|
metadata_dir = tardata_dir = data_dir
|
|
221
|
+
|
|
222
|
+
# Use appropriate path joining function based on whether we're dealing with a URL
|
|
223
|
+
path_join = url_path_join if is_url else os.path.join
|
|
224
|
+
|
|
213
225
|
# Get number of shards
|
|
214
|
-
nshards_fname =
|
|
226
|
+
nshards_fname = path_join(metadata_dir, split, 'nshards.txt')
|
|
215
227
|
nshards = int(read_txt(nshards_fname)) # Do not catch FileNotFound, nshards.txt should be mandatory
|
|
216
228
|
|
|
217
229
|
# Get dataset type (classification or retrieval)
|
|
218
|
-
type_fname =
|
|
230
|
+
type_fname = path_join(metadata_dir, 'dataset_type.txt')
|
|
219
231
|
try:
|
|
220
232
|
dataset_type = read_txt(type_fname).strip().lower()
|
|
221
233
|
except FileNotFoundError:
|
|
222
234
|
dataset_type = 'classification'
|
|
223
235
|
|
|
224
|
-
filepattern =
|
|
236
|
+
filepattern = path_join(tardata_dir, split, '{0..%d}.tar' % (nshards - 1))
|
|
225
237
|
# Load webdataset (support WEBP, PNG, and JPG for now)
|
|
226
238
|
if not cache_dir or not isinstance(cache_dir, str):
|
|
227
239
|
cache_dir = None
|
|
@@ -172,11 +172,9 @@ class CrossEncoderModel(BaseModel):
|
|
|
172
172
|
kwargs.pop(key)
|
|
173
173
|
self.encode_kwargs.update(kwargs)
|
|
174
174
|
|
|
175
|
-
if len(sentences[0]) ==
|
|
175
|
+
if len(sentences[0]) == 2: # Note: For mteb retrieval task
|
|
176
176
|
processed_sentences = []
|
|
177
|
-
for query, docs
|
|
178
|
-
if isinstance(docs, dict):
|
|
179
|
-
docs = docs['text']
|
|
177
|
+
for query, docs in sentences:
|
|
180
178
|
processed_sentences.append((self.prompt + query, docs))
|
|
181
179
|
sentences = processed_sentences
|
|
182
180
|
embeddings = self.model.predict(sentences, **self.encode_kwargs)
|
|
@@ -69,6 +69,7 @@ class EvalMuseAdapter(T2IBaseAdapter):
|
|
|
69
69
|
if 'FGA_BLIP2Score' in metric_name and '(' in metric_name: # FGA_BLIP2Score element score
|
|
70
70
|
metrics_prefix = metric_name.split(':')[0]
|
|
71
71
|
category = metric_name.rpartition('(')[-1].split(')')[0]
|
|
72
|
+
category = category.split('-')[0].lower() # remove the suffix if exists
|
|
72
73
|
new_items[f'{metrics_prefix}:{category}'].extend(value_list)
|
|
73
74
|
else:
|
|
74
75
|
new_items[metric_name].extend(value_list)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
-
from evalscope.constants import OutputType
|
|
3
2
|
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
|
|
4
3
|
from evalscope.utils.logger import get_logger
|
|
5
4
|
|
|
@@ -11,6 +10,9 @@ logger = get_logger()
|
|
|
11
10
|
@Benchmark.register(
|
|
12
11
|
name='aime24',
|
|
13
12
|
pretty_name='AIME-2024',
|
|
13
|
+
tags=['Mathematics'],
|
|
14
|
+
description=
|
|
15
|
+
'The AIME 2024 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.', # noqa: E501
|
|
14
16
|
dataset_id='HuggingFaceH4/aime_2024',
|
|
15
17
|
subset_list=['default'],
|
|
16
18
|
metric_list=['AveragePass@1'],
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
-
from evalscope.constants import OutputType
|
|
3
2
|
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
|
|
4
3
|
from evalscope.utils.logger import get_logger
|
|
5
4
|
|
|
@@ -11,6 +10,9 @@ logger = get_logger()
|
|
|
11
10
|
@Benchmark.register(
|
|
12
11
|
name='aime25',
|
|
13
12
|
pretty_name='AIME-2025',
|
|
13
|
+
tags=['Mathematics'],
|
|
14
|
+
description=
|
|
15
|
+
'The AIME 2025 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.',
|
|
14
16
|
dataset_id='opencompass/AIME2025',
|
|
15
17
|
subset_list=['AIME2025-I', 'AIME2025-II'],
|
|
16
18
|
metric_list=['AveragePass@1'],
|
|
@@ -47,6 +47,11 @@ Evaluate the models based on the quality and relevance of their outputs, and sel
|
|
|
47
47
|
@Benchmark.register(
|
|
48
48
|
name='alpaca_eval',
|
|
49
49
|
pretty_name='AlpacaEval2.0',
|
|
50
|
+
tags=['Instruction-Following', 'Reasoning'],
|
|
51
|
+
description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
|
|
52
|
+
'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
|
|
53
|
+
'provide more accurate and cost-effective model assessments. '
|
|
54
|
+
'Currently not support `length-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-turbo`.', # noqa: E501
|
|
50
55
|
dataset_id='AI-ModelScope/alpaca_eval',
|
|
51
56
|
subset_list=['alpaca_eval_gpt4_baseline'],
|
|
52
57
|
metric_list=['winrate'],
|
|
@@ -17,6 +17,9 @@ logger = get_logger()
|
|
|
17
17
|
@Benchmark.register(
|
|
18
18
|
name='arc',
|
|
19
19
|
pretty_name='ARC',
|
|
20
|
+
tags=['Reasoning', 'MCQ'],
|
|
21
|
+
description=
|
|
22
|
+
'The ARC (AI2 Reasoning Challenge) benchmark is designed to evaluate the reasoning capabilities of AI models through multiple-choice questions derived from science exams. It includes two subsets: ARC-Easy and ARC-Challenge, which vary in difficulty.', # noqa: E501
|
|
20
23
|
dataset_id='modelscope/ai2_arc',
|
|
21
24
|
model_adapter=OutputType.GENERATION,
|
|
22
25
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from collections import defaultdict
|
|
3
1
|
from typing import Any, List
|
|
4
2
|
|
|
5
3
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
@@ -19,12 +17,18 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
|
|
|
19
17
|
@Benchmark.register(
|
|
20
18
|
name='arena_hard',
|
|
21
19
|
pretty_name='ArenaHard',
|
|
20
|
+
tags=['Instruction-Following', 'Reasoning'],
|
|
21
|
+
description=
|
|
22
|
+
'ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, '
|
|
23
|
+
'where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. '
|
|
24
|
+
'It includes a set of challenging tasks that require reasoning, understanding, and generation capabilities. '
|
|
25
|
+
'Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.', # noqa: E501
|
|
22
26
|
dataset_id='AI-ModelScope/arena-hard-auto-v0.1',
|
|
23
27
|
metric_list=['winrate'],
|
|
24
28
|
few_shot_num=0,
|
|
25
29
|
train_split=None,
|
|
26
30
|
eval_split='test')
|
|
27
|
-
class
|
|
31
|
+
class ArenaHardAdapter(DataAdapter):
|
|
28
32
|
|
|
29
33
|
def __init__(self, *args, **kwargs):
|
|
30
34
|
super().__init__(*args, **kwargs)
|
|
@@ -59,6 +59,9 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
|
59
59
|
@Benchmark.register(
|
|
60
60
|
name='bbh',
|
|
61
61
|
pretty_name='BBH',
|
|
62
|
+
tags=['Reasoning'],
|
|
63
|
+
description=
|
|
64
|
+
'The BBH (Big Bench Hard) benchmark is a collection of challenging tasks designed to evaluate the reasoning capabilities of AI models. It includes both free-form and multiple-choice tasks, covering a wide range of reasoning skills.', # noqa: E501
|
|
62
65
|
dataset_id='modelscope/bbh',
|
|
63
66
|
subset_list=SUBSET_LIST,
|
|
64
67
|
metric_list=['AverageAccuracy'],
|
|
@@ -29,6 +29,7 @@ class BenchmarkMeta:
|
|
|
29
29
|
query_template: Optional[str] = None
|
|
30
30
|
pretty_name: Optional[str] = None
|
|
31
31
|
description: Optional[str] = None
|
|
32
|
+
tags: Optional[List[str]] = field(default_factory=list)
|
|
32
33
|
filters: Optional[OrderedDict] = None
|
|
33
34
|
extra_params: Optional[Dict] = field(default_factory=dict)
|
|
34
35
|
|
|
File without changes
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import importlib
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
import traceback
|
|
6
|
+
from typing import Any, List
|
|
7
|
+
|
|
8
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
+
from evalscope.constants import EvalType
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
SUBJECT_MAPPING = {
|
|
15
|
+
'simple': 'AST_NON_LIVE',
|
|
16
|
+
'multiple': 'AST_NON_LIVE',
|
|
17
|
+
'parallel': 'AST_NON_LIVE',
|
|
18
|
+
'parallel_multiple': 'AST_NON_LIVE',
|
|
19
|
+
'java': 'AST_NON_LIVE',
|
|
20
|
+
'javascript': 'AST_NON_LIVE',
|
|
21
|
+
'live_simple': 'AST_LIVE',
|
|
22
|
+
'live_multiple': 'AST_LIVE',
|
|
23
|
+
'live_parallel': 'AST_LIVE',
|
|
24
|
+
'live_parallel_multiple': 'AST_LIVE',
|
|
25
|
+
'irrelevance': 'RELEVANCE',
|
|
26
|
+
'live_relevance': 'RELEVANCE',
|
|
27
|
+
'live_irrelevance': 'RELEVANCE',
|
|
28
|
+
'multi_turn_base': 'MULTI_TURN',
|
|
29
|
+
'multi_turn_miss_func': 'MULTI_TURN',
|
|
30
|
+
'multi_turn_miss_param': 'MULTI_TURN',
|
|
31
|
+
'multi_turn_long_context': 'MULTI_TURN'
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@Benchmark.register(
|
|
36
|
+
name='bfcl_v3',
|
|
37
|
+
pretty_name='BFCL-v3',
|
|
38
|
+
tags=['Agent'],
|
|
39
|
+
description=
|
|
40
|
+
'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
|
|
41
|
+
'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '
|
|
42
|
+
'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
|
|
43
|
+
'Need to run `pip install bfcl-eval` before evaluating. '
|
|
44
|
+
'[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html)', # noqa: E501
|
|
45
|
+
dataset_id='AI-ModelScope/bfcl_v3',
|
|
46
|
+
subset_list=list(SUBJECT_MAPPING.keys()),
|
|
47
|
+
model_adapter='bfcl_server',
|
|
48
|
+
metric_list=['AverageAccuracy'],
|
|
49
|
+
few_shot_num=0,
|
|
50
|
+
train_split=None,
|
|
51
|
+
eval_split='train',
|
|
52
|
+
extra_params={
|
|
53
|
+
'underscore_to_dot': True,
|
|
54
|
+
'is_fc_model': True,
|
|
55
|
+
})
|
|
56
|
+
class BFCLAdapter(DataAdapter):
|
|
57
|
+
|
|
58
|
+
def __init__(self, **kwargs):
|
|
59
|
+
super().__init__(**kwargs)
|
|
60
|
+
|
|
61
|
+
spec = importlib.util.find_spec('bfcl_eval')
|
|
62
|
+
if spec is None:
|
|
63
|
+
raise ImportError(
|
|
64
|
+
'`bfcl_eval` not found, please install it with `pip install bfcl-eval` before evaluating.')
|
|
65
|
+
|
|
66
|
+
self.category_map = SUBJECT_MAPPING
|
|
67
|
+
|
|
68
|
+
extra_params = kwargs.get('extra_params', {})
|
|
69
|
+
self.underscore_to_dot = extra_params.get('underscore_to_dot', False)
|
|
70
|
+
self.is_fc_model = extra_params.get('is_fc_model', True)
|
|
71
|
+
|
|
72
|
+
def load(self, **kwargs):
|
|
73
|
+
kwargs['subset_list'] = ['default']
|
|
74
|
+
data_dict = super().load(**kwargs)
|
|
75
|
+
return self.reformat_subset(data_dict, subset_key='subset', format='{}')
|
|
76
|
+
|
|
77
|
+
def preprocess_row(self, row: dict):
|
|
78
|
+
"""
|
|
79
|
+
Inplace preprocess the row to ensure it has the correct format for BFCL evaluation.
|
|
80
|
+
"""
|
|
81
|
+
row['should_execute_tool_calls'] = True if row['multi_turn'] else False
|
|
82
|
+
row['functions'] = json.loads(row['functions'])
|
|
83
|
+
row['tools'] = json.loads(row['tools'])
|
|
84
|
+
row['turns'] = json.loads(row['turns'])
|
|
85
|
+
row['missing_functions'] = json.loads(row['missed_functions'])
|
|
86
|
+
row['ground_truth'] = json.loads(row.get('ground_truth', '{}'))
|
|
87
|
+
row['initial_config'] = json.loads(row['initial_config'])
|
|
88
|
+
row['is_fc_model'] = self.is_fc_model
|
|
89
|
+
|
|
90
|
+
def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
|
|
91
|
+
self.preprocess_row(input_d)
|
|
92
|
+
|
|
93
|
+
# If the model is a function calling model, we need to remove the system prompt
|
|
94
|
+
if self.is_fc_model:
|
|
95
|
+
turns = input_d['turns']
|
|
96
|
+
new_turns = []
|
|
97
|
+
for turn_idx, messages in enumerate(turns):
|
|
98
|
+
current_messages = messages.copy()
|
|
99
|
+
if len(current_messages) > 0 and current_messages[0]['role'] == 'system':
|
|
100
|
+
current_messages = current_messages[1:]
|
|
101
|
+
new_turns.append(current_messages)
|
|
102
|
+
input_d['turns'] = new_turns
|
|
103
|
+
|
|
104
|
+
return self.gen_prompt_data(prompt='', messages=input_d)
|
|
105
|
+
|
|
106
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
107
|
+
# Get the gold choice
|
|
108
|
+
return input_d.get('ground_truth', )
|
|
109
|
+
|
|
110
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> dict:
|
|
111
|
+
row = copy.deepcopy(raw_input_d)
|
|
112
|
+
del row['turns'] # Remove turns as they are not needed for the match function
|
|
113
|
+
|
|
114
|
+
row['generation'] = result
|
|
115
|
+
return row
|
|
116
|
+
|
|
117
|
+
def match(self, gold: dict, pred: dict) -> dict:
|
|
118
|
+
from bfcl_eval.eval_checker.ast_eval.ast_checker import ast_checker
|
|
119
|
+
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import multi_turn_checker
|
|
120
|
+
from bfcl_eval.model_handler.utils import (convert_to_function_call, default_decode_ast_prompting,
|
|
121
|
+
default_decode_execute_prompting)
|
|
122
|
+
from bfcl_eval.utils import is_empty_output
|
|
123
|
+
|
|
124
|
+
# NOTE: This is hardcoded dummy model since its only use is to infer underscore_to_dot
|
|
125
|
+
# which decides if model was provided with functions of the type
|
|
126
|
+
# spotify.list_songs or spotify_list_songs
|
|
127
|
+
# It is False for all llama models (when using via prompting)
|
|
128
|
+
# and True for API calls
|
|
129
|
+
if self.underscore_to_dot:
|
|
130
|
+
dummy_model = 'gpt-4o-2024-11-20-FC'
|
|
131
|
+
else:
|
|
132
|
+
dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
|
|
133
|
+
|
|
134
|
+
row = pred
|
|
135
|
+
test_category = re.sub(r'_[0-9_-]+$', '', row['id'])
|
|
136
|
+
if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
|
|
137
|
+
error = None
|
|
138
|
+
try:
|
|
139
|
+
if self.is_fc_model:
|
|
140
|
+
decoded_tool_calls = []
|
|
141
|
+
for tool_call in row['generation'][0]:
|
|
142
|
+
name = list(tool_call.keys())[0]
|
|
143
|
+
params = json.loads(tool_call[name])
|
|
144
|
+
decoded_tool_calls.append({name: params})
|
|
145
|
+
else:
|
|
146
|
+
decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
|
|
147
|
+
|
|
148
|
+
# successful decode means valid function call was present
|
|
149
|
+
contains_func_call = True
|
|
150
|
+
if is_empty_output(decoded_tool_calls):
|
|
151
|
+
# Empty output is not considered as a valid function call
|
|
152
|
+
contains_func_call = False
|
|
153
|
+
error = 'Empty decoded output.'
|
|
154
|
+
except Exception:
|
|
155
|
+
contains_func_call = False
|
|
156
|
+
error = f'Failed to decode with traceback: {traceback.format_exc()}'
|
|
157
|
+
finally:
|
|
158
|
+
valid = contains_func_call if test_category == 'live_relevance' else not contains_func_call
|
|
159
|
+
score_result = {'valid': valid, 'error_message': error}
|
|
160
|
+
|
|
161
|
+
elif row['multi_turn']:
|
|
162
|
+
# each step might give a list of tool calls and each turn is multi-step
|
|
163
|
+
# and multi-turn has generations of all the turns
|
|
164
|
+
# hence in a multi-turn setting,
|
|
165
|
+
# multi_turn_decoded_generations is a list of list of list of strings
|
|
166
|
+
multi_turn_decoded_generations: list[list[list[str]]] = []
|
|
167
|
+
for single_turn_generations in row['generation']:
|
|
168
|
+
single_turn_decoded_generations: list[list[str]] = []
|
|
169
|
+
for generation in single_turn_generations:
|
|
170
|
+
try:
|
|
171
|
+
if self.is_fc_model:
|
|
172
|
+
tool_calls = convert_to_function_call(generation)
|
|
173
|
+
else:
|
|
174
|
+
tool_calls = default_decode_execute_prompting(generation)
|
|
175
|
+
|
|
176
|
+
single_turn_decoded_generations.append(tool_calls)
|
|
177
|
+
except Exception:
|
|
178
|
+
single_turn_decoded_generations.append([generation])
|
|
179
|
+
|
|
180
|
+
multi_turn_decoded_generations.append(single_turn_decoded_generations)
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
raw_score_result = multi_turn_checker(
|
|
184
|
+
multi_turn_decoded_generations,
|
|
185
|
+
row['ground_truth'],
|
|
186
|
+
row,
|
|
187
|
+
test_category,
|
|
188
|
+
dummy_model,
|
|
189
|
+
)
|
|
190
|
+
except Exception:
|
|
191
|
+
raw_score_result = {
|
|
192
|
+
'valid': False,
|
|
193
|
+
'error_type': 'multi_turn:checker_failed',
|
|
194
|
+
'error_message': f'Failed to grade multi-turn. Traceback: {traceback.format_exc()}',
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
score_result = {
|
|
198
|
+
'valid': float(raw_score_result['valid']),
|
|
199
|
+
'error_message': raw_score_result.get('error_message', ''),
|
|
200
|
+
'error_type': raw_score_result.get('error_type', ''),
|
|
201
|
+
}
|
|
202
|
+
else:
|
|
203
|
+
try:
|
|
204
|
+
if self.is_fc_model:
|
|
205
|
+
decoded_tool_calls = []
|
|
206
|
+
for tool_call in row['generation'][0]:
|
|
207
|
+
name = list(tool_call.keys())[0]
|
|
208
|
+
params = json.loads(tool_call[name])
|
|
209
|
+
decoded_tool_calls.append({name: params})
|
|
210
|
+
else:
|
|
211
|
+
decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
|
|
212
|
+
|
|
213
|
+
score_result = ast_checker(
|
|
214
|
+
row['functions'],
|
|
215
|
+
decoded_tool_calls,
|
|
216
|
+
row['ground_truth'],
|
|
217
|
+
row['language'],
|
|
218
|
+
row['test_category'],
|
|
219
|
+
dummy_model,
|
|
220
|
+
)
|
|
221
|
+
except Exception:
|
|
222
|
+
score_result = {
|
|
223
|
+
'valid': False,
|
|
224
|
+
'error_message': f'Invalid syntax. Failed to decode AST. Traceback: {traceback.format_exc()}',
|
|
225
|
+
'error_type': 'ast_decoder:decoder_failed',
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return {
|
|
229
|
+
'AverageAccuracy': float(score_result['valid']),
|
|
230
|
+
'raw_score_result': score_result,
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
|
|
234
|
+
# aggregate review results
|
|
235
|
+
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
236
|
+
|
|
237
|
+
return super().compute_metric(res_dict, **kwargs)
|
|
@@ -126,6 +126,9 @@ SUBJECT_MAPPING = {
|
|
|
126
126
|
@Benchmark.register(
|
|
127
127
|
name='ceval',
|
|
128
128
|
pretty_name='C-Eval',
|
|
129
|
+
tags=['Knowledge', 'MCQ', 'Chinese'],
|
|
130
|
+
description=
|
|
131
|
+
'C-Eval is a benchmark designed to evaluate the performance of AI models on Chinese exams across various subjects, including STEM, social sciences, and humanities. It consists of multiple-choice questions that test knowledge and reasoning abilities in these areas.', # noqa: E501
|
|
129
132
|
dataset_id='modelscope/ceval-exam',
|
|
130
133
|
model_adapter=OutputType.GENERATION,
|
|
131
134
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -87,7 +87,10 @@ SUBSET_LIST = ['中华文化', '人文与社会科学', '工程、技术与应
|
|
|
87
87
|
|
|
88
88
|
@Benchmark.register(
|
|
89
89
|
name='chinese_simpleqa',
|
|
90
|
-
pretty_name='Chinese
|
|
90
|
+
pretty_name='Chinese-SimpleQA',
|
|
91
|
+
tags=['Knowledge', 'QA', 'Chinese'],
|
|
92
|
+
description=
|
|
93
|
+
"Chinese SimpleQA is a Chinese question-answering dataset designed to evaluate the performance of language models on simple factual questions. It includes a variety of topics and is structured to test the model's ability to understand and generate correct answers in Chinese.", # noqa: E501
|
|
91
94
|
subset_list=SUBSET_LIST,
|
|
92
95
|
dataset_id='AI-ModelScope/Chinese-SimpleQA',
|
|
93
96
|
metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
|
|
@@ -103,6 +103,9 @@ SUBJECT_MAPPING = {
|
|
|
103
103
|
@Benchmark.register(
|
|
104
104
|
name='cmmlu',
|
|
105
105
|
pretty_name='C-MMLU',
|
|
106
|
+
tags=['Knowledge', 'MCQ', 'Chinese'],
|
|
107
|
+
description=
|
|
108
|
+
'C-MMLU is a benchmark designed to evaluate the performance of AI models on Chinese language tasks, including reading comprehension, text classification, and more.',
|
|
106
109
|
dataset_id='modelscope/cmmlu',
|
|
107
110
|
model_adapter=OutputType.GENERATION,
|
|
108
111
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -17,6 +17,9 @@ logger = get_logger()
|
|
|
17
17
|
@Benchmark.register(
|
|
18
18
|
name='competition_math',
|
|
19
19
|
pretty_name='MATH',
|
|
20
|
+
tags=['Mathematics'],
|
|
21
|
+
description=
|
|
22
|
+
'The MATH (Mathematics) benchmark is designed to evaluate the mathematical reasoning abilities of AI models through a variety of problem types, including arithmetic, algebra, geometry, and more.',
|
|
20
23
|
dataset_id='modelscope/competition_math',
|
|
21
24
|
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
22
25
|
metric_list=['AveragePass@1'],
|
|
@@ -39,6 +39,7 @@ class DataAdapter(ABC):
|
|
|
39
39
|
query_template: Optional[str] = None,
|
|
40
40
|
pretty_name: Optional[str] = None,
|
|
41
41
|
description: Optional[str] = None,
|
|
42
|
+
tags: Optional[List[str]] = None,
|
|
42
43
|
**kwargs):
|
|
43
44
|
"""
|
|
44
45
|
Args:
|
|
@@ -76,6 +77,7 @@ class DataAdapter(ABC):
|
|
|
76
77
|
self.query_template = query_template
|
|
77
78
|
self.pretty_name = pretty_name
|
|
78
79
|
self.description = description
|
|
80
|
+
self.tags = tags or []
|
|
79
81
|
self.config_kwargs = kwargs
|
|
80
82
|
self.category_map = kwargs.get('category_map', {})
|
|
81
83
|
self.choices = kwargs.get('choices', None)
|
|
@@ -16,6 +16,7 @@ Format your response as follows: "Therefore, the answer is (insert answer here)"
|
|
|
16
16
|
@Benchmark.register(
|
|
17
17
|
name='docmath',
|
|
18
18
|
pretty_name='DocMath',
|
|
19
|
+
tags=['Reasoning', 'Mathematics', 'Long Context'],
|
|
19
20
|
description=
|
|
20
21
|
'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.', # noqa: E501
|
|
21
22
|
dataset_id='yale-nlp/DocMath-Eval',
|
|
@@ -31,6 +31,9 @@ Answer: 43
|
|
|
31
31
|
@Benchmark.register(
|
|
32
32
|
name='drop',
|
|
33
33
|
pretty_name='DROP',
|
|
34
|
+
tags=['Reasoning'],
|
|
35
|
+
description=
|
|
36
|
+
'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.', # noqa: E501
|
|
34
37
|
dataset_id='AI-ModelScope/DROP',
|
|
35
38
|
metric_list=['AverageAccuracy'],
|
|
36
39
|
few_shot_num=0,
|
|
@@ -16,6 +16,7 @@ Format your response as follows: "Therefore, the answer is (insert answer here)"
|
|
|
16
16
|
@Benchmark.register(
|
|
17
17
|
name='frames',
|
|
18
18
|
pretty_name='FRAMES',
|
|
19
|
+
tags=['Reasoning', 'Long Context'],
|
|
19
20
|
description=
|
|
20
21
|
'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.', # noqa: E501
|
|
21
22
|
dataset_id='iic/frames',
|