evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -5
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
- evalscope/benchmarks/benchmark.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
- evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
- evalscope/benchmarks/data_adapter.py +69 -70
- evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
- evalscope/benchmarks/race/race_adapter.py +4 -73
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/evaluator.py +82 -62
- evalscope/collections/sampler.py +47 -41
- evalscope/collections/schema.py +14 -10
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +22 -13
- evalscope/metrics/__init__.py +2 -5
- evalscope/metrics/metrics.py +11 -2
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/models/server_adapter.py +11 -4
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +16 -11
- evalscope/summarizer.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/logger.py +1 -0
- evalscope/utils/model_utils.py +5 -2
- evalscope/version.py +2 -2
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
- tests/cli/test_collection.py +11 -7
- tests/cli/test_run.py +13 -4
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from transformers import AutoTokenizer
|
|
3
2
|
from typing import Any, Dict, Iterator, List
|
|
4
3
|
|
|
5
4
|
from evalscope.perf.arguments import Arguments
|
|
@@ -25,6 +24,7 @@ class CustomPlugin(ApiPluginBase):
|
|
|
25
24
|
"""
|
|
26
25
|
super().__init__(model_path=mode_path)
|
|
27
26
|
if mode_path is not None:
|
|
27
|
+
from transformers import AutoTokenizer
|
|
28
28
|
self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
|
|
29
29
|
else:
|
|
30
30
|
self.tokenizer = None
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
from transformers import AutoTokenizer
|
|
4
3
|
from typing import Any, Dict, Iterator, List, Union
|
|
5
4
|
|
|
6
5
|
from evalscope.perf.arguments import Arguments
|
|
@@ -25,6 +24,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
25
24
|
"""
|
|
26
25
|
super().__init__(model_path=mode_path)
|
|
27
26
|
if mode_path is not None:
|
|
27
|
+
from transformers import AutoTokenizer
|
|
28
28
|
self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
|
|
29
29
|
else:
|
|
30
30
|
self.tokenizer = None
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
from io import BytesIO
|
|
3
|
-
from modelscope.msdatasets import MsDataset
|
|
4
3
|
from PIL import Image
|
|
5
4
|
from typing import Any, Dict, Iterator, List
|
|
6
5
|
|
|
@@ -26,6 +25,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
|
|
|
26
25
|
super().__init__(query_parameters)
|
|
27
26
|
|
|
28
27
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
28
|
+
from modelscope.msdatasets import MsDataset
|
|
29
29
|
dataset = MsDataset.load('clip-benchmark/wds_flickr8k', split='test')
|
|
30
30
|
|
|
31
31
|
for item in dataset:
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from modelscope import MsDataset
|
|
2
1
|
from typing import Any, Dict, Iterator, List
|
|
3
2
|
|
|
4
3
|
from evalscope.perf.arguments import Arguments
|
|
@@ -17,6 +16,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
|
|
|
17
16
|
|
|
18
17
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
19
18
|
if not self.query_parameters.dataset_path:
|
|
19
|
+
from modelscope import MsDataset
|
|
20
20
|
ds = MsDataset.load('AI-ModelScope/LongAlpaca-12k', subset_name='default', split='train')
|
|
21
21
|
else:
|
|
22
22
|
ds = self.dataset_json_list(self.query_parameters.dataset_path)
|
evalscope/report/app.py
ADDED
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import gradio as gr
|
|
3
|
+
import numpy as np
|
|
4
|
+
import os
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import plotly.express as px
|
|
7
|
+
import plotly.graph_objects as go
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any, List, Union
|
|
10
|
+
|
|
11
|
+
from evalscope.constants import DataCollection
|
|
12
|
+
from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
|
|
13
|
+
from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
|
|
14
|
+
from evalscope.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def scan_for_report_folders(root_path):
|
|
20
|
+
"""Scan for folders containing reports subdirectories"""
|
|
21
|
+
logger.debug(f'Scanning for report folders in {root_path}')
|
|
22
|
+
if not os.path.exists(root_path):
|
|
23
|
+
return []
|
|
24
|
+
|
|
25
|
+
reports = []
|
|
26
|
+
# Iterate over all folders in the root path
|
|
27
|
+
for folder in glob.glob(os.path.join(root_path, '*')):
|
|
28
|
+
# Check if reports folder exists
|
|
29
|
+
reports_path = os.path.join(folder, OutputsStructure.REPORTS_DIR)
|
|
30
|
+
if not os.path.exists(reports_path):
|
|
31
|
+
continue
|
|
32
|
+
|
|
33
|
+
# Iterate over all items in reports folder
|
|
34
|
+
for model_item in glob.glob(os.path.join(reports_path, '*')):
|
|
35
|
+
if not os.path.isdir(model_item):
|
|
36
|
+
continue
|
|
37
|
+
datasets = []
|
|
38
|
+
for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
|
|
39
|
+
datasets.append(os.path.basename(dataset_item).split('.')[0])
|
|
40
|
+
datasets = ','.join(datasets)
|
|
41
|
+
reports.append(f'{os.path.basename(folder)}@{os.path.basename(model_item)}:{datasets}')
|
|
42
|
+
|
|
43
|
+
reports = sorted(reports, reverse=True)
|
|
44
|
+
logger.debug(f'reports: {reports}')
|
|
45
|
+
return reports
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def process_report_name(report_name: str):
|
|
49
|
+
prefix, report_name = report_name.split('@')
|
|
50
|
+
model_name, datasets = report_name.split(':')
|
|
51
|
+
datasets = datasets.split(',')
|
|
52
|
+
return prefix, model_name, datasets
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def load_single_report(root_path: str, report_name: str):
|
|
56
|
+
prefix, model_name, datasets = process_report_name(report_name)
|
|
57
|
+
report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
|
|
58
|
+
report_list = get_report_list([report_path_list])
|
|
59
|
+
|
|
60
|
+
task_cfg_path = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))[0]
|
|
61
|
+
task_cfg = yaml_to_dict(task_cfg_path)
|
|
62
|
+
return report_list, datasets, task_cfg
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def load_multi_report(root_path: str, report_names: List[str]):
|
|
66
|
+
report_list = []
|
|
67
|
+
for report_name in report_names:
|
|
68
|
+
prefix, model_name, datasets = process_report_name(report_name)
|
|
69
|
+
report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
|
|
70
|
+
reports = get_report_list([report_path_list])
|
|
71
|
+
report_list.extend(reports)
|
|
72
|
+
return report_list
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_acc_report_df(report_list: List[Report]):
|
|
76
|
+
data_dict = []
|
|
77
|
+
for report in report_list:
|
|
78
|
+
if report.name == DataCollection.NAME:
|
|
79
|
+
for metric in report.metrics:
|
|
80
|
+
for category in metric.categories:
|
|
81
|
+
item = {
|
|
82
|
+
ReportKey.model_name: report.model_name,
|
|
83
|
+
ReportKey.dataset_name: '/'.join(category.name),
|
|
84
|
+
ReportKey.score: category.score,
|
|
85
|
+
ReportKey.num: category.num,
|
|
86
|
+
}
|
|
87
|
+
data_dict.append(item)
|
|
88
|
+
else:
|
|
89
|
+
item = {
|
|
90
|
+
ReportKey.model_name: report.model_name,
|
|
91
|
+
ReportKey.dataset_name: report.dataset_name,
|
|
92
|
+
ReportKey.score: report.score,
|
|
93
|
+
ReportKey.num: report.metrics[0].num,
|
|
94
|
+
}
|
|
95
|
+
data_dict.append(item)
|
|
96
|
+
df = pd.DataFrame.from_dict(data_dict, orient='columns')
|
|
97
|
+
return df
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_compare_report_df(acc_df: pd.DataFrame):
|
|
101
|
+
df = acc_df.pivot_table(index=ReportKey.model_name, columns=ReportKey.dataset_name, values=ReportKey.score)
|
|
102
|
+
df.reset_index(inplace=True)
|
|
103
|
+
styler = df.style.background_gradient(cmap='RdYlGn', vmin=0.0, vmax=1.0, axis=0)
|
|
104
|
+
styler.format(precision=4)
|
|
105
|
+
return styler
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def plot_single_report_scores(df: pd.DataFrame):
|
|
109
|
+
plot = px.bar(
|
|
110
|
+
df,
|
|
111
|
+
x=df[ReportKey.dataset_name],
|
|
112
|
+
y=df[ReportKey.score],
|
|
113
|
+
color=df[ReportKey.dataset_name],
|
|
114
|
+
template='plotly_dark')
|
|
115
|
+
return plot
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def plot_single_report_sunburst(report_list: List[Report]):
|
|
119
|
+
if report_list[0].name == DataCollection.NAME:
|
|
120
|
+
df = get_data_frame(report_list)
|
|
121
|
+
categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
|
|
122
|
+
path = categories + [ReportKey.subset_name]
|
|
123
|
+
else:
|
|
124
|
+
df = get_data_frame(report_list, flatten_metrics=False)
|
|
125
|
+
categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
|
|
126
|
+
path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
|
|
127
|
+
logger.debug(f'df: {df}')
|
|
128
|
+
df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
|
|
129
|
+
plot = px.sunburst(
|
|
130
|
+
df,
|
|
131
|
+
path=path,
|
|
132
|
+
values=ReportKey.num,
|
|
133
|
+
color=ReportKey.score,
|
|
134
|
+
color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
|
|
135
|
+
color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
|
|
136
|
+
template='plotly_dark',
|
|
137
|
+
maxdepth=3)
|
|
138
|
+
plot.update_traces(insidetextorientation='radial')
|
|
139
|
+
plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1))
|
|
140
|
+
return plot
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def get_single_dataset_data(df: pd.DataFrame, dataset_name: str):
|
|
144
|
+
return df[df[ReportKey.dataset_name] == dataset_name]
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def plot_single_dataset_scores(df: pd.DataFrame):
|
|
148
|
+
# TODO: add metric radio and relace category name
|
|
149
|
+
plot = px.bar(
|
|
150
|
+
df,
|
|
151
|
+
x=df[ReportKey.metric_name],
|
|
152
|
+
y=df[ReportKey.score],
|
|
153
|
+
color=df[ReportKey.subset_name],
|
|
154
|
+
template='plotly_dark',
|
|
155
|
+
barmode='group')
|
|
156
|
+
return plot
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def plot_multi_report_radar(df: pd.DataFrame):
|
|
160
|
+
fig = go.Figure()
|
|
161
|
+
|
|
162
|
+
grouped = df.groupby(ReportKey.model_name)
|
|
163
|
+
common_datasets = set.intersection(*[set(group[ReportKey.dataset_name]) for _, group in grouped])
|
|
164
|
+
|
|
165
|
+
for model_name, group in grouped:
|
|
166
|
+
common_group = group[group[ReportKey.dataset_name].isin(common_datasets)]
|
|
167
|
+
fig.add_trace(
|
|
168
|
+
go.Scatterpolar(
|
|
169
|
+
r=common_group[ReportKey.score],
|
|
170
|
+
theta=common_group[ReportKey.dataset_name],
|
|
171
|
+
name=model_name,
|
|
172
|
+
fill='toself'))
|
|
173
|
+
|
|
174
|
+
fig.update_layout(
|
|
175
|
+
template='plotly_dark',
|
|
176
|
+
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
|
|
177
|
+
margin=dict(t=20, l=20, r=20, b=20))
|
|
178
|
+
return fig
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def dict_to_markdown(data) -> str:
|
|
182
|
+
markdown_lines = []
|
|
183
|
+
|
|
184
|
+
for key, value in data.items():
|
|
185
|
+
bold_key = f'**{key}**'
|
|
186
|
+
|
|
187
|
+
if isinstance(value, list):
|
|
188
|
+
value_str = '\n' + '\n'.join([f' - {item}' for item in value])
|
|
189
|
+
elif isinstance(value, dict):
|
|
190
|
+
value_str = dict_to_markdown(value)
|
|
191
|
+
else:
|
|
192
|
+
value_str = str(value)
|
|
193
|
+
|
|
194
|
+
value_str = process_string(value_str)
|
|
195
|
+
markdown_line = f'{bold_key}: {value_str}'
|
|
196
|
+
markdown_lines.append(markdown_line)
|
|
197
|
+
|
|
198
|
+
return '\n\n'.join(markdown_lines)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def process_string(string: str, max_length: int = 2048) -> str:
|
|
202
|
+
if len(string) > max_length:
|
|
203
|
+
return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
|
|
204
|
+
return string
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def process_model_prediction(item: Any):
|
|
208
|
+
if isinstance(item, dict):
|
|
209
|
+
return dict_to_markdown(item)
|
|
210
|
+
elif isinstance(item, list):
|
|
211
|
+
return '\n'.join([process_model_prediction(item) for item in item])
|
|
212
|
+
else:
|
|
213
|
+
return process_string(str(item))
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def normalize_score(score):
|
|
217
|
+
if isinstance(score, bool):
|
|
218
|
+
return 1.0 if score else 0.0
|
|
219
|
+
elif isinstance(score, dict):
|
|
220
|
+
for key in score:
|
|
221
|
+
return float(score[key])
|
|
222
|
+
return 0.0
|
|
223
|
+
else:
|
|
224
|
+
try:
|
|
225
|
+
return float(score)
|
|
226
|
+
except (ValueError, TypeError):
|
|
227
|
+
return 0.0
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
|
|
231
|
+
data_path = os.path.join(work_dir, OutputsStructure.REVIEWS_DIR, model_name)
|
|
232
|
+
subset_name = subset_name.replace('/', '_') # for collection report
|
|
233
|
+
origin_df = pd.read_json(os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl'), lines=True)
|
|
234
|
+
ds = []
|
|
235
|
+
for i, item in origin_df.iterrows():
|
|
236
|
+
raw_input = item['raw_input']
|
|
237
|
+
raw_pred_answer = item['choices'][0]['message']['content']
|
|
238
|
+
parsed_gold_answer = item['choices'][0]['review']['gold']
|
|
239
|
+
parsed_pred_answer = item['choices'][0]['review']['pred']
|
|
240
|
+
score = item['choices'][0]['review']['result']
|
|
241
|
+
raw_d = {
|
|
242
|
+
'Input': raw_input,
|
|
243
|
+
'Generated': raw_pred_answer,
|
|
244
|
+
'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
|
|
245
|
+
'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
|
|
246
|
+
'Score': score,
|
|
247
|
+
'NScore': normalize_score(score)
|
|
248
|
+
}
|
|
249
|
+
ds.append(raw_d)
|
|
250
|
+
|
|
251
|
+
df_subset = pd.DataFrame(ds)
|
|
252
|
+
return df_subset
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: int = 1) -> pd.DataFrame:
|
|
256
|
+
if data_review_df is None:
|
|
257
|
+
return None
|
|
258
|
+
|
|
259
|
+
logger.debug(f'page: {page}, rows_per_page: {rows_per_page}')
|
|
260
|
+
start = (page - 1) * rows_per_page
|
|
261
|
+
end = start + rows_per_page
|
|
262
|
+
df_subset = data_review_df.iloc[start:end].copy()
|
|
263
|
+
df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
|
|
264
|
+
df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
|
|
265
|
+
return df_subset
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
@dataclass
|
|
269
|
+
class SidebarComponents:
|
|
270
|
+
root_path: gr.Textbox
|
|
271
|
+
reports_dropdown: gr.Dropdown
|
|
272
|
+
load_btn: gr.Button
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def create_sidebar():
|
|
276
|
+
gr.Markdown('## Settings')
|
|
277
|
+
root_path = gr.Textbox(label='Report(s) Root Path', value='./outputs', placeholder='./outputs', lines=1)
|
|
278
|
+
reports_dropdown = gr.Dropdown(label='Select Report(s)', choices=[], multiselect=True, interactive=True)
|
|
279
|
+
load_btn = gr.Button('Load & View')
|
|
280
|
+
gr.Markdown('### Note: Select report(s) and click `Load & View` to view the data!')
|
|
281
|
+
|
|
282
|
+
@reports_dropdown.focus(inputs=[root_path], outputs=[reports_dropdown])
|
|
283
|
+
def update_dropdown_choices(root_path):
|
|
284
|
+
folders = scan_for_report_folders(root_path)
|
|
285
|
+
if len(folders) == 0:
|
|
286
|
+
gr.Warning('No reports found, please check the path', duration=3)
|
|
287
|
+
return gr.update(choices=folders)
|
|
288
|
+
|
|
289
|
+
return SidebarComponents(
|
|
290
|
+
root_path=root_path,
|
|
291
|
+
reports_dropdown=reports_dropdown,
|
|
292
|
+
load_btn=load_btn,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
@dataclass
|
|
297
|
+
class SingleModelComponents:
|
|
298
|
+
report_name: gr.Dropdown
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def create_single_model_tab(sidebar: SidebarComponents):
|
|
302
|
+
report_name = gr.Dropdown(label='Select Report', choices=[], interactive=True)
|
|
303
|
+
work_dir = gr.State(None)
|
|
304
|
+
model_name = gr.State(None)
|
|
305
|
+
|
|
306
|
+
with gr.Accordion('Task Config', open=False):
|
|
307
|
+
task_config = gr.JSON(value=None)
|
|
308
|
+
|
|
309
|
+
report_list = gr.State([])
|
|
310
|
+
|
|
311
|
+
with gr.Tab('Datasets Overview'):
|
|
312
|
+
gr.Markdown('### Dataset Components')
|
|
313
|
+
sunburst_plot = gr.Plot(value=None, scale=1, label='Components')
|
|
314
|
+
gr.Markdown('### Dataset Scores')
|
|
315
|
+
score_plot = gr.Plot(value=None, scale=1, label='Scores')
|
|
316
|
+
gr.Markdown('### Dataset Scores Table')
|
|
317
|
+
score_table = gr.DataFrame(value=None)
|
|
318
|
+
|
|
319
|
+
with gr.Tab('Dataset Details'):
|
|
320
|
+
dataset_radio = gr.Radio(label='Select Dataset', choices=[], show_label=True, interactive=True)
|
|
321
|
+
gr.Markdown('### Dataset Scores')
|
|
322
|
+
dataset_plot = gr.Plot(value=None, scale=1, label='Scores')
|
|
323
|
+
gr.Markdown('### Dataset Scores Table')
|
|
324
|
+
dataset_table = gr.DataFrame(value=None)
|
|
325
|
+
|
|
326
|
+
gr.Markdown('### Model Prediction')
|
|
327
|
+
subset_radio = gr.Radio(label='Select Subset', choices=[], show_label=True, interactive=True)
|
|
328
|
+
with gr.Row():
|
|
329
|
+
answer_mode_radio = gr.Radio(
|
|
330
|
+
label='Answer Mode', choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
|
|
331
|
+
page_number = gr.Number(value=1, label='Page', minimum=1, maximum=1, step=1, interactive=True)
|
|
332
|
+
answer_mode_counts = gr.Markdown('', label='Counts')
|
|
333
|
+
data_review_df = gr.State(None)
|
|
334
|
+
filtered_review_df = gr.State(None)
|
|
335
|
+
data_review_table = gr.DataFrame(
|
|
336
|
+
value=None,
|
|
337
|
+
datatype=['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number'],
|
|
338
|
+
# column_widths=['500px', '500px'],
|
|
339
|
+
wrap=True,
|
|
340
|
+
latex_delimiters=[{
|
|
341
|
+
'left': '$$',
|
|
342
|
+
'right': '$$',
|
|
343
|
+
'display': True
|
|
344
|
+
}, {
|
|
345
|
+
'left': '$',
|
|
346
|
+
'right': '$',
|
|
347
|
+
'display': False
|
|
348
|
+
}, {
|
|
349
|
+
'left': '\\(',
|
|
350
|
+
'right': '\\)',
|
|
351
|
+
'display': False
|
|
352
|
+
}, {
|
|
353
|
+
'left': '\\[',
|
|
354
|
+
'right': '\\]',
|
|
355
|
+
'display': True
|
|
356
|
+
}],
|
|
357
|
+
max_height=500)
|
|
358
|
+
|
|
359
|
+
@report_name.change(
|
|
360
|
+
inputs=[sidebar.root_path, report_name],
|
|
361
|
+
outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
|
|
362
|
+
def update_single_report_data(root_path, report_name):
|
|
363
|
+
report_list, datasets, task_cfg = load_single_report(root_path, report_name)
|
|
364
|
+
work_dir = os.path.join(root_path, report_name.split('@')[0])
|
|
365
|
+
model_name = report_name.split('@')[1].split(':')[0]
|
|
366
|
+
return (report_list, task_cfg, gr.update(choices=datasets, value=datasets[0]), work_dir, model_name)
|
|
367
|
+
|
|
368
|
+
@report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])
|
|
369
|
+
def update_single_report_score(report_list):
|
|
370
|
+
report_score_df = get_acc_report_df(report_list)
|
|
371
|
+
report_score_plot = plot_single_report_scores(report_score_df)
|
|
372
|
+
report_sunburst_plot = plot_single_report_sunburst(report_list)
|
|
373
|
+
return report_score_plot, report_score_df, report_sunburst_plot
|
|
374
|
+
|
|
375
|
+
@gr.on(
|
|
376
|
+
triggers=[dataset_radio.change, report_list.change],
|
|
377
|
+
inputs=[dataset_radio, report_list],
|
|
378
|
+
outputs=[dataset_plot, dataset_table, subset_radio])
|
|
379
|
+
def update_single_report_dataset(dataset_name, report_list):
|
|
380
|
+
logger.debug(f'Updating single report dataset: {dataset_name}')
|
|
381
|
+
report_df = get_data_frame(report_list)
|
|
382
|
+
data_score_df = get_single_dataset_data(report_df, dataset_name)
|
|
383
|
+
data_score_plot = plot_single_dataset_scores(data_score_df)
|
|
384
|
+
subsets = data_score_df[ReportKey.subset_name].unique().tolist()
|
|
385
|
+
logger.debug(f'subsets: {subsets}')
|
|
386
|
+
return data_score_plot, data_score_df, gr.update(choices=subsets, value=subsets[0])
|
|
387
|
+
|
|
388
|
+
@subset_radio.change(
|
|
389
|
+
inputs=[work_dir, model_name, dataset_radio, subset_radio], outputs=[data_review_df, page_number])
|
|
390
|
+
def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
|
|
391
|
+
if not subset_name:
|
|
392
|
+
return gr.skip()
|
|
393
|
+
data_review_df = get_model_prediction(work_dir, model_name, dataset_name, subset_name)
|
|
394
|
+
return data_review_df, 1
|
|
395
|
+
|
|
396
|
+
@gr.on(
|
|
397
|
+
triggers=[data_review_df.change, answer_mode_radio.change],
|
|
398
|
+
inputs=[data_review_df, answer_mode_radio],
|
|
399
|
+
outputs=[filtered_review_df, page_number, answer_mode_counts])
|
|
400
|
+
def filter_data(data_review_df, answer_mode):
|
|
401
|
+
if data_review_df is None:
|
|
402
|
+
return None, gr.update(value=1, maximum=1), ''
|
|
403
|
+
|
|
404
|
+
all_count = len(data_review_df)
|
|
405
|
+
pass_df = data_review_df[data_review_df['NScore'] >= 0.99]
|
|
406
|
+
pass_count = len(pass_df)
|
|
407
|
+
fail_count = all_count - pass_count
|
|
408
|
+
|
|
409
|
+
counts_text = f'### All: {all_count} | Pass: {pass_count} | Fail: {fail_count}'
|
|
410
|
+
|
|
411
|
+
if answer_mode == 'Pass':
|
|
412
|
+
filtered_df = pass_df
|
|
413
|
+
elif answer_mode == 'Fail':
|
|
414
|
+
filtered_df = data_review_df[data_review_df['NScore'] < 0.99]
|
|
415
|
+
else:
|
|
416
|
+
filtered_df = data_review_df
|
|
417
|
+
|
|
418
|
+
max_page = max(1, len(filtered_df))
|
|
419
|
+
|
|
420
|
+
return (filtered_df, gr.update(value=1, maximum=max_page), counts_text)
|
|
421
|
+
|
|
422
|
+
@gr.on(
|
|
423
|
+
triggers=[filtered_review_df.change, page_number.change],
|
|
424
|
+
inputs=[filtered_review_df, page_number],
|
|
425
|
+
outputs=[data_review_table])
|
|
426
|
+
def update_table(filtered_df, page_number):
|
|
427
|
+
subset_df = get_table_data(filtered_df, page_number)
|
|
428
|
+
if subset_df is None:
|
|
429
|
+
return gr.skip()
|
|
430
|
+
return subset_df
|
|
431
|
+
|
|
432
|
+
return SingleModelComponents(report_name=report_name)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
@dataclass
|
|
436
|
+
class MultiModelComponents:
|
|
437
|
+
multi_report_name: gr.Dropdown
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def create_multi_model_tab(sidebar: SidebarComponents):
|
|
441
|
+
multi_report_name = gr.Dropdown(label='Select Reports', choices=[], multiselect=True, interactive=True)
|
|
442
|
+
gr.Markdown('### Model Radar')
|
|
443
|
+
radar_plot = gr.Plot(value=None)
|
|
444
|
+
gr.Markdown('### Model Scores')
|
|
445
|
+
score_table = gr.DataFrame(value=None)
|
|
446
|
+
|
|
447
|
+
@multi_report_name.change(inputs=[sidebar.root_path, multi_report_name], outputs=[radar_plot, score_table])
|
|
448
|
+
def update_multi_report_data(root_path, multi_report_name):
|
|
449
|
+
if not multi_report_name:
|
|
450
|
+
return gr.skip()
|
|
451
|
+
report_list = load_multi_report(root_path, multi_report_name)
|
|
452
|
+
report_df = get_acc_report_df(report_list)
|
|
453
|
+
report_radar_plot = plot_multi_report_radar(report_df)
|
|
454
|
+
report_compare_df = get_compare_report_df(report_df)
|
|
455
|
+
return report_radar_plot, report_compare_df
|
|
456
|
+
|
|
457
|
+
return MultiModelComponents(multi_report_name=multi_report_name)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def create_app():
|
|
461
|
+
with gr.Blocks(title='Evalscope Dashboard') as demo:
|
|
462
|
+
with gr.Row():
|
|
463
|
+
with gr.Column(scale=0, min_width=35):
|
|
464
|
+
toggle_btn = gr.Button('<')
|
|
465
|
+
with gr.Column(scale=1):
|
|
466
|
+
gr.HTML('<h1 style="text-align: left;">Evalscope Dashboard</h1>') # 文本列
|
|
467
|
+
|
|
468
|
+
with gr.Row():
|
|
469
|
+
with gr.Column(scale=1) as sidebar_column:
|
|
470
|
+
sidebar_visible = gr.State(True)
|
|
471
|
+
sidebar = create_sidebar()
|
|
472
|
+
|
|
473
|
+
with gr.Column(scale=5):
|
|
474
|
+
|
|
475
|
+
with gr.Column(visible=True):
|
|
476
|
+
gr.Markdown('## Visualization')
|
|
477
|
+
with gr.Tabs():
|
|
478
|
+
with gr.Tab('Single Model'):
|
|
479
|
+
single = create_single_model_tab(sidebar)
|
|
480
|
+
|
|
481
|
+
with gr.Tab('Multi Model'):
|
|
482
|
+
multi = create_multi_model_tab(sidebar)
|
|
483
|
+
|
|
484
|
+
@sidebar.load_btn.click(
|
|
485
|
+
inputs=[sidebar.reports_dropdown], outputs=[single.report_name, multi.multi_report_name])
|
|
486
|
+
def update_displays(reports_dropdown):
|
|
487
|
+
if not reports_dropdown:
|
|
488
|
+
gr.Warning('No reports found, please check the path', duration=3)
|
|
489
|
+
return gr.skip()
|
|
490
|
+
|
|
491
|
+
return (
|
|
492
|
+
gr.update(choices=reports_dropdown, value=reports_dropdown[0]), # update single model dropdown
|
|
493
|
+
gr.update(choices=reports_dropdown, value=reports_dropdown) # update multi model dropdown
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
@toggle_btn.click(inputs=[sidebar_visible], outputs=[sidebar_column, sidebar_visible, toggle_btn])
|
|
497
|
+
def toggle_sidebar(visible):
|
|
498
|
+
new_visible = not visible
|
|
499
|
+
text = '<' if new_visible else '>'
|
|
500
|
+
return gr.update(visible=new_visible), new_visible, gr.update(value=text)
|
|
501
|
+
|
|
502
|
+
demo.launch()
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
if __name__ == '__main__':
|
|
506
|
+
create_app()
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import glob
|
|
4
|
+
import os
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from tabulate import tabulate
|
|
7
|
+
from typing import List, Tuple
|
|
8
|
+
|
|
9
|
+
from evalscope.report.utils import Report
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
"""
|
|
14
|
+
Combine and generate table for reports of LLMs.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_report_list(reports_path_list: List[str]) -> List[Report]:
|
|
19
|
+
report_list: List[Report] = []
|
|
20
|
+
# Iterate over each report path
|
|
21
|
+
for report_path in reports_path_list:
|
|
22
|
+
model_report_dir = os.path.normpath(report_path)
|
|
23
|
+
report_files = glob.glob(os.path.join(model_report_dir, '**', '*.json'), recursive=True)
|
|
24
|
+
# Iterate over each report file
|
|
25
|
+
for file_path in report_files:
|
|
26
|
+
try:
|
|
27
|
+
report = Report.from_json(file_path)
|
|
28
|
+
report_list.append(report)
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logger.error(f'Error loading report from {file_path}: {e}')
|
|
31
|
+
report_list = sorted(report_list, key=lambda x: (x.model_name, x.dataset_name))
|
|
32
|
+
return report_list
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_data_frame(report_list: List[Report],
|
|
36
|
+
flatten_metrics: bool = True,
|
|
37
|
+
flatten_categories: bool = True) -> pd.DataFrame:
|
|
38
|
+
tables = []
|
|
39
|
+
for report in report_list:
|
|
40
|
+
df = report.to_dataframe(flatten_metrics=flatten_metrics, flatten_categories=flatten_categories)
|
|
41
|
+
tables.append(df)
|
|
42
|
+
return pd.concat(tables, ignore_index=True)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def gen_table(reports_path_list: list) -> str:
|
|
46
|
+
report_list = get_report_list(reports_path_list)
|
|
47
|
+
table = get_data_frame(report_list)
|
|
48
|
+
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ReportsRecorder:
|
|
52
|
+
COMMON_DATASET_PATH = []
|
|
53
|
+
CUSTOM_DATASET_PATH = []
|
|
54
|
+
|
|
55
|
+
def __init__(self, oss_url: str = '', endpoint: str = ''):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == '__main__':
|
|
60
|
+
report_dir_1 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250117_151926'
|
|
61
|
+
# report_dir_2 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250107_204445/reports'
|
|
62
|
+
|
|
63
|
+
report_table = gen_table([report_dir_1])
|
|
64
|
+
print(report_table)
|
|
65
|
+
|
|
66
|
+
# ALL VALUES ONLY FOR EXAMPLE
|
|
67
|
+
# +--------------------------+-------------------+-------------+
|
|
68
|
+
# | Model | CompetitionMath | GSM8K |
|
|
69
|
+
# +==========================+===================+=============+
|
|
70
|
+
# | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
|
|
71
|
+
# +--------------------------+-------------------+-------------+
|
|
72
|
+
# | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
|
|
73
|
+
# +--------------------------+-------------------+-------------+
|