evalscope 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/aime24/__init__.py +0 -0
- evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
- evalscope/benchmarks/benchmark.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
- evalscope/benchmarks/data_adapter.py +18 -12
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
- evalscope/benchmarks/ifeval/instructions.py +3 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
- evalscope/cli/start_app.py +3 -2
- evalscope/collections/evaluator.py +103 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +1 -0
- evalscope/evaluator/evaluator.py +78 -64
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +16 -1
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/chat_adapter.py +69 -47
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +4 -0
- evalscope/models/server_adapter.py +28 -34
- evalscope/report/app.py +298 -96
- evalscope/run.py +10 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/METADATA +20 -11
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/RECORD +57 -47
- tests/cli/test_run.py +93 -16
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
evalscope/report/app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import argparse
|
|
1
2
|
import glob
|
|
2
3
|
import gradio as gr
|
|
3
4
|
import numpy as np
|
|
@@ -5,16 +6,20 @@ import os
|
|
|
5
6
|
import pandas as pd
|
|
6
7
|
import plotly.express as px
|
|
7
8
|
import plotly.graph_objects as go
|
|
9
|
+
import re
|
|
8
10
|
from dataclasses import dataclass
|
|
9
11
|
from typing import Any, List, Union
|
|
10
12
|
|
|
11
13
|
from evalscope.constants import DataCollection
|
|
12
14
|
from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
|
|
13
15
|
from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
|
|
14
|
-
from evalscope.utils.logger import get_logger
|
|
16
|
+
from evalscope.utils.logger import configure_logging, get_logger
|
|
17
|
+
from evalscope.version import __version__
|
|
15
18
|
|
|
16
19
|
logger = get_logger()
|
|
17
20
|
|
|
21
|
+
PLOTLY_THEME = 'plotly_dark'
|
|
22
|
+
|
|
18
23
|
|
|
19
24
|
def scan_for_report_folders(root_path):
|
|
20
25
|
"""Scan for folders containing reports subdirectories"""
|
|
@@ -94,24 +99,33 @@ def get_acc_report_df(report_list: List[Report]):
|
|
|
94
99
|
}
|
|
95
100
|
data_dict.append(item)
|
|
96
101
|
df = pd.DataFrame.from_dict(data_dict, orient='columns')
|
|
97
|
-
|
|
102
|
+
|
|
103
|
+
styler = style_df(df, columns=[ReportKey.score])
|
|
104
|
+
return df, styler
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def style_df(df: pd.DataFrame, columns: List[str] = None):
|
|
108
|
+
# Apply background gradient to the specified columns
|
|
109
|
+
styler = df.style.background_gradient(subset=columns, cmap='RdYlGn', vmin=0.0, vmax=1.0, axis=0)
|
|
110
|
+
# Format the dataframe with a precision of 4 decimal places
|
|
111
|
+
styler.format(precision=4)
|
|
112
|
+
return styler
|
|
98
113
|
|
|
99
114
|
|
|
100
115
|
def get_compare_report_df(acc_df: pd.DataFrame):
|
|
101
116
|
df = acc_df.pivot_table(index=ReportKey.model_name, columns=ReportKey.dataset_name, values=ReportKey.score)
|
|
102
117
|
df.reset_index(inplace=True)
|
|
103
|
-
|
|
104
|
-
styler
|
|
105
|
-
return styler
|
|
118
|
+
|
|
119
|
+
styler = style_df(df)
|
|
120
|
+
return df, styler
|
|
106
121
|
|
|
107
122
|
|
|
108
123
|
def plot_single_report_scores(df: pd.DataFrame):
|
|
109
|
-
plot = px.bar(
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
template='plotly_dark')
|
|
124
|
+
plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
|
|
125
|
+
|
|
126
|
+
width = 0.2 if len(df[ReportKey.dataset_name]) <= 5 else None
|
|
127
|
+
plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
|
|
128
|
+
plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
|
|
115
129
|
return plot
|
|
116
130
|
|
|
117
131
|
|
|
@@ -126,6 +140,7 @@ def plot_single_report_sunburst(report_list: List[Report]):
|
|
|
126
140
|
path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
|
|
127
141
|
logger.debug(f'df: {df}')
|
|
128
142
|
df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
|
|
143
|
+
|
|
129
144
|
plot = px.sunburst(
|
|
130
145
|
df,
|
|
131
146
|
path=path,
|
|
@@ -133,15 +148,17 @@ def plot_single_report_sunburst(report_list: List[Report]):
|
|
|
133
148
|
color=ReportKey.score,
|
|
134
149
|
color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
|
|
135
150
|
color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
|
|
136
|
-
template=
|
|
137
|
-
maxdepth=
|
|
151
|
+
template=PLOTLY_THEME,
|
|
152
|
+
maxdepth=4)
|
|
138
153
|
plot.update_traces(insidetextorientation='radial')
|
|
139
|
-
plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1))
|
|
154
|
+
plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1), height=600)
|
|
140
155
|
return plot
|
|
141
156
|
|
|
142
157
|
|
|
143
|
-
def
|
|
144
|
-
|
|
158
|
+
def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
|
|
159
|
+
df = df[df[ReportKey.dataset_name] == dataset_name]
|
|
160
|
+
styler = style_df(df, columns=[ReportKey.score])
|
|
161
|
+
return df, styler
|
|
145
162
|
|
|
146
163
|
|
|
147
164
|
def plot_single_dataset_scores(df: pd.DataFrame):
|
|
@@ -151,8 +168,12 @@ def plot_single_dataset_scores(df: pd.DataFrame):
|
|
|
151
168
|
x=df[ReportKey.metric_name],
|
|
152
169
|
y=df[ReportKey.score],
|
|
153
170
|
color=df[ReportKey.subset_name],
|
|
154
|
-
|
|
171
|
+
text=df[ReportKey.score],
|
|
155
172
|
barmode='group')
|
|
173
|
+
|
|
174
|
+
width = 0.2 if len(df[ReportKey.subset_name]) <= 5 else None
|
|
175
|
+
plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
|
|
176
|
+
plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
|
|
156
177
|
return plot
|
|
157
178
|
|
|
158
179
|
|
|
@@ -172,7 +193,7 @@ def plot_multi_report_radar(df: pd.DataFrame):
|
|
|
172
193
|
fill='toself'))
|
|
173
194
|
|
|
174
195
|
fig.update_layout(
|
|
175
|
-
template=
|
|
196
|
+
template=PLOTLY_THEME,
|
|
176
197
|
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
|
|
177
198
|
margin=dict(t=20, l=20, r=20, b=20))
|
|
178
199
|
return fig
|
|
@@ -198,7 +219,16 @@ def dict_to_markdown(data) -> str:
|
|
|
198
219
|
return '\n\n'.join(markdown_lines)
|
|
199
220
|
|
|
200
221
|
|
|
222
|
+
def convert_html_tags(text):
|
|
223
|
+
# match begin label
|
|
224
|
+
text = re.sub(r'<(\w+)>', r'[\1]', text)
|
|
225
|
+
# match end label
|
|
226
|
+
text = re.sub(r'</(\w+)>', r'[/\1]', text)
|
|
227
|
+
return text
|
|
228
|
+
|
|
229
|
+
|
|
201
230
|
def process_string(string: str, max_length: int = 2048) -> str:
|
|
231
|
+
string = convert_html_tags(string) # for display labels e.g. `<think>`
|
|
202
232
|
if len(string) > max_length:
|
|
203
233
|
return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
|
|
204
234
|
return string
|
|
@@ -206,9 +236,11 @@ def process_string(string: str, max_length: int = 2048) -> str:
|
|
|
206
236
|
|
|
207
237
|
def process_model_prediction(item: Any):
|
|
208
238
|
if isinstance(item, dict):
|
|
209
|
-
|
|
239
|
+
res = dict_to_markdown(item)
|
|
240
|
+
return process_string(res)
|
|
210
241
|
elif isinstance(item, list):
|
|
211
|
-
|
|
242
|
+
res = '\n'.join([process_model_prediction(item) for item in item])
|
|
243
|
+
return process_string(res)
|
|
212
244
|
else:
|
|
213
245
|
return process_string(str(item))
|
|
214
246
|
|
|
@@ -230,23 +262,27 @@ def normalize_score(score):
|
|
|
230
262
|
def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
|
|
231
263
|
data_path = os.path.join(work_dir, OutputsStructure.REVIEWS_DIR, model_name)
|
|
232
264
|
subset_name = subset_name.replace('/', '_') # for collection report
|
|
233
|
-
|
|
265
|
+
review_path = os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl')
|
|
266
|
+
logger.debug(f'review_path: {review_path}')
|
|
267
|
+
origin_df = pd.read_json(review_path, lines=True)
|
|
268
|
+
|
|
234
269
|
ds = []
|
|
235
270
|
for i, item in origin_df.iterrows():
|
|
236
271
|
raw_input = item['raw_input']
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
272
|
+
for choice in item['choices']:
|
|
273
|
+
raw_pred_answer = choice['message']['content']
|
|
274
|
+
parsed_gold_answer = choice['review']['gold']
|
|
275
|
+
parsed_pred_answer = choice['review']['pred']
|
|
276
|
+
score = choice['review']['result']
|
|
277
|
+
raw_d = {
|
|
278
|
+
'Input': raw_input,
|
|
279
|
+
'Generated': raw_pred_answer,
|
|
280
|
+
'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
|
|
281
|
+
'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
|
|
282
|
+
'Score': score,
|
|
283
|
+
'NScore': normalize_score(score)
|
|
284
|
+
}
|
|
285
|
+
ds.append(raw_d)
|
|
250
286
|
|
|
251
287
|
df_subset = pd.DataFrame(ds)
|
|
252
288
|
return df_subset
|
|
@@ -254,15 +290,18 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
|
|
|
254
290
|
|
|
255
291
|
def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: int = 1) -> pd.DataFrame:
|
|
256
292
|
if data_review_df is None:
|
|
257
|
-
return None
|
|
293
|
+
return pd.DataFrame(), None
|
|
258
294
|
|
|
259
295
|
logger.debug(f'page: {page}, rows_per_page: {rows_per_page}')
|
|
260
296
|
start = (page - 1) * rows_per_page
|
|
261
297
|
end = start + rows_per_page
|
|
262
298
|
df_subset = data_review_df.iloc[start:end].copy()
|
|
263
299
|
df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
|
|
300
|
+
df_subset['Generated'] = df_subset['Generated'].map(process_model_prediction).astype(str)
|
|
301
|
+
df_subset['Pred'] = df_subset['Pred'].map(process_model_prediction).astype(str)
|
|
264
302
|
df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
|
|
265
|
-
|
|
303
|
+
styler = style_df(df_subset, columns=['NScore'])
|
|
304
|
+
return df_subset, styler
|
|
266
305
|
|
|
267
306
|
|
|
268
307
|
@dataclass
|
|
@@ -272,18 +311,47 @@ class SidebarComponents:
|
|
|
272
311
|
load_btn: gr.Button
|
|
273
312
|
|
|
274
313
|
|
|
275
|
-
def create_sidebar():
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
314
|
+
def create_sidebar(outputs_dir: str, lang: str):
|
|
315
|
+
locale_dict = {
|
|
316
|
+
'settings': {
|
|
317
|
+
'zh': '设置',
|
|
318
|
+
'en': 'Settings'
|
|
319
|
+
},
|
|
320
|
+
'report_root_path': {
|
|
321
|
+
'zh': '报告根路径',
|
|
322
|
+
'en': 'Report Root Path'
|
|
323
|
+
},
|
|
324
|
+
'select_reports': {
|
|
325
|
+
'zh': '请选择报告',
|
|
326
|
+
'en': 'Select Reports'
|
|
327
|
+
},
|
|
328
|
+
'load_btn': {
|
|
329
|
+
'zh': '加载并查看',
|
|
330
|
+
'en': 'Load & View'
|
|
331
|
+
},
|
|
332
|
+
'note': {
|
|
333
|
+
'zh': '请选择报告并点击`加载并查看`来查看数据',
|
|
334
|
+
'en': 'Please select reports and click `Load & View` to view the data'
|
|
335
|
+
},
|
|
336
|
+
'warning': {
|
|
337
|
+
'zh': '没有找到报告,请检查路径',
|
|
338
|
+
'en': 'No reports found, please check the path'
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
gr.Markdown(f'## {locale_dict["settings"][lang]}')
|
|
343
|
+
root_path = gr.Textbox(
|
|
344
|
+
label=locale_dict['report_root_path'][lang], value=outputs_dir, placeholder=outputs_dir, lines=1)
|
|
345
|
+
reports_dropdown = gr.Dropdown(
|
|
346
|
+
label=locale_dict['select_reports'][lang], choices=[], multiselect=True, interactive=True)
|
|
347
|
+
load_btn = gr.Button(locale_dict['load_btn'][lang])
|
|
348
|
+
gr.Markdown(f'### {locale_dict["note"][lang]}')
|
|
281
349
|
|
|
282
350
|
@reports_dropdown.focus(inputs=[root_path], outputs=[reports_dropdown])
|
|
283
351
|
def update_dropdown_choices(root_path):
|
|
284
352
|
folders = scan_for_report_folders(root_path)
|
|
285
353
|
if len(folders) == 0:
|
|
286
|
-
gr.Warning('
|
|
354
|
+
gr.Warning(locale_dict['warning'][lang], duration=3)
|
|
287
355
|
return gr.update(choices=folders)
|
|
288
356
|
|
|
289
357
|
return SidebarComponents(
|
|
@@ -293,42 +361,132 @@ def create_sidebar():
|
|
|
293
361
|
)
|
|
294
362
|
|
|
295
363
|
|
|
364
|
+
@dataclass
|
|
365
|
+
class VisualizationComponents:
|
|
366
|
+
single_model: gr.Tab
|
|
367
|
+
multi_model: gr.Tab
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def create_visualization(sidebar: SidebarComponents, lang: str):
|
|
371
|
+
locale_dict = {
|
|
372
|
+
'visualization': {
|
|
373
|
+
'zh': '可视化',
|
|
374
|
+
'en': 'Visualization'
|
|
375
|
+
},
|
|
376
|
+
'single_model': {
|
|
377
|
+
'zh': '单模型',
|
|
378
|
+
'en': 'Single Model'
|
|
379
|
+
},
|
|
380
|
+
'multi_model': {
|
|
381
|
+
'zh': '多模型',
|
|
382
|
+
'en': 'Multi Model'
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
with gr.Column(visible=True):
|
|
386
|
+
gr.Markdown(f'## {locale_dict["visualization"][lang]}')
|
|
387
|
+
with gr.Tabs():
|
|
388
|
+
with gr.Tab(locale_dict['single_model'][lang]):
|
|
389
|
+
single = create_single_model_tab(sidebar, lang)
|
|
390
|
+
|
|
391
|
+
with gr.Tab(locale_dict['multi_model'][lang]):
|
|
392
|
+
multi = create_multi_model_tab(sidebar, lang)
|
|
393
|
+
return VisualizationComponents(
|
|
394
|
+
single_model=single,
|
|
395
|
+
multi_model=multi,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
|
|
296
399
|
@dataclass
|
|
297
400
|
class SingleModelComponents:
|
|
298
401
|
report_name: gr.Dropdown
|
|
299
402
|
|
|
300
403
|
|
|
301
|
-
def create_single_model_tab(sidebar: SidebarComponents):
|
|
302
|
-
|
|
404
|
+
def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
405
|
+
locale_dict = {
|
|
406
|
+
'select_report': {
|
|
407
|
+
'zh': '选择报告',
|
|
408
|
+
'en': 'Select Report'
|
|
409
|
+
},
|
|
410
|
+
'task_config': {
|
|
411
|
+
'zh': '任务配置',
|
|
412
|
+
'en': 'Task Config'
|
|
413
|
+
},
|
|
414
|
+
'datasets_overview': {
|
|
415
|
+
'zh': '数据集概览',
|
|
416
|
+
'en': 'Datasets Overview'
|
|
417
|
+
},
|
|
418
|
+
'dataset_components': {
|
|
419
|
+
'zh': '数据集组成',
|
|
420
|
+
'en': 'Dataset Components'
|
|
421
|
+
},
|
|
422
|
+
'dataset_scores': {
|
|
423
|
+
'zh': '数据集分数',
|
|
424
|
+
'en': 'Dataset Scores'
|
|
425
|
+
},
|
|
426
|
+
'dataset_scores_table': {
|
|
427
|
+
'zh': '数据集分数表',
|
|
428
|
+
'en': 'Dataset Scores Table'
|
|
429
|
+
},
|
|
430
|
+
'dataset_details': {
|
|
431
|
+
'zh': '数据集详情',
|
|
432
|
+
'en': 'Dataset Details'
|
|
433
|
+
},
|
|
434
|
+
'select_dataset': {
|
|
435
|
+
'zh': '选择数据集',
|
|
436
|
+
'en': 'Select Dataset'
|
|
437
|
+
},
|
|
438
|
+
'model_prediction': {
|
|
439
|
+
'zh': '模型预测',
|
|
440
|
+
'en': 'Model Prediction'
|
|
441
|
+
},
|
|
442
|
+
'select_subset': {
|
|
443
|
+
'zh': '选择子集',
|
|
444
|
+
'en': 'Select Subset'
|
|
445
|
+
},
|
|
446
|
+
'answer_mode': {
|
|
447
|
+
'zh': '答案模式',
|
|
448
|
+
'en': 'Answer Mode'
|
|
449
|
+
},
|
|
450
|
+
'page': {
|
|
451
|
+
'zh': '页码',
|
|
452
|
+
'en': 'Page'
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
# Update the UI components with localized labels
|
|
457
|
+
report_name = gr.Dropdown(label=locale_dict['select_report'][lang], choices=[], interactive=True)
|
|
303
458
|
work_dir = gr.State(None)
|
|
304
459
|
model_name = gr.State(None)
|
|
305
460
|
|
|
306
|
-
with gr.Accordion('
|
|
461
|
+
with gr.Accordion(locale_dict['task_config'][lang], open=False):
|
|
307
462
|
task_config = gr.JSON(value=None)
|
|
308
463
|
|
|
309
464
|
report_list = gr.State([])
|
|
310
465
|
|
|
311
|
-
with gr.Tab('
|
|
312
|
-
gr.Markdown('###
|
|
313
|
-
sunburst_plot = gr.Plot(value=None, scale=1, label='
|
|
314
|
-
gr.Markdown('###
|
|
315
|
-
score_plot = gr.Plot(value=None, scale=1, label='
|
|
316
|
-
gr.Markdown('###
|
|
466
|
+
with gr.Tab(locale_dict['datasets_overview'][lang]):
|
|
467
|
+
gr.Markdown(f'### {locale_dict["dataset_components"][lang]}')
|
|
468
|
+
sunburst_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_components'][lang])
|
|
469
|
+
gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
|
|
470
|
+
score_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
|
|
471
|
+
gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
|
|
317
472
|
score_table = gr.DataFrame(value=None)
|
|
318
473
|
|
|
319
|
-
with gr.Tab('
|
|
320
|
-
dataset_radio = gr.Radio(
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
gr.
|
|
474
|
+
with gr.Tab(locale_dict['dataset_details'][lang]):
|
|
475
|
+
dataset_radio = gr.Radio(
|
|
476
|
+
label=locale_dict['select_dataset'][lang], choices=[], show_label=True, interactive=True)
|
|
477
|
+
gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
|
|
478
|
+
dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
|
|
479
|
+
gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
|
|
324
480
|
dataset_table = gr.DataFrame(value=None)
|
|
325
481
|
|
|
326
|
-
gr.Markdown('###
|
|
327
|
-
|
|
482
|
+
gr.Markdown(f'### {locale_dict["model_prediction"][lang]}')
|
|
483
|
+
subset_select = gr.Dropdown(
|
|
484
|
+
label=locale_dict['select_subset'][lang], choices=[], show_label=True, interactive=True)
|
|
328
485
|
with gr.Row():
|
|
329
486
|
answer_mode_radio = gr.Radio(
|
|
330
|
-
label='
|
|
331
|
-
page_number = gr.Number(
|
|
487
|
+
label=locale_dict['answer_mode'][lang], choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
|
|
488
|
+
page_number = gr.Number(
|
|
489
|
+
value=1, label=locale_dict['page'][lang], minimum=1, maximum=1, step=1, interactive=True)
|
|
332
490
|
answer_mode_counts = gr.Markdown('', label='Counts')
|
|
333
491
|
data_review_df = gr.State(None)
|
|
334
492
|
filtered_review_df = gr.State(None)
|
|
@@ -354,7 +512,7 @@ def create_single_model_tab(sidebar: SidebarComponents):
|
|
|
354
512
|
'right': '\\]',
|
|
355
513
|
'display': True
|
|
356
514
|
}],
|
|
357
|
-
max_height=
|
|
515
|
+
max_height=600)
|
|
358
516
|
|
|
359
517
|
@report_name.change(
|
|
360
518
|
inputs=[sidebar.root_path, report_name],
|
|
@@ -367,26 +525,28 @@ def create_single_model_tab(sidebar: SidebarComponents):
|
|
|
367
525
|
|
|
368
526
|
@report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])
|
|
369
527
|
def update_single_report_score(report_list):
|
|
370
|
-
report_score_df = get_acc_report_df(report_list)
|
|
528
|
+
report_score_df, styler = get_acc_report_df(report_list)
|
|
371
529
|
report_score_plot = plot_single_report_scores(report_score_df)
|
|
372
530
|
report_sunburst_plot = plot_single_report_sunburst(report_list)
|
|
373
|
-
return report_score_plot,
|
|
531
|
+
return report_score_plot, styler, report_sunburst_plot
|
|
374
532
|
|
|
375
533
|
@gr.on(
|
|
376
534
|
triggers=[dataset_radio.change, report_list.change],
|
|
377
535
|
inputs=[dataset_radio, report_list],
|
|
378
|
-
outputs=[dataset_plot, dataset_table,
|
|
536
|
+
outputs=[dataset_plot, dataset_table, subset_select, data_review_df])
|
|
379
537
|
def update_single_report_dataset(dataset_name, report_list):
|
|
380
538
|
logger.debug(f'Updating single report dataset: {dataset_name}')
|
|
381
539
|
report_df = get_data_frame(report_list)
|
|
382
|
-
data_score_df =
|
|
540
|
+
data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
|
|
383
541
|
data_score_plot = plot_single_dataset_scores(data_score_df)
|
|
384
542
|
subsets = data_score_df[ReportKey.subset_name].unique().tolist()
|
|
385
543
|
logger.debug(f'subsets: {subsets}')
|
|
386
|
-
return data_score_plot,
|
|
544
|
+
return data_score_plot, styler, gr.update(choices=subsets, value=None), None
|
|
387
545
|
|
|
388
|
-
@
|
|
389
|
-
|
|
546
|
+
@gr.on(
|
|
547
|
+
triggers=[subset_select.change],
|
|
548
|
+
inputs=[work_dir, model_name, dataset_radio, subset_select],
|
|
549
|
+
outputs=[data_review_df, page_number])
|
|
390
550
|
def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
|
|
391
551
|
if not subset_name:
|
|
392
552
|
return gr.skip()
|
|
@@ -424,10 +584,10 @@ def create_single_model_tab(sidebar: SidebarComponents):
|
|
|
424
584
|
inputs=[filtered_review_df, page_number],
|
|
425
585
|
outputs=[data_review_table])
|
|
426
586
|
def update_table(filtered_df, page_number):
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
return
|
|
587
|
+
if filtered_df is None:
|
|
588
|
+
return gr.update(value=None)
|
|
589
|
+
subset_df, styler = get_table_data(filtered_df, page_number)
|
|
590
|
+
return styler
|
|
431
591
|
|
|
432
592
|
return SingleModelComponents(report_name=report_name)
|
|
433
593
|
|
|
@@ -437,11 +597,26 @@ class MultiModelComponents:
|
|
|
437
597
|
multi_report_name: gr.Dropdown
|
|
438
598
|
|
|
439
599
|
|
|
440
|
-
def create_multi_model_tab(sidebar: SidebarComponents):
|
|
441
|
-
|
|
442
|
-
|
|
600
|
+
def create_multi_model_tab(sidebar: SidebarComponents, lang: str):
|
|
601
|
+
locale_dict = {
|
|
602
|
+
'select_reports': {
|
|
603
|
+
'zh': '请选择报告',
|
|
604
|
+
'en': 'Select Reports'
|
|
605
|
+
},
|
|
606
|
+
'model_radar': {
|
|
607
|
+
'zh': '模型对比雷达',
|
|
608
|
+
'en': 'Model Comparison Radar'
|
|
609
|
+
},
|
|
610
|
+
'model_scores': {
|
|
611
|
+
'zh': '模型对比分数',
|
|
612
|
+
'en': 'Model Comparison Scores'
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
multi_report_name = gr.Dropdown(
|
|
616
|
+
label=locale_dict['select_reports'][lang], choices=[], multiselect=True, interactive=True)
|
|
617
|
+
gr.Markdown(locale_dict['model_radar'][lang])
|
|
443
618
|
radar_plot = gr.Plot(value=None)
|
|
444
|
-
gr.Markdown('
|
|
619
|
+
gr.Markdown(locale_dict['model_scores'][lang])
|
|
445
620
|
score_table = gr.DataFrame(value=None)
|
|
446
621
|
|
|
447
622
|
@multi_report_name.change(inputs=[sidebar.root_path, multi_report_name], outputs=[radar_plot, score_table])
|
|
@@ -449,43 +624,58 @@ def create_multi_model_tab(sidebar: SidebarComponents):
|
|
|
449
624
|
if not multi_report_name:
|
|
450
625
|
return gr.skip()
|
|
451
626
|
report_list = load_multi_report(root_path, multi_report_name)
|
|
452
|
-
report_df = get_acc_report_df(report_list)
|
|
627
|
+
report_df, _ = get_acc_report_df(report_list)
|
|
453
628
|
report_radar_plot = plot_multi_report_radar(report_df)
|
|
454
|
-
|
|
455
|
-
return report_radar_plot,
|
|
629
|
+
_, styler = get_compare_report_df(report_df)
|
|
630
|
+
return report_radar_plot, styler
|
|
456
631
|
|
|
457
632
|
return MultiModelComponents(multi_report_name=multi_report_name)
|
|
458
633
|
|
|
459
634
|
|
|
460
|
-
def create_app():
|
|
635
|
+
def create_app(args: argparse.Namespace):
|
|
636
|
+
configure_logging(debug=args.debug)
|
|
637
|
+
lang = args.lang
|
|
638
|
+
|
|
639
|
+
locale_dict = {
|
|
640
|
+
'title': {
|
|
641
|
+
'zh': '📈 EvalScope 看板',
|
|
642
|
+
'en': '📈 Evalscope Dashboard'
|
|
643
|
+
},
|
|
644
|
+
'star_beggar': {
|
|
645
|
+
'zh':
|
|
646
|
+
'喜欢<a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>就动动手指给我们加个star吧 🥺 ',
|
|
647
|
+
'en':
|
|
648
|
+
'If you like <a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>, '
|
|
649
|
+
'please take a few seconds to star us 🥺 '
|
|
650
|
+
},
|
|
651
|
+
'note': {
|
|
652
|
+
'zh': '请选择报告',
|
|
653
|
+
'en': 'Please select reports'
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
|
|
461
657
|
with gr.Blocks(title='Evalscope Dashboard') as demo:
|
|
658
|
+
gr.HTML(f'<h1 style="text-align: left;">{locale_dict["title"][lang]} (v{__version__})</h1>')
|
|
462
659
|
with gr.Row():
|
|
463
660
|
with gr.Column(scale=0, min_width=35):
|
|
464
661
|
toggle_btn = gr.Button('<')
|
|
465
662
|
with gr.Column(scale=1):
|
|
466
|
-
gr.HTML('<
|
|
663
|
+
gr.HTML(f'<h3 style="text-align: left;">{locale_dict["star_beggar"][lang]}</h3>')
|
|
467
664
|
|
|
468
665
|
with gr.Row():
|
|
469
666
|
with gr.Column(scale=1) as sidebar_column:
|
|
470
667
|
sidebar_visible = gr.State(True)
|
|
471
|
-
sidebar = create_sidebar()
|
|
668
|
+
sidebar = create_sidebar(args.outputs, lang)
|
|
472
669
|
|
|
473
670
|
with gr.Column(scale=5):
|
|
474
|
-
|
|
475
|
-
with gr.Column(visible=True):
|
|
476
|
-
gr.Markdown('## Visualization')
|
|
477
|
-
with gr.Tabs():
|
|
478
|
-
with gr.Tab('Single Model'):
|
|
479
|
-
single = create_single_model_tab(sidebar)
|
|
480
|
-
|
|
481
|
-
with gr.Tab('Multi Model'):
|
|
482
|
-
multi = create_multi_model_tab(sidebar)
|
|
671
|
+
visualization = create_visualization(sidebar, lang)
|
|
483
672
|
|
|
484
673
|
@sidebar.load_btn.click(
|
|
485
|
-
inputs=[sidebar.reports_dropdown],
|
|
674
|
+
inputs=[sidebar.reports_dropdown],
|
|
675
|
+
outputs=[visualization.single_model.report_name, visualization.multi_model.multi_report_name])
|
|
486
676
|
def update_displays(reports_dropdown):
|
|
487
677
|
if not reports_dropdown:
|
|
488
|
-
gr.Warning('
|
|
678
|
+
gr.Warning(locale_dict['note'][lang], duration=3)
|
|
489
679
|
return gr.skip()
|
|
490
680
|
|
|
491
681
|
return (
|
|
@@ -499,8 +689,20 @@ def create_app():
|
|
|
499
689
|
text = '<' if new_visible else '>'
|
|
500
690
|
return gr.update(visible=new_visible), new_visible, gr.update(value=text)
|
|
501
691
|
|
|
502
|
-
demo.launch()
|
|
692
|
+
demo.launch(share=args.share, server_name=args.server_name, server_port=args.server_port, debug=args.debug)
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def add_argument(parser: argparse.ArgumentParser):
|
|
696
|
+
parser.add_argument('--share', action='store_true', help='Share the app.')
|
|
697
|
+
parser.add_argument('--server-name', type=str, default='0.0.0.0', help='The server name.')
|
|
698
|
+
parser.add_argument('--server-port', type=int, default=None, help='The server port.')
|
|
699
|
+
parser.add_argument('--debug', action='store_true', help='Debug the app.')
|
|
700
|
+
parser.add_argument('--lang', type=str, default='zh', help='The locale.', choices=['zh', 'en'])
|
|
701
|
+
parser.add_argument('--outputs', type=str, default='./outputs', help='The outputs dir.')
|
|
503
702
|
|
|
504
703
|
|
|
505
704
|
if __name__ == '__main__':
|
|
506
|
-
|
|
705
|
+
parser = argparse.ArgumentParser()
|
|
706
|
+
add_argument(parser)
|
|
707
|
+
args = parser.parse_args()
|
|
708
|
+
create_app(args)
|
evalscope/run.py
CHANGED
|
@@ -46,11 +46,13 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
46
46
|
|
|
47
47
|
def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
48
48
|
"""Set the working directory for the task."""
|
|
49
|
+
# use cache
|
|
49
50
|
if task_cfg.use_cache:
|
|
50
51
|
task_cfg.work_dir = task_cfg.use_cache
|
|
51
52
|
logger.info(f'Set resume from {task_cfg.work_dir}')
|
|
52
53
|
# elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
|
|
53
|
-
|
|
54
|
+
else:
|
|
55
|
+
task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
|
|
54
56
|
|
|
55
57
|
outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
|
|
56
58
|
|
|
@@ -112,7 +114,7 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
112
114
|
logger.info(task_cfg)
|
|
113
115
|
|
|
114
116
|
for evaluator in evaluators:
|
|
115
|
-
res_dict = evaluator.eval(
|
|
117
|
+
res_dict = evaluator.eval()
|
|
116
118
|
eval_results[dataset_name] = res_dict
|
|
117
119
|
|
|
118
120
|
return eval_results
|
|
@@ -124,21 +126,22 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
|
|
|
124
126
|
from evalscope.evaluator import Evaluator
|
|
125
127
|
from evalscope.models import initialize_model_adapter
|
|
126
128
|
|
|
129
|
+
benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
|
|
130
|
+
# Initialize data adapter
|
|
131
|
+
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
132
|
+
|
|
127
133
|
if dataset_name == DataCollection.NAME:
|
|
128
134
|
# EvaluatorCollection is a collection of evaluators
|
|
129
135
|
from evalscope.collections import EvaluatorCollection
|
|
130
|
-
return EvaluatorCollection(task_cfg, outputs)
|
|
136
|
+
return EvaluatorCollection(task_cfg, data_adapter, outputs)
|
|
131
137
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
138
|
+
# Initialize model adapter
|
|
135
139
|
model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
|
|
136
140
|
|
|
137
141
|
# update task_cfg.dataset_args
|
|
138
142
|
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
|
139
143
|
|
|
140
144
|
return Evaluator(
|
|
141
|
-
dataset_name_or_path=benchmark.dataset_id,
|
|
142
145
|
data_adapter=data_adapter,
|
|
143
146
|
model_adapter=model_adapter,
|
|
144
147
|
outputs=outputs,
|