evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -5
  3. evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
  4. evalscope/benchmarks/benchmark.py +1 -1
  5. evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
  6. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
  7. evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
  8. evalscope/benchmarks/data_adapter.py +69 -70
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
  10. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
  11. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
  12. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
  13. evalscope/benchmarks/ifeval/__init__.py +0 -0
  14. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  15. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  16. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  17. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  18. evalscope/benchmarks/ifeval/utils.py +134 -0
  19. evalscope/benchmarks/iquiz/__init__.py +0 -0
  20. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
  23. evalscope/benchmarks/race/race_adapter.py +4 -73
  24. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
  25. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
  26. evalscope/cli/cli.py +2 -0
  27. evalscope/cli/start_app.py +29 -0
  28. evalscope/collections/evaluator.py +82 -62
  29. evalscope/collections/sampler.py +47 -41
  30. evalscope/collections/schema.py +14 -10
  31. evalscope/constants.py +4 -0
  32. evalscope/evaluator/evaluator.py +22 -13
  33. evalscope/metrics/__init__.py +2 -5
  34. evalscope/metrics/metrics.py +11 -2
  35. evalscope/metrics/named_metrics.py +17 -0
  36. evalscope/models/server_adapter.py +11 -4
  37. evalscope/perf/__init__.py +1 -0
  38. evalscope/perf/main.py +0 -1
  39. evalscope/perf/plugin/api/custom_api.py +1 -1
  40. evalscope/perf/plugin/api/openai_api.py +1 -1
  41. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  42. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  43. evalscope/report/__init__.py +5 -0
  44. evalscope/report/app.py +506 -0
  45. evalscope/report/combinator.py +73 -0
  46. evalscope/report/generator.py +80 -0
  47. evalscope/report/utils.py +133 -0
  48. evalscope/run.py +16 -11
  49. evalscope/summarizer.py +1 -1
  50. evalscope/utils/chat_service.py +1 -1
  51. evalscope/utils/logger.py +1 -0
  52. evalscope/utils/model_utils.py +5 -2
  53. evalscope/version.py +2 -2
  54. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
  55. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
  56. tests/cli/test_collection.py +11 -7
  57. tests/cli/test_run.py +13 -4
  58. evalscope/tools/__init__.py +0 -1
  59. evalscope/tools/combine_reports.py +0 -133
  60. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  61. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  62. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  63. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  64. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  65. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,4 @@
1
1
  import json
2
- from transformers import AutoTokenizer
3
2
  from typing import Any, Dict, Iterator, List
4
3
 
5
4
  from evalscope.perf.arguments import Arguments
@@ -25,6 +24,7 @@ class CustomPlugin(ApiPluginBase):
25
24
  """
26
25
  super().__init__(model_path=mode_path)
27
26
  if mode_path is not None:
27
+ from transformers import AutoTokenizer
28
28
  self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
29
29
  else:
30
30
  self.tokenizer = None
@@ -1,6 +1,5 @@
1
1
  import json
2
2
  import os
3
- from transformers import AutoTokenizer
4
3
  from typing import Any, Dict, Iterator, List, Union
5
4
 
6
5
  from evalscope.perf.arguments import Arguments
@@ -25,6 +24,7 @@ class OpenaiPlugin(ApiPluginBase):
25
24
  """
26
25
  super().__init__(model_path=mode_path)
27
26
  if mode_path is not None:
27
+ from transformers import AutoTokenizer
28
28
  self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
29
29
  else:
30
30
  self.tokenizer = None
@@ -1,6 +1,5 @@
1
1
  import base64
2
2
  from io import BytesIO
3
- from modelscope.msdatasets import MsDataset
4
3
  from PIL import Image
5
4
  from typing import Any, Dict, Iterator, List
6
5
 
@@ -26,6 +25,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
26
25
  super().__init__(query_parameters)
27
26
 
28
27
  def build_messages(self) -> Iterator[List[Dict]]:
28
+ from modelscope.msdatasets import MsDataset
29
29
  dataset = MsDataset.load('clip-benchmark/wds_flickr8k', split='test')
30
30
 
31
31
  for item in dataset:
@@ -1,4 +1,3 @@
1
- from modelscope import MsDataset
2
1
  from typing import Any, Dict, Iterator, List
3
2
 
4
3
  from evalscope.perf.arguments import Arguments
@@ -17,6 +16,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
17
16
 
18
17
  def build_messages(self) -> Iterator[List[Dict]]:
19
18
  if not self.query_parameters.dataset_path:
19
+ from modelscope import MsDataset
20
20
  ds = MsDataset.load('AI-ModelScope/LongAlpaca-12k', subset_name='default', split='train')
21
21
  else:
22
22
  ds = self.dataset_json_list(self.query_parameters.dataset_path)
@@ -0,0 +1,5 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.report.combinator import gen_table, get_data_frame, get_report_list
4
+ from evalscope.report.generator import ReportGenerator
5
+ from evalscope.report.utils import Category, Report, ReportKey, Subset
@@ -0,0 +1,506 @@
1
+ import glob
2
+ import gradio as gr
3
+ import numpy as np
4
+ import os
5
+ import pandas as pd
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from dataclasses import dataclass
9
+ from typing import Any, List, Union
10
+
11
+ from evalscope.constants import DataCollection
12
+ from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
13
+ from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
14
+ from evalscope.utils.logger import get_logger
15
+
16
+ logger = get_logger()
17
+
18
+
19
+ def scan_for_report_folders(root_path):
20
+ """Scan for folders containing reports subdirectories"""
21
+ logger.debug(f'Scanning for report folders in {root_path}')
22
+ if not os.path.exists(root_path):
23
+ return []
24
+
25
+ reports = []
26
+ # Iterate over all folders in the root path
27
+ for folder in glob.glob(os.path.join(root_path, '*')):
28
+ # Check if reports folder exists
29
+ reports_path = os.path.join(folder, OutputsStructure.REPORTS_DIR)
30
+ if not os.path.exists(reports_path):
31
+ continue
32
+
33
+ # Iterate over all items in reports folder
34
+ for model_item in glob.glob(os.path.join(reports_path, '*')):
35
+ if not os.path.isdir(model_item):
36
+ continue
37
+ datasets = []
38
+ for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
39
+ datasets.append(os.path.basename(dataset_item).split('.')[0])
40
+ datasets = ','.join(datasets)
41
+ reports.append(f'{os.path.basename(folder)}@{os.path.basename(model_item)}:{datasets}')
42
+
43
+ reports = sorted(reports, reverse=True)
44
+ logger.debug(f'reports: {reports}')
45
+ return reports
46
+
47
+
48
+ def process_report_name(report_name: str):
49
+ prefix, report_name = report_name.split('@')
50
+ model_name, datasets = report_name.split(':')
51
+ datasets = datasets.split(',')
52
+ return prefix, model_name, datasets
53
+
54
+
55
+ def load_single_report(root_path: str, report_name: str):
56
+ prefix, model_name, datasets = process_report_name(report_name)
57
+ report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
58
+ report_list = get_report_list([report_path_list])
59
+
60
+ task_cfg_path = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))[0]
61
+ task_cfg = yaml_to_dict(task_cfg_path)
62
+ return report_list, datasets, task_cfg
63
+
64
+
65
+ def load_multi_report(root_path: str, report_names: List[str]):
66
+ report_list = []
67
+ for report_name in report_names:
68
+ prefix, model_name, datasets = process_report_name(report_name)
69
+ report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
70
+ reports = get_report_list([report_path_list])
71
+ report_list.extend(reports)
72
+ return report_list
73
+
74
+
75
+ def get_acc_report_df(report_list: List[Report]):
76
+ data_dict = []
77
+ for report in report_list:
78
+ if report.name == DataCollection.NAME:
79
+ for metric in report.metrics:
80
+ for category in metric.categories:
81
+ item = {
82
+ ReportKey.model_name: report.model_name,
83
+ ReportKey.dataset_name: '/'.join(category.name),
84
+ ReportKey.score: category.score,
85
+ ReportKey.num: category.num,
86
+ }
87
+ data_dict.append(item)
88
+ else:
89
+ item = {
90
+ ReportKey.model_name: report.model_name,
91
+ ReportKey.dataset_name: report.dataset_name,
92
+ ReportKey.score: report.score,
93
+ ReportKey.num: report.metrics[0].num,
94
+ }
95
+ data_dict.append(item)
96
+ df = pd.DataFrame.from_dict(data_dict, orient='columns')
97
+ return df
98
+
99
+
100
+ def get_compare_report_df(acc_df: pd.DataFrame):
101
+ df = acc_df.pivot_table(index=ReportKey.model_name, columns=ReportKey.dataset_name, values=ReportKey.score)
102
+ df.reset_index(inplace=True)
103
+ styler = df.style.background_gradient(cmap='RdYlGn', vmin=0.0, vmax=1.0, axis=0)
104
+ styler.format(precision=4)
105
+ return styler
106
+
107
+
108
+ def plot_single_report_scores(df: pd.DataFrame):
109
+ plot = px.bar(
110
+ df,
111
+ x=df[ReportKey.dataset_name],
112
+ y=df[ReportKey.score],
113
+ color=df[ReportKey.dataset_name],
114
+ template='plotly_dark')
115
+ return plot
116
+
117
+
118
+ def plot_single_report_sunburst(report_list: List[Report]):
119
+ if report_list[0].name == DataCollection.NAME:
120
+ df = get_data_frame(report_list)
121
+ categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
122
+ path = categories + [ReportKey.subset_name]
123
+ else:
124
+ df = get_data_frame(report_list, flatten_metrics=False)
125
+ categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
126
+ path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
127
+ logger.debug(f'df: {df}')
128
+ df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
129
+ plot = px.sunburst(
130
+ df,
131
+ path=path,
132
+ values=ReportKey.num,
133
+ color=ReportKey.score,
134
+ color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
135
+ color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
136
+ template='plotly_dark',
137
+ maxdepth=3)
138
+ plot.update_traces(insidetextorientation='radial')
139
+ plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1))
140
+ return plot
141
+
142
+
143
+ def get_single_dataset_data(df: pd.DataFrame, dataset_name: str):
144
+ return df[df[ReportKey.dataset_name] == dataset_name]
145
+
146
+
147
+ def plot_single_dataset_scores(df: pd.DataFrame):
148
+ # TODO: add metric radio and relace category name
149
+ plot = px.bar(
150
+ df,
151
+ x=df[ReportKey.metric_name],
152
+ y=df[ReportKey.score],
153
+ color=df[ReportKey.subset_name],
154
+ template='plotly_dark',
155
+ barmode='group')
156
+ return plot
157
+
158
+
159
+ def plot_multi_report_radar(df: pd.DataFrame):
160
+ fig = go.Figure()
161
+
162
+ grouped = df.groupby(ReportKey.model_name)
163
+ common_datasets = set.intersection(*[set(group[ReportKey.dataset_name]) for _, group in grouped])
164
+
165
+ for model_name, group in grouped:
166
+ common_group = group[group[ReportKey.dataset_name].isin(common_datasets)]
167
+ fig.add_trace(
168
+ go.Scatterpolar(
169
+ r=common_group[ReportKey.score],
170
+ theta=common_group[ReportKey.dataset_name],
171
+ name=model_name,
172
+ fill='toself'))
173
+
174
+ fig.update_layout(
175
+ template='plotly_dark',
176
+ polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
177
+ margin=dict(t=20, l=20, r=20, b=20))
178
+ return fig
179
+
180
+
181
+ def dict_to_markdown(data) -> str:
182
+ markdown_lines = []
183
+
184
+ for key, value in data.items():
185
+ bold_key = f'**{key}**'
186
+
187
+ if isinstance(value, list):
188
+ value_str = '\n' + '\n'.join([f' - {item}' for item in value])
189
+ elif isinstance(value, dict):
190
+ value_str = dict_to_markdown(value)
191
+ else:
192
+ value_str = str(value)
193
+
194
+ value_str = process_string(value_str)
195
+ markdown_line = f'{bold_key}: {value_str}'
196
+ markdown_lines.append(markdown_line)
197
+
198
+ return '\n\n'.join(markdown_lines)
199
+
200
+
201
+ def process_string(string: str, max_length: int = 2048) -> str:
202
+ if len(string) > max_length:
203
+ return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
204
+ return string
205
+
206
+
207
+ def process_model_prediction(item: Any):
208
+ if isinstance(item, dict):
209
+ return dict_to_markdown(item)
210
+ elif isinstance(item, list):
211
+ return '\n'.join([process_model_prediction(item) for item in item])
212
+ else:
213
+ return process_string(str(item))
214
+
215
+
216
+ def normalize_score(score):
217
+ if isinstance(score, bool):
218
+ return 1.0 if score else 0.0
219
+ elif isinstance(score, dict):
220
+ for key in score:
221
+ return float(score[key])
222
+ return 0.0
223
+ else:
224
+ try:
225
+ return float(score)
226
+ except (ValueError, TypeError):
227
+ return 0.0
228
+
229
+
230
+ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
231
+ data_path = os.path.join(work_dir, OutputsStructure.REVIEWS_DIR, model_name)
232
+ subset_name = subset_name.replace('/', '_') # for collection report
233
+ origin_df = pd.read_json(os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl'), lines=True)
234
+ ds = []
235
+ for i, item in origin_df.iterrows():
236
+ raw_input = item['raw_input']
237
+ raw_pred_answer = item['choices'][0]['message']['content']
238
+ parsed_gold_answer = item['choices'][0]['review']['gold']
239
+ parsed_pred_answer = item['choices'][0]['review']['pred']
240
+ score = item['choices'][0]['review']['result']
241
+ raw_d = {
242
+ 'Input': raw_input,
243
+ 'Generated': raw_pred_answer,
244
+ 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
245
+ 'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
246
+ 'Score': score,
247
+ 'NScore': normalize_score(score)
248
+ }
249
+ ds.append(raw_d)
250
+
251
+ df_subset = pd.DataFrame(ds)
252
+ return df_subset
253
+
254
+
255
+ def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: int = 1) -> pd.DataFrame:
256
+ if data_review_df is None:
257
+ return None
258
+
259
+ logger.debug(f'page: {page}, rows_per_page: {rows_per_page}')
260
+ start = (page - 1) * rows_per_page
261
+ end = start + rows_per_page
262
+ df_subset = data_review_df.iloc[start:end].copy()
263
+ df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
264
+ df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
265
+ return df_subset
266
+
267
+
268
+ @dataclass
269
+ class SidebarComponents:
270
+ root_path: gr.Textbox
271
+ reports_dropdown: gr.Dropdown
272
+ load_btn: gr.Button
273
+
274
+
275
+ def create_sidebar():
276
+ gr.Markdown('## Settings')
277
+ root_path = gr.Textbox(label='Report(s) Root Path', value='./outputs', placeholder='./outputs', lines=1)
278
+ reports_dropdown = gr.Dropdown(label='Select Report(s)', choices=[], multiselect=True, interactive=True)
279
+ load_btn = gr.Button('Load & View')
280
+ gr.Markdown('### Note: Select report(s) and click `Load & View` to view the data!')
281
+
282
+ @reports_dropdown.focus(inputs=[root_path], outputs=[reports_dropdown])
283
+ def update_dropdown_choices(root_path):
284
+ folders = scan_for_report_folders(root_path)
285
+ if len(folders) == 0:
286
+ gr.Warning('No reports found, please check the path', duration=3)
287
+ return gr.update(choices=folders)
288
+
289
+ return SidebarComponents(
290
+ root_path=root_path,
291
+ reports_dropdown=reports_dropdown,
292
+ load_btn=load_btn,
293
+ )
294
+
295
+
296
+ @dataclass
297
+ class SingleModelComponents:
298
+ report_name: gr.Dropdown
299
+
300
+
301
+ def create_single_model_tab(sidebar: SidebarComponents):
302
+ report_name = gr.Dropdown(label='Select Report', choices=[], interactive=True)
303
+ work_dir = gr.State(None)
304
+ model_name = gr.State(None)
305
+
306
+ with gr.Accordion('Task Config', open=False):
307
+ task_config = gr.JSON(value=None)
308
+
309
+ report_list = gr.State([])
310
+
311
+ with gr.Tab('Datasets Overview'):
312
+ gr.Markdown('### Dataset Components')
313
+ sunburst_plot = gr.Plot(value=None, scale=1, label='Components')
314
+ gr.Markdown('### Dataset Scores')
315
+ score_plot = gr.Plot(value=None, scale=1, label='Scores')
316
+ gr.Markdown('### Dataset Scores Table')
317
+ score_table = gr.DataFrame(value=None)
318
+
319
+ with gr.Tab('Dataset Details'):
320
+ dataset_radio = gr.Radio(label='Select Dataset', choices=[], show_label=True, interactive=True)
321
+ gr.Markdown('### Dataset Scores')
322
+ dataset_plot = gr.Plot(value=None, scale=1, label='Scores')
323
+ gr.Markdown('### Dataset Scores Table')
324
+ dataset_table = gr.DataFrame(value=None)
325
+
326
+ gr.Markdown('### Model Prediction')
327
+ subset_radio = gr.Radio(label='Select Subset', choices=[], show_label=True, interactive=True)
328
+ with gr.Row():
329
+ answer_mode_radio = gr.Radio(
330
+ label='Answer Mode', choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
331
+ page_number = gr.Number(value=1, label='Page', minimum=1, maximum=1, step=1, interactive=True)
332
+ answer_mode_counts = gr.Markdown('', label='Counts')
333
+ data_review_df = gr.State(None)
334
+ filtered_review_df = gr.State(None)
335
+ data_review_table = gr.DataFrame(
336
+ value=None,
337
+ datatype=['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number'],
338
+ # column_widths=['500px', '500px'],
339
+ wrap=True,
340
+ latex_delimiters=[{
341
+ 'left': '$$',
342
+ 'right': '$$',
343
+ 'display': True
344
+ }, {
345
+ 'left': '$',
346
+ 'right': '$',
347
+ 'display': False
348
+ }, {
349
+ 'left': '\\(',
350
+ 'right': '\\)',
351
+ 'display': False
352
+ }, {
353
+ 'left': '\\[',
354
+ 'right': '\\]',
355
+ 'display': True
356
+ }],
357
+ max_height=500)
358
+
359
+ @report_name.change(
360
+ inputs=[sidebar.root_path, report_name],
361
+ outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
362
+ def update_single_report_data(root_path, report_name):
363
+ report_list, datasets, task_cfg = load_single_report(root_path, report_name)
364
+ work_dir = os.path.join(root_path, report_name.split('@')[0])
365
+ model_name = report_name.split('@')[1].split(':')[0]
366
+ return (report_list, task_cfg, gr.update(choices=datasets, value=datasets[0]), work_dir, model_name)
367
+
368
+ @report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])
369
+ def update_single_report_score(report_list):
370
+ report_score_df = get_acc_report_df(report_list)
371
+ report_score_plot = plot_single_report_scores(report_score_df)
372
+ report_sunburst_plot = plot_single_report_sunburst(report_list)
373
+ return report_score_plot, report_score_df, report_sunburst_plot
374
+
375
+ @gr.on(
376
+ triggers=[dataset_radio.change, report_list.change],
377
+ inputs=[dataset_radio, report_list],
378
+ outputs=[dataset_plot, dataset_table, subset_radio])
379
+ def update_single_report_dataset(dataset_name, report_list):
380
+ logger.debug(f'Updating single report dataset: {dataset_name}')
381
+ report_df = get_data_frame(report_list)
382
+ data_score_df = get_single_dataset_data(report_df, dataset_name)
383
+ data_score_plot = plot_single_dataset_scores(data_score_df)
384
+ subsets = data_score_df[ReportKey.subset_name].unique().tolist()
385
+ logger.debug(f'subsets: {subsets}')
386
+ return data_score_plot, data_score_df, gr.update(choices=subsets, value=subsets[0])
387
+
388
+ @subset_radio.change(
389
+ inputs=[work_dir, model_name, dataset_radio, subset_radio], outputs=[data_review_df, page_number])
390
+ def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
391
+ if not subset_name:
392
+ return gr.skip()
393
+ data_review_df = get_model_prediction(work_dir, model_name, dataset_name, subset_name)
394
+ return data_review_df, 1
395
+
396
+ @gr.on(
397
+ triggers=[data_review_df.change, answer_mode_radio.change],
398
+ inputs=[data_review_df, answer_mode_radio],
399
+ outputs=[filtered_review_df, page_number, answer_mode_counts])
400
+ def filter_data(data_review_df, answer_mode):
401
+ if data_review_df is None:
402
+ return None, gr.update(value=1, maximum=1), ''
403
+
404
+ all_count = len(data_review_df)
405
+ pass_df = data_review_df[data_review_df['NScore'] >= 0.99]
406
+ pass_count = len(pass_df)
407
+ fail_count = all_count - pass_count
408
+
409
+ counts_text = f'### All: {all_count} | Pass: {pass_count} | Fail: {fail_count}'
410
+
411
+ if answer_mode == 'Pass':
412
+ filtered_df = pass_df
413
+ elif answer_mode == 'Fail':
414
+ filtered_df = data_review_df[data_review_df['NScore'] < 0.99]
415
+ else:
416
+ filtered_df = data_review_df
417
+
418
+ max_page = max(1, len(filtered_df))
419
+
420
+ return (filtered_df, gr.update(value=1, maximum=max_page), counts_text)
421
+
422
+ @gr.on(
423
+ triggers=[filtered_review_df.change, page_number.change],
424
+ inputs=[filtered_review_df, page_number],
425
+ outputs=[data_review_table])
426
+ def update_table(filtered_df, page_number):
427
+ subset_df = get_table_data(filtered_df, page_number)
428
+ if subset_df is None:
429
+ return gr.skip()
430
+ return subset_df
431
+
432
+ return SingleModelComponents(report_name=report_name)
433
+
434
+
435
+ @dataclass
436
+ class MultiModelComponents:
437
+ multi_report_name: gr.Dropdown
438
+
439
+
440
+ def create_multi_model_tab(sidebar: SidebarComponents):
441
+ multi_report_name = gr.Dropdown(label='Select Reports', choices=[], multiselect=True, interactive=True)
442
+ gr.Markdown('### Model Radar')
443
+ radar_plot = gr.Plot(value=None)
444
+ gr.Markdown('### Model Scores')
445
+ score_table = gr.DataFrame(value=None)
446
+
447
+ @multi_report_name.change(inputs=[sidebar.root_path, multi_report_name], outputs=[radar_plot, score_table])
448
+ def update_multi_report_data(root_path, multi_report_name):
449
+ if not multi_report_name:
450
+ return gr.skip()
451
+ report_list = load_multi_report(root_path, multi_report_name)
452
+ report_df = get_acc_report_df(report_list)
453
+ report_radar_plot = plot_multi_report_radar(report_df)
454
+ report_compare_df = get_compare_report_df(report_df)
455
+ return report_radar_plot, report_compare_df
456
+
457
+ return MultiModelComponents(multi_report_name=multi_report_name)
458
+
459
+
460
+ def create_app():
461
+ with gr.Blocks(title='Evalscope Dashboard') as demo:
462
+ with gr.Row():
463
+ with gr.Column(scale=0, min_width=35):
464
+ toggle_btn = gr.Button('<')
465
+ with gr.Column(scale=1):
466
+ gr.HTML('<h1 style="text-align: left;">Evalscope Dashboard</h1>') # 文本列
467
+
468
+ with gr.Row():
469
+ with gr.Column(scale=1) as sidebar_column:
470
+ sidebar_visible = gr.State(True)
471
+ sidebar = create_sidebar()
472
+
473
+ with gr.Column(scale=5):
474
+
475
+ with gr.Column(visible=True):
476
+ gr.Markdown('## Visualization')
477
+ with gr.Tabs():
478
+ with gr.Tab('Single Model'):
479
+ single = create_single_model_tab(sidebar)
480
+
481
+ with gr.Tab('Multi Model'):
482
+ multi = create_multi_model_tab(sidebar)
483
+
484
+ @sidebar.load_btn.click(
485
+ inputs=[sidebar.reports_dropdown], outputs=[single.report_name, multi.multi_report_name])
486
+ def update_displays(reports_dropdown):
487
+ if not reports_dropdown:
488
+ gr.Warning('No reports found, please check the path', duration=3)
489
+ return gr.skip()
490
+
491
+ return (
492
+ gr.update(choices=reports_dropdown, value=reports_dropdown[0]), # update single model dropdown
493
+ gr.update(choices=reports_dropdown, value=reports_dropdown) # update multi model dropdown
494
+ )
495
+
496
+ @toggle_btn.click(inputs=[sidebar_visible], outputs=[sidebar_column, sidebar_visible, toggle_btn])
497
+ def toggle_sidebar(visible):
498
+ new_visible = not visible
499
+ text = '<' if new_visible else '>'
500
+ return gr.update(visible=new_visible), new_visible, gr.update(value=text)
501
+
502
+ demo.launch()
503
+
504
+
505
+ if __name__ == '__main__':
506
+ create_app()
@@ -0,0 +1,73 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import glob
4
+ import os
5
+ import pandas as pd
6
+ from tabulate import tabulate
7
+ from typing import List, Tuple
8
+
9
+ from evalscope.report.utils import Report
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+ """
14
+ Combine and generate table for reports of LLMs.
15
+ """
16
+
17
+
18
+ def get_report_list(reports_path_list: List[str]) -> List[Report]:
19
+ report_list: List[Report] = []
20
+ # Iterate over each report path
21
+ for report_path in reports_path_list:
22
+ model_report_dir = os.path.normpath(report_path)
23
+ report_files = glob.glob(os.path.join(model_report_dir, '**', '*.json'), recursive=True)
24
+ # Iterate over each report file
25
+ for file_path in report_files:
26
+ try:
27
+ report = Report.from_json(file_path)
28
+ report_list.append(report)
29
+ except Exception as e:
30
+ logger.error(f'Error loading report from {file_path}: {e}')
31
+ report_list = sorted(report_list, key=lambda x: (x.model_name, x.dataset_name))
32
+ return report_list
33
+
34
+
35
+ def get_data_frame(report_list: List[Report],
36
+ flatten_metrics: bool = True,
37
+ flatten_categories: bool = True) -> pd.DataFrame:
38
+ tables = []
39
+ for report in report_list:
40
+ df = report.to_dataframe(flatten_metrics=flatten_metrics, flatten_categories=flatten_categories)
41
+ tables.append(df)
42
+ return pd.concat(tables, ignore_index=True)
43
+
44
+
45
+ def gen_table(reports_path_list: list) -> str:
46
+ report_list = get_report_list(reports_path_list)
47
+ table = get_data_frame(report_list)
48
+ return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
49
+
50
+
51
+ class ReportsRecorder:
52
+ COMMON_DATASET_PATH = []
53
+ CUSTOM_DATASET_PATH = []
54
+
55
+ def __init__(self, oss_url: str = '', endpoint: str = ''):
56
+ pass
57
+
58
+
59
+ if __name__ == '__main__':
60
+ report_dir_1 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250117_151926'
61
+ # report_dir_2 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250107_204445/reports'
62
+
63
+ report_table = gen_table([report_dir_1])
64
+ print(report_table)
65
+
66
+ # ALL VALUES ONLY FOR EXAMPLE
67
+ # +--------------------------+-------------------+-------------+
68
+ # | Model | CompetitionMath | GSM8K |
69
+ # +==========================+===================+=============+
70
+ # | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
71
+ # +--------------------------+-------------------+-------------+
72
+ # | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
73
+ # +--------------------------+-------------------+-------------+