evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,506 @@
1
+ import glob
2
+ import gradio as gr
3
+ import numpy as np
4
+ import os
5
+ import pandas as pd
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from dataclasses import dataclass
9
+ from typing import Any, List, Union
10
+
11
+ from evalscope.constants import DataCollection
12
+ from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
13
+ from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
14
+ from evalscope.utils.logger import get_logger
15
+
16
+ logger = get_logger()
17
+
18
+
19
+ def scan_for_report_folders(root_path):
20
+ """Scan for folders containing reports subdirectories"""
21
+ logger.debug(f'Scanning for report folders in {root_path}')
22
+ if not os.path.exists(root_path):
23
+ return []
24
+
25
+ reports = []
26
+ # Iterate over all folders in the root path
27
+ for folder in glob.glob(os.path.join(root_path, '*')):
28
+ # Check if reports folder exists
29
+ reports_path = os.path.join(folder, OutputsStructure.REPORTS_DIR)
30
+ if not os.path.exists(reports_path):
31
+ continue
32
+
33
+ # Iterate over all items in reports folder
34
+ for model_item in glob.glob(os.path.join(reports_path, '*')):
35
+ if not os.path.isdir(model_item):
36
+ continue
37
+ datasets = []
38
+ for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
39
+ datasets.append(os.path.basename(dataset_item).split('.')[0])
40
+ datasets = ','.join(datasets)
41
+ reports.append(f'{os.path.basename(folder)}@{os.path.basename(model_item)}:{datasets}')
42
+
43
+ reports = sorted(reports, reverse=True)
44
+ logger.debug(f'reports: {reports}')
45
+ return reports
46
+
47
+
48
+ def process_report_name(report_name: str):
49
+ prefix, report_name = report_name.split('@')
50
+ model_name, datasets = report_name.split(':')
51
+ datasets = datasets.split(',')
52
+ return prefix, model_name, datasets
53
+
54
+
55
+ def load_single_report(root_path: str, report_name: str):
56
+ prefix, model_name, datasets = process_report_name(report_name)
57
+ report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
58
+ report_list = get_report_list([report_path_list])
59
+
60
+ task_cfg_path = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))[0]
61
+ task_cfg = yaml_to_dict(task_cfg_path)
62
+ return report_list, datasets, task_cfg
63
+
64
+
65
+ def load_multi_report(root_path: str, report_names: List[str]):
66
+ report_list = []
67
+ for report_name in report_names:
68
+ prefix, model_name, datasets = process_report_name(report_name)
69
+ report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
70
+ reports = get_report_list([report_path_list])
71
+ report_list.extend(reports)
72
+ return report_list
73
+
74
+
75
+ def get_acc_report_df(report_list: List[Report]):
76
+ data_dict = []
77
+ for report in report_list:
78
+ if report.name == DataCollection.NAME:
79
+ for metric in report.metrics:
80
+ for category in metric.categories:
81
+ item = {
82
+ ReportKey.model_name: report.model_name,
83
+ ReportKey.dataset_name: '/'.join(category.name),
84
+ ReportKey.score: category.score,
85
+ ReportKey.num: category.num,
86
+ }
87
+ data_dict.append(item)
88
+ else:
89
+ item = {
90
+ ReportKey.model_name: report.model_name,
91
+ ReportKey.dataset_name: report.dataset_name,
92
+ ReportKey.score: report.score,
93
+ ReportKey.num: report.metrics[0].num,
94
+ }
95
+ data_dict.append(item)
96
+ df = pd.DataFrame.from_dict(data_dict, orient='columns')
97
+ return df
98
+
99
+
100
+ def get_compare_report_df(acc_df: pd.DataFrame):
101
+ df = acc_df.pivot_table(index=ReportKey.model_name, columns=ReportKey.dataset_name, values=ReportKey.score)
102
+ df.reset_index(inplace=True)
103
+ styler = df.style.background_gradient(cmap='RdYlGn', vmin=0.0, vmax=1.0, axis=0)
104
+ styler.format(precision=4)
105
+ return styler
106
+
107
+
108
+ def plot_single_report_scores(df: pd.DataFrame):
109
+ plot = px.bar(
110
+ df,
111
+ x=df[ReportKey.dataset_name],
112
+ y=df[ReportKey.score],
113
+ color=df[ReportKey.dataset_name],
114
+ template='plotly_dark')
115
+ return plot
116
+
117
+
118
+ def plot_single_report_sunburst(report_list: List[Report]):
119
+ if report_list[0].name == DataCollection.NAME:
120
+ df = get_data_frame(report_list)
121
+ categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
122
+ path = categories + [ReportKey.subset_name]
123
+ else:
124
+ df = get_data_frame(report_list, flatten_metrics=False)
125
+ categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
126
+ path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
127
+ logger.debug(f'df: {df}')
128
+ df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
129
+ plot = px.sunburst(
130
+ df,
131
+ path=path,
132
+ values=ReportKey.num,
133
+ color=ReportKey.score,
134
+ color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
135
+ color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
136
+ template='plotly_dark',
137
+ maxdepth=3)
138
+ plot.update_traces(insidetextorientation='radial')
139
+ plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1))
140
+ return plot
141
+
142
+
143
+ def get_single_dataset_data(df: pd.DataFrame, dataset_name: str):
144
+ return df[df[ReportKey.dataset_name] == dataset_name]
145
+
146
+
147
+ def plot_single_dataset_scores(df: pd.DataFrame):
148
+ # TODO: add metric radio and relace category name
149
+ plot = px.bar(
150
+ df,
151
+ x=df[ReportKey.metric_name],
152
+ y=df[ReportKey.score],
153
+ color=df[ReportKey.subset_name],
154
+ template='plotly_dark',
155
+ barmode='group')
156
+ return plot
157
+
158
+
159
+ def plot_multi_report_radar(df: pd.DataFrame):
160
+ fig = go.Figure()
161
+
162
+ grouped = df.groupby(ReportKey.model_name)
163
+ common_datasets = set.intersection(*[set(group[ReportKey.dataset_name]) for _, group in grouped])
164
+
165
+ for model_name, group in grouped:
166
+ common_group = group[group[ReportKey.dataset_name].isin(common_datasets)]
167
+ fig.add_trace(
168
+ go.Scatterpolar(
169
+ r=common_group[ReportKey.score],
170
+ theta=common_group[ReportKey.dataset_name],
171
+ name=model_name,
172
+ fill='toself'))
173
+
174
+ fig.update_layout(
175
+ template='plotly_dark',
176
+ polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
177
+ margin=dict(t=20, l=20, r=20, b=20))
178
+ return fig
179
+
180
+
181
+ def dict_to_markdown(data) -> str:
182
+ markdown_lines = []
183
+
184
+ for key, value in data.items():
185
+ bold_key = f'**{key}**'
186
+
187
+ if isinstance(value, list):
188
+ value_str = '\n' + '\n'.join([f' - {item}' for item in value])
189
+ elif isinstance(value, dict):
190
+ value_str = dict_to_markdown(value)
191
+ else:
192
+ value_str = str(value)
193
+
194
+ value_str = process_string(value_str)
195
+ markdown_line = f'{bold_key}: {value_str}'
196
+ markdown_lines.append(markdown_line)
197
+
198
+ return '\n\n'.join(markdown_lines)
199
+
200
+
201
+ def process_string(string: str, max_length: int = 2048) -> str:
202
+ if len(string) > max_length:
203
+ return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
204
+ return string
205
+
206
+
207
+ def process_model_prediction(item: Any):
208
+ if isinstance(item, dict):
209
+ return dict_to_markdown(item)
210
+ elif isinstance(item, list):
211
+ return '\n'.join([process_model_prediction(item) for item in item])
212
+ else:
213
+ return process_string(str(item))
214
+
215
+
216
+ def normalize_score(score):
217
+ if isinstance(score, bool):
218
+ return 1.0 if score else 0.0
219
+ elif isinstance(score, dict):
220
+ for key in score:
221
+ return float(score[key])
222
+ return 0.0
223
+ else:
224
+ try:
225
+ return float(score)
226
+ except (ValueError, TypeError):
227
+ return 0.0
228
+
229
+
230
+ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
231
+ data_path = os.path.join(work_dir, OutputsStructure.REVIEWS_DIR, model_name)
232
+ subset_name = subset_name.replace('/', '_') # for collection report
233
+ origin_df = pd.read_json(os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl'), lines=True)
234
+ ds = []
235
+ for i, item in origin_df.iterrows():
236
+ raw_input = item['raw_input']
237
+ raw_pred_answer = item['choices'][0]['message']['content']
238
+ parsed_gold_answer = item['choices'][0]['review']['gold']
239
+ parsed_pred_answer = item['choices'][0]['review']['pred']
240
+ score = item['choices'][0]['review']['result']
241
+ raw_d = {
242
+ 'Input': raw_input,
243
+ 'Generated': raw_pred_answer,
244
+ 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
245
+ 'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
246
+ 'Score': score,
247
+ 'NScore': normalize_score(score)
248
+ }
249
+ ds.append(raw_d)
250
+
251
+ df_subset = pd.DataFrame(ds)
252
+ return df_subset
253
+
254
+
255
+ def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: int = 1) -> pd.DataFrame:
256
+ if data_review_df is None:
257
+ return None
258
+
259
+ logger.debug(f'page: {page}, rows_per_page: {rows_per_page}')
260
+ start = (page - 1) * rows_per_page
261
+ end = start + rows_per_page
262
+ df_subset = data_review_df.iloc[start:end].copy()
263
+ df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
264
+ df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
265
+ return df_subset
266
+
267
+
268
+ @dataclass
269
+ class SidebarComponents:
270
+ root_path: gr.Textbox
271
+ reports_dropdown: gr.Dropdown
272
+ load_btn: gr.Button
273
+
274
+
275
+ def create_sidebar():
276
+ gr.Markdown('## Settings')
277
+ root_path = gr.Textbox(label='Report(s) Root Path', value='./outputs', placeholder='./outputs', lines=1)
278
+ reports_dropdown = gr.Dropdown(label='Select Report(s)', choices=[], multiselect=True, interactive=True)
279
+ load_btn = gr.Button('Load & View')
280
+ gr.Markdown('### Note: Select report(s) and click `Load & View` to view the data!')
281
+
282
+ @reports_dropdown.focus(inputs=[root_path], outputs=[reports_dropdown])
283
+ def update_dropdown_choices(root_path):
284
+ folders = scan_for_report_folders(root_path)
285
+ if len(folders) == 0:
286
+ gr.Warning('No reports found, please check the path', duration=3)
287
+ return gr.update(choices=folders)
288
+
289
+ return SidebarComponents(
290
+ root_path=root_path,
291
+ reports_dropdown=reports_dropdown,
292
+ load_btn=load_btn,
293
+ )
294
+
295
+
296
+ @dataclass
297
+ class SingleModelComponents:
298
+ report_name: gr.Dropdown
299
+
300
+
301
+ def create_single_model_tab(sidebar: SidebarComponents):
302
+ report_name = gr.Dropdown(label='Select Report', choices=[], interactive=True)
303
+ work_dir = gr.State(None)
304
+ model_name = gr.State(None)
305
+
306
+ with gr.Accordion('Task Config', open=False):
307
+ task_config = gr.JSON(value=None)
308
+
309
+ report_list = gr.State([])
310
+
311
+ with gr.Tab('Datasets Overview'):
312
+ gr.Markdown('### Dataset Components')
313
+ sunburst_plot = gr.Plot(value=None, scale=1, label='Components')
314
+ gr.Markdown('### Dataset Scores')
315
+ score_plot = gr.Plot(value=None, scale=1, label='Scores')
316
+ gr.Markdown('### Dataset Scores Table')
317
+ score_table = gr.DataFrame(value=None)
318
+
319
+ with gr.Tab('Dataset Details'):
320
+ dataset_radio = gr.Radio(label='Select Dataset', choices=[], show_label=True, interactive=True)
321
+ gr.Markdown('### Dataset Scores')
322
+ dataset_plot = gr.Plot(value=None, scale=1, label='Scores')
323
+ gr.Markdown('### Dataset Scores Table')
324
+ dataset_table = gr.DataFrame(value=None)
325
+
326
+ gr.Markdown('### Model Prediction')
327
+ subset_radio = gr.Radio(label='Select Subset', choices=[], show_label=True, interactive=True)
328
+ with gr.Row():
329
+ answer_mode_radio = gr.Radio(
330
+ label='Answer Mode', choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
331
+ page_number = gr.Number(value=1, label='Page', minimum=1, maximum=1, step=1, interactive=True)
332
+ answer_mode_counts = gr.Markdown('', label='Counts')
333
+ data_review_df = gr.State(None)
334
+ filtered_review_df = gr.State(None)
335
+ data_review_table = gr.DataFrame(
336
+ value=None,
337
+ datatype=['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number'],
338
+ # column_widths=['500px', '500px'],
339
+ wrap=True,
340
+ latex_delimiters=[{
341
+ 'left': '$$',
342
+ 'right': '$$',
343
+ 'display': True
344
+ }, {
345
+ 'left': '$',
346
+ 'right': '$',
347
+ 'display': False
348
+ }, {
349
+ 'left': '\\(',
350
+ 'right': '\\)',
351
+ 'display': False
352
+ }, {
353
+ 'left': '\\[',
354
+ 'right': '\\]',
355
+ 'display': True
356
+ }],
357
+ max_height=500)
358
+
359
+ @report_name.change(
360
+ inputs=[sidebar.root_path, report_name],
361
+ outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
362
+ def update_single_report_data(root_path, report_name):
363
+ report_list, datasets, task_cfg = load_single_report(root_path, report_name)
364
+ work_dir = os.path.join(root_path, report_name.split('@')[0])
365
+ model_name = report_name.split('@')[1].split(':')[0]
366
+ return (report_list, task_cfg, gr.update(choices=datasets, value=datasets[0]), work_dir, model_name)
367
+
368
+ @report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])
369
+ def update_single_report_score(report_list):
370
+ report_score_df = get_acc_report_df(report_list)
371
+ report_score_plot = plot_single_report_scores(report_score_df)
372
+ report_sunburst_plot = plot_single_report_sunburst(report_list)
373
+ return report_score_plot, report_score_df, report_sunburst_plot
374
+
375
+ @gr.on(
376
+ triggers=[dataset_radio.change, report_list.change],
377
+ inputs=[dataset_radio, report_list],
378
+ outputs=[dataset_plot, dataset_table, subset_radio])
379
+ def update_single_report_dataset(dataset_name, report_list):
380
+ logger.debug(f'Updating single report dataset: {dataset_name}')
381
+ report_df = get_data_frame(report_list)
382
+ data_score_df = get_single_dataset_data(report_df, dataset_name)
383
+ data_score_plot = plot_single_dataset_scores(data_score_df)
384
+ subsets = data_score_df[ReportKey.subset_name].unique().tolist()
385
+ logger.debug(f'subsets: {subsets}')
386
+ return data_score_plot, data_score_df, gr.update(choices=subsets, value=subsets[0])
387
+
388
+ @subset_radio.change(
389
+ inputs=[work_dir, model_name, dataset_radio, subset_radio], outputs=[data_review_df, page_number])
390
+ def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
391
+ if not subset_name:
392
+ return gr.skip()
393
+ data_review_df = get_model_prediction(work_dir, model_name, dataset_name, subset_name)
394
+ return data_review_df, 1
395
+
396
+ @gr.on(
397
+ triggers=[data_review_df.change, answer_mode_radio.change],
398
+ inputs=[data_review_df, answer_mode_radio],
399
+ outputs=[filtered_review_df, page_number, answer_mode_counts])
400
+ def filter_data(data_review_df, answer_mode):
401
+ if data_review_df is None:
402
+ return None, gr.update(value=1, maximum=1), ''
403
+
404
+ all_count = len(data_review_df)
405
+ pass_df = data_review_df[data_review_df['NScore'] >= 0.99]
406
+ pass_count = len(pass_df)
407
+ fail_count = all_count - pass_count
408
+
409
+ counts_text = f'### All: {all_count} | Pass: {pass_count} | Fail: {fail_count}'
410
+
411
+ if answer_mode == 'Pass':
412
+ filtered_df = pass_df
413
+ elif answer_mode == 'Fail':
414
+ filtered_df = data_review_df[data_review_df['NScore'] < 0.99]
415
+ else:
416
+ filtered_df = data_review_df
417
+
418
+ max_page = max(1, len(filtered_df))
419
+
420
+ return (filtered_df, gr.update(value=1, maximum=max_page), counts_text)
421
+
422
+ @gr.on(
423
+ triggers=[filtered_review_df.change, page_number.change],
424
+ inputs=[filtered_review_df, page_number],
425
+ outputs=[data_review_table])
426
+ def update_table(filtered_df, page_number):
427
+ subset_df = get_table_data(filtered_df, page_number)
428
+ if subset_df is None:
429
+ return gr.skip()
430
+ return subset_df
431
+
432
+ return SingleModelComponents(report_name=report_name)
433
+
434
+
435
+ @dataclass
436
+ class MultiModelComponents:
437
+ multi_report_name: gr.Dropdown
438
+
439
+
440
+ def create_multi_model_tab(sidebar: SidebarComponents):
441
+ multi_report_name = gr.Dropdown(label='Select Reports', choices=[], multiselect=True, interactive=True)
442
+ gr.Markdown('### Model Radar')
443
+ radar_plot = gr.Plot(value=None)
444
+ gr.Markdown('### Model Scores')
445
+ score_table = gr.DataFrame(value=None)
446
+
447
+ @multi_report_name.change(inputs=[sidebar.root_path, multi_report_name], outputs=[radar_plot, score_table])
448
+ def update_multi_report_data(root_path, multi_report_name):
449
+ if not multi_report_name:
450
+ return gr.skip()
451
+ report_list = load_multi_report(root_path, multi_report_name)
452
+ report_df = get_acc_report_df(report_list)
453
+ report_radar_plot = plot_multi_report_radar(report_df)
454
+ report_compare_df = get_compare_report_df(report_df)
455
+ return report_radar_plot, report_compare_df
456
+
457
+ return MultiModelComponents(multi_report_name=multi_report_name)
458
+
459
+
460
+ def create_app():
461
+ with gr.Blocks(title='Evalscope Dashboard') as demo:
462
+ with gr.Row():
463
+ with gr.Column(scale=0, min_width=35):
464
+ toggle_btn = gr.Button('<')
465
+ with gr.Column(scale=1):
466
+ gr.HTML('<h1 style="text-align: left;">Evalscope Dashboard</h1>') # 文本列
467
+
468
+ with gr.Row():
469
+ with gr.Column(scale=1) as sidebar_column:
470
+ sidebar_visible = gr.State(True)
471
+ sidebar = create_sidebar()
472
+
473
+ with gr.Column(scale=5):
474
+
475
+ with gr.Column(visible=True):
476
+ gr.Markdown('## Visualization')
477
+ with gr.Tabs():
478
+ with gr.Tab('Single Model'):
479
+ single = create_single_model_tab(sidebar)
480
+
481
+ with gr.Tab('Multi Model'):
482
+ multi = create_multi_model_tab(sidebar)
483
+
484
+ @sidebar.load_btn.click(
485
+ inputs=[sidebar.reports_dropdown], outputs=[single.report_name, multi.multi_report_name])
486
+ def update_displays(reports_dropdown):
487
+ if not reports_dropdown:
488
+ gr.Warning('No reports found, please check the path', duration=3)
489
+ return gr.skip()
490
+
491
+ return (
492
+ gr.update(choices=reports_dropdown, value=reports_dropdown[0]), # update single model dropdown
493
+ gr.update(choices=reports_dropdown, value=reports_dropdown) # update multi model dropdown
494
+ )
495
+
496
+ @toggle_btn.click(inputs=[sidebar_visible], outputs=[sidebar_column, sidebar_visible, toggle_btn])
497
+ def toggle_sidebar(visible):
498
+ new_visible = not visible
499
+ text = '<' if new_visible else '>'
500
+ return gr.update(visible=new_visible), new_visible, gr.update(value=text)
501
+
502
+ demo.launch()
503
+
504
+
505
+ if __name__ == '__main__':
506
+ create_app()
@@ -0,0 +1,73 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import glob
4
+ import os
5
+ import pandas as pd
6
+ from tabulate import tabulate
7
+ from typing import List, Tuple
8
+
9
+ from evalscope.report.utils import Report
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+ """
14
+ Combine and generate table for reports of LLMs.
15
+ """
16
+
17
+
18
+ def get_report_list(reports_path_list: List[str]) -> List[Report]:
19
+ report_list: List[Report] = []
20
+ # Iterate over each report path
21
+ for report_path in reports_path_list:
22
+ model_report_dir = os.path.normpath(report_path)
23
+ report_files = glob.glob(os.path.join(model_report_dir, '**', '*.json'), recursive=True)
24
+ # Iterate over each report file
25
+ for file_path in report_files:
26
+ try:
27
+ report = Report.from_json(file_path)
28
+ report_list.append(report)
29
+ except Exception as e:
30
+ logger.error(f'Error loading report from {file_path}: {e}')
31
+ report_list = sorted(report_list, key=lambda x: (x.model_name, x.dataset_name))
32
+ return report_list
33
+
34
+
35
+ def get_data_frame(report_list: List[Report],
36
+ flatten_metrics: bool = True,
37
+ flatten_categories: bool = True) -> pd.DataFrame:
38
+ tables = []
39
+ for report in report_list:
40
+ df = report.to_dataframe(flatten_metrics=flatten_metrics, flatten_categories=flatten_categories)
41
+ tables.append(df)
42
+ return pd.concat(tables, ignore_index=True)
43
+
44
+
45
+ def gen_table(reports_path_list: list) -> str:
46
+ report_list = get_report_list(reports_path_list)
47
+ table = get_data_frame(report_list)
48
+ return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
49
+
50
+
51
+ class ReportsRecorder:
52
+ COMMON_DATASET_PATH = []
53
+ CUSTOM_DATASET_PATH = []
54
+
55
+ def __init__(self, oss_url: str = '', endpoint: str = ''):
56
+ pass
57
+
58
+
59
+ if __name__ == '__main__':
60
+ report_dir_1 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250117_151926'
61
+ # report_dir_2 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250107_204445/reports'
62
+
63
+ report_table = gen_table([report_dir_1])
64
+ print(report_table)
65
+
66
+ # ALL VALUES ONLY FOR EXAMPLE
67
+ # +--------------------------+-------------------+-------------+
68
+ # | Model | CompetitionMath | GSM8K |
69
+ # +==========================+===================+=============+
70
+ # | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
71
+ # +--------------------------+-------------------+-------------+
72
+ # | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
73
+ # +--------------------------+-------------------+-------------+
@@ -0,0 +1,80 @@
1
+ import pandas as pd
2
+ from pandas import DataFrame
3
+
4
+ from evalscope.constants import DataCollection
5
+ from evalscope.report.utils import *
6
+
7
+
8
+ class ReportGenerator:
9
+
10
+ @staticmethod
11
+ def gen_report(subset_score_map: dict, report_name: str, **kwargs) -> Report:
12
+ """
13
+ Generate report for specific dataset.
14
+ subset_score_map: e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}, {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}]}
15
+ category_map: e.g. {'subset_name': ['category_name1', 'category_name2'], ...}
16
+ metric_list: e.g. [{'object': AverageAccuracy, 'name': 'AverageAccuracy'}, {'object': 'WeightedAverageAccuracy', 'name': 'WeightedAverageAccuracy'}]
17
+ """ # noqa: E501
18
+
19
+ dataset_name = kwargs.get('dataset_name', None)
20
+ model_name = kwargs.get('model_name', None)
21
+ category_map = kwargs.get('category_map', {})
22
+
23
+ def flatten_subset() -> DataFrame:
24
+ """
25
+ Flatten subset score map to a DataFrame.
26
+
27
+ Example:
28
+ name score num categories metric_name
29
+ 0 ARC-Easy 0.5 2 [default] AverageAccuracy
30
+ 1 ARC-Challenge 0.5 2 [default] AverageAccuracy
31
+ """
32
+ subsets = []
33
+ for subset_name, scores in subset_score_map.items():
34
+ for score_item in scores:
35
+ categories = category_map.get(subset_name, ['default'])
36
+ if isinstance(categories, str):
37
+ categories = [categories]
38
+ subsets.append(
39
+ dict(
40
+ name=subset_name,
41
+ score=score_item['score'],
42
+ num=score_item['num'],
43
+ metric_name=score_item['metric_name'],
44
+ categories=tuple(categories)))
45
+ df = pd.DataFrame(subsets)
46
+ return df
47
+
48
+ df = flatten_subset()
49
+
50
+ metrics_list = []
51
+ for metric_name, group_metric in df.groupby('metric_name'):
52
+ categories = []
53
+ for category_name, group_category in group_metric.groupby('categories'):
54
+ subsets = []
55
+ for _, row in group_category.iterrows():
56
+ subsets.append(Subset(name=row['name'], score=row['score'], num=row['num']))
57
+
58
+ categories.append(Category(name=category_name, subsets=subsets))
59
+
60
+ metrics_list.append(Metric(name=metric_name, categories=categories))
61
+
62
+ report = Report(name=report_name, metrics=metrics_list, dataset_name=dataset_name, model_name=model_name)
63
+ return report
64
+
65
+ @staticmethod
66
+ def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
67
+ categories = []
68
+ for category_name, group_category in df.groupby('categories'):
69
+ subsets = []
70
+ for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
71
+ avg_score = group_subset['score'].mean()
72
+ num = group_subset['score'].count()
73
+ subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
74
+
75
+ categories.append(Category(name=category_name, subsets=subsets))
76
+ return Report(
77
+ name=DataCollection.NAME,
78
+ metrics=[Metric(name='Average', categories=categories)],
79
+ dataset_name=all_dataset_name,
80
+ model_name=model_name)