evalscope 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (59) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/aime24/__init__.py +0 -0
  3. evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  5. evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
  6. evalscope/benchmarks/benchmark.py +2 -2
  7. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
  10. evalscope/benchmarks/data_adapter.py +18 -12
  11. evalscope/benchmarks/data_collection/__init__.py +0 -0
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  13. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
  16. evalscope/benchmarks/gpqa/__init__.py +0 -0
  17. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  18. evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
  19. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
  20. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  21. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  22. evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
  23. evalscope/benchmarks/ifeval/instructions.py +3 -4
  24. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  25. evalscope/benchmarks/math_500/__init__.py +0 -0
  26. evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  28. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
  29. evalscope/benchmarks/race/race_adapter.py +3 -3
  30. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  31. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
  32. evalscope/cli/start_app.py +3 -2
  33. evalscope/collections/evaluator.py +103 -39
  34. evalscope/collections/sampler.py +2 -1
  35. evalscope/collections/schema.py +1 -2
  36. evalscope/config.py +1 -0
  37. evalscope/evaluator/evaluator.py +78 -64
  38. evalscope/metrics/math_parser.py +526 -0
  39. evalscope/metrics/metrics.py +16 -1
  40. evalscope/metrics/named_metrics.py +31 -7
  41. evalscope/models/chat_adapter.py +69 -47
  42. evalscope/models/choice_adapter.py +52 -45
  43. evalscope/models/custom_adapter.py +2 -2
  44. evalscope/models/local_model.py +4 -0
  45. evalscope/models/server_adapter.py +28 -34
  46. evalscope/report/app.py +298 -96
  47. evalscope/run.py +10 -7
  48. evalscope/utils/chat_service.py +2 -2
  49. evalscope/utils/io_utils.py +1 -1
  50. evalscope/version.py +2 -2
  51. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/METADATA +20 -11
  52. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/RECORD +57 -47
  53. tests/cli/test_run.py +93 -16
  54. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  55. evalscope/metrics/math_accuracy.py +0 -200
  56. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
  57. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
  58. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
  59. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
evalscope/report/app.py CHANGED
@@ -1,3 +1,4 @@
1
+ import argparse
1
2
  import glob
2
3
  import gradio as gr
3
4
  import numpy as np
@@ -5,16 +6,20 @@ import os
5
6
  import pandas as pd
6
7
  import plotly.express as px
7
8
  import plotly.graph_objects as go
9
+ import re
8
10
  from dataclasses import dataclass
9
11
  from typing import Any, List, Union
10
12
 
11
13
  from evalscope.constants import DataCollection
12
14
  from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
13
15
  from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
14
- from evalscope.utils.logger import get_logger
16
+ from evalscope.utils.logger import configure_logging, get_logger
17
+ from evalscope.version import __version__
15
18
 
16
19
  logger = get_logger()
17
20
 
21
+ PLOTLY_THEME = 'plotly_dark'
22
+
18
23
 
19
24
  def scan_for_report_folders(root_path):
20
25
  """Scan for folders containing reports subdirectories"""
@@ -94,24 +99,33 @@ def get_acc_report_df(report_list: List[Report]):
94
99
  }
95
100
  data_dict.append(item)
96
101
  df = pd.DataFrame.from_dict(data_dict, orient='columns')
97
- return df
102
+
103
+ styler = style_df(df, columns=[ReportKey.score])
104
+ return df, styler
105
+
106
+
107
+ def style_df(df: pd.DataFrame, columns: List[str] = None):
108
+ # Apply background gradient to the specified columns
109
+ styler = df.style.background_gradient(subset=columns, cmap='RdYlGn', vmin=0.0, vmax=1.0, axis=0)
110
+ # Format the dataframe with a precision of 4 decimal places
111
+ styler.format(precision=4)
112
+ return styler
98
113
 
99
114
 
100
115
  def get_compare_report_df(acc_df: pd.DataFrame):
101
116
  df = acc_df.pivot_table(index=ReportKey.model_name, columns=ReportKey.dataset_name, values=ReportKey.score)
102
117
  df.reset_index(inplace=True)
103
- styler = df.style.background_gradient(cmap='RdYlGn', vmin=0.0, vmax=1.0, axis=0)
104
- styler.format(precision=4)
105
- return styler
118
+
119
+ styler = style_df(df)
120
+ return df, styler
106
121
 
107
122
 
108
123
  def plot_single_report_scores(df: pd.DataFrame):
109
- plot = px.bar(
110
- df,
111
- x=df[ReportKey.dataset_name],
112
- y=df[ReportKey.score],
113
- color=df[ReportKey.dataset_name],
114
- template='plotly_dark')
124
+ plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
125
+
126
+ width = 0.2 if len(df[ReportKey.dataset_name]) <= 5 else None
127
+ plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
128
+ plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
115
129
  return plot
116
130
 
117
131
 
@@ -126,6 +140,7 @@ def plot_single_report_sunburst(report_list: List[Report]):
126
140
  path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
127
141
  logger.debug(f'df: {df}')
128
142
  df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
143
+
129
144
  plot = px.sunburst(
130
145
  df,
131
146
  path=path,
@@ -133,15 +148,17 @@ def plot_single_report_sunburst(report_list: List[Report]):
133
148
  color=ReportKey.score,
134
149
  color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
135
150
  color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
136
- template='plotly_dark',
137
- maxdepth=3)
151
+ template=PLOTLY_THEME,
152
+ maxdepth=4)
138
153
  plot.update_traces(insidetextorientation='radial')
139
- plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1))
154
+ plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1), height=600)
140
155
  return plot
141
156
 
142
157
 
143
- def get_single_dataset_data(df: pd.DataFrame, dataset_name: str):
144
- return df[df[ReportKey.dataset_name] == dataset_name]
158
+ def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
159
+ df = df[df[ReportKey.dataset_name] == dataset_name]
160
+ styler = style_df(df, columns=[ReportKey.score])
161
+ return df, styler
145
162
 
146
163
 
147
164
  def plot_single_dataset_scores(df: pd.DataFrame):
@@ -151,8 +168,12 @@ def plot_single_dataset_scores(df: pd.DataFrame):
151
168
  x=df[ReportKey.metric_name],
152
169
  y=df[ReportKey.score],
153
170
  color=df[ReportKey.subset_name],
154
- template='plotly_dark',
171
+ text=df[ReportKey.score],
155
172
  barmode='group')
173
+
174
+ width = 0.2 if len(df[ReportKey.subset_name]) <= 5 else None
175
+ plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
176
+ plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
156
177
  return plot
157
178
 
158
179
 
@@ -172,7 +193,7 @@ def plot_multi_report_radar(df: pd.DataFrame):
172
193
  fill='toself'))
173
194
 
174
195
  fig.update_layout(
175
- template='plotly_dark',
196
+ template=PLOTLY_THEME,
176
197
  polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
177
198
  margin=dict(t=20, l=20, r=20, b=20))
178
199
  return fig
@@ -198,7 +219,16 @@ def dict_to_markdown(data) -> str:
198
219
  return '\n\n'.join(markdown_lines)
199
220
 
200
221
 
222
+ def convert_html_tags(text):
223
+ # match begin label
224
+ text = re.sub(r'<(\w+)>', r'[\1]', text)
225
+ # match end label
226
+ text = re.sub(r'</(\w+)>', r'[/\1]', text)
227
+ return text
228
+
229
+
201
230
  def process_string(string: str, max_length: int = 2048) -> str:
231
+ string = convert_html_tags(string) # for display labels e.g. `<think>`
202
232
  if len(string) > max_length:
203
233
  return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
204
234
  return string
@@ -206,9 +236,11 @@ def process_string(string: str, max_length: int = 2048) -> str:
206
236
 
207
237
  def process_model_prediction(item: Any):
208
238
  if isinstance(item, dict):
209
- return dict_to_markdown(item)
239
+ res = dict_to_markdown(item)
240
+ return process_string(res)
210
241
  elif isinstance(item, list):
211
- return '\n'.join([process_model_prediction(item) for item in item])
242
+ res = '\n'.join([process_model_prediction(item) for item in item])
243
+ return process_string(res)
212
244
  else:
213
245
  return process_string(str(item))
214
246
 
@@ -230,23 +262,27 @@ def normalize_score(score):
230
262
  def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
231
263
  data_path = os.path.join(work_dir, OutputsStructure.REVIEWS_DIR, model_name)
232
264
  subset_name = subset_name.replace('/', '_') # for collection report
233
- origin_df = pd.read_json(os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl'), lines=True)
265
+ review_path = os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl')
266
+ logger.debug(f'review_path: {review_path}')
267
+ origin_df = pd.read_json(review_path, lines=True)
268
+
234
269
  ds = []
235
270
  for i, item in origin_df.iterrows():
236
271
  raw_input = item['raw_input']
237
- raw_pred_answer = item['choices'][0]['message']['content']
238
- parsed_gold_answer = item['choices'][0]['review']['gold']
239
- parsed_pred_answer = item['choices'][0]['review']['pred']
240
- score = item['choices'][0]['review']['result']
241
- raw_d = {
242
- 'Input': raw_input,
243
- 'Generated': raw_pred_answer,
244
- 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
245
- 'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
246
- 'Score': score,
247
- 'NScore': normalize_score(score)
248
- }
249
- ds.append(raw_d)
272
+ for choice in item['choices']:
273
+ raw_pred_answer = choice['message']['content']
274
+ parsed_gold_answer = choice['review']['gold']
275
+ parsed_pred_answer = choice['review']['pred']
276
+ score = choice['review']['result']
277
+ raw_d = {
278
+ 'Input': raw_input,
279
+ 'Generated': raw_pred_answer,
280
+ 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
281
+ 'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
282
+ 'Score': score,
283
+ 'NScore': normalize_score(score)
284
+ }
285
+ ds.append(raw_d)
250
286
 
251
287
  df_subset = pd.DataFrame(ds)
252
288
  return df_subset
@@ -254,15 +290,18 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
254
290
 
255
291
  def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: int = 1) -> pd.DataFrame:
256
292
  if data_review_df is None:
257
- return None
293
+ return pd.DataFrame(), None
258
294
 
259
295
  logger.debug(f'page: {page}, rows_per_page: {rows_per_page}')
260
296
  start = (page - 1) * rows_per_page
261
297
  end = start + rows_per_page
262
298
  df_subset = data_review_df.iloc[start:end].copy()
263
299
  df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
300
+ df_subset['Generated'] = df_subset['Generated'].map(process_model_prediction).astype(str)
301
+ df_subset['Pred'] = df_subset['Pred'].map(process_model_prediction).astype(str)
264
302
  df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
265
- return df_subset
303
+ styler = style_df(df_subset, columns=['NScore'])
304
+ return df_subset, styler
266
305
 
267
306
 
268
307
  @dataclass
@@ -272,18 +311,47 @@ class SidebarComponents:
272
311
  load_btn: gr.Button
273
312
 
274
313
 
275
- def create_sidebar():
276
- gr.Markdown('## Settings')
277
- root_path = gr.Textbox(label='Report(s) Root Path', value='./outputs', placeholder='./outputs', lines=1)
278
- reports_dropdown = gr.Dropdown(label='Select Report(s)', choices=[], multiselect=True, interactive=True)
279
- load_btn = gr.Button('Load & View')
280
- gr.Markdown('### Note: Select report(s) and click `Load & View` to view the data!')
314
+ def create_sidebar(outputs_dir: str, lang: str):
315
+ locale_dict = {
316
+ 'settings': {
317
+ 'zh': '设置',
318
+ 'en': 'Settings'
319
+ },
320
+ 'report_root_path': {
321
+ 'zh': '报告根路径',
322
+ 'en': 'Report Root Path'
323
+ },
324
+ 'select_reports': {
325
+ 'zh': '请选择报告',
326
+ 'en': 'Select Reports'
327
+ },
328
+ 'load_btn': {
329
+ 'zh': '加载并查看',
330
+ 'en': 'Load & View'
331
+ },
332
+ 'note': {
333
+ 'zh': '请选择报告并点击`加载并查看`来查看数据',
334
+ 'en': 'Please select reports and click `Load & View` to view the data'
335
+ },
336
+ 'warning': {
337
+ 'zh': '没有找到报告,请检查路径',
338
+ 'en': 'No reports found, please check the path'
339
+ }
340
+ }
341
+
342
+ gr.Markdown(f'## {locale_dict["settings"][lang]}')
343
+ root_path = gr.Textbox(
344
+ label=locale_dict['report_root_path'][lang], value=outputs_dir, placeholder=outputs_dir, lines=1)
345
+ reports_dropdown = gr.Dropdown(
346
+ label=locale_dict['select_reports'][lang], choices=[], multiselect=True, interactive=True)
347
+ load_btn = gr.Button(locale_dict['load_btn'][lang])
348
+ gr.Markdown(f'### {locale_dict["note"][lang]}')
281
349
 
282
350
  @reports_dropdown.focus(inputs=[root_path], outputs=[reports_dropdown])
283
351
  def update_dropdown_choices(root_path):
284
352
  folders = scan_for_report_folders(root_path)
285
353
  if len(folders) == 0:
286
- gr.Warning('No reports found, please check the path', duration=3)
354
+ gr.Warning(locale_dict['warning'][lang], duration=3)
287
355
  return gr.update(choices=folders)
288
356
 
289
357
  return SidebarComponents(
@@ -293,42 +361,132 @@ def create_sidebar():
293
361
  )
294
362
 
295
363
 
364
+ @dataclass
365
+ class VisualizationComponents:
366
+ single_model: gr.Tab
367
+ multi_model: gr.Tab
368
+
369
+
370
+ def create_visualization(sidebar: SidebarComponents, lang: str):
371
+ locale_dict = {
372
+ 'visualization': {
373
+ 'zh': '可视化',
374
+ 'en': 'Visualization'
375
+ },
376
+ 'single_model': {
377
+ 'zh': '单模型',
378
+ 'en': 'Single Model'
379
+ },
380
+ 'multi_model': {
381
+ 'zh': '多模型',
382
+ 'en': 'Multi Model'
383
+ }
384
+ }
385
+ with gr.Column(visible=True):
386
+ gr.Markdown(f'## {locale_dict["visualization"][lang]}')
387
+ with gr.Tabs():
388
+ with gr.Tab(locale_dict['single_model'][lang]):
389
+ single = create_single_model_tab(sidebar, lang)
390
+
391
+ with gr.Tab(locale_dict['multi_model'][lang]):
392
+ multi = create_multi_model_tab(sidebar, lang)
393
+ return VisualizationComponents(
394
+ single_model=single,
395
+ multi_model=multi,
396
+ )
397
+
398
+
296
399
  @dataclass
297
400
  class SingleModelComponents:
298
401
  report_name: gr.Dropdown
299
402
 
300
403
 
301
- def create_single_model_tab(sidebar: SidebarComponents):
302
- report_name = gr.Dropdown(label='Select Report', choices=[], interactive=True)
404
+ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
405
+ locale_dict = {
406
+ 'select_report': {
407
+ 'zh': '选择报告',
408
+ 'en': 'Select Report'
409
+ },
410
+ 'task_config': {
411
+ 'zh': '任务配置',
412
+ 'en': 'Task Config'
413
+ },
414
+ 'datasets_overview': {
415
+ 'zh': '数据集概览',
416
+ 'en': 'Datasets Overview'
417
+ },
418
+ 'dataset_components': {
419
+ 'zh': '数据集组成',
420
+ 'en': 'Dataset Components'
421
+ },
422
+ 'dataset_scores': {
423
+ 'zh': '数据集分数',
424
+ 'en': 'Dataset Scores'
425
+ },
426
+ 'dataset_scores_table': {
427
+ 'zh': '数据集分数表',
428
+ 'en': 'Dataset Scores Table'
429
+ },
430
+ 'dataset_details': {
431
+ 'zh': '数据集详情',
432
+ 'en': 'Dataset Details'
433
+ },
434
+ 'select_dataset': {
435
+ 'zh': '选择数据集',
436
+ 'en': 'Select Dataset'
437
+ },
438
+ 'model_prediction': {
439
+ 'zh': '模型预测',
440
+ 'en': 'Model Prediction'
441
+ },
442
+ 'select_subset': {
443
+ 'zh': '选择子集',
444
+ 'en': 'Select Subset'
445
+ },
446
+ 'answer_mode': {
447
+ 'zh': '答案模式',
448
+ 'en': 'Answer Mode'
449
+ },
450
+ 'page': {
451
+ 'zh': '页码',
452
+ 'en': 'Page'
453
+ }
454
+ }
455
+
456
+ # Update the UI components with localized labels
457
+ report_name = gr.Dropdown(label=locale_dict['select_report'][lang], choices=[], interactive=True)
303
458
  work_dir = gr.State(None)
304
459
  model_name = gr.State(None)
305
460
 
306
- with gr.Accordion('Task Config', open=False):
461
+ with gr.Accordion(locale_dict['task_config'][lang], open=False):
307
462
  task_config = gr.JSON(value=None)
308
463
 
309
464
  report_list = gr.State([])
310
465
 
311
- with gr.Tab('Datasets Overview'):
312
- gr.Markdown('### Dataset Components')
313
- sunburst_plot = gr.Plot(value=None, scale=1, label='Components')
314
- gr.Markdown('### Dataset Scores')
315
- score_plot = gr.Plot(value=None, scale=1, label='Scores')
316
- gr.Markdown('### Dataset Scores Table')
466
+ with gr.Tab(locale_dict['datasets_overview'][lang]):
467
+ gr.Markdown(f'### {locale_dict["dataset_components"][lang]}')
468
+ sunburst_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_components'][lang])
469
+ gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
470
+ score_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
471
+ gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
317
472
  score_table = gr.DataFrame(value=None)
318
473
 
319
- with gr.Tab('Dataset Details'):
320
- dataset_radio = gr.Radio(label='Select Dataset', choices=[], show_label=True, interactive=True)
321
- gr.Markdown('### Dataset Scores')
322
- dataset_plot = gr.Plot(value=None, scale=1, label='Scores')
323
- gr.Markdown('### Dataset Scores Table')
474
+ with gr.Tab(locale_dict['dataset_details'][lang]):
475
+ dataset_radio = gr.Radio(
476
+ label=locale_dict['select_dataset'][lang], choices=[], show_label=True, interactive=True)
477
+ gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
478
+ dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
479
+ gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
324
480
  dataset_table = gr.DataFrame(value=None)
325
481
 
326
- gr.Markdown('### Model Prediction')
327
- subset_radio = gr.Radio(label='Select Subset', choices=[], show_label=True, interactive=True)
482
+ gr.Markdown(f'### {locale_dict["model_prediction"][lang]}')
483
+ subset_select = gr.Dropdown(
484
+ label=locale_dict['select_subset'][lang], choices=[], show_label=True, interactive=True)
328
485
  with gr.Row():
329
486
  answer_mode_radio = gr.Radio(
330
- label='Answer Mode', choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
331
- page_number = gr.Number(value=1, label='Page', minimum=1, maximum=1, step=1, interactive=True)
487
+ label=locale_dict['answer_mode'][lang], choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
488
+ page_number = gr.Number(
489
+ value=1, label=locale_dict['page'][lang], minimum=1, maximum=1, step=1, interactive=True)
332
490
  answer_mode_counts = gr.Markdown('', label='Counts')
333
491
  data_review_df = gr.State(None)
334
492
  filtered_review_df = gr.State(None)
@@ -354,7 +512,7 @@ def create_single_model_tab(sidebar: SidebarComponents):
354
512
  'right': '\\]',
355
513
  'display': True
356
514
  }],
357
- max_height=500)
515
+ max_height=600)
358
516
 
359
517
  @report_name.change(
360
518
  inputs=[sidebar.root_path, report_name],
@@ -367,26 +525,28 @@ def create_single_model_tab(sidebar: SidebarComponents):
367
525
 
368
526
  @report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])
369
527
  def update_single_report_score(report_list):
370
- report_score_df = get_acc_report_df(report_list)
528
+ report_score_df, styler = get_acc_report_df(report_list)
371
529
  report_score_plot = plot_single_report_scores(report_score_df)
372
530
  report_sunburst_plot = plot_single_report_sunburst(report_list)
373
- return report_score_plot, report_score_df, report_sunburst_plot
531
+ return report_score_plot, styler, report_sunburst_plot
374
532
 
375
533
  @gr.on(
376
534
  triggers=[dataset_radio.change, report_list.change],
377
535
  inputs=[dataset_radio, report_list],
378
- outputs=[dataset_plot, dataset_table, subset_radio])
536
+ outputs=[dataset_plot, dataset_table, subset_select, data_review_df])
379
537
  def update_single_report_dataset(dataset_name, report_list):
380
538
  logger.debug(f'Updating single report dataset: {dataset_name}')
381
539
  report_df = get_data_frame(report_list)
382
- data_score_df = get_single_dataset_data(report_df, dataset_name)
540
+ data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
383
541
  data_score_plot = plot_single_dataset_scores(data_score_df)
384
542
  subsets = data_score_df[ReportKey.subset_name].unique().tolist()
385
543
  logger.debug(f'subsets: {subsets}')
386
- return data_score_plot, data_score_df, gr.update(choices=subsets, value=subsets[0])
544
+ return data_score_plot, styler, gr.update(choices=subsets, value=None), None
387
545
 
388
- @subset_radio.change(
389
- inputs=[work_dir, model_name, dataset_radio, subset_radio], outputs=[data_review_df, page_number])
546
+ @gr.on(
547
+ triggers=[subset_select.change],
548
+ inputs=[work_dir, model_name, dataset_radio, subset_select],
549
+ outputs=[data_review_df, page_number])
390
550
  def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
391
551
  if not subset_name:
392
552
  return gr.skip()
@@ -424,10 +584,10 @@ def create_single_model_tab(sidebar: SidebarComponents):
424
584
  inputs=[filtered_review_df, page_number],
425
585
  outputs=[data_review_table])
426
586
  def update_table(filtered_df, page_number):
427
- subset_df = get_table_data(filtered_df, page_number)
428
- if subset_df is None:
429
- return gr.skip()
430
- return subset_df
587
+ if filtered_df is None:
588
+ return gr.update(value=None)
589
+ subset_df, styler = get_table_data(filtered_df, page_number)
590
+ return styler
431
591
 
432
592
  return SingleModelComponents(report_name=report_name)
433
593
 
@@ -437,11 +597,26 @@ class MultiModelComponents:
437
597
  multi_report_name: gr.Dropdown
438
598
 
439
599
 
440
- def create_multi_model_tab(sidebar: SidebarComponents):
441
- multi_report_name = gr.Dropdown(label='Select Reports', choices=[], multiselect=True, interactive=True)
442
- gr.Markdown('### Model Radar')
600
+ def create_multi_model_tab(sidebar: SidebarComponents, lang: str):
601
+ locale_dict = {
602
+ 'select_reports': {
603
+ 'zh': '请选择报告',
604
+ 'en': 'Select Reports'
605
+ },
606
+ 'model_radar': {
607
+ 'zh': '模型对比雷达',
608
+ 'en': 'Model Comparison Radar'
609
+ },
610
+ 'model_scores': {
611
+ 'zh': '模型对比分数',
612
+ 'en': 'Model Comparison Scores'
613
+ }
614
+ }
615
+ multi_report_name = gr.Dropdown(
616
+ label=locale_dict['select_reports'][lang], choices=[], multiselect=True, interactive=True)
617
+ gr.Markdown(locale_dict['model_radar'][lang])
443
618
  radar_plot = gr.Plot(value=None)
444
- gr.Markdown('### Model Scores')
619
+ gr.Markdown(locale_dict['model_scores'][lang])
445
620
  score_table = gr.DataFrame(value=None)
446
621
 
447
622
  @multi_report_name.change(inputs=[sidebar.root_path, multi_report_name], outputs=[radar_plot, score_table])
@@ -449,43 +624,58 @@ def create_multi_model_tab(sidebar: SidebarComponents):
449
624
  if not multi_report_name:
450
625
  return gr.skip()
451
626
  report_list = load_multi_report(root_path, multi_report_name)
452
- report_df = get_acc_report_df(report_list)
627
+ report_df, _ = get_acc_report_df(report_list)
453
628
  report_radar_plot = plot_multi_report_radar(report_df)
454
- report_compare_df = get_compare_report_df(report_df)
455
- return report_radar_plot, report_compare_df
629
+ _, styler = get_compare_report_df(report_df)
630
+ return report_radar_plot, styler
456
631
 
457
632
  return MultiModelComponents(multi_report_name=multi_report_name)
458
633
 
459
634
 
460
- def create_app():
635
+ def create_app(args: argparse.Namespace):
636
+ configure_logging(debug=args.debug)
637
+ lang = args.lang
638
+
639
+ locale_dict = {
640
+ 'title': {
641
+ 'zh': '📈 EvalScope 看板',
642
+ 'en': '📈 Evalscope Dashboard'
643
+ },
644
+ 'star_beggar': {
645
+ 'zh':
646
+ '喜欢<a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>就动动手指给我们加个star吧 🥺 ',
647
+ 'en':
648
+ 'If you like <a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>, '
649
+ 'please take a few seconds to star us 🥺 '
650
+ },
651
+ 'note': {
652
+ 'zh': '请选择报告',
653
+ 'en': 'Please select reports'
654
+ }
655
+ }
656
+
461
657
  with gr.Blocks(title='Evalscope Dashboard') as demo:
658
+ gr.HTML(f'<h1 style="text-align: left;">{locale_dict["title"][lang]} (v{__version__})</h1>')
462
659
  with gr.Row():
463
660
  with gr.Column(scale=0, min_width=35):
464
661
  toggle_btn = gr.Button('<')
465
662
  with gr.Column(scale=1):
466
- gr.HTML('<h1 style="text-align: left;">Evalscope Dashboard</h1>') # 文本列
663
+ gr.HTML(f'<h3 style="text-align: left;">{locale_dict["star_beggar"][lang]}</h3>')
467
664
 
468
665
  with gr.Row():
469
666
  with gr.Column(scale=1) as sidebar_column:
470
667
  sidebar_visible = gr.State(True)
471
- sidebar = create_sidebar()
668
+ sidebar = create_sidebar(args.outputs, lang)
472
669
 
473
670
  with gr.Column(scale=5):
474
-
475
- with gr.Column(visible=True):
476
- gr.Markdown('## Visualization')
477
- with gr.Tabs():
478
- with gr.Tab('Single Model'):
479
- single = create_single_model_tab(sidebar)
480
-
481
- with gr.Tab('Multi Model'):
482
- multi = create_multi_model_tab(sidebar)
671
+ visualization = create_visualization(sidebar, lang)
483
672
 
484
673
  @sidebar.load_btn.click(
485
- inputs=[sidebar.reports_dropdown], outputs=[single.report_name, multi.multi_report_name])
674
+ inputs=[sidebar.reports_dropdown],
675
+ outputs=[visualization.single_model.report_name, visualization.multi_model.multi_report_name])
486
676
  def update_displays(reports_dropdown):
487
677
  if not reports_dropdown:
488
- gr.Warning('No reports found, please check the path', duration=3)
678
+ gr.Warning(locale_dict['note'][lang], duration=3)
489
679
  return gr.skip()
490
680
 
491
681
  return (
@@ -499,8 +689,20 @@ def create_app():
499
689
  text = '<' if new_visible else '>'
500
690
  return gr.update(visible=new_visible), new_visible, gr.update(value=text)
501
691
 
502
- demo.launch()
692
+ demo.launch(share=args.share, server_name=args.server_name, server_port=args.server_port, debug=args.debug)
693
+
694
+
695
+ def add_argument(parser: argparse.ArgumentParser):
696
+ parser.add_argument('--share', action='store_true', help='Share the app.')
697
+ parser.add_argument('--server-name', type=str, default='0.0.0.0', help='The server name.')
698
+ parser.add_argument('--server-port', type=int, default=None, help='The server port.')
699
+ parser.add_argument('--debug', action='store_true', help='Debug the app.')
700
+ parser.add_argument('--lang', type=str, default='zh', help='The locale.', choices=['zh', 'en'])
701
+ parser.add_argument('--outputs', type=str, default='./outputs', help='The outputs dir.')
503
702
 
504
703
 
505
704
  if __name__ == '__main__':
506
- create_app()
705
+ parser = argparse.ArgumentParser()
706
+ add_argument(parser)
707
+ args = parser.parse_args()
708
+ create_app(args)
evalscope/run.py CHANGED
@@ -46,11 +46,13 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
46
46
 
47
47
  def setup_work_directory(task_cfg: TaskConfig, run_time: str):
48
48
  """Set the working directory for the task."""
49
+ # use cache
49
50
  if task_cfg.use_cache:
50
51
  task_cfg.work_dir = task_cfg.use_cache
51
52
  logger.info(f'Set resume from {task_cfg.work_dir}')
52
53
  # elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
53
- task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
54
+ else:
55
+ task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
54
56
 
55
57
  outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
56
58
 
@@ -112,7 +114,7 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
112
114
  logger.info(task_cfg)
113
115
 
114
116
  for evaluator in evaluators:
115
- res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
117
+ res_dict = evaluator.eval()
116
118
  eval_results[dataset_name] = res_dict
117
119
 
118
120
  return eval_results
@@ -124,21 +126,22 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
124
126
  from evalscope.evaluator import Evaluator
125
127
  from evalscope.models import initialize_model_adapter
126
128
 
129
+ benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
130
+ # Initialize data adapter
131
+ data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
132
+
127
133
  if dataset_name == DataCollection.NAME:
128
134
  # EvaluatorCollection is a collection of evaluators
129
135
  from evalscope.collections import EvaluatorCollection
130
- return EvaluatorCollection(task_cfg, outputs)
136
+ return EvaluatorCollection(task_cfg, data_adapter, outputs)
131
137
 
132
- benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
133
-
134
- data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
138
+ # Initialize model adapter
135
139
  model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
136
140
 
137
141
  # update task_cfg.dataset_args
138
142
  task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
139
143
 
140
144
  return Evaluator(
141
- dataset_name_or_path=benchmark.dataset_id,
142
145
  data_adapter=data_adapter,
143
146
  model_adapter=model_adapter,
144
147
  outputs=outputs,