evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  25. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  26. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  27. evalscope/benchmarks/data_adapter.py +20 -5
  28. evalscope/benchmarks/general_arena/__init__.py +0 -0
  29. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  30. evalscope/benchmarks/general_arena/utils.py +226 -0
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  35. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  37. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  38. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  39. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  41. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  42. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  43. evalscope/benchmarks/race/race_adapter.py +1 -1
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  45. evalscope/benchmarks/utils.py +1 -2
  46. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  47. evalscope/config.py +8 -123
  48. evalscope/evaluator/evaluator.py +15 -12
  49. evalscope/metrics/__init__.py +6 -0
  50. evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
  51. evalscope/metrics/llm_judge.py +105 -20
  52. evalscope/metrics/metrics.py +1 -1
  53. evalscope/models/adapters/base_adapter.py +0 -2
  54. evalscope/models/adapters/server_adapter.py +2 -2
  55. evalscope/models/custom/dummy_model.py +3 -3
  56. evalscope/perf/arguments.py +2 -16
  57. evalscope/perf/main.py +1 -1
  58. evalscope/perf/utils/analysis_result.py +24 -23
  59. evalscope/perf/utils/benchmark_util.py +1 -1
  60. evalscope/report/__init__.py +1 -1
  61. evalscope/report/utils.py +34 -15
  62. evalscope/run.py +1 -1
  63. evalscope/summarizer.py +1 -2
  64. evalscope/utils/__init__.py +63 -2
  65. evalscope/utils/argument_utils.py +64 -0
  66. evalscope/utils/import_utils.py +16 -0
  67. evalscope/utils/io_utils.py +45 -4
  68. evalscope/utils/model_utils.py +37 -1
  69. evalscope/version.py +2 -2
  70. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
  71. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
  72. tests/aigc/test_t2i.py +1 -1
  73. tests/cli/test_all.py +50 -2
  74. tests/cli/test_collection.py +1 -1
  75. tests/cli/test_custom.py +261 -0
  76. tests/cli/test_run.py +13 -37
  77. tests/perf/test_perf.py +2 -2
  78. tests/rag/test_clip_benchmark.py +2 -1
  79. tests/rag/test_mteb.py +3 -1
  80. tests/rag/test_ragas.py +3 -1
  81. tests/swift/test_run_swift_eval.py +2 -1
  82. tests/swift/test_run_swift_vlm_eval.py +2 -1
  83. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  84. tests/utils.py +13 -0
  85. tests/vlm/test_vlmeval.py +8 -2
  86. evalscope/evaluator/rating_eval.py +0 -157
  87. evalscope/evaluator/reviewer/__init__.py +0 -1
  88. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  89. evalscope/registry/__init__.py +0 -1
  90. evalscope/registry/config/cfg_arena.yaml +0 -77
  91. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  92. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  93. evalscope/registry/config/cfg_single.yaml +0 -78
  94. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  95. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  96. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  97. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  98. evalscope/registry/data/question.jsonl +0 -80
  99. evalscope/registry/tasks/arc.yaml +0 -28
  100. evalscope/registry/tasks/bbh.yaml +0 -26
  101. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  102. evalscope/registry/tasks/ceval.yaml +0 -27
  103. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  104. evalscope/registry/tasks/cmmlu.yaml +0 -27
  105. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  106. evalscope/registry/tasks/general_qa.yaml +0 -27
  107. evalscope/registry/tasks/gsm8k.yaml +0 -29
  108. evalscope/registry/tasks/mmlu.yaml +0 -29
  109. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  110. evalscope/run_arena.py +0 -202
  111. evalscope/utils/arena_utils.py +0 -217
  112. evalscope/utils/completion_parsers.py +0 -82
  113. /evalscope/{utils → benchmarks}/filters.py +0 -0
  114. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
  115. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
  116. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
  117. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
evalscope/app/app.py CHANGED
@@ -1,776 +1,23 @@
1
+ """
2
+ Main application module for the Evalscope dashboard.
3
+ """
1
4
  import argparse
2
- import glob
3
- import gradio as gr
4
- import json
5
- import numpy as np
6
- import os
7
- import pandas as pd
8
- import plotly.express as px
9
- import plotly.graph_objects as go
10
- import re
11
- from dataclasses import dataclass
12
- from typing import Any, List, Union
13
5
 
14
- from evalscope.constants import DataCollection
15
- from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
16
- from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
17
- from evalscope.utils.logger import configure_logging, get_logger
18
- from evalscope.version import __version__
6
+ from evalscope.utils.logger import configure_logging
19
7
  from .arguments import add_argument
20
- from .constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, PLOTLY_THEME, REPORT_TOKEN
8
+ from .ui import create_app_ui
21
9
 
22
- logger = get_logger()
23
10
 
24
-
25
- def scan_for_report_folders(root_path):
26
- """Scan for folders containing reports subdirectories"""
27
- logger.debug(f'Scanning for report folders in {root_path}')
28
- if not os.path.exists(root_path):
29
- return []
30
-
31
- reports = []
32
- # Iterate over all folders in the root path
33
- for folder in glob.glob(os.path.join(root_path, '*')):
34
- # Check if reports folder exists
35
- reports_path = os.path.join(folder, OutputsStructure.REPORTS_DIR)
36
- if not os.path.exists(reports_path):
37
- continue
38
-
39
- # Iterate over all items in reports folder
40
- for model_item in glob.glob(os.path.join(reports_path, '*')):
41
- if not os.path.isdir(model_item):
42
- continue
43
- datasets = []
44
- for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
45
- datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
46
- datasets = DATASET_TOKEN.join(datasets)
47
- reports.append(
48
- f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
49
-
50
- reports = sorted(reports, reverse=True)
51
- logger.debug(f'reports: {reports}')
52
- return reports
53
-
54
-
55
- def process_report_name(report_name: str):
56
- prefix, report_name = report_name.split(REPORT_TOKEN)
57
- model_name, datasets = report_name.split(MODEL_TOKEN)
58
- datasets = datasets.split(DATASET_TOKEN)
59
- return prefix, model_name, datasets
60
-
61
-
62
- def load_single_report(root_path: str, report_name: str):
63
- prefix, model_name, datasets = process_report_name(report_name)
64
- report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
65
- report_list = get_report_list([report_path_list])
66
-
67
- task_cfg_path = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))[0]
68
- task_cfg = yaml_to_dict(task_cfg_path)
69
- return report_list, datasets, task_cfg
70
-
71
-
72
- def load_multi_report(root_path: str, report_names: List[str]):
73
- report_list = []
74
- for report_name in report_names:
75
- prefix, model_name, datasets = process_report_name(report_name)
76
- report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
77
- reports = get_report_list([report_path_list])
78
- report_list.extend(reports)
79
- return report_list
80
-
81
-
82
- def get_acc_report_df(report_list: List[Report]):
83
- data_dict = []
84
- for report in report_list:
85
- if report.name == DataCollection.NAME:
86
- for metric in report.metrics:
87
- for category in metric.categories:
88
- item = {
89
- ReportKey.model_name: report.model_name,
90
- ReportKey.dataset_name: '/'.join(category.name),
91
- ReportKey.score: category.score,
92
- ReportKey.num: category.num,
93
- }
94
- data_dict.append(item)
95
- else:
96
- item = {
97
- ReportKey.model_name: report.model_name,
98
- ReportKey.dataset_name: report.dataset_name,
99
- ReportKey.score: report.score,
100
- ReportKey.num: report.metrics[0].num,
101
- }
102
- data_dict.append(item)
103
- df = pd.DataFrame.from_dict(data_dict, orient='columns')
104
-
105
- styler = style_df(df, columns=[ReportKey.score])
106
- return df, styler
107
-
108
-
109
- def style_df(df: pd.DataFrame, columns: List[str] = None):
110
- # Apply background gradient to the specified columns
111
- styler = df.style.background_gradient(subset=columns, cmap='RdYlGn', vmin=0.0, vmax=1.0, axis=0)
112
- # Format the dataframe with a precision of 4 decimal places
113
- styler.format(precision=4)
114
- return styler
115
-
116
-
117
- def get_compare_report_df(acc_df: pd.DataFrame):
118
- df = acc_df.pivot_table(index=ReportKey.model_name, columns=ReportKey.dataset_name, values=ReportKey.score)
119
- df.reset_index(inplace=True)
120
-
121
- styler = style_df(df)
122
- return df, styler
123
-
124
-
125
- def plot_single_report_scores(df: pd.DataFrame):
126
- if df is None:
127
- return None
128
- logger.debug(f'df: {df}')
129
- plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
130
-
131
- width = 0.2 if len(df[ReportKey.dataset_name]) <= 5 else None
132
- plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
133
- plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
134
- return plot
135
-
136
-
137
- def plot_single_report_sunburst(report_list: List[Report]):
138
- if report_list[0].name == DataCollection.NAME:
139
- df = get_data_frame(report_list=report_list)
140
- categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
141
- path = categories + [ReportKey.subset_name]
142
- else:
143
- df = get_data_frame(report_list=report_list, flatten_metrics=False)
144
- categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
145
- path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
146
- logger.debug(f'df: {df}')
147
- df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
148
-
149
- plot = px.sunburst(
150
- df,
151
- path=path,
152
- values=ReportKey.num,
153
- color=ReportKey.score,
154
- color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
155
- color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
156
- template=PLOTLY_THEME,
157
- maxdepth=4)
158
- plot.update_traces(insidetextorientation='radial')
159
- plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1), height=600)
160
- return plot
161
-
162
-
163
- def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
164
- df = df[df[ReportKey.dataset_name] == dataset_name]
165
- styler = style_df(df, columns=[ReportKey.score])
166
- return df, styler
167
-
168
-
169
- def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
170
- for report in report_list:
171
- if report.dataset_name == dataset_name:
172
- return report.analysis
173
- return 'N/A'
174
-
175
-
176
- def plot_single_dataset_scores(df: pd.DataFrame):
177
- # TODO: add metric radio and relace category name
178
- plot = px.bar(
179
- df,
180
- x=df[ReportKey.metric_name],
181
- y=df[ReportKey.score],
182
- color=df[ReportKey.subset_name],
183
- text=df[ReportKey.score],
184
- barmode='group')
185
-
186
- width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
187
- plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
188
- plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
189
- return plot
190
-
191
-
192
- def plot_multi_report_radar(df: pd.DataFrame):
193
- fig = go.Figure()
194
-
195
- grouped = df.groupby(ReportKey.model_name)
196
- common_datasets = set.intersection(*[set(group[ReportKey.dataset_name]) for _, group in grouped])
197
-
198
- for model_name, group in grouped:
199
- common_group = group[group[ReportKey.dataset_name].isin(common_datasets)]
200
- fig.add_trace(
201
- go.Scatterpolar(
202
- r=common_group[ReportKey.score],
203
- theta=common_group[ReportKey.dataset_name],
204
- name=model_name,
205
- fill='toself'))
206
-
207
- fig.update_layout(
208
- template=PLOTLY_THEME,
209
- polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
210
- margin=dict(t=20, l=20, r=20, b=20))
211
- return fig
212
-
213
-
214
- def convert_markdown_image(text):
215
- if not os.path.isfile(text):
216
- return text
217
- # Convert the image path to a markdown image tag
218
- if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
219
- text = os.path.abspath(text)
220
- image_tag = f'![image](gradio_api/file={text})'
221
- logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
222
- return image_tag
223
- return text
224
-
225
-
226
- def convert_html_tags(text):
227
- # match begin label
228
- text = re.sub(r'<(\w+)>', r'[\1]', text)
229
- # match end label
230
- text = re.sub(r'</(\w+)>', r'[/\1]', text)
231
- return text
232
-
233
-
234
- def process_string(string: str, max_length: int = 2048) -> str:
235
- string = convert_html_tags(string) # for display labels e.g.
236
- if max_length and len(string) > max_length:
237
- return f'{string[:max_length // 2]}...[truncate]...{string[-max_length // 2:]}'
238
- return string
239
-
240
-
241
- def dict_to_markdown(data) -> str:
242
- markdown_lines = []
243
-
244
- for key, value in data.items():
245
- bold_key = f'**{key}**'
246
-
247
- if isinstance(value, list):
248
- value_str = '\n' + '\n'.join([f'- {process_model_prediction(item, max_length=None)}' for item in value])
249
- elif isinstance(value, dict):
250
- value_str = dict_to_markdown(value)
251
- else:
252
- value_str = str(value)
253
-
254
- value_str = process_string(value_str, max_length=None) # Convert HTML tags but don't truncate
255
- markdown_line = f'{bold_key}:\n{value_str}'
256
- markdown_lines.append(markdown_line)
257
-
258
- return '\n\n'.join(markdown_lines)
259
-
260
-
261
- def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
11
+ def create_app(args: argparse.Namespace):
262
12
  """
263
- Process model prediction output into a formatted string.
13
+ Create and launch the Evalscope dashboard application.
264
14
 
265
15
  Args:
266
- item: The item to process. Can be a string, list, or dictionary.
267
- max_length: The maximum length of the output string.
268
-
269
- Returns:
270
- A formatted string representation of the input.
16
+ args: Command line arguments.
271
17
  """
272
- if isinstance(item, dict):
273
- result = dict_to_markdown(item)
274
- elif isinstance(item, list):
275
- result = '\n'.join([f'- {process_model_prediction(i, max_length=None)}' for i in item])
276
- else:
277
- result = str(item)
278
-
279
- # Apply HTML tag conversion and truncation only at the final output
280
- if max_length is not None:
281
- return process_string(result, max_length)
282
- return result
283
-
284
-
285
- def process_model_prediction(item: Any, max_length: int = 4096) -> str:
286
- if isinstance(item, (dict, list)):
287
- result = json.dumps(item, ensure_ascii=False, indent=2)
288
- result = f'```json\n{result}\n```'
289
- else:
290
- result = str(item)
291
-
292
- # Apply HTML tag conversion and truncation only at the final output
293
- if max_length is not None:
294
- return process_string(result, max_length)
295
-
296
- return result
297
-
298
-
299
- def normalize_score(score):
300
- try:
301
- if isinstance(score, bool):
302
- return 1.0 if score else 0.0
303
- elif isinstance(score, dict):
304
- for key in score:
305
- return float(score[key])
306
- return 0.0
307
- else:
308
- return float(score)
309
- except (ValueError, TypeError):
310
- return 0.0
311
-
312
-
313
- def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
314
- data_path = os.path.join(work_dir, OutputsStructure.REVIEWS_DIR, model_name)
315
- subset_name = subset_name.replace('/', '_') # for collection report
316
- review_path = os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl')
317
- logger.debug(f'review_path: {review_path}')
318
- origin_df = pd.read_json(review_path, lines=True)
319
-
320
- ds = []
321
- for i, item in origin_df.iterrows():
322
- raw_input = item['raw_input']
323
- for choice in item['choices']:
324
- raw_pred_answer = choice['message']['content']
325
- parsed_gold_answer = choice['review']['gold']
326
- parsed_pred_answer = choice['review']['pred']
327
- score = choice['review']['result']
328
- raw_d = {
329
- 'Input': raw_input,
330
- 'Generated': raw_pred_answer,
331
- 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
332
- 'Pred': parsed_pred_answer,
333
- 'Score': score,
334
- 'NScore': normalize_score(score)
335
- }
336
- ds.append(raw_d)
337
-
338
- df_subset = pd.DataFrame(ds)
339
- return df_subset
340
-
341
-
342
- @dataclass
343
- class SidebarComponents:
344
- root_path: gr.Textbox
345
- reports_dropdown: gr.Dropdown
346
- load_btn: gr.Button
347
-
348
-
349
- def create_sidebar(outputs_dir: str, lang: str):
350
- locale_dict = {
351
- 'settings': {
352
- 'zh': '设置',
353
- 'en': 'Settings'
354
- },
355
- 'report_root_path': {
356
- 'zh': '报告根路径',
357
- 'en': 'Report Root Path'
358
- },
359
- 'select_reports': {
360
- 'zh': '请选择报告',
361
- 'en': 'Select Reports'
362
- },
363
- 'load_btn': {
364
- 'zh': '加载并查看',
365
- 'en': 'Load & View'
366
- },
367
- 'note': {
368
- 'zh': '请选择报告并点击`加载并查看`来查看数据',
369
- 'en': 'Please select reports and click `Load & View` to view the data'
370
- },
371
- 'warning': {
372
- 'zh': '没有找到报告,请检查路径',
373
- 'en': 'No reports found, please check the path'
374
- }
375
- }
376
-
377
- gr.Markdown(f'## {locale_dict["settings"][lang]}')
378
- root_path = gr.Textbox(
379
- label=locale_dict['report_root_path'][lang], value=outputs_dir, placeholder=outputs_dir, lines=1)
380
- reports_dropdown = gr.Dropdown(
381
- label=locale_dict['select_reports'][lang], choices=[], multiselect=True, interactive=True)
382
- load_btn = gr.Button(locale_dict['load_btn'][lang])
383
- gr.Markdown(f'### {locale_dict["note"][lang]}')
384
-
385
- @reports_dropdown.focus(inputs=[root_path], outputs=[reports_dropdown])
386
- def update_dropdown_choices(root_path):
387
- folders = scan_for_report_folders(root_path)
388
- if len(folders) == 0:
389
- gr.Warning(locale_dict['warning'][lang], duration=3)
390
- return gr.update(choices=folders)
391
-
392
- return SidebarComponents(
393
- root_path=root_path,
394
- reports_dropdown=reports_dropdown,
395
- load_btn=load_btn,
396
- )
397
-
398
-
399
- @dataclass
400
- class VisualizationComponents:
401
- single_model: gr.Tab
402
- multi_model: gr.Tab
403
-
404
-
405
- def create_visualization(sidebar: SidebarComponents, lang: str):
406
- locale_dict = {
407
- 'visualization': {
408
- 'zh': '可视化',
409
- 'en': 'Visualization'
410
- },
411
- 'single_model': {
412
- 'zh': '单模型',
413
- 'en': 'Single Model'
414
- },
415
- 'multi_model': {
416
- 'zh': '多模型',
417
- 'en': 'Multi Model'
418
- }
419
- }
420
- with gr.Column(visible=True):
421
- gr.Markdown(f'## {locale_dict["visualization"][lang]}')
422
- with gr.Tabs():
423
- with gr.Tab(locale_dict['single_model'][lang]):
424
- single = create_single_model_tab(sidebar, lang)
425
-
426
- with gr.Tab(locale_dict['multi_model'][lang]):
427
- multi = create_multi_model_tab(sidebar, lang)
428
- return VisualizationComponents(
429
- single_model=single,
430
- multi_model=multi,
431
- )
432
-
433
-
434
- @dataclass
435
- class SingleModelComponents:
436
- report_name: gr.Dropdown
437
-
438
-
439
- def create_single_model_tab(sidebar: SidebarComponents, lang: str):
440
- locale_dict = {
441
- 'select_report': {
442
- 'zh': '选择报告',
443
- 'en': 'Select Report'
444
- },
445
- 'task_config': {
446
- 'zh': '任务配置',
447
- 'en': 'Task Config'
448
- },
449
- 'datasets_overview': {
450
- 'zh': '数据集概览',
451
- 'en': 'Datasets Overview'
452
- },
453
- 'dataset_components': {
454
- 'zh': '数据集组成',
455
- 'en': 'Dataset Components'
456
- },
457
- 'dataset_scores': {
458
- 'zh': '数据集分数',
459
- 'en': 'Dataset Scores'
460
- },
461
- 'report_analysis': {
462
- 'zh': '报告智能分析',
463
- 'en': 'Report Intelligent Analysis'
464
- },
465
- 'dataset_scores_table': {
466
- 'zh': '数据集分数表',
467
- 'en': 'Dataset Scores Table'
468
- },
469
- 'dataset_details': {
470
- 'zh': '数据集详情',
471
- 'en': 'Dataset Details'
472
- },
473
- 'select_dataset': {
474
- 'zh': '选择数据集',
475
- 'en': 'Select Dataset'
476
- },
477
- 'model_prediction': {
478
- 'zh': '模型预测',
479
- 'en': 'Model Prediction'
480
- },
481
- 'select_subset': {
482
- 'zh': '选择子集',
483
- 'en': 'Select Subset'
484
- },
485
- 'answer_mode': {
486
- 'zh': '答案模式',
487
- 'en': 'Answer Mode'
488
- },
489
- 'page': {
490
- 'zh': '页码',
491
- 'en': 'Page'
492
- },
493
- 'score_threshold': {
494
- 'zh': '分数阈值',
495
- 'en': 'Score Threshold'
496
- },
497
- }
498
-
499
- # Update the UI components with localized labels
500
- report_name = gr.Dropdown(label=locale_dict['select_report'][lang], choices=[], interactive=True)
501
- work_dir = gr.State(None)
502
- model_name = gr.State(None)
503
-
504
- with gr.Accordion(locale_dict['task_config'][lang], open=False):
505
- task_config = gr.JSON(value=None)
506
-
507
- report_list = gr.State([])
508
-
509
- with gr.Tab(locale_dict['datasets_overview'][lang]):
510
- gr.Markdown(f'### {locale_dict["dataset_components"][lang]}')
511
- sunburst_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_components'][lang])
512
- gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
513
- score_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
514
- gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
515
- score_table = gr.DataFrame(value=None)
516
-
517
- with gr.Tab(locale_dict['dataset_details'][lang]):
518
- dataset_radio = gr.Radio(
519
- label=locale_dict['select_dataset'][lang], choices=[], show_label=True, interactive=True)
520
- # show dataset details
521
- with gr.Accordion(locale_dict['report_analysis'][lang], open=True):
522
- report_analysis = gr.Markdown(value='N/A', show_copy_button=True)
523
- gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
524
- dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
525
- gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
526
- dataset_table = gr.DataFrame(value=None)
527
-
528
- gr.Markdown(f'### {locale_dict["model_prediction"][lang]}')
529
- subset_select = gr.Dropdown(
530
- label=locale_dict['select_subset'][lang], choices=[], show_label=True, interactive=True)
531
-
532
- with gr.Row():
533
- answer_mode_radio = gr.Radio(
534
- label=locale_dict['answer_mode'][lang], choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
535
- score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'][lang], interactive=True)
536
-
537
- data_review_df = gr.State(None)
538
- filtered_review_df = gr.State(None)
539
-
540
- # show statistics
541
- with gr.Row(variant='panel'):
542
- with gr.Column():
543
- gr.Markdown('### *Counts*')
544
- answer_mode_counts = gr.Markdown('')
545
- with gr.Column():
546
- page_number = gr.Number(
547
- value=1, label=locale_dict['page'][lang], minimum=1, maximum=1, step=1, interactive=True)
548
-
549
- # show data review table
550
- with gr.Row(variant='panel'):
551
- with gr.Column():
552
- gr.Markdown('### *Score*')
553
- score_text = gr.Markdown(
554
- '', elem_id='score_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
555
- with gr.Column():
556
- gr.Markdown('### *Normalized Score*')
557
- nscore = gr.Markdown('', elem_id='score_text', latex_delimiters=LATEX_DELIMITERS)
558
-
559
- with gr.Row(variant='panel'):
560
- with gr.Column():
561
- gr.Markdown('### *Gold*')
562
- gold_text = gr.Markdown(
563
- '', elem_id='gold_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
564
- with gr.Column():
565
- gr.Markdown('### *Pred*')
566
- pred_text = gr.Markdown(
567
- '', elem_id='pred_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
568
-
569
- with gr.Row(variant='panel'):
570
- with gr.Column():
571
- gr.Markdown('### *Input*')
572
- input_text = gr.Markdown(
573
- '', elem_id='input_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
574
- with gr.Column():
575
- gr.Markdown('### *Generated*')
576
- generated_text = gr.Markdown(
577
- '', elem_id='generated_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
578
-
579
- @report_name.change(
580
- inputs=[sidebar.root_path, report_name],
581
- outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
582
- def update_single_report_data(root_path, report_name):
583
- report_list, datasets, task_cfg = load_single_report(root_path, report_name)
584
- work_dir = os.path.join(root_path, report_name.split(REPORT_TOKEN)[0])
585
- model_name = report_name.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
586
- return (report_list, task_cfg, gr.update(choices=datasets, value=datasets[0]), work_dir, model_name)
587
-
588
- @report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])
589
- def update_single_report_score(report_list):
590
- report_score_df, styler = get_acc_report_df(report_list)
591
- report_score_plot = plot_single_report_scores(report_score_df)
592
- report_sunburst_plot = plot_single_report_sunburst(report_list)
593
- return report_score_plot, styler, report_sunburst_plot
594
-
595
- @gr.on(
596
- triggers=[dataset_radio.change, report_list.change],
597
- inputs=[dataset_radio, report_list],
598
- outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
599
- def update_single_report_dataset(dataset_name, report_list):
600
- logger.debug(f'Updating single report dataset: {dataset_name}')
601
- report_df = get_data_frame(report_list=report_list)
602
- analysis = get_report_analysis(report_list, dataset_name)
603
- data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
604
- data_score_plot = plot_single_dataset_scores(data_score_df)
605
- subsets = data_score_df[ReportKey.subset_name].unique().tolist()
606
- logger.debug(f'subsets: {subsets}')
607
- return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
608
-
609
- @gr.on(
610
- triggers=[subset_select.change],
611
- inputs=[work_dir, model_name, dataset_radio, subset_select],
612
- outputs=[data_review_df, page_number])
613
- def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
614
- if not subset_name:
615
- return gr.skip()
616
- data_review_df = get_model_prediction(work_dir, model_name, dataset_name, subset_name)
617
- return data_review_df, 1
618
-
619
- @gr.on(
620
- triggers=[data_review_df.change, answer_mode_radio.change, score_threshold.change],
621
- inputs=[data_review_df, answer_mode_radio, score_threshold],
622
- outputs=[filtered_review_df, page_number, answer_mode_counts])
623
- def filter_data(data_review_df, answer_mode, score_threshold):
624
- if data_review_df is None:
625
- return None, gr.update(value=1, maximum=1), ''
626
-
627
- all_count = len(data_review_df)
628
- pass_df = data_review_df[data_review_df['NScore'] >= score_threshold]
629
- pass_count = len(pass_df)
630
- fail_count = all_count - pass_count
631
-
632
- counts_text = f'### All: {all_count} | Pass: {pass_count} | Fail: {fail_count}'
633
-
634
- if answer_mode == 'Pass':
635
- filtered_df = pass_df
636
- elif answer_mode == 'Fail':
637
- filtered_df = data_review_df[data_review_df['NScore'] < score_threshold]
638
- else:
639
- filtered_df = data_review_df
640
-
641
- max_page = max(1, len(filtered_df))
642
-
643
- return (filtered_df, gr.update(value=1, maximum=max_page), counts_text)
644
-
645
- @gr.on(
646
- triggers=[filtered_review_df.change, page_number.change],
647
- inputs=[filtered_review_df, page_number, score_threshold],
648
- outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore])
649
- def update_table_components(filtered_df, page_number, score_threshold):
650
- if filtered_df is None or len(filtered_df) == 0:
651
- return '', '', '', '', '', ''
652
-
653
- # Get single row data for the current page
654
- start = (page_number - 1)
655
- if start >= len(filtered_df):
656
- return '', '', '', '', '', ''
657
-
658
- row = filtered_df.iloc[start]
659
-
660
- # Process the data for display
661
- input_md = process_model_prediction(row['Input'])
662
- generated_md = process_model_prediction(row['Generated'])
663
- gold_md = process_model_prediction(row['Gold'])
664
- pred_md = convert_markdown_image(process_model_prediction(row['Pred']))
665
- score_md = process_model_prediction(row['Score'])
666
- nscore_val = float(row['NScore']) if not pd.isna(row['NScore']) else 0.0
667
-
668
- if nscore_val >= score_threshold:
669
- nscore_val = f'<div style="background-color:rgb(45,104, 62); padding:10px;">{nscore_val}</div>'
670
- else:
671
- nscore_val = f'<div style="background-color:rgb(151, 31, 44); padding:10px;">{nscore_val}</div>'
672
-
673
- return input_md, generated_md, gold_md, pred_md, score_md, nscore_val
674
-
675
- return SingleModelComponents(report_name=report_name)
676
-
677
-
678
- @dataclass
679
- class MultiModelComponents:
680
- multi_report_name: gr.Dropdown
681
-
682
-
683
- def create_multi_model_tab(sidebar: SidebarComponents, lang: str):
684
- locale_dict = {
685
- 'select_reports': {
686
- 'zh': '请选择报告',
687
- 'en': 'Select Reports'
688
- },
689
- 'model_radar': {
690
- 'zh': '模型对比雷达',
691
- 'en': 'Model Comparison Radar'
692
- },
693
- 'model_scores': {
694
- 'zh': '模型对比分数',
695
- 'en': 'Model Comparison Scores'
696
- }
697
- }
698
- multi_report_name = gr.Dropdown(
699
- label=locale_dict['select_reports'][lang], choices=[], multiselect=True, interactive=True)
700
- gr.Markdown(locale_dict['model_radar'][lang])
701
- radar_plot = gr.Plot(value=None)
702
- gr.Markdown(locale_dict['model_scores'][lang])
703
- score_table = gr.DataFrame(value=None)
704
-
705
- @multi_report_name.change(inputs=[sidebar.root_path, multi_report_name], outputs=[radar_plot, score_table])
706
- def update_multi_report_data(root_path, multi_report_name):
707
- if not multi_report_name:
708
- return gr.skip()
709
- report_list = load_multi_report(root_path, multi_report_name)
710
- report_df, _ = get_acc_report_df(report_list)
711
- report_radar_plot = plot_multi_report_radar(report_df)
712
- _, styler = get_compare_report_df(report_df)
713
- return report_radar_plot, styler
714
-
715
- return MultiModelComponents(multi_report_name=multi_report_name)
716
-
717
-
718
- def create_app(args: argparse.Namespace):
719
18
  configure_logging(debug=args.debug)
720
- lang = args.lang
721
-
722
- locale_dict = {
723
- 'title': {
724
- 'zh': '📈 EvalScope 看板',
725
- 'en': '📈 Evalscope Dashboard'
726
- },
727
- 'star_beggar': {
728
- 'zh':
729
- '喜欢<a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>就动动手指给我们加个star吧 🥺 ',
730
- 'en':
731
- 'If you like <a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>, '
732
- 'please take a few seconds to star us 🥺 '
733
- },
734
- 'note': {
735
- 'zh': '请选择报告',
736
- 'en': 'Please select reports'
737
- }
738
- }
739
-
740
- with gr.Blocks(title='Evalscope Dashboard') as demo:
741
- gr.HTML(f'<h1 style="text-align: left;">{locale_dict["title"][lang]} (v{__version__})</h1>')
742
- with gr.Row():
743
- with gr.Column(scale=0, min_width=35):
744
- toggle_btn = gr.Button('<')
745
- with gr.Column(scale=1):
746
- gr.HTML(f'<h3 style="text-align: left;">{locale_dict["star_beggar"][lang]}</h3>')
747
-
748
- with gr.Row():
749
- with gr.Column(scale=1) as sidebar_column:
750
- sidebar_visible = gr.State(True)
751
- sidebar = create_sidebar(args.outputs, lang)
752
-
753
- with gr.Column(scale=5):
754
- visualization = create_visualization(sidebar, lang)
755
-
756
- @sidebar.load_btn.click(
757
- inputs=[sidebar.reports_dropdown],
758
- outputs=[visualization.single_model.report_name, visualization.multi_model.multi_report_name])
759
- def update_displays(reports_dropdown):
760
- if not reports_dropdown:
761
- gr.Warning(locale_dict['note'][lang], duration=3)
762
- return gr.skip()
763
-
764
- return (
765
- gr.update(choices=reports_dropdown, value=reports_dropdown[0]), # update single model dropdown
766
- gr.update(choices=reports_dropdown, value=reports_dropdown) # update multi model dropdown
767
- )
768
19
 
769
- @toggle_btn.click(inputs=[sidebar_visible], outputs=[sidebar_column, sidebar_visible, toggle_btn])
770
- def toggle_sidebar(visible):
771
- new_visible = not visible
772
- text = '<' if new_visible else '>'
773
- return gr.update(visible=new_visible), new_visible, gr.update(value=text)
20
+ demo = create_app_ui(args)
774
21
 
775
22
  demo.launch(
776
23
  share=args.share,