evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  25. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  26. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  27. evalscope/benchmarks/data_adapter.py +20 -5
  28. evalscope/benchmarks/general_arena/__init__.py +0 -0
  29. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  30. evalscope/benchmarks/general_arena/utils.py +226 -0
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  35. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  37. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  38. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  39. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  41. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  42. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  43. evalscope/benchmarks/race/race_adapter.py +1 -1
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  45. evalscope/benchmarks/utils.py +1 -2
  46. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  47. evalscope/config.py +8 -123
  48. evalscope/evaluator/evaluator.py +15 -12
  49. evalscope/metrics/__init__.py +6 -0
  50. evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
  51. evalscope/metrics/llm_judge.py +105 -20
  52. evalscope/metrics/metrics.py +1 -1
  53. evalscope/models/adapters/base_adapter.py +0 -2
  54. evalscope/models/adapters/server_adapter.py +2 -2
  55. evalscope/models/custom/dummy_model.py +3 -3
  56. evalscope/perf/arguments.py +2 -16
  57. evalscope/perf/main.py +1 -1
  58. evalscope/perf/utils/analysis_result.py +24 -23
  59. evalscope/perf/utils/benchmark_util.py +1 -1
  60. evalscope/report/__init__.py +1 -1
  61. evalscope/report/utils.py +34 -15
  62. evalscope/run.py +1 -1
  63. evalscope/summarizer.py +1 -2
  64. evalscope/utils/__init__.py +63 -2
  65. evalscope/utils/argument_utils.py +64 -0
  66. evalscope/utils/import_utils.py +16 -0
  67. evalscope/utils/io_utils.py +45 -4
  68. evalscope/utils/model_utils.py +37 -1
  69. evalscope/version.py +2 -2
  70. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
  71. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
  72. tests/aigc/test_t2i.py +1 -1
  73. tests/cli/test_all.py +50 -2
  74. tests/cli/test_collection.py +1 -1
  75. tests/cli/test_custom.py +261 -0
  76. tests/cli/test_run.py +13 -37
  77. tests/perf/test_perf.py +2 -2
  78. tests/rag/test_clip_benchmark.py +2 -1
  79. tests/rag/test_mteb.py +3 -1
  80. tests/rag/test_ragas.py +3 -1
  81. tests/swift/test_run_swift_eval.py +2 -1
  82. tests/swift/test_run_swift_vlm_eval.py +2 -1
  83. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  84. tests/utils.py +13 -0
  85. tests/vlm/test_vlmeval.py +8 -2
  86. evalscope/evaluator/rating_eval.py +0 -157
  87. evalscope/evaluator/reviewer/__init__.py +0 -1
  88. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  89. evalscope/registry/__init__.py +0 -1
  90. evalscope/registry/config/cfg_arena.yaml +0 -77
  91. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  92. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  93. evalscope/registry/config/cfg_single.yaml +0 -78
  94. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  95. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  96. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  97. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  98. evalscope/registry/data/question.jsonl +0 -80
  99. evalscope/registry/tasks/arc.yaml +0 -28
  100. evalscope/registry/tasks/bbh.yaml +0 -26
  101. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  102. evalscope/registry/tasks/ceval.yaml +0 -27
  103. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  104. evalscope/registry/tasks/cmmlu.yaml +0 -27
  105. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  106. evalscope/registry/tasks/general_qa.yaml +0 -27
  107. evalscope/registry/tasks/gsm8k.yaml +0 -29
  108. evalscope/registry/tasks/mmlu.yaml +0 -29
  109. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  110. evalscope/run_arena.py +0 -202
  111. evalscope/utils/arena_utils.py +0 -217
  112. evalscope/utils/completion_parsers.py +0 -82
  113. /evalscope/{utils → benchmarks}/filters.py +0 -0
  114. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
  115. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
  116. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
  117. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ PLOTLY_THEME = 'plotly_dark'
2
2
  REPORT_TOKEN = '@@'
3
3
  MODEL_TOKEN = '::'
4
4
  DATASET_TOKEN = ', '
5
+ DEFAULT_BAR_WIDTH = 0.2
5
6
  LATEX_DELIMITERS = [{
6
7
  'left': '$$',
7
8
  'right': '$$',
@@ -0,0 +1,20 @@
1
+ """
2
+ UI components for the Evalscope dashboard.
3
+ """
4
+ from .app_ui import create_app_ui
5
+ from .multi_model import MultiModelComponents, create_multi_model_tab
6
+ from .sidebar import SidebarComponents, create_sidebar
7
+ from .single_model import SingleModelComponents, create_single_model_tab
8
+ from .visualization import VisualizationComponents, create_visualization
9
+
10
+ __all__ = [
11
+ 'create_app_ui',
12
+ 'SidebarComponents',
13
+ 'create_sidebar',
14
+ 'VisualizationComponents',
15
+ 'create_visualization',
16
+ 'SingleModelComponents',
17
+ 'create_single_model_tab',
18
+ 'MultiModelComponents',
19
+ 'create_multi_model_tab',
20
+ ]
@@ -0,0 +1,52 @@
1
+ """
2
+ Main UI application for the Evalscope dashboard.
3
+ """
4
+ import argparse
5
+ import gradio as gr
6
+
7
+ from evalscope.version import __version__
8
+ from ..utils.localization import get_app_locale
9
+ from .sidebar import create_sidebar
10
+ from .visualization import create_visualization
11
+
12
+
13
+ def create_app_ui(args: argparse.Namespace):
14
+ lang = args.lang
15
+ locale_dict = get_app_locale(lang)
16
+
17
+ with gr.Blocks(title='Evalscope Dashboard') as demo:
18
+ gr.HTML(f'<h1 style="text-align: left;">{locale_dict["title"]} (v{__version__})</h1>')
19
+ with gr.Row():
20
+ with gr.Column(scale=0, min_width=35):
21
+ toggle_btn = gr.Button('<')
22
+ with gr.Column(scale=1):
23
+ gr.HTML(f'<h3 style="text-align: left;">{locale_dict["star_beggar"]}</h3>')
24
+
25
+ with gr.Row():
26
+ with gr.Column(scale=1) as sidebar_column:
27
+ sidebar_visible = gr.State(True)
28
+ sidebar = create_sidebar(args.outputs, lang)
29
+
30
+ with gr.Column(scale=5):
31
+ visualization = create_visualization(sidebar, lang)
32
+
33
+ @sidebar.load_btn.click(
34
+ inputs=[sidebar.reports_dropdown],
35
+ outputs=[visualization.single_model.report_name, visualization.multi_model.multi_report_name])
36
+ def update_displays(reports_dropdown):
37
+ if not reports_dropdown:
38
+ gr.Warning(locale_dict['note'], duration=3)
39
+ return gr.skip()
40
+
41
+ return (
42
+ gr.update(choices=reports_dropdown, value=reports_dropdown[0]), # update single model dropdown
43
+ gr.update(choices=reports_dropdown, value=reports_dropdown) # update multi model dropdown
44
+ )
45
+
46
+ @toggle_btn.click(inputs=[sidebar_visible], outputs=[sidebar_column, sidebar_visible, toggle_btn])
47
+ def toggle_sidebar(visible):
48
+ new_visible = not visible
49
+ text = '<' if new_visible else '>'
50
+ return gr.update(visible=new_visible), new_visible, gr.update(value=text)
51
+
52
+ return demo
@@ -0,0 +1,323 @@
1
+ """
2
+ Multi model components for the Evalscope dashboard.
3
+ """
4
+ import gradio as gr
5
+ import os
6
+ import pandas as pd
7
+ from dataclasses import dataclass
8
+ from typing import TYPE_CHECKING
9
+
10
+ from evalscope.report import ReportKey, get_data_frame
11
+ from evalscope.utils.logger import get_logger
12
+ from ..constants import LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN
13
+ from ..utils.data_utils import (get_acc_report_df, get_compare_report_df, get_model_prediction, get_single_dataset_df,
14
+ load_multi_report, load_single_report)
15
+ from ..utils.localization import get_multi_model_locale
16
+ from ..utils.text_utils import convert_markdown_image, process_model_prediction
17
+ from ..utils.visualization import plot_multi_report_radar
18
+
19
+ if TYPE_CHECKING:
20
+ from .sidebar import SidebarComponents
21
+
22
+ logger = get_logger()
23
+
24
+
25
+ @dataclass
26
+ class MultiModelComponents:
27
+ multi_report_name: gr.Dropdown
28
+
29
+
30
+ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
31
+ locale_dict = get_multi_model_locale(lang)
32
+
33
+ multi_report_name = gr.Dropdown(label=locale_dict['select_reports'], choices=[], multiselect=True, interactive=True)
34
+ report_list = gr.State([])
35
+
36
+ with gr.Tab(locale_dict['models_overview']):
37
+ gr.Markdown(locale_dict['model_radar'])
38
+ radar_plot = gr.Plot(value=None)
39
+ gr.Markdown(locale_dict['model_scores'])
40
+ score_table = gr.DataFrame(value=None)
41
+
42
+ with gr.Tab(locale_dict['model_comparison_details']):
43
+ with gr.Row():
44
+ model_a_select = gr.Dropdown(label=locale_dict['select_model_a'], choices=[], interactive=True)
45
+ model_b_select = gr.Dropdown(label=locale_dict['select_model_b'], choices=[], interactive=True)
46
+
47
+ # States to store selected models' information
48
+ model_a_report = gr.State(None)
49
+ model_b_report = gr.State(None)
50
+ model_a_dir = gr.State(None)
51
+ model_b_dir = gr.State(None)
52
+ model_a_name = gr.State(None)
53
+ model_b_name = gr.State(None)
54
+
55
+ dataset_radio = gr.Radio(label=locale_dict['select_dataset'], choices=[], show_label=True, interactive=True)
56
+
57
+ gr.Markdown(f"### {locale_dict['model_predictions']}")
58
+ subset_select = gr.Dropdown(label=locale_dict['select_subset'], choices=[], show_label=True, interactive=True)
59
+
60
+ with gr.Row():
61
+ answer_mode_radio = gr.Radio(
62
+ label=locale_dict.get('answer_mode'),
63
+ choices=['All', 'Pass A & B', 'Fail A & B', 'Pass A, Fail B', 'Fail A, Pass B'],
64
+ value='All',
65
+ interactive=True)
66
+ score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True)
67
+
68
+ data_comparison_df = gr.State(None)
69
+ filtered_comparison_df = gr.State(None)
70
+
71
+ # Statistics row
72
+ with gr.Row(variant='panel'):
73
+ with gr.Column():
74
+ gr.Markdown('### *Counts*')
75
+ comparison_counts = gr.Markdown('')
76
+ with gr.Column():
77
+ page_number = gr.Number(
78
+ value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True)
79
+
80
+ # Input and Gold answer sections remain at the top
81
+ with gr.Row(variant='panel'):
82
+ with gr.Column():
83
+ gr.Markdown('### *Input*')
84
+ input_text = gr.Markdown('', elem_id='input_text', latex_delimiters=LATEX_DELIMITERS)
85
+
86
+ with gr.Column():
87
+ gr.Markdown('### *Gold Answer*')
88
+ gold_text = gr.Markdown('', elem_id='gold_text', latex_delimiters=LATEX_DELIMITERS)
89
+
90
+ # Table-like layout for direct comparison
91
+ with gr.Row():
92
+ # Headers for the two models
93
+ with gr.Column(scale=1):
94
+ gr.Markdown('### *Model A*')
95
+ with gr.Column(scale=1):
96
+ gr.Markdown('### *Model B*')
97
+
98
+ # Score comparison row
99
+ with gr.Row():
100
+ with gr.Column(scale=1, variant='panel'):
101
+ gr.Markdown('### *Score*')
102
+ model_a_score = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
103
+ with gr.Column(scale=1, variant='panel'):
104
+ gr.Markdown('### *Score*')
105
+ model_b_score = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
106
+
107
+ # Normalized score comparison row
108
+ with gr.Row():
109
+ with gr.Column(scale=1, variant='panel'):
110
+ gr.Markdown('### *Normalized Score*')
111
+ model_a_nscore = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
112
+ with gr.Column(scale=1, variant='panel'):
113
+ gr.Markdown('### *Normalized Score*')
114
+ model_b_nscore = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
115
+
116
+ # Prediction comparison row
117
+ with gr.Row():
118
+ with gr.Column(scale=1, variant='panel'):
119
+ gr.Markdown('### *Prediction*')
120
+ model_a_pred = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
121
+ with gr.Column(scale=1, variant='panel'):
122
+ gr.Markdown('### *Prediction*')
123
+ model_b_pred = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
124
+
125
+ # Generated output comparison row
126
+ with gr.Row():
127
+ with gr.Column(scale=1, variant='panel'):
128
+ gr.Markdown('### *Generated*')
129
+ model_a_generated = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
130
+ with gr.Column(scale=1, variant='panel'):
131
+ gr.Markdown('### *Generated*')
132
+ model_b_generated = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
133
+
134
+ @multi_report_name.change(
135
+ inputs=[sidebar.root_path, multi_report_name],
136
+ outputs=[report_list, radar_plot, score_table, model_a_select, model_b_select])
137
+ def update_multi_report_data(root_path, multi_report_names):
138
+ if not multi_report_names:
139
+ return gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
140
+
141
+ report_list = load_multi_report(root_path, multi_report_names)
142
+ report_df, _ = get_acc_report_df(report_list)
143
+ report_radar_plot = plot_multi_report_radar(report_df)
144
+ _, styler = get_compare_report_df(report_df)
145
+
146
+ # Extract model names for dropdowns
147
+ model_choices = multi_report_names
148
+
149
+ return report_list, report_radar_plot, styler, gr.update(
150
+ choices=model_choices, value=model_choices[0]), gr.update(
151
+ choices=model_choices, value=model_choices[1] if len(model_choices) > 1 else None)
152
+
153
+ @gr.on(
154
+ triggers=[model_a_select.change, model_b_select.change],
155
+ inputs=[sidebar.root_path, model_a_select, model_b_select],
156
+ outputs=[model_a_report, model_b_report, model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio])
157
+ def update_selected_models(root_path, model_a, model_b):
158
+ if not model_a or not model_b:
159
+ return gr.skip()
160
+
161
+ # Load individual reports for both models
162
+ model_a_reports, datasets_a, _ = load_single_report(root_path, model_a)
163
+ model_b_reports, datasets_b, _ = load_single_report(root_path, model_b)
164
+
165
+ # Get common datasets
166
+ common_datasets = list(set(datasets_a).intersection(set(datasets_b)))
167
+
168
+ # Extract work directories and model names
169
+ model_a_dir = os.path.join(root_path, model_a.split(REPORT_TOKEN)[0])
170
+ model_b_dir = os.path.join(root_path, model_b.split(REPORT_TOKEN)[0])
171
+
172
+ model_a_name = model_a.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
173
+ model_b_name = model_b.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
174
+
175
+ return (model_a_reports, model_b_reports, model_a_dir, model_b_dir, model_a_name, model_b_name,
176
+ gr.update(choices=common_datasets, value=common_datasets[0] if common_datasets else None))
177
+
178
+ @gr.on(
179
+ triggers=[dataset_radio.change],
180
+ inputs=[dataset_radio, model_a_report, model_b_report],
181
+ outputs=[subset_select, data_comparison_df])
182
+ def update_dataset_comparison(dataset_name, model_a_report, model_b_report):
183
+ if not dataset_name or model_a_report is None or model_b_report is None:
184
+ return gr.skip()
185
+
186
+ # Get dataframes for both models
187
+ report_df_a = get_data_frame(report_list=model_a_report)
188
+ data_score_df_a, _ = get_single_dataset_df(report_df_a, dataset_name)
189
+
190
+ report_df_b = get_data_frame(report_list=model_b_report)
191
+ data_score_df_b, _ = get_single_dataset_df(report_df_b, dataset_name)
192
+
193
+ # Get subset choices - should be same for both models
194
+ subsets = data_score_df_a[ReportKey.subset_name].unique().tolist()
195
+
196
+ return gr.update(choices=subsets, value=None), None
197
+
198
+ @gr.on(
199
+ triggers=[subset_select.change],
200
+ inputs=[model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio, subset_select],
201
+ outputs=[data_comparison_df, page_number])
202
+ def update_comparison_data(model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_name, subset_name):
203
+ if not subset_name or not dataset_name:
204
+ return gr.skip()
205
+
206
+ # Get predictions for both models
207
+ df_a = get_model_prediction(model_a_dir, model_a_name, dataset_name, subset_name)
208
+ df_b = get_model_prediction(model_b_dir, model_b_name, dataset_name, subset_name)
209
+
210
+ # Merge dataframes on Input and Gold columns for comparison
211
+ if df_a is not None and df_b is not None:
212
+ # Save the Index column if it exists
213
+ index_a = df_a['Index'].copy()
214
+ index_b = df_b['Index'].copy()
215
+
216
+ df_a = df_a.add_prefix('A_')
217
+ df_b = df_b.add_prefix('B_')
218
+
219
+ # Restore the Index column
220
+ df_a['Index'] = index_a
221
+ df_b['Index'] = index_b
222
+
223
+ # Merge on Index
224
+ comparison_df = pd.merge(df_a, df_b, on='Index')
225
+
226
+ return comparison_df, 1
227
+
228
+ return None, 1
229
+
230
+ @gr.on(
231
+ triggers=[data_comparison_df.change, answer_mode_radio.change, score_threshold.change],
232
+ inputs=[data_comparison_df, answer_mode_radio, score_threshold],
233
+ outputs=[filtered_comparison_df, page_number, comparison_counts])
234
+ def filter_comparison_data(comparison_df, answer_mode, score_threshold):
235
+ if comparison_df is None:
236
+ return None, gr.update(value=1, maximum=1), ''
237
+
238
+ all_count = len(comparison_df)
239
+
240
+ # Apply filtering based on the selected mode and threshold
241
+ if answer_mode == 'Pass A & B':
242
+ filtered_df = comparison_df[(comparison_df['A_NScore'] >= score_threshold)
243
+ & (comparison_df['B_NScore'] >= score_threshold)]
244
+ elif answer_mode == 'Fail A & B':
245
+ filtered_df = comparison_df[(comparison_df['A_NScore'] < score_threshold)
246
+ & (comparison_df['B_NScore'] < score_threshold)]
247
+ elif answer_mode == 'Pass A, Fail B':
248
+ filtered_df = comparison_df[(comparison_df['A_NScore'] >= score_threshold)
249
+ & (comparison_df['B_NScore'] < score_threshold)]
250
+ elif answer_mode == 'Fail A, Pass B':
251
+ filtered_df = comparison_df[(comparison_df['A_NScore'] < score_threshold)
252
+ & (comparison_df['B_NScore'] >= score_threshold)]
253
+ else: # All
254
+ filtered_df = comparison_df
255
+
256
+ # Count statistics
257
+ pass_a_count = len(comparison_df[comparison_df['A_NScore'] >= score_threshold])
258
+ pass_b_count = len(comparison_df[comparison_df['B_NScore'] >= score_threshold])
259
+ pass_both_count = len(comparison_df[(comparison_df['A_NScore'] >= score_threshold)
260
+ & (comparison_df['B_NScore'] >= score_threshold)])
261
+ fail_both_count = len(comparison_df[(comparison_df['A_NScore'] < score_threshold)
262
+ & (comparison_df['B_NScore'] < score_threshold)])
263
+
264
+ counts_text = (f'### All: {all_count} | Pass A: {pass_a_count} | Pass B: {pass_b_count} | '
265
+ f'Pass Both: {pass_both_count} | Fail Both: {fail_both_count}')
266
+
267
+ max_page = max(1, len(filtered_df))
268
+
269
+ return filtered_df, gr.update(value=1, maximum=max_page), counts_text
270
+
271
+ @gr.on(
272
+ triggers=[filtered_comparison_df.change, page_number.change, model_a_select.change, model_b_select.change],
273
+ inputs=[
274
+ filtered_comparison_df, page_number, score_threshold, model_a_select, model_b_select, model_a_name,
275
+ model_b_name
276
+ ],
277
+ outputs=[
278
+ input_text, gold_text, model_a_generated, model_a_pred, model_a_score, model_a_nscore, model_b_generated,
279
+ model_b_pred, model_b_score, model_b_nscore
280
+ ])
281
+ def update_comparison_display(filtered_df, page_number, score_threshold, model_a_select, model_b_select,
282
+ model_a_name_val, model_b_name_val):
283
+ if filtered_df is None or len(filtered_df) == 0:
284
+ return '', '', '', '', '', '', '', '', '', ''
285
+
286
+ # Get the row for the current page
287
+ start = (page_number - 1)
288
+ if start >= len(filtered_df):
289
+ return '', '', '', '', '', '', '', '', '', ''
290
+
291
+ row = filtered_df.iloc[start]
292
+
293
+ # Process common data
294
+ input_md = process_model_prediction(row['A_Input']) # Use A's input (same as B's)
295
+ gold_md = process_model_prediction(row['A_Gold']) # Use A's gold (same as B's)
296
+
297
+ # Process Model A data
298
+ a_generated_md = process_model_prediction(row['A_Generated'])
299
+ a_pred_md = convert_markdown_image(process_model_prediction(row['A_Pred']))
300
+ a_score_md = process_model_prediction(row['A_Score'])
301
+ a_nscore_val = float(row['A_NScore']) if not pd.isna(row['A_NScore']) else 0.0
302
+
303
+ # Process Model B data
304
+ b_generated_md = process_model_prediction(row['B_Generated'])
305
+ b_pred_md = convert_markdown_image(process_model_prediction(row['B_Pred']))
306
+ b_score_md = process_model_prediction(row['B_Score'])
307
+ b_nscore_val = float(row['B_NScore']) if not pd.isna(row['B_NScore']) else 0.0
308
+
309
+ # Apply visual indicators with backgrounds that make differences more obvious
310
+ if a_nscore_val >= score_threshold:
311
+ a_nscore_html = f"<div style='background-color:rgb(45,104, 62); padding:10px;'>{a_nscore_val}</div>"
312
+ else:
313
+ a_nscore_html = f"<div style='background-color:rgb(151, 31, 44); padding:10px;'>{a_nscore_val}</div>"
314
+
315
+ if b_nscore_val >= score_threshold:
316
+ b_nscore_html = f"<div style='background-color:rgb(45,104, 62); padding:10px;'>{b_nscore_val}</div>"
317
+ else:
318
+ b_nscore_html = f"<div style='background-color:rgb(151, 31, 44); padding:10px;'>{b_nscore_val}</div>"
319
+
320
+ return (input_md, gold_md, a_generated_md, a_pred_md, a_score_md, a_nscore_html, b_generated_md, b_pred_md,
321
+ b_score_md, b_nscore_html)
322
+
323
+ return MultiModelComponents(multi_report_name=multi_report_name)
@@ -0,0 +1,42 @@
1
+ """
2
+ Sidebar components for the Evalscope dashboard.
3
+ """
4
+ import gradio as gr
5
+ import os
6
+ from dataclasses import dataclass
7
+
8
+ from evalscope.utils.logger import get_logger
9
+ from ..utils.data_utils import scan_for_report_folders
10
+ from ..utils.localization import get_sidebar_locale
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ @dataclass
16
+ class SidebarComponents:
17
+ root_path: gr.Textbox
18
+ reports_dropdown: gr.Dropdown
19
+ load_btn: gr.Button
20
+
21
+
22
+ def create_sidebar(outputs_dir: str, lang: str):
23
+ locale_dict = get_sidebar_locale(lang)
24
+
25
+ gr.Markdown(f'## {locale_dict["settings"]}')
26
+ root_path = gr.Textbox(label=locale_dict['report_root_path'], value=outputs_dir, placeholder=outputs_dir, lines=1)
27
+ reports_dropdown = gr.Dropdown(label=locale_dict['select_reports'], choices=[], multiselect=True, interactive=True)
28
+ load_btn = gr.Button(locale_dict['load_btn'])
29
+ gr.Markdown(f'### {locale_dict["note"]}')
30
+
31
+ @reports_dropdown.focus(inputs=[root_path], outputs=[reports_dropdown])
32
+ def update_dropdown_choices(root_path):
33
+ folders = scan_for_report_folders(root_path)
34
+ if len(folders) == 0:
35
+ gr.Warning(locale_dict['warning'], duration=3)
36
+ return gr.update(choices=folders)
37
+
38
+ return SidebarComponents(
39
+ root_path=root_path,
40
+ reports_dropdown=reports_dropdown,
41
+ load_btn=load_btn,
42
+ )
@@ -0,0 +1,202 @@
1
+ """
2
+ Single model components for the Evalscope dashboard.
3
+ """
4
+ import gradio as gr
5
+ import os
6
+ import pandas as pd
7
+ from dataclasses import dataclass
8
+ from typing import TYPE_CHECKING
9
+
10
+ from evalscope.report import Report, ReportKey, get_data_frame
11
+ from evalscope.utils.logger import get_logger
12
+ from ..constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN
13
+ from ..utils.data_utils import (get_acc_report_df, get_model_prediction, get_report_analysis, get_single_dataset_df,
14
+ load_single_report)
15
+ from ..utils.localization import get_single_model_locale
16
+ from ..utils.text_utils import convert_markdown_image, process_json_content, process_model_prediction
17
+ from ..utils.visualization import plot_single_dataset_scores, plot_single_report_scores, plot_single_report_sunburst
18
+
19
+ if TYPE_CHECKING:
20
+ from .sidebar import SidebarComponents
21
+
22
+ logger = get_logger()
23
+
24
+
25
+ @dataclass
26
+ class SingleModelComponents:
27
+ report_name: gr.Dropdown
28
+
29
+
30
+ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
31
+ locale_dict = get_single_model_locale(lang)
32
+
33
+ # Update the UI components with localized labels
34
+ report_name = gr.Dropdown(label=locale_dict['select_report'], choices=[], interactive=True)
35
+ work_dir = gr.State(None)
36
+ model_name = gr.State(None)
37
+
38
+ with gr.Accordion(locale_dict['task_config'], open=False):
39
+ task_config = gr.JSON(value=None)
40
+
41
+ report_list = gr.State([])
42
+
43
+ with gr.Tab(locale_dict['datasets_overview']):
44
+ gr.Markdown(f'### {locale_dict["dataset_components"]}')
45
+ sunburst_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_components'])
46
+ gr.Markdown(f'### {locale_dict["dataset_scores"]}')
47
+ score_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'])
48
+ gr.Markdown(f'### {locale_dict["dataset_scores_table"]}')
49
+ score_table = gr.DataFrame(value=None)
50
+
51
+ with gr.Tab(locale_dict['dataset_details']):
52
+ dataset_radio = gr.Radio(label=locale_dict['select_dataset'], choices=[], show_label=True, interactive=True)
53
+ # show dataset details
54
+ with gr.Accordion(locale_dict['report_analysis'], open=True):
55
+ report_analysis = gr.Markdown(value='N/A')
56
+ gr.Markdown(f'### {locale_dict["dataset_scores"]}')
57
+ dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'])
58
+ gr.Markdown(f'### {locale_dict["dataset_scores_table"]}')
59
+ dataset_table = gr.DataFrame(value=None)
60
+
61
+ gr.Markdown(f'### {locale_dict["model_prediction"]}')
62
+ subset_select = gr.Dropdown(label=locale_dict['select_subset'], choices=[], show_label=True, interactive=True)
63
+
64
+ with gr.Row():
65
+ answer_mode_radio = gr.Radio(
66
+ label=locale_dict['answer_mode'], choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
67
+ score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True)
68
+
69
+ data_review_df = gr.State(None)
70
+ filtered_review_df = gr.State(None)
71
+
72
+ # show statistics
73
+ with gr.Row(variant='panel'):
74
+ with gr.Column():
75
+ gr.Markdown('### *Counts*')
76
+ answer_mode_counts = gr.Markdown('')
77
+ with gr.Column():
78
+ page_number = gr.Number(
79
+ value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True)
80
+
81
+ # show data review table
82
+ with gr.Row(variant='panel'):
83
+ with gr.Column():
84
+ gr.Markdown('### *Score*')
85
+ score_text = gr.Code('', elem_id='score_text', language='json', wrap_lines=False)
86
+ with gr.Column():
87
+ gr.Markdown('### *Normalized Score*')
88
+ nscore = gr.Markdown('', elem_id='score_text', latex_delimiters=LATEX_DELIMITERS)
89
+
90
+ with gr.Row(variant='panel'):
91
+ with gr.Column():
92
+ gr.Markdown('### *Gold*')
93
+ gold_text = gr.Markdown('', elem_id='gold_text', latex_delimiters=LATEX_DELIMITERS)
94
+ with gr.Column():
95
+ gr.Markdown('### *Pred*')
96
+ pred_text = gr.Markdown('', elem_id='pred_text', latex_delimiters=LATEX_DELIMITERS)
97
+
98
+ with gr.Row(variant='panel'):
99
+ with gr.Column():
100
+ gr.Markdown('### *Input*')
101
+ input_text = gr.Code('', elem_id='input_text', language='json', wrap_lines=False)
102
+ with gr.Column():
103
+ gr.Markdown('### *Generated*')
104
+ generated_text = gr.Markdown('', elem_id='generated_text', latex_delimiters=LATEX_DELIMITERS)
105
+
106
+ @report_name.change(
107
+ inputs=[sidebar.root_path, report_name],
108
+ outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
109
+ def update_single_report_data(root_path, report_name):
110
+ report_list, datasets, task_cfg = load_single_report(root_path, report_name)
111
+ work_dir = os.path.join(root_path, report_name.split(REPORT_TOKEN)[0])
112
+ model_name = report_name.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
113
+ return (report_list, task_cfg, gr.update(choices=datasets, value=datasets[0]), work_dir, model_name)
114
+
115
+ @report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])
116
+ def update_single_report_score(report_list):
117
+ report_score_df, styler = get_acc_report_df(report_list)
118
+ report_score_plot = plot_single_report_scores(report_score_df)
119
+ report_sunburst_plot = plot_single_report_sunburst(report_list)
120
+ return report_score_plot, styler, report_sunburst_plot
121
+
122
+ @gr.on(
123
+ triggers=[dataset_radio.change, report_list.change],
124
+ inputs=[dataset_radio, report_list],
125
+ outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
126
+ def update_single_report_dataset(dataset_name, report_list):
127
+ logger.debug(f'Updating single report dataset: {dataset_name}')
128
+ report_df = get_data_frame(report_list=report_list)
129
+ analysis = get_report_analysis(report_list, dataset_name)
130
+ data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
131
+ data_score_plot = plot_single_dataset_scores(data_score_df)
132
+ subsets = data_score_df[ReportKey.subset_name].unique().tolist()
133
+ logger.debug(f'subsets: {subsets}')
134
+ return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
135
+
136
+ @gr.on(
137
+ triggers=[subset_select.change],
138
+ inputs=[work_dir, model_name, dataset_radio, subset_select],
139
+ outputs=[data_review_df, page_number])
140
+ def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
141
+ if not subset_name:
142
+ return gr.skip()
143
+ data_review_df = get_model_prediction(work_dir, model_name, dataset_name, subset_name)
144
+ return data_review_df, 1
145
+
146
+ @gr.on(
147
+ triggers=[data_review_df.change, answer_mode_radio.change, score_threshold.change],
148
+ inputs=[data_review_df, answer_mode_radio, score_threshold],
149
+ outputs=[filtered_review_df, page_number, answer_mode_counts])
150
+ def filter_data(data_review_df, answer_mode, score_threshold):
151
+ if data_review_df is None:
152
+ return None, gr.update(value=1, maximum=1), ''
153
+
154
+ all_count = len(data_review_df)
155
+ pass_df = data_review_df[data_review_df['NScore'] >= score_threshold]
156
+ pass_count = len(pass_df)
157
+ fail_count = all_count - pass_count
158
+
159
+ counts_text = f'### All: {all_count} | Pass: {pass_count} | Fail: {fail_count}'
160
+
161
+ if answer_mode == 'Pass':
162
+ filtered_df = pass_df
163
+ elif answer_mode == 'Fail':
164
+ filtered_df = data_review_df[data_review_df['NScore'] < score_threshold]
165
+ else:
166
+ filtered_df = data_review_df
167
+
168
+ max_page = max(1, len(filtered_df))
169
+
170
+ return (filtered_df, gr.update(value=1, maximum=max_page), counts_text)
171
+
172
+ @gr.on(
173
+ triggers=[filtered_review_df.change, page_number.change],
174
+ inputs=[filtered_review_df, page_number, score_threshold],
175
+ outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore])
176
+ def update_table_components(filtered_df, page_number, score_threshold):
177
+ if filtered_df is None or len(filtered_df) == 0:
178
+ return '', '', '', '', '', ''
179
+
180
+ # Get single row data for the current page
181
+ start = (page_number - 1)
182
+ if start >= len(filtered_df):
183
+ return '', '', '', '', '', ''
184
+
185
+ row = filtered_df.iloc[start]
186
+
187
+ # Process the data for display
188
+ input_md = process_json_content(row['Input'])
189
+ generated_md = process_model_prediction(row['Generated'])
190
+ gold_md = process_model_prediction(row['Gold'])
191
+ pred_md = convert_markdown_image(process_model_prediction(row['Pred']))
192
+ score_md = process_json_content(row['Score'])
193
+ nscore_val = float(row['NScore']) if not pd.isna(row['NScore']) else 0.0
194
+
195
+ if nscore_val >= score_threshold:
196
+ nscore_val = f'<div style="background-color:rgb(45,104, 62); padding:10px;">{nscore_val}</div>'
197
+ else:
198
+ nscore_val = f'<div style="background-color:rgb(151, 31, 44); padding:10px;">{nscore_val}</div>'
199
+
200
+ return input_md, generated_md, gold_md, pred_md, score_md, nscore_val
201
+
202
+ return SingleModelComponents(report_name=report_name)