evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (147) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/base.py +1 -1
  4. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  5. evalscope/backend/rag_eval/utils/clip.py +2 -2
  6. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  7. evalscope/backend/rag_eval/utils/llm.py +1 -1
  8. evalscope/benchmarks/__init__.py +20 -1
  9. evalscope/benchmarks/arc/__init__.py +0 -5
  10. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  11. evalscope/benchmarks/bbh/__init__.py +0 -4
  12. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  13. evalscope/benchmarks/benchmark.py +70 -59
  14. evalscope/benchmarks/ceval/__init__.py +0 -5
  15. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  16. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  17. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  18. evalscope/benchmarks/competition_math/__init__.py +0 -5
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  20. evalscope/benchmarks/data_adapter.py +115 -87
  21. evalscope/benchmarks/general_qa/__init__.py +0 -5
  22. evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
  23. evalscope/benchmarks/gpqa/__init__.py +0 -0
  24. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  26. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  27. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
  28. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  29. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
  30. evalscope/benchmarks/humaneval/__init__.py +0 -4
  31. evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
  32. evalscope/benchmarks/ifeval/__init__.py +0 -0
  33. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  34. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  35. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  36. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  37. evalscope/benchmarks/ifeval/utils.py +134 -0
  38. evalscope/benchmarks/iquiz/__init__.py +0 -0
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  40. evalscope/benchmarks/mmlu/__init__.py +0 -5
  41. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  42. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  43. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  44. evalscope/benchmarks/race/__init__.py +0 -5
  45. evalscope/benchmarks/race/race_adapter.py +27 -123
  46. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  48. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  49. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  50. evalscope/cli/cli.py +2 -0
  51. evalscope/cli/start_app.py +30 -0
  52. evalscope/collections/__init__.py +3 -0
  53. evalscope/collections/evaluator.py +198 -0
  54. evalscope/collections/sampler.py +138 -0
  55. evalscope/collections/schema.py +126 -0
  56. evalscope/config.py +45 -7
  57. evalscope/constants.py +7 -38
  58. evalscope/evaluator/__init__.py +0 -1
  59. evalscope/evaluator/evaluator.py +89 -121
  60. evalscope/evaluator/rating_eval.py +1 -1
  61. evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
  62. evalscope/metrics/__init__.py +3 -0
  63. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  64. evalscope/metrics/math_accuracy.py +193 -50
  65. evalscope/metrics/metrics.py +18 -6
  66. evalscope/metrics/named_metrics.py +17 -0
  67. evalscope/metrics/rouge_metric.py +13 -8
  68. evalscope/models/__init__.py +14 -1
  69. evalscope/models/base_adapter.py +52 -0
  70. evalscope/models/chat_adapter.py +140 -0
  71. evalscope/models/choice_adapter.py +211 -0
  72. evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
  73. evalscope/models/custom_adapter.py +67 -0
  74. evalscope/models/local_model.py +74 -0
  75. evalscope/models/model.py +141 -0
  76. evalscope/models/server_adapter.py +111 -0
  77. evalscope/perf/__init__.py +1 -0
  78. evalscope/perf/arguments.py +3 -1
  79. evalscope/perf/benchmark.py +3 -3
  80. evalscope/perf/main.py +5 -7
  81. evalscope/perf/plugin/api/custom_api.py +1 -1
  82. evalscope/perf/plugin/api/openai_api.py +54 -50
  83. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  84. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  85. evalscope/perf/plugin/registry.py +3 -3
  86. evalscope/perf/utils/benchmark_util.py +4 -4
  87. evalscope/perf/utils/db_util.py +66 -22
  88. evalscope/perf/utils/local_server.py +4 -1
  89. evalscope/report/__init__.py +5 -0
  90. evalscope/report/app.py +693 -0
  91. evalscope/report/combinator.py +73 -0
  92. evalscope/report/generator.py +80 -0
  93. evalscope/report/utils.py +133 -0
  94. evalscope/run.py +64 -125
  95. evalscope/run_arena.py +3 -2
  96. evalscope/summarizer.py +15 -27
  97. evalscope/third_party/longbench_write/eval.py +2 -1
  98. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  99. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  100. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  101. evalscope/utils/__init__.py +1 -0
  102. evalscope/utils/chat_service.py +6 -5
  103. evalscope/utils/io_utils.py +170 -0
  104. evalscope/utils/logger.py +13 -0
  105. evalscope/utils/model_utils.py +15 -2
  106. evalscope/utils/utils.py +3 -200
  107. evalscope/version.py +2 -2
  108. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
  109. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
  110. tests/cli/test_collection.py +57 -0
  111. tests/cli/test_run.py +57 -7
  112. tests/perf/test_perf.py +3 -2
  113. tests/rag/test_mteb.py +3 -2
  114. tests/vlm/test_vlmeval.py +3 -2
  115. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  116. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  117. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  118. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  119. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  120. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  121. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  122. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  123. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  124. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  125. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  126. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  127. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  128. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  129. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  130. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  131. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  132. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  133. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  134. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  135. evalscope/evaluator/humaneval_evaluator.py +0 -158
  136. evalscope/models/api/__init__.py +0 -3
  137. evalscope/models/dummy_chat_model.py +0 -49
  138. evalscope/models/model_adapter.py +0 -525
  139. evalscope/models/openai_model.py +0 -103
  140. evalscope/tools/__init__.py +0 -1
  141. evalscope/tools/combine_reports.py +0 -135
  142. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  143. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  144. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
  145. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
  146. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  147. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,693 @@
1
+ import argparse
2
+ import glob
3
+ import gradio as gr
4
+ import numpy as np
5
+ import os
6
+ import pandas as pd
7
+ import plotly.express as px
8
+ import plotly.graph_objects as go
9
+ from dataclasses import dataclass
10
+ from typing import Any, List, Union
11
+
12
+ from evalscope.constants import DataCollection
13
+ from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
14
+ from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
15
+ from evalscope.utils.logger import configure_logging, get_logger
16
+ from evalscope.version import __version__
17
+
18
+ logger = get_logger()
19
+
20
+ PLOTLY_THEME = 'plotly_dark'
21
+
22
+
23
+ def scan_for_report_folders(root_path):
24
+ """Scan for folders containing reports subdirectories"""
25
+ logger.debug(f'Scanning for report folders in {root_path}')
26
+ if not os.path.exists(root_path):
27
+ return []
28
+
29
+ reports = []
30
+ # Iterate over all folders in the root path
31
+ for folder in glob.glob(os.path.join(root_path, '*')):
32
+ # Check if reports folder exists
33
+ reports_path = os.path.join(folder, OutputsStructure.REPORTS_DIR)
34
+ if not os.path.exists(reports_path):
35
+ continue
36
+
37
+ # Iterate over all items in reports folder
38
+ for model_item in glob.glob(os.path.join(reports_path, '*')):
39
+ if not os.path.isdir(model_item):
40
+ continue
41
+ datasets = []
42
+ for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
43
+ datasets.append(os.path.basename(dataset_item).split('.')[0])
44
+ datasets = ','.join(datasets)
45
+ reports.append(f'{os.path.basename(folder)}@{os.path.basename(model_item)}:{datasets}')
46
+
47
+ reports = sorted(reports, reverse=True)
48
+ logger.debug(f'reports: {reports}')
49
+ return reports
50
+
51
+
52
+ def process_report_name(report_name: str):
53
+ prefix, report_name = report_name.split('@')
54
+ model_name, datasets = report_name.split(':')
55
+ datasets = datasets.split(',')
56
+ return prefix, model_name, datasets
57
+
58
+
59
+ def load_single_report(root_path: str, report_name: str):
60
+ prefix, model_name, datasets = process_report_name(report_name)
61
+ report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
62
+ report_list = get_report_list([report_path_list])
63
+
64
+ task_cfg_path = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))[0]
65
+ task_cfg = yaml_to_dict(task_cfg_path)
66
+ return report_list, datasets, task_cfg
67
+
68
+
69
+ def load_multi_report(root_path: str, report_names: List[str]):
70
+ report_list = []
71
+ for report_name in report_names:
72
+ prefix, model_name, datasets = process_report_name(report_name)
73
+ report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
74
+ reports = get_report_list([report_path_list])
75
+ report_list.extend(reports)
76
+ return report_list
77
+
78
+
79
+ def get_acc_report_df(report_list: List[Report]):
80
+ data_dict = []
81
+ for report in report_list:
82
+ if report.name == DataCollection.NAME:
83
+ for metric in report.metrics:
84
+ for category in metric.categories:
85
+ item = {
86
+ ReportKey.model_name: report.model_name,
87
+ ReportKey.dataset_name: '/'.join(category.name),
88
+ ReportKey.score: category.score,
89
+ ReportKey.num: category.num,
90
+ }
91
+ data_dict.append(item)
92
+ else:
93
+ item = {
94
+ ReportKey.model_name: report.model_name,
95
+ ReportKey.dataset_name: report.dataset_name,
96
+ ReportKey.score: report.score,
97
+ ReportKey.num: report.metrics[0].num,
98
+ }
99
+ data_dict.append(item)
100
+ df = pd.DataFrame.from_dict(data_dict, orient='columns')
101
+
102
+ styler = style_df(df, columns=[ReportKey.score])
103
+ return df, styler
104
+
105
+
106
+ def style_df(df: pd.DataFrame, columns: List[str] = None):
107
+ # Apply background gradient to the specified columns
108
+ styler = df.style.background_gradient(subset=columns, cmap='RdYlGn', vmin=0.0, vmax=1.0, axis=0)
109
+ # Format the dataframe with a precision of 4 decimal places
110
+ styler.format(precision=4)
111
+ return styler
112
+
113
+
114
+ def get_compare_report_df(acc_df: pd.DataFrame):
115
+ df = acc_df.pivot_table(index=ReportKey.model_name, columns=ReportKey.dataset_name, values=ReportKey.score)
116
+ df.reset_index(inplace=True)
117
+
118
+ styler = style_df(df)
119
+ return df, styler
120
+
121
+
122
+ def plot_single_report_scores(df: pd.DataFrame):
123
+ plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
124
+
125
+ width = 0.2 if len(df[ReportKey.dataset_name]) <= 5 else None
126
+ plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
127
+ plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
128
+ return plot
129
+
130
+
131
+ def plot_single_report_sunburst(report_list: List[Report]):
132
+ if report_list[0].name == DataCollection.NAME:
133
+ df = get_data_frame(report_list)
134
+ categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
135
+ path = categories + [ReportKey.subset_name]
136
+ else:
137
+ df = get_data_frame(report_list, flatten_metrics=False)
138
+ categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
139
+ path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
140
+ logger.debug(f'df: {df}')
141
+ df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
142
+
143
+ plot = px.sunburst(
144
+ df,
145
+ path=path,
146
+ values=ReportKey.num,
147
+ color=ReportKey.score,
148
+ color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
149
+ color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
150
+ template=PLOTLY_THEME,
151
+ maxdepth=4)
152
+ plot.update_traces(insidetextorientation='radial')
153
+ plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1), height=600)
154
+ return plot
155
+
156
+
157
+ def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
158
+ df = df[df[ReportKey.dataset_name] == dataset_name]
159
+ styler = style_df(df, columns=[ReportKey.score])
160
+ return df, styler
161
+
162
+
163
+ def plot_single_dataset_scores(df: pd.DataFrame):
164
+ # TODO: add metric radio and relace category name
165
+ plot = px.bar(
166
+ df,
167
+ x=df[ReportKey.metric_name],
168
+ y=df[ReportKey.score],
169
+ color=df[ReportKey.subset_name],
170
+ text=df[ReportKey.score],
171
+ barmode='group')
172
+
173
+ width = 0.2 if len(df[ReportKey.subset_name]) <= 5 else None
174
+ plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
175
+ plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
176
+ return plot
177
+
178
+
179
+ def plot_multi_report_radar(df: pd.DataFrame):
180
+ fig = go.Figure()
181
+
182
+ grouped = df.groupby(ReportKey.model_name)
183
+ common_datasets = set.intersection(*[set(group[ReportKey.dataset_name]) for _, group in grouped])
184
+
185
+ for model_name, group in grouped:
186
+ common_group = group[group[ReportKey.dataset_name].isin(common_datasets)]
187
+ fig.add_trace(
188
+ go.Scatterpolar(
189
+ r=common_group[ReportKey.score],
190
+ theta=common_group[ReportKey.dataset_name],
191
+ name=model_name,
192
+ fill='toself'))
193
+
194
+ fig.update_layout(
195
+ template=PLOTLY_THEME,
196
+ polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
197
+ margin=dict(t=20, l=20, r=20, b=20))
198
+ return fig
199
+
200
+
201
+ def dict_to_markdown(data) -> str:
202
+ markdown_lines = []
203
+
204
+ for key, value in data.items():
205
+ bold_key = f'**{key}**'
206
+
207
+ if isinstance(value, list):
208
+ value_str = '\n' + '\n'.join([f' - {item}' for item in value])
209
+ elif isinstance(value, dict):
210
+ value_str = dict_to_markdown(value)
211
+ else:
212
+ value_str = str(value)
213
+
214
+ value_str = process_string(value_str)
215
+ markdown_line = f'{bold_key}: {value_str}'
216
+ markdown_lines.append(markdown_line)
217
+
218
+ return '\n\n'.join(markdown_lines)
219
+
220
+
221
+ def process_string(string: str, max_length: int = 2048) -> str:
222
+ if len(string) > max_length:
223
+ return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
224
+ return string
225
+
226
+
227
+ def process_model_prediction(item: Any):
228
+ if isinstance(item, dict):
229
+ return dict_to_markdown(item)
230
+ elif isinstance(item, list):
231
+ return '\n'.join([process_model_prediction(item) for item in item])
232
+ else:
233
+ return process_string(str(item))
234
+
235
+
236
+ def normalize_score(score):
237
+ if isinstance(score, bool):
238
+ return 1.0 if score else 0.0
239
+ elif isinstance(score, dict):
240
+ for key in score:
241
+ return float(score[key])
242
+ return 0.0
243
+ else:
244
+ try:
245
+ return float(score)
246
+ except (ValueError, TypeError):
247
+ return 0.0
248
+
249
+
250
+ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
251
+ data_path = os.path.join(work_dir, OutputsStructure.REVIEWS_DIR, model_name)
252
+ subset_name = subset_name.replace('/', '_') # for collection report
253
+ review_path = os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl')
254
+ logger.debug(f'review_path: {review_path}')
255
+ origin_df = pd.read_json(review_path, lines=True)
256
+
257
+ ds = []
258
+ for i, item in origin_df.iterrows():
259
+ raw_input = item['raw_input']
260
+ raw_pred_answer = item['choices'][0]['message']['content']
261
+ parsed_gold_answer = item['choices'][0]['review']['gold']
262
+ parsed_pred_answer = item['choices'][0]['review']['pred']
263
+ score = item['choices'][0]['review']['result']
264
+ raw_d = {
265
+ 'Input': raw_input,
266
+ 'Generated': raw_pred_answer,
267
+ 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
268
+ 'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
269
+ 'Score': score,
270
+ 'NScore': normalize_score(score)
271
+ }
272
+ ds.append(raw_d)
273
+
274
+ df_subset = pd.DataFrame(ds)
275
+ return df_subset
276
+
277
+
278
+ def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: int = 1) -> pd.DataFrame:
279
+ if data_review_df is None:
280
+ return pd.DataFrame(), None
281
+
282
+ logger.debug(f'page: {page}, rows_per_page: {rows_per_page}')
283
+ start = (page - 1) * rows_per_page
284
+ end = start + rows_per_page
285
+ df_subset = data_review_df.iloc[start:end].copy()
286
+ df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
287
+ df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
288
+ styler = style_df(df_subset, columns=['NScore'])
289
+ return df_subset, styler
290
+
291
+
292
+ @dataclass
293
+ class SidebarComponents:
294
+ root_path: gr.Textbox
295
+ reports_dropdown: gr.Dropdown
296
+ load_btn: gr.Button
297
+
298
+
299
+ def create_sidebar(outputs_dir: str, lang: str):
300
+ locale_dict = {
301
+ 'settings': {
302
+ 'zh': '设置',
303
+ 'en': 'Settings'
304
+ },
305
+ 'report_root_path': {
306
+ 'zh': '报告根路径',
307
+ 'en': 'Report Root Path'
308
+ },
309
+ 'select_reports': {
310
+ 'zh': '请选择报告',
311
+ 'en': 'Select Reports'
312
+ },
313
+ 'load_btn': {
314
+ 'zh': '加载并查看',
315
+ 'en': 'Load & View'
316
+ },
317
+ 'note': {
318
+ 'zh': '请选择报告并点击`加载并查看`来查看数据',
319
+ 'en': 'Please select reports and click `Load & View` to view the data'
320
+ },
321
+ 'warning': {
322
+ 'zh': '没有找到报告,请检查路径',
323
+ 'en': 'No reports found, please check the path'
324
+ }
325
+ }
326
+
327
+ gr.Markdown(f'## {locale_dict["settings"][lang]}')
328
+ root_path = gr.Textbox(
329
+ label=locale_dict['report_root_path'][lang], value=outputs_dir, placeholder=outputs_dir, lines=1)
330
+ reports_dropdown = gr.Dropdown(
331
+ label=locale_dict['select_reports'][lang], choices=[], multiselect=True, interactive=True)
332
+ load_btn = gr.Button(locale_dict['load_btn'][lang])
333
+ gr.Markdown(f'### {locale_dict["note"][lang]}')
334
+
335
+ @reports_dropdown.focus(inputs=[root_path], outputs=[reports_dropdown])
336
+ def update_dropdown_choices(root_path):
337
+ folders = scan_for_report_folders(root_path)
338
+ if len(folders) == 0:
339
+ gr.Warning(locale_dict['warning'][lang], duration=3)
340
+ return gr.update(choices=folders)
341
+
342
+ return SidebarComponents(
343
+ root_path=root_path,
344
+ reports_dropdown=reports_dropdown,
345
+ load_btn=load_btn,
346
+ )
347
+
348
+
349
+ @dataclass
350
+ class VisualizationComponents:
351
+ single_model: gr.Tab
352
+ multi_model: gr.Tab
353
+
354
+
355
+ def create_visualization(sidebar: SidebarComponents, lang: str):
356
+ locale_dict = {
357
+ 'visualization': {
358
+ 'zh': '可视化',
359
+ 'en': 'Visualization'
360
+ },
361
+ 'single_model': {
362
+ 'zh': '单模型',
363
+ 'en': 'Single Model'
364
+ },
365
+ 'multi_model': {
366
+ 'zh': '多模型',
367
+ 'en': 'Multi Model'
368
+ }
369
+ }
370
+ with gr.Column(visible=True):
371
+ gr.Markdown(f'## {locale_dict["visualization"][lang]}')
372
+ with gr.Tabs():
373
+ with gr.Tab(locale_dict['single_model'][lang]):
374
+ single = create_single_model_tab(sidebar, lang)
375
+
376
+ with gr.Tab(locale_dict['multi_model'][lang]):
377
+ multi = create_multi_model_tab(sidebar, lang)
378
+ return VisualizationComponents(
379
+ single_model=single,
380
+ multi_model=multi,
381
+ )
382
+
383
+
384
+ @dataclass
385
+ class SingleModelComponents:
386
+ report_name: gr.Dropdown
387
+
388
+
389
+ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
390
+ locale_dict = {
391
+ 'select_report': {
392
+ 'zh': '选择报告',
393
+ 'en': 'Select Report'
394
+ },
395
+ 'task_config': {
396
+ 'zh': '任务配置',
397
+ 'en': 'Task Config'
398
+ },
399
+ 'datasets_overview': {
400
+ 'zh': '数据集概览',
401
+ 'en': 'Datasets Overview'
402
+ },
403
+ 'dataset_components': {
404
+ 'zh': '数据集组成',
405
+ 'en': 'Dataset Components'
406
+ },
407
+ 'dataset_scores': {
408
+ 'zh': '数据集分数',
409
+ 'en': 'Dataset Scores'
410
+ },
411
+ 'dataset_scores_table': {
412
+ 'zh': '数据集分数表',
413
+ 'en': 'Dataset Scores Table'
414
+ },
415
+ 'dataset_details': {
416
+ 'zh': '数据集详情',
417
+ 'en': 'Dataset Details'
418
+ },
419
+ 'select_dataset': {
420
+ 'zh': '选择数据集',
421
+ 'en': 'Select Dataset'
422
+ },
423
+ 'model_prediction': {
424
+ 'zh': '模型预测',
425
+ 'en': 'Model Prediction'
426
+ },
427
+ 'select_subset': {
428
+ 'zh': '选择子集',
429
+ 'en': 'Select Subset'
430
+ },
431
+ 'answer_mode': {
432
+ 'zh': '答案模式',
433
+ 'en': 'Answer Mode'
434
+ },
435
+ 'page': {
436
+ 'zh': '页码',
437
+ 'en': 'Page'
438
+ }
439
+ }
440
+
441
+ # Update the UI components with localized labels
442
+ report_name = gr.Dropdown(label=locale_dict['select_report'][lang], choices=[], interactive=True)
443
+ work_dir = gr.State(None)
444
+ model_name = gr.State(None)
445
+
446
+ with gr.Accordion(locale_dict['task_config'][lang], open=False):
447
+ task_config = gr.JSON(value=None)
448
+
449
+ report_list = gr.State([])
450
+
451
+ with gr.Tab(locale_dict['datasets_overview'][lang]):
452
+ gr.Markdown(f'### {locale_dict["dataset_components"][lang]}')
453
+ sunburst_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_components'][lang])
454
+ gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
455
+ score_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
456
+ gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
457
+ score_table = gr.DataFrame(value=None)
458
+
459
+ with gr.Tab(locale_dict['dataset_details'][lang]):
460
+ dataset_radio = gr.Radio(
461
+ label=locale_dict['select_dataset'][lang], choices=[], show_label=True, interactive=True)
462
+ gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
463
+ dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
464
+ gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
465
+ dataset_table = gr.DataFrame(value=None)
466
+
467
+ gr.Markdown(f'### {locale_dict["model_prediction"][lang]}')
468
+ subset_select = gr.Dropdown(
469
+ label=locale_dict['select_subset'][lang], choices=[], show_label=True, interactive=True)
470
+ with gr.Row():
471
+ answer_mode_radio = gr.Radio(
472
+ label=locale_dict['answer_mode'][lang], choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
473
+ page_number = gr.Number(
474
+ value=1, label=locale_dict['page'][lang], minimum=1, maximum=1, step=1, interactive=True)
475
+ answer_mode_counts = gr.Markdown('', label='Counts')
476
+ data_review_df = gr.State(None)
477
+ filtered_review_df = gr.State(None)
478
+ data_review_table = gr.DataFrame(
479
+ value=None,
480
+ datatype=['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number'],
481
+ # column_widths=['500px', '500px'],
482
+ wrap=True,
483
+ latex_delimiters=[{
484
+ 'left': '$$',
485
+ 'right': '$$',
486
+ 'display': True
487
+ }, {
488
+ 'left': '$',
489
+ 'right': '$',
490
+ 'display': False
491
+ }, {
492
+ 'left': '\\(',
493
+ 'right': '\\)',
494
+ 'display': False
495
+ }, {
496
+ 'left': '\\[',
497
+ 'right': '\\]',
498
+ 'display': True
499
+ }],
500
+ max_height=600)
501
+
502
+ @report_name.change(
503
+ inputs=[sidebar.root_path, report_name],
504
+ outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
505
+ def update_single_report_data(root_path, report_name):
506
+ report_list, datasets, task_cfg = load_single_report(root_path, report_name)
507
+ work_dir = os.path.join(root_path, report_name.split('@')[0])
508
+ model_name = report_name.split('@')[1].split(':')[0]
509
+ return (report_list, task_cfg, gr.update(choices=datasets, value=datasets[0]), work_dir, model_name)
510
+
511
+ @report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])
512
+ def update_single_report_score(report_list):
513
+ report_score_df, styler = get_acc_report_df(report_list)
514
+ report_score_plot = plot_single_report_scores(report_score_df)
515
+ report_sunburst_plot = plot_single_report_sunburst(report_list)
516
+ return report_score_plot, styler, report_sunburst_plot
517
+
518
+ @gr.on(
519
+ triggers=[dataset_radio.change, report_list.change],
520
+ inputs=[dataset_radio, report_list],
521
+ outputs=[dataset_plot, dataset_table, subset_select, data_review_df])
522
+ def update_single_report_dataset(dataset_name, report_list):
523
+ logger.debug(f'Updating single report dataset: {dataset_name}')
524
+ report_df = get_data_frame(report_list)
525
+ data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
526
+ data_score_plot = plot_single_dataset_scores(data_score_df)
527
+ subsets = data_score_df[ReportKey.subset_name].unique().tolist()
528
+ logger.debug(f'subsets: {subsets}')
529
+ return data_score_plot, styler, gr.update(choices=subsets, value=None), None
530
+
531
+ @gr.on(
532
+ triggers=[subset_select.change],
533
+ inputs=[work_dir, model_name, dataset_radio, subset_select],
534
+ outputs=[data_review_df, page_number])
535
+ def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
536
+ if not subset_name:
537
+ return gr.skip()
538
+ data_review_df = get_model_prediction(work_dir, model_name, dataset_name, subset_name)
539
+ return data_review_df, 1
540
+
541
+ @gr.on(
542
+ triggers=[data_review_df.change, answer_mode_radio.change],
543
+ inputs=[data_review_df, answer_mode_radio],
544
+ outputs=[filtered_review_df, page_number, answer_mode_counts])
545
+ def filter_data(data_review_df, answer_mode):
546
+ if data_review_df is None:
547
+ return None, gr.update(value=1, maximum=1), ''
548
+
549
+ all_count = len(data_review_df)
550
+ pass_df = data_review_df[data_review_df['NScore'] >= 0.99]
551
+ pass_count = len(pass_df)
552
+ fail_count = all_count - pass_count
553
+
554
+ counts_text = f'### All: {all_count} | Pass: {pass_count} | Fail: {fail_count}'
555
+
556
+ if answer_mode == 'Pass':
557
+ filtered_df = pass_df
558
+ elif answer_mode == 'Fail':
559
+ filtered_df = data_review_df[data_review_df['NScore'] < 0.99]
560
+ else:
561
+ filtered_df = data_review_df
562
+
563
+ max_page = max(1, len(filtered_df))
564
+
565
+ return (filtered_df, gr.update(value=1, maximum=max_page), counts_text)
566
+
567
+ @gr.on(
568
+ triggers=[filtered_review_df.change, page_number.change],
569
+ inputs=[filtered_review_df, page_number],
570
+ outputs=[data_review_table])
571
+ def update_table(filtered_df, page_number):
572
+ if filtered_df is None:
573
+ return gr.update(value=None)
574
+ subset_df, styler = get_table_data(filtered_df, page_number)
575
+ return styler
576
+
577
+ return SingleModelComponents(report_name=report_name)
578
+
579
+
580
+ @dataclass
581
+ class MultiModelComponents:
582
+ multi_report_name: gr.Dropdown
583
+
584
+
585
+ def create_multi_model_tab(sidebar: SidebarComponents, lang: str):
586
+ locale_dict = {
587
+ 'select_reports': {
588
+ 'zh': '请选择报告',
589
+ 'en': 'Select Reports'
590
+ },
591
+ 'model_radar': {
592
+ 'zh': '模型对比雷达',
593
+ 'en': 'Model Comparison Radar'
594
+ },
595
+ 'model_scores': {
596
+ 'zh': '模型对比分数',
597
+ 'en': 'Model Comparison Scores'
598
+ }
599
+ }
600
+ multi_report_name = gr.Dropdown(
601
+ label=locale_dict['select_reports'][lang], choices=[], multiselect=True, interactive=True)
602
+ gr.Markdown(locale_dict['model_radar'][lang])
603
+ radar_plot = gr.Plot(value=None)
604
+ gr.Markdown(locale_dict['model_scores'][lang])
605
+ score_table = gr.DataFrame(value=None)
606
+
607
+ @multi_report_name.change(inputs=[sidebar.root_path, multi_report_name], outputs=[radar_plot, score_table])
608
+ def update_multi_report_data(root_path, multi_report_name):
609
+ if not multi_report_name:
610
+ return gr.skip()
611
+ report_list = load_multi_report(root_path, multi_report_name)
612
+ report_df, _ = get_acc_report_df(report_list)
613
+ report_radar_plot = plot_multi_report_radar(report_df)
614
+ _, styler = get_compare_report_df(report_df)
615
+ return report_radar_plot, styler
616
+
617
+ return MultiModelComponents(multi_report_name=multi_report_name)
618
+
619
+
620
+ def create_app(args: argparse.Namespace):
621
+ configure_logging(debug=args.debug)
622
+ lang = args.lang
623
+
624
+ locale_dict = {
625
+ 'title': {
626
+ 'zh': '📈 EvalScope 看板',
627
+ 'en': '📈 Evalscope Dashboard'
628
+ },
629
+ 'star_beggar': {
630
+ 'zh':
631
+ '喜欢<a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>就动动手指给我们加个star吧 🥺 ',
632
+ 'en':
633
+ 'If you like <a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>, '
634
+ 'please take a few seconds to star us 🥺 '
635
+ },
636
+ 'note': {
637
+ 'zh': '请选择报告',
638
+ 'en': 'Please select reports'
639
+ }
640
+ }
641
+
642
+ with gr.Blocks(title='Evalscope Dashboard') as demo:
643
+ gr.HTML(f'<h1 style="text-align: left;">{locale_dict["title"][lang]} (v{__version__})</h1>')
644
+ with gr.Row():
645
+ with gr.Column(scale=0, min_width=35):
646
+ toggle_btn = gr.Button('<')
647
+ with gr.Column(scale=1):
648
+ gr.HTML(f'<h3 style="text-align: left;">{locale_dict["star_beggar"][lang]}</h3>')
649
+
650
+ with gr.Row():
651
+ with gr.Column(scale=1) as sidebar_column:
652
+ sidebar_visible = gr.State(True)
653
+ sidebar = create_sidebar(args.outputs, lang)
654
+
655
+ with gr.Column(scale=5):
656
+ visualization = create_visualization(sidebar, lang)
657
+
658
+ @sidebar.load_btn.click(
659
+ inputs=[sidebar.reports_dropdown],
660
+ outputs=[visualization.single_model.report_name, visualization.multi_model.multi_report_name])
661
+ def update_displays(reports_dropdown):
662
+ if not reports_dropdown:
663
+ gr.Warning(locale_dict['note'][lang], duration=3)
664
+ return gr.skip()
665
+
666
+ return (
667
+ gr.update(choices=reports_dropdown, value=reports_dropdown[0]), # update single model dropdown
668
+ gr.update(choices=reports_dropdown, value=reports_dropdown) # update multi model dropdown
669
+ )
670
+
671
+ @toggle_btn.click(inputs=[sidebar_visible], outputs=[sidebar_column, sidebar_visible, toggle_btn])
672
+ def toggle_sidebar(visible):
673
+ new_visible = not visible
674
+ text = '<' if new_visible else '>'
675
+ return gr.update(visible=new_visible), new_visible, gr.update(value=text)
676
+
677
+ demo.launch(share=args.share, server_name=args.server_name, server_port=args.server_port, debug=args.debug)
678
+
679
+
680
+ def add_argument(parser: argparse.ArgumentParser):
681
+ parser.add_argument('--share', action='store_true', help='Share the app.')
682
+ parser.add_argument('--server-name', type=str, default='0.0.0.0', help='The server name.')
683
+ parser.add_argument('--server-port', type=int, default=None, help='The server port.')
684
+ parser.add_argument('--debug', action='store_true', help='Debug the app.')
685
+ parser.add_argument('--lang', type=str, default='zh', help='The locale.', choices=['zh', 'en'])
686
+ parser.add_argument('--outputs', type=str, default='./outputs', help='The outputs dir.')
687
+
688
+
689
+ if __name__ == '__main__':
690
+ parser = argparse.ArgumentParser()
691
+ add_argument(parser)
692
+ args = parser.parse_args()
693
+ create_app(args)