evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  25. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  26. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  27. evalscope/benchmarks/data_adapter.py +20 -5
  28. evalscope/benchmarks/general_arena/__init__.py +0 -0
  29. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  30. evalscope/benchmarks/general_arena/utils.py +226 -0
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  35. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  37. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  38. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  39. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  41. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  42. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  43. evalscope/benchmarks/race/race_adapter.py +1 -1
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  45. evalscope/benchmarks/utils.py +1 -2
  46. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  47. evalscope/config.py +8 -123
  48. evalscope/evaluator/evaluator.py +15 -12
  49. evalscope/metrics/__init__.py +6 -0
  50. evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
  51. evalscope/metrics/llm_judge.py +105 -20
  52. evalscope/metrics/metrics.py +1 -1
  53. evalscope/models/adapters/base_adapter.py +0 -2
  54. evalscope/models/adapters/server_adapter.py +2 -2
  55. evalscope/models/custom/dummy_model.py +3 -3
  56. evalscope/perf/arguments.py +2 -16
  57. evalscope/perf/main.py +1 -1
  58. evalscope/perf/utils/analysis_result.py +24 -23
  59. evalscope/perf/utils/benchmark_util.py +1 -1
  60. evalscope/report/__init__.py +1 -1
  61. evalscope/report/utils.py +34 -15
  62. evalscope/run.py +1 -1
  63. evalscope/summarizer.py +1 -2
  64. evalscope/utils/__init__.py +63 -2
  65. evalscope/utils/argument_utils.py +64 -0
  66. evalscope/utils/import_utils.py +16 -0
  67. evalscope/utils/io_utils.py +45 -4
  68. evalscope/utils/model_utils.py +37 -1
  69. evalscope/version.py +2 -2
  70. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
  71. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
  72. tests/aigc/test_t2i.py +1 -1
  73. tests/cli/test_all.py +50 -2
  74. tests/cli/test_collection.py +1 -1
  75. tests/cli/test_custom.py +261 -0
  76. tests/cli/test_run.py +13 -37
  77. tests/perf/test_perf.py +2 -2
  78. tests/rag/test_clip_benchmark.py +2 -1
  79. tests/rag/test_mteb.py +3 -1
  80. tests/rag/test_ragas.py +3 -1
  81. tests/swift/test_run_swift_eval.py +2 -1
  82. tests/swift/test_run_swift_vlm_eval.py +2 -1
  83. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  84. tests/utils.py +13 -0
  85. tests/vlm/test_vlmeval.py +8 -2
  86. evalscope/evaluator/rating_eval.py +0 -157
  87. evalscope/evaluator/reviewer/__init__.py +0 -1
  88. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  89. evalscope/registry/__init__.py +0 -1
  90. evalscope/registry/config/cfg_arena.yaml +0 -77
  91. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  92. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  93. evalscope/registry/config/cfg_single.yaml +0 -78
  94. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  95. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  96. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  97. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  98. evalscope/registry/data/question.jsonl +0 -80
  99. evalscope/registry/tasks/arc.yaml +0 -28
  100. evalscope/registry/tasks/bbh.yaml +0 -26
  101. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  102. evalscope/registry/tasks/ceval.yaml +0 -27
  103. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  104. evalscope/registry/tasks/cmmlu.yaml +0 -27
  105. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  106. evalscope/registry/tasks/general_qa.yaml +0 -27
  107. evalscope/registry/tasks/gsm8k.yaml +0 -29
  108. evalscope/registry/tasks/mmlu.yaml +0 -29
  109. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  110. evalscope/run_arena.py +0 -202
  111. evalscope/utils/arena_utils.py +0 -217
  112. evalscope/utils/completion_parsers.py +0 -82
  113. /evalscope/{utils → benchmarks}/filters.py +0 -0
  114. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
  115. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
  116. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
  117. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,36 @@
1
+ """
2
+ Visualization components for the Evalscope dashboard.
3
+ """
4
+ import gradio as gr
5
+ from dataclasses import dataclass
6
+ from typing import TYPE_CHECKING
7
+
8
+ from ..utils.localization import get_visualization_locale
9
+ from .multi_model import MultiModelComponents, create_multi_model_tab
10
+ from .single_model import SingleModelComponents, create_single_model_tab
11
+
12
+ if TYPE_CHECKING:
13
+ from .sidebar import SidebarComponents
14
+
15
+
16
+ @dataclass
17
+ class VisualizationComponents:
18
+ single_model: SingleModelComponents
19
+ multi_model: MultiModelComponents
20
+
21
+
22
+ def create_visualization(sidebar: 'SidebarComponents', lang: str):
23
+ locale_dict = get_visualization_locale(lang)
24
+
25
+ with gr.Column(visible=True):
26
+ gr.Markdown(f'## {locale_dict["visualization"]}')
27
+ with gr.Tabs():
28
+ with gr.Tab(locale_dict['single_model']):
29
+ single = create_single_model_tab(sidebar, lang)
30
+
31
+ with gr.Tab(locale_dict['multi_model']):
32
+ multi = create_multi_model_tab(sidebar, lang)
33
+ return VisualizationComponents(
34
+ single_model=single,
35
+ multi_model=multi,
36
+ )
@@ -0,0 +1,178 @@
1
+ """
2
+ Data loading and processing utilities for the Evalscope dashboard.
3
+ """
4
+ import glob
5
+ import numpy as np
6
+ import os
7
+ import pandas as pd
8
+ from typing import Any, Dict, List, Union
9
+
10
+ from evalscope.constants import DataCollection
11
+ from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
12
+ from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
13
+ from evalscope.utils.logger import get_logger
14
+ from ..constants import DATASET_TOKEN, MODEL_TOKEN, REPORT_TOKEN
15
+
16
+ logger = get_logger()
17
+
18
+
19
+ def scan_for_report_folders(root_path):
20
+ """Scan for folders containing reports subdirectories"""
21
+ logger.debug(f'Scanning for report folders in {root_path}')
22
+ if not os.path.exists(root_path):
23
+ return []
24
+
25
+ reports = []
26
+ # Iterate over all folders in the root path
27
+ for folder in glob.glob(os.path.join(root_path, '*')):
28
+ # Check if reports folder exists
29
+ reports_path = os.path.join(folder, OutputsStructure.REPORTS_DIR)
30
+ if not os.path.exists(reports_path):
31
+ continue
32
+
33
+ # Iterate over all items in reports folder
34
+ for model_item in glob.glob(os.path.join(reports_path, '*')):
35
+ if not os.path.isdir(model_item):
36
+ continue
37
+ datasets = []
38
+ for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
39
+ datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
40
+ datasets = DATASET_TOKEN.join(datasets)
41
+ reports.append(
42
+ f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
43
+
44
+ reports = sorted(reports, reverse=True)
45
+ logger.debug(f'reports: {reports}')
46
+ return reports
47
+
48
+
49
+ def process_report_name(report_name: str):
50
+ prefix, report_name = report_name.split(REPORT_TOKEN)
51
+ model_name, datasets = report_name.split(MODEL_TOKEN)
52
+ datasets = datasets.split(DATASET_TOKEN)
53
+ return prefix, model_name, datasets
54
+
55
+
56
+ def load_single_report(root_path: str, report_name: str):
57
+ prefix, model_name, datasets = process_report_name(report_name)
58
+ report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
59
+ report_list = get_report_list([report_path_list])
60
+
61
+ config_files = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))
62
+ if not config_files:
63
+ raise FileNotFoundError(
64
+ f'No configuration files found in {os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR)}')
65
+ task_cfg_path = config_files[0]
66
+ task_cfg = yaml_to_dict(task_cfg_path)
67
+ return report_list, datasets, task_cfg
68
+
69
+
70
+ def load_multi_report(root_path: str, report_names: List[str]):
71
+ report_list = []
72
+ for report_name in report_names:
73
+ prefix, model_name, datasets = process_report_name(report_name)
74
+ report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
75
+ reports = get_report_list([report_path_list])
76
+ report_list.extend(reports)
77
+ return report_list
78
+
79
+
80
+ def get_acc_report_df(report_list: List[Report]):
81
+ data_dict = []
82
+ for report in report_list:
83
+ if report.name == DataCollection.NAME:
84
+ for metric in report.metrics:
85
+ for category in metric.categories:
86
+ item = {
87
+ ReportKey.model_name: report.model_name,
88
+ ReportKey.dataset_name: '/'.join(category.name),
89
+ ReportKey.score: category.score,
90
+ ReportKey.num: category.num,
91
+ }
92
+ data_dict.append(item)
93
+ else:
94
+ item = {
95
+ ReportKey.model_name: report.model_name,
96
+ ReportKey.dataset_name: report.dataset_name,
97
+ ReportKey.score: report.score,
98
+ ReportKey.num: report.metrics[0].num,
99
+ }
100
+ data_dict.append(item)
101
+ df = pd.DataFrame.from_dict(data_dict, orient='columns')
102
+
103
+ styler = style_df(df, columns=[ReportKey.score])
104
+ return df, styler
105
+
106
+
107
+ def style_df(df: pd.DataFrame, columns: List[str] = None):
108
+ # Apply background gradient to the specified columns
109
+ styler = df.style.background_gradient(subset=columns, cmap='RdYlGn', vmin=0.0, vmax=1.0, axis=0)
110
+ # Format the dataframe with a precision of 4 decimal places
111
+ styler.format(precision=4)
112
+ return styler
113
+
114
+
115
+ def get_compare_report_df(acc_df: pd.DataFrame):
116
+ df = acc_df.pivot_table(index=ReportKey.model_name, columns=ReportKey.dataset_name, values=ReportKey.score)
117
+ df.reset_index(inplace=True)
118
+
119
+ styler = style_df(df)
120
+ return df, styler
121
+
122
+
123
+ def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
124
+ df = df[df[ReportKey.dataset_name] == dataset_name]
125
+ styler = style_df(df, columns=[ReportKey.score])
126
+ return df, styler
127
+
128
+
129
+ def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
130
+ for report in report_list:
131
+ if report.dataset_name == dataset_name:
132
+ return report.analysis
133
+ return 'N/A'
134
+
135
+
136
+ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
137
+ data_path = os.path.join(work_dir, OutputsStructure.REVIEWS_DIR, model_name)
138
+ subset_name = subset_name.replace('/', '_') # for collection report
139
+ review_path = os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl')
140
+ logger.debug(f'review_path: {review_path}')
141
+ origin_df = pd.read_json(review_path, lines=True)
142
+
143
+ ds = []
144
+ for i, item in origin_df.iterrows():
145
+ raw_input = item['raw_input']
146
+ sample_index = item['index']
147
+ for choice_index, choice in enumerate(item['choices']):
148
+ raw_pred_answer = choice['message']['content']
149
+ parsed_gold_answer = choice['review']['gold']
150
+ parsed_pred_answer = choice['review']['pred']
151
+ score = choice['review']['result']
152
+ raw_d = {
153
+ 'Index': f'{sample_index}_{choice_index}',
154
+ 'Input': raw_input,
155
+ 'Generated': raw_pred_answer if raw_pred_answer != parsed_pred_answer else '*Same as Pred*',
156
+ 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
157
+ 'Pred': parsed_pred_answer,
158
+ 'Score': score,
159
+ 'NScore': normalize_score(score)
160
+ }
161
+ ds.append(raw_d)
162
+
163
+ df_subset = pd.DataFrame(ds)
164
+ return df_subset
165
+
166
+
167
+ def normalize_score(score):
168
+ try:
169
+ if isinstance(score, bool):
170
+ return 1.0 if score else 0.0
171
+ elif isinstance(score, dict):
172
+ for key in score:
173
+ return float(score[key])
174
+ return 0.0
175
+ else:
176
+ return float(score)
177
+ except (ValueError, TypeError):
178
+ return 0.0
@@ -0,0 +1,221 @@
1
+ """
2
+ Localization utilities for the Evalscope dashboard.
3
+ """
4
+ from typing import Any, Dict
5
+
6
+
7
+ def get_sidebar_locale(lang: str) -> Dict[str, str]:
8
+ locale_dict = {
9
+ 'settings': {
10
+ 'zh': '设置',
11
+ 'en': 'Settings'
12
+ },
13
+ 'report_root_path': {
14
+ 'zh': '报告根路径',
15
+ 'en': 'Report Root Path'
16
+ },
17
+ 'select_reports': {
18
+ 'zh': '请选择报告',
19
+ 'en': 'Select Reports'
20
+ },
21
+ 'load_btn': {
22
+ 'zh': '加载并查看',
23
+ 'en': 'Load & View'
24
+ },
25
+ 'note': {
26
+ 'zh': '请选择报告并点击`加载并查看`来查看数据',
27
+ 'en': 'Please select reports and click `Load & View` to view the data'
28
+ },
29
+ 'warning': {
30
+ 'zh': '没有找到报告,请检查路径',
31
+ 'en': 'No reports found, please check the path'
32
+ }
33
+ }
34
+ return {k: v[lang] for k, v in locale_dict.items()}
35
+
36
+
37
+ def get_visualization_locale(lang: str) -> Dict[str, str]:
38
+ locale_dict = {
39
+ 'visualization': {
40
+ 'zh': '可视化',
41
+ 'en': 'Visualization'
42
+ },
43
+ 'single_model': {
44
+ 'zh': '单模型',
45
+ 'en': 'Single Model'
46
+ },
47
+ 'multi_model': {
48
+ 'zh': '多模型',
49
+ 'en': 'Multi Model'
50
+ }
51
+ }
52
+ return {k: v[lang] for k, v in locale_dict.items()}
53
+
54
+
55
+ def get_single_model_locale(lang: str) -> Dict[str, str]:
56
+ locale_dict = {
57
+ 'select_report': {
58
+ 'zh': '选择报告',
59
+ 'en': 'Select Report'
60
+ },
61
+ 'task_config': {
62
+ 'zh': '任务配置',
63
+ 'en': 'Task Config'
64
+ },
65
+ 'datasets_overview': {
66
+ 'zh': '数据集概览',
67
+ 'en': 'Datasets Overview'
68
+ },
69
+ 'dataset_components': {
70
+ 'zh': '数据集组成',
71
+ 'en': 'Dataset Components'
72
+ },
73
+ 'dataset_scores': {
74
+ 'zh': '数据集分数',
75
+ 'en': 'Dataset Scores'
76
+ },
77
+ 'report_analysis': {
78
+ 'zh': '报告智能分析',
79
+ 'en': 'Report Intelligent Analysis'
80
+ },
81
+ 'dataset_scores_table': {
82
+ 'zh': '数据集分数表',
83
+ 'en': 'Dataset Scores Table'
84
+ },
85
+ 'dataset_details': {
86
+ 'zh': '数据集详情',
87
+ 'en': 'Dataset Details'
88
+ },
89
+ 'select_dataset': {
90
+ 'zh': '选择数据集',
91
+ 'en': 'Select Dataset'
92
+ },
93
+ 'model_prediction': {
94
+ 'zh': '模型预测',
95
+ 'en': 'Model Prediction'
96
+ },
97
+ 'select_subset': {
98
+ 'zh': '选择子集',
99
+ 'en': 'Select Subset'
100
+ },
101
+ 'answer_mode': {
102
+ 'zh': '答案模式',
103
+ 'en': 'Answer Mode'
104
+ },
105
+ 'page': {
106
+ 'zh': '页码',
107
+ 'en': 'Page'
108
+ },
109
+ 'score_threshold': {
110
+ 'zh': '分数阈值',
111
+ 'en': 'Score Threshold'
112
+ },
113
+ }
114
+ return {k: v[lang] for k, v in locale_dict.items()}
115
+
116
+
117
+ def get_multi_model_locale(lang: str) -> Dict[str, str]:
118
+ locale_dict = {
119
+ 'select_reports': {
120
+ 'zh': '请选择报告',
121
+ 'en': 'Select Reports'
122
+ },
123
+ 'models_overview': {
124
+ 'zh': '模型概览',
125
+ 'en': 'Models Overview'
126
+ },
127
+ 'model_radar': {
128
+ 'zh': '模型对比雷达',
129
+ 'en': 'Model Comparison Radar'
130
+ },
131
+ 'model_scores': {
132
+ 'zh': '模型对比分数',
133
+ 'en': 'Model Comparison Scores'
134
+ },
135
+ 'model_comparison_details': {
136
+ 'zh': '模型对比详情',
137
+ 'en': 'Model Comparison Details'
138
+ },
139
+ 'select_model_a': {
140
+ 'zh': '选择模型A',
141
+ 'en': 'Select Model A'
142
+ },
143
+ 'select_model_b': {
144
+ 'zh': '选择模型B',
145
+ 'en': 'Select Model B'
146
+ },
147
+ 'select_dataset': {
148
+ 'zh': '选择数据集',
149
+ 'en': 'Select Dataset'
150
+ },
151
+ 'model_predictions': {
152
+ 'zh': '模型预测',
153
+ 'en': 'Model Predictions'
154
+ },
155
+ 'select_subset': {
156
+ 'zh': '选择子集',
157
+ 'en': 'Select Subset'
158
+ },
159
+ 'answer_mode': {
160
+ 'zh': '答案模式',
161
+ 'en': 'Answer Mode'
162
+ },
163
+ 'score_threshold': {
164
+ 'zh': '分数阈值',
165
+ 'en': 'Score Threshold'
166
+ },
167
+ 'comparison_counts': {
168
+ 'zh': '对比统计',
169
+ 'en': 'Comparison Counts'
170
+ },
171
+ 'page': {
172
+ 'zh': '页码',
173
+ 'en': 'Page'
174
+ },
175
+ 'input': {
176
+ 'zh': '输入',
177
+ 'en': 'Input'
178
+ },
179
+ 'gold_answer': {
180
+ 'zh': '标准答案',
181
+ 'en': 'Gold Answer'
182
+ },
183
+ 'score': {
184
+ 'zh': '分数',
185
+ 'en': 'Score'
186
+ },
187
+ 'normalized_score': {
188
+ 'zh': '归一化分数',
189
+ 'en': 'Normalized Score'
190
+ },
191
+ 'prediction': {
192
+ 'zh': '预测',
193
+ 'en': 'Prediction'
194
+ },
195
+ 'generated': {
196
+ 'zh': '生成结果',
197
+ 'en': 'Generated'
198
+ }
199
+ }
200
+ return {k: v[lang] for k, v in locale_dict.items()}
201
+
202
+
203
+ def get_app_locale(lang: str) -> Dict[str, str]:
204
+ locale_dict = {
205
+ 'title': {
206
+ 'zh': '📈 EvalScope 看板',
207
+ 'en': '📈 Evalscope Dashboard'
208
+ },
209
+ 'star_beggar': {
210
+ 'zh':
211
+ '喜欢<a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>就动动手指给我们加个star吧 🥺 ',
212
+ 'en':
213
+ 'If you like <a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>, '
214
+ 'please take a few seconds to star us 🥺 '
215
+ },
216
+ 'note': {
217
+ 'zh': '请选择报告',
218
+ 'en': 'Please select reports'
219
+ }
220
+ }
221
+ return {k: v[lang] for k, v in locale_dict.items()}
@@ -0,0 +1,119 @@
1
+ """
2
+ Text processing utilities for the Evalscope dashboard.
3
+ """
4
+ import json
5
+ import numpy as np
6
+ import os
7
+ import pandas as pd
8
+ import re
9
+ from typing import Any, Dict, List
10
+
11
+ from evalscope.utils.logger import get_logger
12
+ from ..constants import LATEX_DELIMITERS
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ def convert_markdown_image(text):
18
+ if not os.path.isfile(text):
19
+ return text
20
+ # Convert the image path to a markdown image tag
21
+ if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
22
+ text = os.path.abspath(text)
23
+ image_tag = f'![image](gradio_api/file={text})'
24
+ logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
25
+ return image_tag
26
+ return text
27
+
28
+
29
+ def convert_html_tags(text):
30
+ # match begin label
31
+ text = re.sub(r'<(\w+)>', r'[\1]', text)
32
+ # match end label
33
+ text = re.sub(r'</(\w+)>', r'[/\1]', text)
34
+ return text
35
+
36
+
37
+ def process_string(string: str, max_length: int = 2048) -> str:
38
+ string = convert_html_tags(string) # for display labels e.g.
39
+ if max_length and len(string) > max_length:
40
+ return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
41
+ return string
42
+
43
+
44
+ def dict_to_markdown(data) -> str:
45
+ markdown_lines = []
46
+
47
+ for key, value in data.items():
48
+ bold_key = f'**{key}**'
49
+
50
+ if isinstance(value, list):
51
+ value_str = '\n' + '\n'.join([f'- {process_model_prediction(item, max_length=None)}' for item in value])
52
+ elif isinstance(value, dict):
53
+ value_str = dict_to_markdown(value)
54
+ else:
55
+ value_str = str(value)
56
+
57
+ value_str = process_string(value_str, max_length=None) # Convert HTML tags but don't truncate
58
+ markdown_line = f'{bold_key}:\n{value_str}'
59
+ markdown_lines.append(markdown_line)
60
+
61
+ return '\n\n'.join(markdown_lines)
62
+
63
+
64
+ def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
65
+ """
66
+ Process model prediction output into a formatted string.
67
+
68
+ Args:
69
+ item: The item to process. Can be a string, list, or dictionary.
70
+ max_length: The maximum length of the output string.
71
+
72
+ Returns:
73
+ A formatted string representation of the input.
74
+ """
75
+ if isinstance(item, dict):
76
+ result = dict_to_markdown(item)
77
+ elif isinstance(item, list):
78
+ result = '\n'.join([f'- {process_model_prediction(i, max_length=None)}' for i in item])
79
+ else:
80
+ result = str(item)
81
+
82
+ # Apply HTML tag conversion and truncation only at the final output
83
+ if max_length is not None:
84
+ return process_string(result, max_length)
85
+ return result
86
+
87
+
88
+ def process_model_prediction(item: Any, max_length: int = 32000) -> str:
89
+ if isinstance(item, (dict, list)):
90
+ result = json.dumps(item, ensure_ascii=False, indent=2)
91
+ result = f'```json\n{result}\n```'
92
+ else:
93
+ result = str(item)
94
+
95
+ # Apply HTML tag conversion and truncation only at the final output
96
+ if max_length is not None:
97
+ return process_string(result, max_length)
98
+
99
+ return result
100
+
101
+
102
+ def process_json_content(content: Any) -> str:
103
+ """
104
+ Process JSON content to convert it into a markdown-friendly format.
105
+
106
+ Args:
107
+ content (str): The JSON content as a string.
108
+
109
+ Returns:
110
+ str: The processed content formatted for markdown display.
111
+ """
112
+ if isinstance(content, (np.bool_, np.int_, np.float_)):
113
+ content = str(content)
114
+
115
+ if isinstance(content, str):
116
+ content = {'content': content}
117
+
118
+ content_json = json.dumps(content, ensure_ascii=False, indent=2)
119
+ return content_json
@@ -0,0 +1,91 @@
1
+ """
2
+ Visualization utilities for the Evalscope dashboard.
3
+ """
4
+ import numpy as np
5
+ import pandas as pd
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from typing import List
9
+
10
+ from evalscope.constants import DataCollection
11
+ from evalscope.report import Report, ReportKey, get_data_frame
12
+ from evalscope.utils.logger import get_logger
13
+ from ..constants import DEFAULT_BAR_WIDTH, PLOTLY_THEME
14
+
15
+ logger = get_logger()
16
+
17
+
18
+ def plot_single_report_scores(df: pd.DataFrame):
19
+ if df is None:
20
+ return None
21
+ logger.debug(f'df: {df}')
22
+ plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
23
+
24
+ width = DEFAULT_BAR_WIDTH if len(df[ReportKey.dataset_name]) <= 5 else None
25
+ plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
26
+ plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
27
+ return plot
28
+
29
+
30
+ def plot_single_report_sunburst(report_list: List[Report]):
31
+ if report_list[0].name == DataCollection.NAME:
32
+ df = get_data_frame(report_list=report_list)
33
+ categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
34
+ path = categories + [ReportKey.subset_name]
35
+ else:
36
+ df = get_data_frame(report_list=report_list, flatten_metrics=False)
37
+ categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
38
+ path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
39
+ logger.debug(f'df: {df}')
40
+ df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
41
+
42
+ plot = px.sunburst(
43
+ df,
44
+ path=path,
45
+ values=ReportKey.num,
46
+ color=ReportKey.score,
47
+ color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
48
+ color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
49
+ template=PLOTLY_THEME,
50
+ maxdepth=4)
51
+ plot.update_traces(insidetextorientation='radial')
52
+ plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1), height=600)
53
+ return plot
54
+
55
+
56
+ def plot_single_dataset_scores(df: pd.DataFrame):
57
+ # TODO: add metric radio and replace category name
58
+ plot = px.bar(
59
+ df,
60
+ x=df[ReportKey.metric_name],
61
+ y=df[ReportKey.score],
62
+ color=df[ReportKey.subset_name],
63
+ text=df[ReportKey.score],
64
+ barmode='group')
65
+
66
+ width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
67
+ plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
68
+ plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
69
+ return plot
70
+
71
+
72
+ def plot_multi_report_radar(df: pd.DataFrame):
73
+ fig = go.Figure()
74
+
75
+ grouped = df.groupby(ReportKey.model_name)
76
+ common_datasets = set.intersection(*[set(group[ReportKey.dataset_name]) for _, group in grouped])
77
+
78
+ for model_name, group in grouped:
79
+ common_group = group[group[ReportKey.dataset_name].isin(common_datasets)]
80
+ fig.add_trace(
81
+ go.Scatterpolar(
82
+ r=common_group[ReportKey.score],
83
+ theta=common_group[ReportKey.dataset_name],
84
+ name=model_name,
85
+ fill='toself'))
86
+
87
+ fig.update_layout(
88
+ template=PLOTLY_THEME,
89
+ polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
90
+ margin=dict(t=20, l=20, r=20, b=20))
91
+ return fig
@@ -8,7 +8,8 @@ from typing import Optional, Union
8
8
 
9
9
  from evalscope.backend.base import BackendManager
10
10
  from evalscope.backend.opencompass.api_meta_template import get_template
11
- from evalscope.utils import get_module_path, get_valid_list, is_module_installed
11
+ from evalscope.utils.import_utils import get_module_path, is_module_installed
12
+ from evalscope.utils.io_utils import get_valid_list
12
13
  from evalscope.utils.logger import get_logger
13
14
 
14
15
  logger = get_logger()
@@ -2,7 +2,8 @@ import os
2
2
  from typing import Optional, Union
3
3
 
4
4
  from evalscope.backend.base import BackendManager
5
- from evalscope.utils import get_valid_list, is_module_installed
5
+ from evalscope.utils.import_utils import is_module_installed
6
+ from evalscope.utils.io_utils import get_valid_list
6
7
  from evalscope.utils.logger import get_logger
7
8
 
8
9
  logger = get_logger()
@@ -12,8 +12,8 @@ from typing import Dict, List, Optional, Union
12
12
 
13
13
  from evalscope.backend.rag_eval.utils.tools import download_model
14
14
  from evalscope.constants import HubType
15
+ from evalscope.utils.argument_utils import get_supported_params
15
16
  from evalscope.utils.logger import get_logger
16
- from evalscope.utils.utils import get_supported_params
17
17
 
18
18
  logger = get_logger()
19
19