evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +29 -9
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +2 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +20 -15
- evalscope/metrics/__init__.py +9 -1
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
- evalscope/metrics/llm_judge.py +106 -20
- evalscope/metrics/metrics.py +20 -8
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +4 -2
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +15 -16
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +3 -3
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +55 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +68 -4
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +34 -70
- tests/perf/test_perf.py +31 -4
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/models/model.py +0 -189
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Visualization components for the Evalscope dashboard.
|
|
3
|
+
"""
|
|
4
|
+
import gradio as gr
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
from ..utils.localization import get_visualization_locale
|
|
9
|
+
from .multi_model import MultiModelComponents, create_multi_model_tab
|
|
10
|
+
from .single_model import SingleModelComponents, create_single_model_tab
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from .sidebar import SidebarComponents
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class VisualizationComponents:
|
|
18
|
+
single_model: SingleModelComponents
|
|
19
|
+
multi_model: MultiModelComponents
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_visualization(sidebar: 'SidebarComponents', lang: str):
|
|
23
|
+
locale_dict = get_visualization_locale(lang)
|
|
24
|
+
|
|
25
|
+
with gr.Column(visible=True):
|
|
26
|
+
gr.Markdown(f'## {locale_dict["visualization"]}')
|
|
27
|
+
with gr.Tabs():
|
|
28
|
+
with gr.Tab(locale_dict['single_model']):
|
|
29
|
+
single = create_single_model_tab(sidebar, lang)
|
|
30
|
+
|
|
31
|
+
with gr.Tab(locale_dict['multi_model']):
|
|
32
|
+
multi = create_multi_model_tab(sidebar, lang)
|
|
33
|
+
return VisualizationComponents(
|
|
34
|
+
single_model=single,
|
|
35
|
+
multi_model=multi,
|
|
36
|
+
)
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data loading and processing utilities for the Evalscope dashboard.
|
|
3
|
+
"""
|
|
4
|
+
import glob
|
|
5
|
+
import numpy as np
|
|
6
|
+
import os
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from typing import Any, Dict, List, Union
|
|
9
|
+
|
|
10
|
+
from evalscope.constants import DataCollection
|
|
11
|
+
from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
|
|
12
|
+
from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
|
|
13
|
+
from evalscope.utils.logger import get_logger
|
|
14
|
+
from ..constants import DATASET_TOKEN, MODEL_TOKEN, REPORT_TOKEN
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def scan_for_report_folders(root_path):
|
|
20
|
+
"""Scan for folders containing reports subdirectories"""
|
|
21
|
+
logger.debug(f'Scanning for report folders in {root_path}')
|
|
22
|
+
if not os.path.exists(root_path):
|
|
23
|
+
return []
|
|
24
|
+
|
|
25
|
+
reports = []
|
|
26
|
+
# Iterate over all folders in the root path
|
|
27
|
+
for folder in glob.glob(os.path.join(root_path, '*')):
|
|
28
|
+
# Check if reports folder exists
|
|
29
|
+
reports_path = os.path.join(folder, OutputsStructure.REPORTS_DIR)
|
|
30
|
+
if not os.path.exists(reports_path):
|
|
31
|
+
continue
|
|
32
|
+
|
|
33
|
+
# Iterate over all items in reports folder
|
|
34
|
+
for model_item in glob.glob(os.path.join(reports_path, '*')):
|
|
35
|
+
if not os.path.isdir(model_item):
|
|
36
|
+
continue
|
|
37
|
+
datasets = []
|
|
38
|
+
for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
|
|
39
|
+
datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
|
|
40
|
+
datasets = DATASET_TOKEN.join(datasets)
|
|
41
|
+
reports.append(
|
|
42
|
+
f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
|
|
43
|
+
|
|
44
|
+
reports = sorted(reports, reverse=True)
|
|
45
|
+
logger.debug(f'reports: {reports}')
|
|
46
|
+
return reports
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def process_report_name(report_name: str):
|
|
50
|
+
prefix, report_name = report_name.split(REPORT_TOKEN)
|
|
51
|
+
model_name, datasets = report_name.split(MODEL_TOKEN)
|
|
52
|
+
datasets = datasets.split(DATASET_TOKEN)
|
|
53
|
+
return prefix, model_name, datasets
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def load_single_report(root_path: str, report_name: str):
|
|
57
|
+
prefix, model_name, datasets = process_report_name(report_name)
|
|
58
|
+
report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
|
|
59
|
+
report_list = get_report_list([report_path_list])
|
|
60
|
+
|
|
61
|
+
config_files = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))
|
|
62
|
+
if not config_files:
|
|
63
|
+
raise FileNotFoundError(
|
|
64
|
+
f'No configuration files found in {os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR)}')
|
|
65
|
+
task_cfg_path = config_files[0]
|
|
66
|
+
task_cfg = yaml_to_dict(task_cfg_path)
|
|
67
|
+
return report_list, datasets, task_cfg
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def load_multi_report(root_path: str, report_names: List[str]):
|
|
71
|
+
report_list = []
|
|
72
|
+
for report_name in report_names:
|
|
73
|
+
prefix, model_name, datasets = process_report_name(report_name)
|
|
74
|
+
report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
|
|
75
|
+
reports = get_report_list([report_path_list])
|
|
76
|
+
report_list.extend(reports)
|
|
77
|
+
return report_list
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_acc_report_df(report_list: List[Report]):
|
|
81
|
+
data_dict = []
|
|
82
|
+
for report in report_list:
|
|
83
|
+
if report.name == DataCollection.NAME:
|
|
84
|
+
for metric in report.metrics:
|
|
85
|
+
for category in metric.categories:
|
|
86
|
+
item = {
|
|
87
|
+
ReportKey.model_name: report.model_name,
|
|
88
|
+
ReportKey.dataset_name: '/'.join(category.name),
|
|
89
|
+
ReportKey.score: category.score,
|
|
90
|
+
ReportKey.num: category.num,
|
|
91
|
+
}
|
|
92
|
+
data_dict.append(item)
|
|
93
|
+
else:
|
|
94
|
+
item = {
|
|
95
|
+
ReportKey.model_name: report.model_name,
|
|
96
|
+
ReportKey.dataset_name: report.dataset_name,
|
|
97
|
+
ReportKey.score: report.score,
|
|
98
|
+
ReportKey.num: report.metrics[0].num,
|
|
99
|
+
}
|
|
100
|
+
data_dict.append(item)
|
|
101
|
+
df = pd.DataFrame.from_dict(data_dict, orient='columns')
|
|
102
|
+
|
|
103
|
+
styler = style_df(df, columns=[ReportKey.score])
|
|
104
|
+
return df, styler
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def style_df(df: pd.DataFrame, columns: List[str] = None):
|
|
108
|
+
# Apply background gradient to the specified columns
|
|
109
|
+
styler = df.style.background_gradient(subset=columns, cmap='RdYlGn', vmin=0.0, vmax=1.0, axis=0)
|
|
110
|
+
# Format the dataframe with a precision of 4 decimal places
|
|
111
|
+
styler.format(precision=4)
|
|
112
|
+
return styler
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_compare_report_df(acc_df: pd.DataFrame):
|
|
116
|
+
df = acc_df.pivot_table(index=ReportKey.model_name, columns=ReportKey.dataset_name, values=ReportKey.score)
|
|
117
|
+
df.reset_index(inplace=True)
|
|
118
|
+
|
|
119
|
+
styler = style_df(df)
|
|
120
|
+
return df, styler
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
|
|
124
|
+
df = df[df[ReportKey.dataset_name] == dataset_name]
|
|
125
|
+
styler = style_df(df, columns=[ReportKey.score])
|
|
126
|
+
return df, styler
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
|
|
130
|
+
for report in report_list:
|
|
131
|
+
if report.dataset_name == dataset_name:
|
|
132
|
+
return report.analysis
|
|
133
|
+
return 'N/A'
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
|
|
137
|
+
data_path = os.path.join(work_dir, OutputsStructure.REVIEWS_DIR, model_name)
|
|
138
|
+
subset_name = subset_name.replace('/', '_') # for collection report
|
|
139
|
+
review_path = os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl')
|
|
140
|
+
logger.debug(f'review_path: {review_path}')
|
|
141
|
+
origin_df = pd.read_json(review_path, lines=True)
|
|
142
|
+
|
|
143
|
+
ds = []
|
|
144
|
+
for i, item in origin_df.iterrows():
|
|
145
|
+
raw_input = item['raw_input']
|
|
146
|
+
sample_index = item['index']
|
|
147
|
+
for choice_index, choice in enumerate(item['choices']):
|
|
148
|
+
raw_pred_answer = choice['message']['content']
|
|
149
|
+
parsed_gold_answer = choice['review']['gold']
|
|
150
|
+
parsed_pred_answer = choice['review']['pred']
|
|
151
|
+
score = choice['review']['result']
|
|
152
|
+
raw_d = {
|
|
153
|
+
'Index': f'{sample_index}_{choice_index}',
|
|
154
|
+
'Input': raw_input,
|
|
155
|
+
'Generated': raw_pred_answer if raw_pred_answer != parsed_pred_answer else '*Same as Pred*',
|
|
156
|
+
'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
|
|
157
|
+
'Pred': parsed_pred_answer,
|
|
158
|
+
'Score': score,
|
|
159
|
+
'NScore': normalize_score(score)
|
|
160
|
+
}
|
|
161
|
+
ds.append(raw_d)
|
|
162
|
+
|
|
163
|
+
df_subset = pd.DataFrame(ds)
|
|
164
|
+
return df_subset
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def normalize_score(score):
|
|
168
|
+
try:
|
|
169
|
+
if isinstance(score, bool):
|
|
170
|
+
return 1.0 if score else 0.0
|
|
171
|
+
elif isinstance(score, dict):
|
|
172
|
+
for key in score:
|
|
173
|
+
return float(score[key])
|
|
174
|
+
return 0.0
|
|
175
|
+
else:
|
|
176
|
+
return float(score)
|
|
177
|
+
except (ValueError, TypeError):
|
|
178
|
+
return 0.0
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Localization utilities for the Evalscope dashboard.
|
|
3
|
+
"""
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_sidebar_locale(lang: str) -> Dict[str, str]:
|
|
8
|
+
locale_dict = {
|
|
9
|
+
'settings': {
|
|
10
|
+
'zh': '设置',
|
|
11
|
+
'en': 'Settings'
|
|
12
|
+
},
|
|
13
|
+
'report_root_path': {
|
|
14
|
+
'zh': '报告根路径',
|
|
15
|
+
'en': 'Report Root Path'
|
|
16
|
+
},
|
|
17
|
+
'select_reports': {
|
|
18
|
+
'zh': '请选择报告',
|
|
19
|
+
'en': 'Select Reports'
|
|
20
|
+
},
|
|
21
|
+
'load_btn': {
|
|
22
|
+
'zh': '加载并查看',
|
|
23
|
+
'en': 'Load & View'
|
|
24
|
+
},
|
|
25
|
+
'note': {
|
|
26
|
+
'zh': '请选择报告并点击`加载并查看`来查看数据',
|
|
27
|
+
'en': 'Please select reports and click `Load & View` to view the data'
|
|
28
|
+
},
|
|
29
|
+
'warning': {
|
|
30
|
+
'zh': '没有找到报告,请检查路径',
|
|
31
|
+
'en': 'No reports found, please check the path'
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
return {k: v[lang] for k, v in locale_dict.items()}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_visualization_locale(lang: str) -> Dict[str, str]:
|
|
38
|
+
locale_dict = {
|
|
39
|
+
'visualization': {
|
|
40
|
+
'zh': '可视化',
|
|
41
|
+
'en': 'Visualization'
|
|
42
|
+
},
|
|
43
|
+
'single_model': {
|
|
44
|
+
'zh': '单模型',
|
|
45
|
+
'en': 'Single Model'
|
|
46
|
+
},
|
|
47
|
+
'multi_model': {
|
|
48
|
+
'zh': '多模型',
|
|
49
|
+
'en': 'Multi Model'
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return {k: v[lang] for k, v in locale_dict.items()}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_single_model_locale(lang: str) -> Dict[str, str]:
|
|
56
|
+
locale_dict = {
|
|
57
|
+
'select_report': {
|
|
58
|
+
'zh': '选择报告',
|
|
59
|
+
'en': 'Select Report'
|
|
60
|
+
},
|
|
61
|
+
'task_config': {
|
|
62
|
+
'zh': '任务配置',
|
|
63
|
+
'en': 'Task Config'
|
|
64
|
+
},
|
|
65
|
+
'datasets_overview': {
|
|
66
|
+
'zh': '数据集概览',
|
|
67
|
+
'en': 'Datasets Overview'
|
|
68
|
+
},
|
|
69
|
+
'dataset_components': {
|
|
70
|
+
'zh': '数据集组成',
|
|
71
|
+
'en': 'Dataset Components'
|
|
72
|
+
},
|
|
73
|
+
'dataset_scores': {
|
|
74
|
+
'zh': '数据集分数',
|
|
75
|
+
'en': 'Dataset Scores'
|
|
76
|
+
},
|
|
77
|
+
'report_analysis': {
|
|
78
|
+
'zh': '报告智能分析',
|
|
79
|
+
'en': 'Report Intelligent Analysis'
|
|
80
|
+
},
|
|
81
|
+
'dataset_scores_table': {
|
|
82
|
+
'zh': '数据集分数表',
|
|
83
|
+
'en': 'Dataset Scores Table'
|
|
84
|
+
},
|
|
85
|
+
'dataset_details': {
|
|
86
|
+
'zh': '数据集详情',
|
|
87
|
+
'en': 'Dataset Details'
|
|
88
|
+
},
|
|
89
|
+
'select_dataset': {
|
|
90
|
+
'zh': '选择数据集',
|
|
91
|
+
'en': 'Select Dataset'
|
|
92
|
+
},
|
|
93
|
+
'model_prediction': {
|
|
94
|
+
'zh': '模型预测',
|
|
95
|
+
'en': 'Model Prediction'
|
|
96
|
+
},
|
|
97
|
+
'select_subset': {
|
|
98
|
+
'zh': '选择子集',
|
|
99
|
+
'en': 'Select Subset'
|
|
100
|
+
},
|
|
101
|
+
'answer_mode': {
|
|
102
|
+
'zh': '答案模式',
|
|
103
|
+
'en': 'Answer Mode'
|
|
104
|
+
},
|
|
105
|
+
'page': {
|
|
106
|
+
'zh': '页码',
|
|
107
|
+
'en': 'Page'
|
|
108
|
+
},
|
|
109
|
+
'score_threshold': {
|
|
110
|
+
'zh': '分数阈值',
|
|
111
|
+
'en': 'Score Threshold'
|
|
112
|
+
},
|
|
113
|
+
}
|
|
114
|
+
return {k: v[lang] for k, v in locale_dict.items()}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def get_multi_model_locale(lang: str) -> Dict[str, str]:
|
|
118
|
+
locale_dict = {
|
|
119
|
+
'select_reports': {
|
|
120
|
+
'zh': '请选择报告',
|
|
121
|
+
'en': 'Select Reports'
|
|
122
|
+
},
|
|
123
|
+
'models_overview': {
|
|
124
|
+
'zh': '模型概览',
|
|
125
|
+
'en': 'Models Overview'
|
|
126
|
+
},
|
|
127
|
+
'model_radar': {
|
|
128
|
+
'zh': '模型对比雷达',
|
|
129
|
+
'en': 'Model Comparison Radar'
|
|
130
|
+
},
|
|
131
|
+
'model_scores': {
|
|
132
|
+
'zh': '模型对比分数',
|
|
133
|
+
'en': 'Model Comparison Scores'
|
|
134
|
+
},
|
|
135
|
+
'model_comparison_details': {
|
|
136
|
+
'zh': '模型对比详情',
|
|
137
|
+
'en': 'Model Comparison Details'
|
|
138
|
+
},
|
|
139
|
+
'select_model_a': {
|
|
140
|
+
'zh': '选择模型A',
|
|
141
|
+
'en': 'Select Model A'
|
|
142
|
+
},
|
|
143
|
+
'select_model_b': {
|
|
144
|
+
'zh': '选择模型B',
|
|
145
|
+
'en': 'Select Model B'
|
|
146
|
+
},
|
|
147
|
+
'select_dataset': {
|
|
148
|
+
'zh': '选择数据集',
|
|
149
|
+
'en': 'Select Dataset'
|
|
150
|
+
},
|
|
151
|
+
'model_predictions': {
|
|
152
|
+
'zh': '模型预测',
|
|
153
|
+
'en': 'Model Predictions'
|
|
154
|
+
},
|
|
155
|
+
'select_subset': {
|
|
156
|
+
'zh': '选择子集',
|
|
157
|
+
'en': 'Select Subset'
|
|
158
|
+
},
|
|
159
|
+
'answer_mode': {
|
|
160
|
+
'zh': '答案模式',
|
|
161
|
+
'en': 'Answer Mode'
|
|
162
|
+
},
|
|
163
|
+
'score_threshold': {
|
|
164
|
+
'zh': '分数阈值',
|
|
165
|
+
'en': 'Score Threshold'
|
|
166
|
+
},
|
|
167
|
+
'comparison_counts': {
|
|
168
|
+
'zh': '对比统计',
|
|
169
|
+
'en': 'Comparison Counts'
|
|
170
|
+
},
|
|
171
|
+
'page': {
|
|
172
|
+
'zh': '页码',
|
|
173
|
+
'en': 'Page'
|
|
174
|
+
},
|
|
175
|
+
'input': {
|
|
176
|
+
'zh': '输入',
|
|
177
|
+
'en': 'Input'
|
|
178
|
+
},
|
|
179
|
+
'gold_answer': {
|
|
180
|
+
'zh': '标准答案',
|
|
181
|
+
'en': 'Gold Answer'
|
|
182
|
+
},
|
|
183
|
+
'score': {
|
|
184
|
+
'zh': '分数',
|
|
185
|
+
'en': 'Score'
|
|
186
|
+
},
|
|
187
|
+
'normalized_score': {
|
|
188
|
+
'zh': '归一化分数',
|
|
189
|
+
'en': 'Normalized Score'
|
|
190
|
+
},
|
|
191
|
+
'prediction': {
|
|
192
|
+
'zh': '预测',
|
|
193
|
+
'en': 'Prediction'
|
|
194
|
+
},
|
|
195
|
+
'generated': {
|
|
196
|
+
'zh': '生成结果',
|
|
197
|
+
'en': 'Generated'
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
return {k: v[lang] for k, v in locale_dict.items()}
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def get_app_locale(lang: str) -> Dict[str, str]:
|
|
204
|
+
locale_dict = {
|
|
205
|
+
'title': {
|
|
206
|
+
'zh': '📈 EvalScope 看板',
|
|
207
|
+
'en': '📈 Evalscope Dashboard'
|
|
208
|
+
},
|
|
209
|
+
'star_beggar': {
|
|
210
|
+
'zh':
|
|
211
|
+
'喜欢<a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>就动动手指给我们加个star吧 🥺 ',
|
|
212
|
+
'en':
|
|
213
|
+
'If you like <a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>, '
|
|
214
|
+
'please take a few seconds to star us 🥺 '
|
|
215
|
+
},
|
|
216
|
+
'note': {
|
|
217
|
+
'zh': '请选择报告',
|
|
218
|
+
'en': 'Please select reports'
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
return {k: v[lang] for k, v in locale_dict.items()}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text processing utilities for the Evalscope dashboard.
|
|
3
|
+
"""
|
|
4
|
+
import json
|
|
5
|
+
import numpy as np
|
|
6
|
+
import os
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import re
|
|
9
|
+
from typing import Any, Dict, List
|
|
10
|
+
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
from ..constants import LATEX_DELIMITERS
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def convert_markdown_image(text):
|
|
18
|
+
if not os.path.isfile(text):
|
|
19
|
+
return text
|
|
20
|
+
# Convert the image path to a markdown image tag
|
|
21
|
+
if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
|
|
22
|
+
text = os.path.abspath(text)
|
|
23
|
+
image_tag = f''
|
|
24
|
+
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
25
|
+
return image_tag
|
|
26
|
+
return text
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def convert_html_tags(text):
|
|
30
|
+
# match begin label
|
|
31
|
+
text = re.sub(r'<(\w+)>', r'[\1]', text)
|
|
32
|
+
# match end label
|
|
33
|
+
text = re.sub(r'</(\w+)>', r'[/\1]', text)
|
|
34
|
+
return text
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def process_string(string: str, max_length: int = 2048) -> str:
|
|
38
|
+
string = convert_html_tags(string) # for display labels e.g.
|
|
39
|
+
if max_length and len(string) > max_length:
|
|
40
|
+
return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
|
|
41
|
+
return string
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def dict_to_markdown(data) -> str:
|
|
45
|
+
markdown_lines = []
|
|
46
|
+
|
|
47
|
+
for key, value in data.items():
|
|
48
|
+
bold_key = f'**{key}**'
|
|
49
|
+
|
|
50
|
+
if isinstance(value, list):
|
|
51
|
+
value_str = '\n' + '\n'.join([f'- {process_model_prediction(item, max_length=None)}' for item in value])
|
|
52
|
+
elif isinstance(value, dict):
|
|
53
|
+
value_str = dict_to_markdown(value)
|
|
54
|
+
else:
|
|
55
|
+
value_str = str(value)
|
|
56
|
+
|
|
57
|
+
value_str = process_string(value_str, max_length=None) # Convert HTML tags but don't truncate
|
|
58
|
+
markdown_line = f'{bold_key}:\n{value_str}'
|
|
59
|
+
markdown_lines.append(markdown_line)
|
|
60
|
+
|
|
61
|
+
return '\n\n'.join(markdown_lines)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
|
|
65
|
+
"""
|
|
66
|
+
Process model prediction output into a formatted string.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
item: The item to process. Can be a string, list, or dictionary.
|
|
70
|
+
max_length: The maximum length of the output string.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
A formatted string representation of the input.
|
|
74
|
+
"""
|
|
75
|
+
if isinstance(item, dict):
|
|
76
|
+
result = dict_to_markdown(item)
|
|
77
|
+
elif isinstance(item, list):
|
|
78
|
+
result = '\n'.join([f'- {process_model_prediction(i, max_length=None)}' for i in item])
|
|
79
|
+
else:
|
|
80
|
+
result = str(item)
|
|
81
|
+
|
|
82
|
+
# Apply HTML tag conversion and truncation only at the final output
|
|
83
|
+
if max_length is not None:
|
|
84
|
+
return process_string(result, max_length)
|
|
85
|
+
return result
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def process_model_prediction(item: Any, max_length: int = 32000) -> str:
|
|
89
|
+
if isinstance(item, (dict, list)):
|
|
90
|
+
result = json.dumps(item, ensure_ascii=False, indent=2)
|
|
91
|
+
result = f'```json\n{result}\n```'
|
|
92
|
+
else:
|
|
93
|
+
result = str(item)
|
|
94
|
+
|
|
95
|
+
# Apply HTML tag conversion and truncation only at the final output
|
|
96
|
+
if max_length is not None:
|
|
97
|
+
return process_string(result, max_length)
|
|
98
|
+
|
|
99
|
+
return result
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def process_json_content(content: Any) -> str:
|
|
103
|
+
"""
|
|
104
|
+
Process JSON content to convert it into a markdown-friendly format.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
content (str): The JSON content as a string.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
str: The processed content formatted for markdown display.
|
|
111
|
+
"""
|
|
112
|
+
if isinstance(content, (np.bool_, np.int_, np.float_)):
|
|
113
|
+
content = str(content)
|
|
114
|
+
|
|
115
|
+
if isinstance(content, str):
|
|
116
|
+
content = {'content': content}
|
|
117
|
+
|
|
118
|
+
content_json = json.dumps(content, ensure_ascii=False, indent=2)
|
|
119
|
+
return content_json
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Visualization utilities for the Evalscope dashboard.
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import plotly.express as px
|
|
7
|
+
import plotly.graph_objects as go
|
|
8
|
+
from typing import List
|
|
9
|
+
|
|
10
|
+
from evalscope.constants import DataCollection
|
|
11
|
+
from evalscope.report import Report, ReportKey, get_data_frame
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
from ..constants import DEFAULT_BAR_WIDTH, PLOTLY_THEME
|
|
14
|
+
|
|
15
|
+
logger = get_logger()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def plot_single_report_scores(df: pd.DataFrame):
|
|
19
|
+
if df is None:
|
|
20
|
+
return None
|
|
21
|
+
logger.debug(f'df: {df}')
|
|
22
|
+
plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
|
|
23
|
+
|
|
24
|
+
width = DEFAULT_BAR_WIDTH if len(df[ReportKey.dataset_name]) <= 5 else None
|
|
25
|
+
plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
|
|
26
|
+
plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
|
|
27
|
+
return plot
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def plot_single_report_sunburst(report_list: List[Report]):
|
|
31
|
+
if report_list[0].name == DataCollection.NAME:
|
|
32
|
+
df = get_data_frame(report_list=report_list)
|
|
33
|
+
categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
|
|
34
|
+
path = categories + [ReportKey.subset_name]
|
|
35
|
+
else:
|
|
36
|
+
df = get_data_frame(report_list=report_list, flatten_metrics=False)
|
|
37
|
+
categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
|
|
38
|
+
path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
|
|
39
|
+
logger.debug(f'df: {df}')
|
|
40
|
+
df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
|
|
41
|
+
|
|
42
|
+
plot = px.sunburst(
|
|
43
|
+
df,
|
|
44
|
+
path=path,
|
|
45
|
+
values=ReportKey.num,
|
|
46
|
+
color=ReportKey.score,
|
|
47
|
+
color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
|
|
48
|
+
color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
|
|
49
|
+
template=PLOTLY_THEME,
|
|
50
|
+
maxdepth=4)
|
|
51
|
+
plot.update_traces(insidetextorientation='radial')
|
|
52
|
+
plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1), height=600)
|
|
53
|
+
return plot
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def plot_single_dataset_scores(df: pd.DataFrame):
|
|
57
|
+
# TODO: add metric radio and replace category name
|
|
58
|
+
plot = px.bar(
|
|
59
|
+
df,
|
|
60
|
+
x=df[ReportKey.metric_name],
|
|
61
|
+
y=df[ReportKey.score],
|
|
62
|
+
color=df[ReportKey.subset_name],
|
|
63
|
+
text=df[ReportKey.score],
|
|
64
|
+
barmode='group')
|
|
65
|
+
|
|
66
|
+
width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
|
|
67
|
+
plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
|
|
68
|
+
plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
|
|
69
|
+
return plot
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def plot_multi_report_radar(df: pd.DataFrame):
|
|
73
|
+
fig = go.Figure()
|
|
74
|
+
|
|
75
|
+
grouped = df.groupby(ReportKey.model_name)
|
|
76
|
+
common_datasets = set.intersection(*[set(group[ReportKey.dataset_name]) for _, group in grouped])
|
|
77
|
+
|
|
78
|
+
for model_name, group in grouped:
|
|
79
|
+
common_group = group[group[ReportKey.dataset_name].isin(common_datasets)]
|
|
80
|
+
fig.add_trace(
|
|
81
|
+
go.Scatterpolar(
|
|
82
|
+
r=common_group[ReportKey.score],
|
|
83
|
+
theta=common_group[ReportKey.dataset_name],
|
|
84
|
+
name=model_name,
|
|
85
|
+
fill='toself'))
|
|
86
|
+
|
|
87
|
+
fig.update_layout(
|
|
88
|
+
template=PLOTLY_THEME,
|
|
89
|
+
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
|
|
90
|
+
margin=dict(t=20, l=20, r=20, b=20))
|
|
91
|
+
return fig
|
|
@@ -8,7 +8,8 @@ from typing import Optional, Union
|
|
|
8
8
|
|
|
9
9
|
from evalscope.backend.base import BackendManager
|
|
10
10
|
from evalscope.backend.opencompass.api_meta_template import get_template
|
|
11
|
-
from evalscope.utils import get_module_path,
|
|
11
|
+
from evalscope.utils.import_utils import get_module_path, is_module_installed
|
|
12
|
+
from evalscope.utils.io_utils import get_valid_list
|
|
12
13
|
from evalscope.utils.logger import get_logger
|
|
13
14
|
|
|
14
15
|
logger = get_logger()
|
|
@@ -2,7 +2,8 @@ import os
|
|
|
2
2
|
from typing import Optional, Union
|
|
3
3
|
|
|
4
4
|
from evalscope.backend.base import BackendManager
|
|
5
|
-
from evalscope.utils import
|
|
5
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
6
|
+
from evalscope.utils.io_utils import get_valid_list
|
|
6
7
|
from evalscope.utils.logger import get_logger
|
|
7
8
|
|
|
8
9
|
logger = get_logger()
|
|
@@ -12,8 +12,8 @@ from typing import Dict, List, Optional, Union
|
|
|
12
12
|
|
|
13
13
|
from evalscope.backend.rag_eval.utils.tools import download_model
|
|
14
14
|
from evalscope.constants import HubType
|
|
15
|
+
from evalscope.utils.argument_utils import get_supported_params
|
|
15
16
|
from evalscope.utils.logger import get_logger
|
|
16
|
-
from evalscope.utils.utils import get_supported_params
|
|
17
17
|
|
|
18
18
|
logger = get_logger()
|
|
19
19
|
|