evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +40 -30
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +77 -39
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +2 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +99 -16
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +91 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
- evalscope/benchmarks/tool_bench/utils.py +5 -4
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/utils.py +25 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +68 -34
- evalscope/config.py +8 -2
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +40 -28
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/llm_judge.py +12 -5
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +31 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +80 -23
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +4 -2
- evalscope/perf/benchmark.py +16 -12
- evalscope/perf/main.py +7 -0
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +1 -1
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +40 -6
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +84 -4
- evalscope/run.py +12 -0
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
- tests/aigc/test_t2i.py +48 -11
- tests/cli/test_all.py +14 -3
- tests/cli/test_collection.py +6 -4
- tests/cli/test_run.py +50 -25
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +51 -7
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from evalscope.utils.import_utils import _LazyModule
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .app import create_app
|
|
8
|
+
from .arguments import add_argument
|
|
9
|
+
|
|
10
|
+
else:
|
|
11
|
+
_import_structure = {
|
|
12
|
+
'app': [
|
|
13
|
+
'create_app',
|
|
14
|
+
],
|
|
15
|
+
'arguments': [
|
|
16
|
+
'add_argument',
|
|
17
|
+
],
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
import sys
|
|
21
|
+
|
|
22
|
+
sys.modules[__name__] = _LazyModule(
|
|
23
|
+
__name__,
|
|
24
|
+
globals()['__file__'],
|
|
25
|
+
_import_structure,
|
|
26
|
+
module_spec=__spec__,
|
|
27
|
+
extra_objects={},
|
|
28
|
+
)
|
evalscope/{report → app}/app.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import glob
|
|
3
3
|
import gradio as gr
|
|
4
|
+
import json
|
|
4
5
|
import numpy as np
|
|
5
6
|
import os
|
|
6
7
|
import pandas as pd
|
|
@@ -11,35 +12,15 @@ from dataclasses import dataclass
|
|
|
11
12
|
from typing import Any, List, Union
|
|
12
13
|
|
|
13
14
|
from evalscope.constants import DataCollection
|
|
14
|
-
from evalscope.report import Report, ReportKey,
|
|
15
|
+
from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
|
|
15
16
|
from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
|
|
16
17
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
17
18
|
from evalscope.version import __version__
|
|
19
|
+
from .arguments import add_argument
|
|
20
|
+
from .constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, PLOTLY_THEME, REPORT_TOKEN
|
|
18
21
|
|
|
19
22
|
logger = get_logger()
|
|
20
23
|
|
|
21
|
-
PLOTLY_THEME = 'plotly_dark'
|
|
22
|
-
REPORT_TOKEN = '@@'
|
|
23
|
-
MODEL_TOKEN = '::'
|
|
24
|
-
DATASET_TOKEN = ', '
|
|
25
|
-
LATEX_DELIMITERS = [{
|
|
26
|
-
'left': '$$',
|
|
27
|
-
'right': '$$',
|
|
28
|
-
'display': True
|
|
29
|
-
}, {
|
|
30
|
-
'left': '$',
|
|
31
|
-
'right': '$',
|
|
32
|
-
'display': False
|
|
33
|
-
}, {
|
|
34
|
-
'left': '\\(',
|
|
35
|
-
'right': '\\)',
|
|
36
|
-
'display': False
|
|
37
|
-
}, {
|
|
38
|
-
'left': '\\[',
|
|
39
|
-
'right': '\\]',
|
|
40
|
-
'display': True
|
|
41
|
-
}]
|
|
42
|
-
|
|
43
24
|
|
|
44
25
|
def scan_for_report_folders(root_path):
|
|
45
26
|
"""Scan for folders containing reports subdirectories"""
|
|
@@ -155,11 +136,11 @@ def plot_single_report_scores(df: pd.DataFrame):
|
|
|
155
136
|
|
|
156
137
|
def plot_single_report_sunburst(report_list: List[Report]):
|
|
157
138
|
if report_list[0].name == DataCollection.NAME:
|
|
158
|
-
df = get_data_frame(report_list)
|
|
139
|
+
df = get_data_frame(report_list=report_list)
|
|
159
140
|
categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
|
|
160
141
|
path = categories + [ReportKey.subset_name]
|
|
161
142
|
else:
|
|
162
|
-
df = get_data_frame(report_list, flatten_metrics=False)
|
|
143
|
+
df = get_data_frame(report_list=report_list, flatten_metrics=False)
|
|
163
144
|
categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
|
|
164
145
|
path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
|
|
165
146
|
logger.debug(f'df: {df}')
|
|
@@ -185,6 +166,13 @@ def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
|
|
|
185
166
|
return df, styler
|
|
186
167
|
|
|
187
168
|
|
|
169
|
+
def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
|
|
170
|
+
for report in report_list:
|
|
171
|
+
if report.dataset_name == dataset_name:
|
|
172
|
+
return report.analysis
|
|
173
|
+
return 'N/A'
|
|
174
|
+
|
|
175
|
+
|
|
188
176
|
def plot_single_dataset_scores(df: pd.DataFrame):
|
|
189
177
|
# TODO: add metric radio and relace category name
|
|
190
178
|
plot = px.bar(
|
|
@@ -246,7 +234,7 @@ def convert_html_tags(text):
|
|
|
246
234
|
def process_string(string: str, max_length: int = 2048) -> str:
|
|
247
235
|
string = convert_html_tags(string) # for display labels e.g.
|
|
248
236
|
if max_length and len(string) > max_length:
|
|
249
|
-
return f'{string[:max_length // 2]}
|
|
237
|
+
return f'{string[:max_length // 2]}...[truncate]...{string[-max_length // 2:]}'
|
|
250
238
|
return string
|
|
251
239
|
|
|
252
240
|
|
|
@@ -270,7 +258,7 @@ def dict_to_markdown(data) -> str:
|
|
|
270
258
|
return '\n\n'.join(markdown_lines)
|
|
271
259
|
|
|
272
260
|
|
|
273
|
-
def
|
|
261
|
+
def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
|
|
274
262
|
"""
|
|
275
263
|
Process model prediction output into a formatted string.
|
|
276
264
|
|
|
@@ -294,6 +282,20 @@ def process_model_prediction(item: Any, max_length: int = 2048) -> str:
|
|
|
294
282
|
return result
|
|
295
283
|
|
|
296
284
|
|
|
285
|
+
def process_model_prediction(item: Any, max_length: int = 4096) -> str:
|
|
286
|
+
if isinstance(item, (dict, list)):
|
|
287
|
+
result = json.dumps(item, ensure_ascii=False, indent=2)
|
|
288
|
+
result = f'```json\n{result}\n```'
|
|
289
|
+
else:
|
|
290
|
+
result = str(item)
|
|
291
|
+
|
|
292
|
+
# Apply HTML tag conversion and truncation only at the final output
|
|
293
|
+
if max_length is not None:
|
|
294
|
+
return process_string(result, max_length)
|
|
295
|
+
|
|
296
|
+
return result
|
|
297
|
+
|
|
298
|
+
|
|
297
299
|
def normalize_score(score):
|
|
298
300
|
try:
|
|
299
301
|
if isinstance(score, bool):
|
|
@@ -456,6 +458,10 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
456
458
|
'zh': '数据集分数',
|
|
457
459
|
'en': 'Dataset Scores'
|
|
458
460
|
},
|
|
461
|
+
'report_analysis': {
|
|
462
|
+
'zh': '报告智能分析',
|
|
463
|
+
'en': 'Report Intelligent Analysis'
|
|
464
|
+
},
|
|
459
465
|
'dataset_scores_table': {
|
|
460
466
|
'zh': '数据集分数表',
|
|
461
467
|
'en': 'Dataset Scores Table'
|
|
@@ -511,6 +517,9 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
511
517
|
with gr.Tab(locale_dict['dataset_details'][lang]):
|
|
512
518
|
dataset_radio = gr.Radio(
|
|
513
519
|
label=locale_dict['select_dataset'][lang], choices=[], show_label=True, interactive=True)
|
|
520
|
+
# show dataset details
|
|
521
|
+
with gr.Accordion(locale_dict['report_analysis'][lang], open=True):
|
|
522
|
+
report_analysis = gr.Markdown(value='N/A', show_copy_button=True)
|
|
514
523
|
gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
|
|
515
524
|
dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
|
|
516
525
|
gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
|
|
@@ -586,15 +595,16 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
586
595
|
@gr.on(
|
|
587
596
|
triggers=[dataset_radio.change, report_list.change],
|
|
588
597
|
inputs=[dataset_radio, report_list],
|
|
589
|
-
outputs=[dataset_plot, dataset_table, subset_select, data_review_df])
|
|
598
|
+
outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
|
|
590
599
|
def update_single_report_dataset(dataset_name, report_list):
|
|
591
600
|
logger.debug(f'Updating single report dataset: {dataset_name}')
|
|
592
|
-
report_df = get_data_frame(report_list)
|
|
601
|
+
report_df = get_data_frame(report_list=report_list)
|
|
602
|
+
analysis = get_report_analysis(report_list, dataset_name)
|
|
593
603
|
data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
|
|
594
604
|
data_score_plot = plot_single_dataset_scores(data_score_df)
|
|
595
605
|
subsets = data_score_df[ReportKey.subset_name].unique().tolist()
|
|
596
606
|
logger.debug(f'subsets: {subsets}')
|
|
597
|
-
return data_score_plot, styler, gr.update(choices=subsets, value=None), None
|
|
607
|
+
return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
|
|
598
608
|
|
|
599
609
|
@gr.on(
|
|
600
610
|
triggers=[subset_select.change],
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
PLOTLY_THEME = 'plotly_dark'
|
|
2
|
+
REPORT_TOKEN = '@@'
|
|
3
|
+
MODEL_TOKEN = '::'
|
|
4
|
+
DATASET_TOKEN = ', '
|
|
5
|
+
LATEX_DELIMITERS = [{
|
|
6
|
+
'left': '$$',
|
|
7
|
+
'right': '$$',
|
|
8
|
+
'display': True
|
|
9
|
+
}, {
|
|
10
|
+
'left': '$',
|
|
11
|
+
'right': '$',
|
|
12
|
+
'display': False
|
|
13
|
+
}, {
|
|
14
|
+
'left': '\\(',
|
|
15
|
+
'right': '\\)',
|
|
16
|
+
'display': False
|
|
17
|
+
}, {
|
|
18
|
+
'left': '\\[',
|
|
19
|
+
'right': '\\]',
|
|
20
|
+
'display': True
|
|
21
|
+
}]
|
evalscope/arguments.py
CHANGED
|
@@ -67,7 +67,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
67
67
|
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
68
68
|
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
69
69
|
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
70
|
-
parser.add_argument('--limit', type=
|
|
70
|
+
parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
|
|
71
71
|
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
|
|
72
72
|
|
|
73
73
|
# Cache and working directory arguments
|
|
@@ -89,6 +89,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
89
89
|
parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
|
|
90
90
|
parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
|
|
91
91
|
parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
|
|
92
|
+
parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
|
|
92
93
|
# yapf: enable
|
|
93
94
|
|
|
94
95
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
2
3
|
import subprocess
|
|
3
4
|
import tempfile
|
|
4
5
|
from dataclasses import asdict
|
|
@@ -204,7 +205,7 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
204
205
|
model_d['meta_template'] = get_template(model_d['meta_template'])
|
|
205
206
|
|
|
206
207
|
# set the 'abbr' as the 'path' if 'abbr' is not specified
|
|
207
|
-
model_d['abbr'] = model_d['path']
|
|
208
|
+
model_d['abbr'] = os.path.basename(model_d['path'])
|
|
208
209
|
|
|
209
210
|
model_config = ApiModelConfig(**model_d)
|
|
210
211
|
models.append(asdict(model_config))
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import posixpath # For URL path handling
|
|
2
3
|
import torch
|
|
3
4
|
from torch.utils.data import DataLoader
|
|
4
5
|
from torch.utils.data import Dataset as TorchDataset
|
|
@@ -186,42 +187,53 @@ def build_wds_dataset(dataset_name, transform, split='test', data_dir='root', ca
|
|
|
186
187
|
|
|
187
188
|
Set `cache_dir` to a path to cache the dataset, otherwise, no caching will occur.
|
|
188
189
|
"""
|
|
190
|
+
import requests
|
|
189
191
|
import webdataset as wds
|
|
190
192
|
|
|
191
193
|
def read_txt(fname):
|
|
192
|
-
if '://'
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
194
|
+
if fname.startswith(('http://', 'https://')):
|
|
195
|
+
try:
|
|
196
|
+
response = requests.get(fname)
|
|
197
|
+
response.raise_for_status() # Ensure the HTTP request was successful
|
|
198
|
+
return response.text
|
|
199
|
+
except requests.exceptions.RequestException as e:
|
|
200
|
+
raise FileNotFoundError(f'Failed to read {fname}: {e}')
|
|
197
201
|
else:
|
|
198
202
|
with open(fname, 'r') as file:
|
|
199
|
-
|
|
200
|
-
|
|
203
|
+
return file.read()
|
|
204
|
+
|
|
205
|
+
def url_path_join(*parts):
|
|
206
|
+
"""Join URL path parts with forward slashes regardless of platform"""
|
|
207
|
+
return posixpath.join(*parts)
|
|
201
208
|
|
|
202
209
|
if not data_dir:
|
|
203
210
|
data_dir = f'https://modelscope.cn/datasets/clip-benchmark/wds_{dataset_name}/resolve/master'
|
|
204
211
|
|
|
205
212
|
# Git LFS files have a different file path to access the raw data than other files
|
|
206
|
-
|
|
213
|
+
is_url = data_dir.startswith(('http://', 'https://'))
|
|
214
|
+
if is_url and data_dir.startswith('https://modelscope.cn/datasets'):
|
|
207
215
|
*split_url_head, _, url_path = data_dir.split('/', 7)
|
|
208
216
|
url_head = '/'.join(split_url_head)
|
|
209
217
|
metadata_dir = '/'.join([url_head, 'resolve', url_path])
|
|
210
218
|
tardata_dir = '/'.join([url_head, 'resolve', url_path])
|
|
211
219
|
else:
|
|
212
220
|
metadata_dir = tardata_dir = data_dir
|
|
221
|
+
|
|
222
|
+
# Use appropriate path joining function based on whether we're dealing with a URL
|
|
223
|
+
path_join = url_path_join if is_url else os.path.join
|
|
224
|
+
|
|
213
225
|
# Get number of shards
|
|
214
|
-
nshards_fname =
|
|
226
|
+
nshards_fname = path_join(metadata_dir, split, 'nshards.txt')
|
|
215
227
|
nshards = int(read_txt(nshards_fname)) # Do not catch FileNotFound, nshards.txt should be mandatory
|
|
216
228
|
|
|
217
229
|
# Get dataset type (classification or retrieval)
|
|
218
|
-
type_fname =
|
|
230
|
+
type_fname = path_join(metadata_dir, 'dataset_type.txt')
|
|
219
231
|
try:
|
|
220
232
|
dataset_type = read_txt(type_fname).strip().lower()
|
|
221
233
|
except FileNotFoundError:
|
|
222
234
|
dataset_type = 'classification'
|
|
223
235
|
|
|
224
|
-
filepattern =
|
|
236
|
+
filepattern = path_join(tardata_dir, split, '{0..%d}.tar' % (nshards - 1))
|
|
225
237
|
# Load webdataset (support WEBP, PNG, and JPG for now)
|
|
226
238
|
if not cache_dir or not isinstance(cache_dir, str):
|
|
227
239
|
cache_dir = None
|
|
@@ -11,7 +11,9 @@ class ModelArguments:
|
|
|
11
11
|
pooling_mode: Optional[str] = None
|
|
12
12
|
max_seq_length: int = 512 # max sequence length
|
|
13
13
|
# prompt for llm based model
|
|
14
|
-
prompt: str =
|
|
14
|
+
prompt: Optional[str] = None
|
|
15
|
+
# prompts dictionary for different tasks, if prompt is not set
|
|
16
|
+
prompts: Optional[Dict[str, str]] = None
|
|
15
17
|
# model kwargs
|
|
16
18
|
model_kwargs: dict = field(default_factory=dict)
|
|
17
19
|
# config kwargs
|
|
@@ -33,6 +35,7 @@ class ModelArguments:
|
|
|
33
35
|
'pooling_mode': self.pooling_mode,
|
|
34
36
|
'max_seq_length': self.max_seq_length,
|
|
35
37
|
'prompt': self.prompt,
|
|
38
|
+
'prompts': self.prompts,
|
|
36
39
|
'model_kwargs': self.model_kwargs,
|
|
37
40
|
'config_kwargs': self.config_kwargs,
|
|
38
41
|
'encode_kwargs': self.encode_kwargs,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import mteb
|
|
2
2
|
import os
|
|
3
|
-
from
|
|
3
|
+
from tabulate import tabulate
|
|
4
4
|
|
|
5
5
|
from evalscope.backend.rag_eval import EmbeddingModel, cmteb
|
|
6
6
|
from evalscope.utils.logger import get_logger
|
|
@@ -12,14 +12,27 @@ def show_results(output_folder, model, results):
|
|
|
12
12
|
model_name = model.mteb_model_meta.model_name_as_path()
|
|
13
13
|
revision = model.mteb_model_meta.revision
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
data = []
|
|
16
|
+
for model_res in results:
|
|
17
|
+
main_res = model_res.only_main_score()
|
|
18
|
+
for split, score in main_res.scores.items():
|
|
19
|
+
for sub_score in score:
|
|
20
|
+
data.append({
|
|
21
|
+
'Model': model_name.replace('eval__', ''),
|
|
22
|
+
'Revision': revision,
|
|
23
|
+
'Task Type': main_res.task_type,
|
|
24
|
+
'Task': main_res.task_name,
|
|
25
|
+
'Split': split,
|
|
26
|
+
'Subset': sub_score['hf_subset'],
|
|
27
|
+
'Main Score': sub_score['main_score'],
|
|
28
|
+
})
|
|
16
29
|
|
|
17
30
|
save_path = os.path.join(
|
|
18
31
|
output_folder,
|
|
19
32
|
model_name,
|
|
20
33
|
revision,
|
|
21
34
|
)
|
|
22
|
-
logger.info(f'Evaluation results:\n{
|
|
35
|
+
logger.info(f'Evaluation results:\n{tabulate(data, headers="keys", tablefmt="grid")}')
|
|
23
36
|
logger.info(f'Evaluation results saved in {os.path.abspath(save_path)}')
|
|
24
37
|
|
|
25
38
|
|
|
@@ -34,6 +47,7 @@ def one_stage_eval(
|
|
|
34
47
|
tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'], dataset_path=custom_dataset_path)
|
|
35
48
|
evaluation = mteb.MTEB(tasks=tasks)
|
|
36
49
|
|
|
50
|
+
eval_args['encode_kwargs'] = model_args.get('encode_kwargs', {})
|
|
37
51
|
# run evaluation
|
|
38
52
|
results = evaluation.run(model, **eval_args)
|
|
39
53
|
|
|
@@ -66,6 +80,7 @@ def two_stage_eval(
|
|
|
66
80
|
overwrite_results=True,
|
|
67
81
|
hub=eval_args['hub'],
|
|
68
82
|
limits=eval_args['limits'],
|
|
83
|
+
encode_kwargs=model1_args.get('encode_kwargs', {}),
|
|
69
84
|
)
|
|
70
85
|
# stage 2: run cross encoder
|
|
71
86
|
results = evaluation.run(
|
|
@@ -77,6 +92,7 @@ def two_stage_eval(
|
|
|
77
92
|
overwrite_results=True,
|
|
78
93
|
hub=eval_args['hub'],
|
|
79
94
|
limits=eval_args['limits'],
|
|
95
|
+
encode_kwargs=model2_args.get('encode_kwargs', {}),
|
|
80
96
|
)
|
|
81
97
|
|
|
82
98
|
# save and log results
|
|
@@ -9,7 +9,6 @@ class CustomRetrieval(AbsTaskRetrieval):
|
|
|
9
9
|
ignore_identical_ids: bool = True
|
|
10
10
|
|
|
11
11
|
def __init__(self, dataset_path: Optional[str] = 'custom_eval/text/retrieval', **kwargs):
|
|
12
|
-
super().__init__(**kwargs)
|
|
13
12
|
self.metadata = TaskMetadata(
|
|
14
13
|
name='CustomRetrieval',
|
|
15
14
|
description='CustomRetrieval Task',
|
|
@@ -34,6 +33,7 @@ class CustomRetrieval(AbsTaskRetrieval):
|
|
|
34
33
|
bibtex_citation='',
|
|
35
34
|
descriptive_stats={},
|
|
36
35
|
)
|
|
36
|
+
super().__init__(**kwargs)
|
|
37
37
|
|
|
38
38
|
def load_data(self, **kwargs):
|
|
39
39
|
if self.data_loaded:
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import torch
|
|
3
3
|
from langchain_core.embeddings import Embeddings
|
|
4
4
|
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
5
|
+
from mteb.encoder_interface import PromptType
|
|
5
6
|
from sentence_transformers import models
|
|
6
7
|
from sentence_transformers.cross_encoder import CrossEncoder
|
|
7
8
|
from sentence_transformers.SentenceTransformer import SentenceTransformer
|
|
@@ -12,6 +13,7 @@ from typing import Dict, List, Optional, Union
|
|
|
12
13
|
from evalscope.backend.rag_eval.utils.tools import download_model
|
|
13
14
|
from evalscope.constants import HubType
|
|
14
15
|
from evalscope.utils.logger import get_logger
|
|
16
|
+
from evalscope.utils.utils import get_supported_params
|
|
15
17
|
|
|
16
18
|
logger = get_logger()
|
|
17
19
|
|
|
@@ -22,14 +24,14 @@ class BaseModel(Embeddings):
|
|
|
22
24
|
self,
|
|
23
25
|
model_name_or_path: str = '',
|
|
24
26
|
max_seq_length: int = 512,
|
|
25
|
-
prompt: str =
|
|
27
|
+
prompt: Optional[str] = None,
|
|
28
|
+
prompts: Optional[Dict[str, str]] = None,
|
|
26
29
|
revision: Optional[str] = 'master',
|
|
27
30
|
**kwargs,
|
|
28
31
|
):
|
|
29
32
|
self.model_name_or_path = model_name_or_path
|
|
30
33
|
self.max_seq_length = max_seq_length
|
|
31
34
|
self.model_kwargs = kwargs.pop('model_kwargs', {})
|
|
32
|
-
self.model_kwargs['trust_remote_code'] = True
|
|
33
35
|
|
|
34
36
|
self.config_kwargs = kwargs.pop('config_kwargs', {})
|
|
35
37
|
self.config_kwargs['trust_remote_code'] = True
|
|
@@ -38,7 +40,9 @@ class BaseModel(Embeddings):
|
|
|
38
40
|
self.encode_kwargs['convert_to_tensor'] = True
|
|
39
41
|
|
|
40
42
|
self.prompt = prompt
|
|
43
|
+
self.prompts = prompts if prompts else {}
|
|
41
44
|
self.revision = revision
|
|
45
|
+
self.framework = ['PyTorch']
|
|
42
46
|
|
|
43
47
|
@property
|
|
44
48
|
def mteb_model_meta(self):
|
|
@@ -46,10 +50,22 @@ class BaseModel(Embeddings):
|
|
|
46
50
|
from mteb import ModelMeta
|
|
47
51
|
|
|
48
52
|
return ModelMeta(
|
|
49
|
-
name=os.path.basename(self.model_name_or_path),
|
|
53
|
+
name='eval/' + os.path.basename(self.model_name_or_path), # Ensure the name contains a slash
|
|
50
54
|
revision=self.revision,
|
|
51
55
|
languages=None,
|
|
52
56
|
release_date=None,
|
|
57
|
+
n_parameters=None,
|
|
58
|
+
memory_usage_mb=None,
|
|
59
|
+
max_tokens=None,
|
|
60
|
+
embed_dim=None,
|
|
61
|
+
license=None,
|
|
62
|
+
open_weights=None,
|
|
63
|
+
public_training_code=None,
|
|
64
|
+
public_training_data=None,
|
|
65
|
+
similarity_fn_name=None,
|
|
66
|
+
use_instructions=None,
|
|
67
|
+
training_datasets=None,
|
|
68
|
+
framework=self.framework,
|
|
53
69
|
)
|
|
54
70
|
|
|
55
71
|
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
@@ -61,7 +77,7 @@ class BaseModel(Embeddings):
|
|
|
61
77
|
Returns:
|
|
62
78
|
List of embeddings.
|
|
63
79
|
"""
|
|
64
|
-
return self.
|
|
80
|
+
return self.encode(texts).tolist()
|
|
65
81
|
|
|
66
82
|
def embed_query(self, text: str) -> List[float]:
|
|
67
83
|
"""Embed query text. Compact langchain.
|
|
@@ -72,19 +88,17 @@ class BaseModel(Embeddings):
|
|
|
72
88
|
Returns:
|
|
73
89
|
Embedding.
|
|
74
90
|
"""
|
|
75
|
-
return self.
|
|
91
|
+
return self.encode(text).tolist()
|
|
76
92
|
|
|
77
93
|
def encode(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
|
|
78
94
|
"""Embed text."""
|
|
79
95
|
raise NotImplementedError
|
|
80
96
|
|
|
81
|
-
def
|
|
82
|
-
"""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
"""Embed search docs . Compact mteb."""
|
|
87
|
-
raise NotImplementedError
|
|
97
|
+
def get_prompt(self, task_name: str) -> Optional[str]:
|
|
98
|
+
"""Get prompt for the given task name."""
|
|
99
|
+
if self.prompt:
|
|
100
|
+
return self.prompt
|
|
101
|
+
return self.prompts.get(task_name, None)
|
|
88
102
|
|
|
89
103
|
|
|
90
104
|
class SentenceTransformerModel(BaseModel):
|
|
@@ -92,6 +106,9 @@ class SentenceTransformerModel(BaseModel):
|
|
|
92
106
|
def __init__(self, model_name_or_path: str, pooling_mode: Optional[str] = None, **kwargs):
|
|
93
107
|
super().__init__(model_name_or_path, **kwargs)
|
|
94
108
|
|
|
109
|
+
self.framework = ['Sentence Transformers', 'PyTorch']
|
|
110
|
+
|
|
111
|
+
self.model_kwargs['trust_remote_code'] = True
|
|
95
112
|
if not pooling_mode:
|
|
96
113
|
self.model = SentenceTransformer(
|
|
97
114
|
self.model_name_or_path,
|
|
@@ -112,43 +129,52 @@ class SentenceTransformerModel(BaseModel):
|
|
|
112
129
|
|
|
113
130
|
self.model.max_seq_length = self.max_seq_length
|
|
114
131
|
|
|
115
|
-
|
|
116
|
-
|
|
132
|
+
self.supported_encode_params = get_supported_params(self.model.encode)
|
|
133
|
+
|
|
134
|
+
def encode(self, texts: Union[str, List[str]], **kwargs) -> List[torch.Tensor]:
|
|
135
|
+
# pop unused kwargs
|
|
136
|
+
extra_params = {}
|
|
137
|
+
for key in list(kwargs.keys()):
|
|
138
|
+
if key not in self.supported_encode_params:
|
|
139
|
+
extra_params[key] = kwargs.pop(key)
|
|
117
140
|
self.encode_kwargs.update(kwargs)
|
|
118
141
|
|
|
142
|
+
# set prompt if provided
|
|
143
|
+
prompt = None
|
|
144
|
+
prompt_type = extra_params.pop('prompt_type', '')
|
|
145
|
+
task_name = extra_params.pop('task_name', '')
|
|
146
|
+
if prompt_type and prompt_type == PromptType.query:
|
|
147
|
+
prompt = self.get_prompt(task_name)
|
|
148
|
+
|
|
119
149
|
embeddings = self.model.encode(texts, prompt=prompt, **self.encode_kwargs)
|
|
120
150
|
assert isinstance(embeddings, Tensor)
|
|
121
151
|
return embeddings.cpu().detach()
|
|
122
152
|
|
|
123
|
-
def encode_queries(self, queries, **kwargs):
|
|
124
|
-
return self.encode(queries, prompt=self.prompt)
|
|
125
|
-
|
|
126
|
-
def encode_corpus(self, corpus, **kwargs):
|
|
127
|
-
if isinstance(corpus[0], dict):
|
|
128
|
-
input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
|
|
129
|
-
else:
|
|
130
|
-
input_texts = corpus
|
|
131
|
-
return self.encode(input_texts)
|
|
132
|
-
|
|
133
153
|
|
|
134
154
|
class CrossEncoderModel(BaseModel):
|
|
135
155
|
|
|
136
156
|
def __init__(self, model_name_or_path: str, **kwargs):
|
|
137
157
|
super().__init__(model_name_or_path, **kwargs)
|
|
158
|
+
|
|
159
|
+
self.framework = ['Sentence Transformers', 'PyTorch']
|
|
160
|
+
|
|
138
161
|
self.model = CrossEncoder(
|
|
139
162
|
self.model_name_or_path,
|
|
140
163
|
trust_remote_code=True,
|
|
141
164
|
max_length=self.max_seq_length,
|
|
165
|
+
automodel_args=self.model_kwargs,
|
|
142
166
|
)
|
|
167
|
+
self.supported_encode_params = get_supported_params(self.model.predict)
|
|
143
168
|
|
|
144
169
|
def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
|
|
170
|
+
for key in list(kwargs.keys()):
|
|
171
|
+
if key not in self.supported_encode_params:
|
|
172
|
+
kwargs.pop(key)
|
|
145
173
|
self.encode_kwargs.update(kwargs)
|
|
146
174
|
|
|
147
|
-
if len(sentences[0]) ==
|
|
175
|
+
if len(sentences[0]) == 2: # Note: For mteb retrieval task
|
|
148
176
|
processed_sentences = []
|
|
149
|
-
for query, docs
|
|
150
|
-
if isinstance(docs, dict):
|
|
151
|
-
docs = docs['text']
|
|
177
|
+
for query, docs in sentences:
|
|
152
178
|
processed_sentences.append((self.prompt + query, docs))
|
|
153
179
|
sentences = processed_sentences
|
|
154
180
|
embeddings = self.model.predict(sentences, **self.encode_kwargs)
|
|
@@ -163,6 +189,7 @@ class APIEmbeddingModel(BaseModel):
|
|
|
163
189
|
self.openai_api_base = kwargs.get('api_base')
|
|
164
190
|
self.openai_api_key = kwargs.get('api_key')
|
|
165
191
|
self.dimensions = kwargs.get('dimensions')
|
|
192
|
+
self.framework = ['API']
|
|
166
193
|
|
|
167
194
|
self.model = OpenAIEmbeddings(
|
|
168
195
|
model=self.model_name,
|
|
@@ -175,26 +202,37 @@ class APIEmbeddingModel(BaseModel):
|
|
|
175
202
|
|
|
176
203
|
self.batch_size = self.encode_kwargs.get('batch_size', 10)
|
|
177
204
|
|
|
205
|
+
self.supported_encode_params = get_supported_params(self.model.embed_documents)
|
|
206
|
+
|
|
178
207
|
def encode(self, texts: Union[str, List[str]], **kwargs) -> Tensor:
|
|
208
|
+
# pop unused kwargs
|
|
209
|
+
extra_params = {}
|
|
210
|
+
for key in list(kwargs.keys()):
|
|
211
|
+
if key not in self.supported_encode_params:
|
|
212
|
+
extra_params[key] = kwargs.pop(key)
|
|
213
|
+
self.encode_kwargs.update(kwargs)
|
|
214
|
+
|
|
215
|
+
# set prompt if provided
|
|
216
|
+
prompt = None
|
|
217
|
+
prompt_type = extra_params.pop('prompt_type', '')
|
|
218
|
+
task_name = extra_params.pop('task_name', '')
|
|
219
|
+
if prompt_type and prompt_type == PromptType.query:
|
|
220
|
+
prompt = self.get_prompt(task_name)
|
|
221
|
+
|
|
179
222
|
if isinstance(texts, str):
|
|
180
223
|
texts = [texts]
|
|
181
224
|
|
|
182
225
|
embeddings: List[List[float]] = []
|
|
183
226
|
for i in tqdm(range(0, len(texts), self.batch_size)):
|
|
184
|
-
|
|
227
|
+
# set prompt if provided
|
|
228
|
+
if prompt is not None:
|
|
229
|
+
batch_texts = [prompt + text for text in texts[i:i + self.batch_size]]
|
|
230
|
+
else:
|
|
231
|
+
batch_texts = texts[i:i + self.batch_size]
|
|
232
|
+
response = self.model.embed_documents(batch_texts, chunk_size=self.batch_size)
|
|
185
233
|
embeddings.extend(response)
|
|
186
234
|
return torch.tensor(embeddings)
|
|
187
235
|
|
|
188
|
-
def encode_queries(self, queries, **kwargs):
|
|
189
|
-
return self.encode(queries, **kwargs)
|
|
190
|
-
|
|
191
|
-
def encode_corpus(self, corpus, **kwargs):
|
|
192
|
-
if isinstance(corpus[0], dict):
|
|
193
|
-
input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
|
|
194
|
-
else:
|
|
195
|
-
input_texts = corpus
|
|
196
|
-
return self.encode(input_texts, **kwargs)
|
|
197
|
-
|
|
198
236
|
|
|
199
237
|
class EmbeddingModel:
|
|
200
238
|
"""Custom embeddings"""
|
|
@@ -69,6 +69,7 @@ class EvalMuseAdapter(T2IBaseAdapter):
|
|
|
69
69
|
if 'FGA_BLIP2Score' in metric_name and '(' in metric_name: # FGA_BLIP2Score element score
|
|
70
70
|
metrics_prefix = metric_name.split(':')[0]
|
|
71
71
|
category = metric_name.rpartition('(')[-1].split(')')[0]
|
|
72
|
+
category = category.split('-')[0].lower() # remove the suffix if exists
|
|
72
73
|
new_items[f'{metrics_prefix}:{category}'].extend(value_list)
|
|
73
74
|
else:
|
|
74
75
|
new_items[metric_name].extend(value_list)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
-
from evalscope.constants import OutputType
|
|
3
2
|
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
|
|
4
3
|
from evalscope.utils.logger import get_logger
|
|
5
4
|
|
|
@@ -11,6 +10,9 @@ logger = get_logger()
|
|
|
11
10
|
@Benchmark.register(
|
|
12
11
|
name='aime24',
|
|
13
12
|
pretty_name='AIME-2024',
|
|
13
|
+
tags=['Mathematics'],
|
|
14
|
+
description=
|
|
15
|
+
'The AIME 2024 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.', # noqa: E501
|
|
14
16
|
dataset_id='HuggingFaceH4/aime_2024',
|
|
15
17
|
subset_list=['default'],
|
|
16
18
|
metric_list=['AveragePass@1'],
|