evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +29 -9
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +2 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +20 -15
- evalscope/metrics/__init__.py +9 -1
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
- evalscope/metrics/llm_judge.py +106 -20
- evalscope/metrics/metrics.py +20 -8
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +4 -2
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +15 -16
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +3 -3
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +55 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +68 -4
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +34 -70
- tests/perf/test_perf.py +31 -4
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/models/model.py +0 -189
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,8 @@ from functools import partial
|
|
|
5
5
|
from typing import Optional, Union
|
|
6
6
|
|
|
7
7
|
from evalscope.backend.base import BackendManager
|
|
8
|
-
from evalscope.utils import
|
|
8
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
9
|
+
from evalscope.utils.io_utils import get_valid_list
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
11
|
|
|
11
12
|
logger = get_logger()
|
|
@@ -68,6 +69,8 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
68
69
|
del remain_cfg['type'] # remove not used args
|
|
69
70
|
|
|
70
71
|
norm_model_type = os.path.basename(model_type).replace(':', '-').replace('.', '_')
|
|
72
|
+
model_cfg['type'] = norm_model_type
|
|
73
|
+
|
|
71
74
|
self.valid_models.update({norm_model_type: partial(model_class, model=model_type, **remain_cfg)})
|
|
72
75
|
new_model_names.append(norm_model_type)
|
|
73
76
|
else:
|
evalscope/benchmarks/__init__.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import glob
|
|
3
3
|
import importlib
|
|
4
4
|
import os
|
|
5
|
+
import time
|
|
5
6
|
|
|
6
7
|
from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
|
|
7
8
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
@@ -13,11 +14,24 @@ logger = get_logger()
|
|
|
13
14
|
pattern = os.path.join(os.path.dirname(__file__), '*', '**', '*_adapter.py')
|
|
14
15
|
files = glob.glob(pattern, recursive=True)
|
|
15
16
|
|
|
17
|
+
import_times = []
|
|
18
|
+
|
|
16
19
|
for file_path in files:
|
|
17
20
|
if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
|
|
18
21
|
# Convert file path to a module path
|
|
19
22
|
relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
|
|
20
23
|
module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
|
|
21
24
|
full_path = f'evalscope.benchmarks.{module_path}'
|
|
25
|
+
|
|
26
|
+
start_time = time.perf_counter()
|
|
22
27
|
importlib.import_module(full_path)
|
|
23
|
-
|
|
28
|
+
end_time = time.perf_counter()
|
|
29
|
+
|
|
30
|
+
import_times.append((full_path, end_time - start_time))
|
|
31
|
+
|
|
32
|
+
# Sort by import time in descending order
|
|
33
|
+
import_times.sort(key=lambda x: x[1], reverse=True)
|
|
34
|
+
|
|
35
|
+
# Log the sorted import times
|
|
36
|
+
for module, duration in import_times:
|
|
37
|
+
logger.debug(f'Module {module} imported in {duration:.6f} seconds')
|
|
@@ -47,7 +47,7 @@ Evaluate the models based on the quality and relevance of their outputs, and sel
|
|
|
47
47
|
@Benchmark.register(
|
|
48
48
|
name='alpaca_eval',
|
|
49
49
|
pretty_name='AlpacaEval2.0',
|
|
50
|
-
tags=['Instruction-Following', '
|
|
50
|
+
tags=['Instruction-Following', 'Arena'],
|
|
51
51
|
description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
|
|
52
52
|
'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
|
|
53
53
|
'provide more accurate and cost-effective model assessments. '
|
|
@@ -6,7 +6,7 @@ import os
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType, OutputType
|
|
8
8
|
from evalscope.metrics import exact_match
|
|
9
|
-
from evalscope.
|
|
9
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
# flake8: noqa
|
|
@@ -17,7 +17,7 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
|
|
|
17
17
|
@Benchmark.register(
|
|
18
18
|
name='arena_hard',
|
|
19
19
|
pretty_name='ArenaHard',
|
|
20
|
-
tags=['Instruction-Following', '
|
|
20
|
+
tags=['Instruction-Following', 'Arena'],
|
|
21
21
|
description=
|
|
22
22
|
'ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, '
|
|
23
23
|
'where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. '
|
|
@@ -127,18 +127,6 @@ def get_bootstrap_result(battles, func_compute_elo, num_round):
|
|
|
127
127
|
return df[df.median().sort_values(ascending=False).index]
|
|
128
128
|
|
|
129
129
|
|
|
130
|
-
def preety_print_two_ratings(ratings_1, ratings_2, column_names):
|
|
131
|
-
df = (
|
|
132
|
-
pd.DataFrame(
|
|
133
|
-
[[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()],
|
|
134
|
-
columns=['Model', column_names[0], column_names[1]],
|
|
135
|
-
).sort_values(column_names[0], ascending=False).reset_index(drop=True))
|
|
136
|
-
df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
|
|
137
|
-
df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
|
|
138
|
-
df.index = df.index + 1
|
|
139
|
-
return df
|
|
140
|
-
|
|
141
|
-
|
|
142
130
|
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
143
131
|
names = sorted(list(elo_ratings.keys()))
|
|
144
132
|
wins = defaultdict(lambda: defaultdict(lambda: 0))
|
|
@@ -35,7 +35,7 @@ SUBJECT_MAPPING = {
|
|
|
35
35
|
@Benchmark.register(
|
|
36
36
|
name='bfcl_v3',
|
|
37
37
|
pretty_name='BFCL-v3',
|
|
38
|
-
tags=['Agent'],
|
|
38
|
+
tags=['Agent', 'Function Calling'],
|
|
39
39
|
description=
|
|
40
40
|
'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
|
|
41
41
|
'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import csv
|
|
3
3
|
import os
|
|
4
|
+
from collections import defaultdict
|
|
4
5
|
|
|
5
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
7
|
from evalscope.constants import EvalType, OutputType
|
|
7
8
|
from evalscope.metrics import exact_match
|
|
8
|
-
from evalscope.
|
|
9
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
10
|
+
from evalscope.utils.io_utils import csv_to_list
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
|
|
11
13
|
# flake8: noqa
|
|
@@ -154,7 +156,7 @@ class CEVALAdapter(DataAdapter):
|
|
|
154
156
|
self.choices = ['A', 'B', 'C', 'D']
|
|
155
157
|
|
|
156
158
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
157
|
-
data_dict =
|
|
159
|
+
data_dict = defaultdict(dict)
|
|
158
160
|
for subset_name in subset_list:
|
|
159
161
|
for split_name in [self.train_split, self.eval_split]:
|
|
160
162
|
if os.path.exists(dataset_name_or_path):
|
|
@@ -162,20 +164,7 @@ class CEVALAdapter(DataAdapter):
|
|
|
162
164
|
else:
|
|
163
165
|
file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
|
|
164
166
|
if os.path.exists(file_path):
|
|
165
|
-
|
|
166
|
-
rows = []
|
|
167
|
-
reader = csv.reader(f)
|
|
168
|
-
header = next(reader)
|
|
169
|
-
for row in reader:
|
|
170
|
-
item = dict(zip(header, row))
|
|
171
|
-
item.setdefault('explanation', '')
|
|
172
|
-
item.setdefault('answer', '')
|
|
173
|
-
rows.append(item)
|
|
174
|
-
|
|
175
|
-
if subset_name in data_dict:
|
|
176
|
-
data_dict[subset_name].update({split_name: rows})
|
|
177
|
-
else:
|
|
178
|
-
data_dict[subset_name] = {split_name: rows}
|
|
167
|
+
data_dict[subset_name][split_name] = csv_to_list(file_path)
|
|
179
168
|
|
|
180
169
|
return data_dict
|
|
181
170
|
|
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
import csv
|
|
4
4
|
import os
|
|
5
|
+
from collections import defaultdict
|
|
5
6
|
|
|
6
7
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
8
|
from evalscope.constants import EvalType, OutputType
|
|
8
9
|
from evalscope.metrics import exact_match
|
|
9
|
-
from evalscope.
|
|
10
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
11
|
+
from evalscope.utils.io_utils import csv_to_list
|
|
10
12
|
from evalscope.utils.logger import get_logger
|
|
11
13
|
|
|
12
14
|
# flake8: noqa
|
|
@@ -126,29 +128,15 @@ class CMMLUAdapter(DataAdapter):
|
|
|
126
128
|
self.choices = ['A', 'B', 'C', 'D']
|
|
127
129
|
|
|
128
130
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
129
|
-
data_dict =
|
|
131
|
+
data_dict = defaultdict(dict)
|
|
130
132
|
for subset_name in subset_list:
|
|
131
|
-
data_dict[subset_name] = {}
|
|
132
133
|
for split_name in [self.train_split, self.eval_split]:
|
|
133
|
-
|
|
134
|
+
if os.path.exists(dataset_name_or_path):
|
|
135
|
+
file_path = os.path.join(dataset_name_or_path, split_name, f'{subset_name}.csv')
|
|
136
|
+
else:
|
|
137
|
+
file_path = os.path.join(work_dir, dataset_name_or_path, split_name, f'{subset_name}.csv')
|
|
134
138
|
if os.path.exists(file_path):
|
|
135
|
-
|
|
136
|
-
rows = []
|
|
137
|
-
reader = csv.reader(f)
|
|
138
|
-
for row in reader:
|
|
139
|
-
if len(row) != 7:
|
|
140
|
-
logger.error(f'Mismatch len of row: {row}, len of row should be 6. Skip this row.')
|
|
141
|
-
continue
|
|
142
|
-
rows.append({
|
|
143
|
-
'Question': row[1],
|
|
144
|
-
'A': row[2],
|
|
145
|
-
'B': row[3],
|
|
146
|
-
'C': row[4],
|
|
147
|
-
'D': row[5],
|
|
148
|
-
'Answer': row[6],
|
|
149
|
-
})
|
|
150
|
-
|
|
151
|
-
data_dict[subset_name].update({split_name: rows})
|
|
139
|
+
data_dict[subset_name][split_name] = csv_to_list(file_path)
|
|
152
140
|
|
|
153
141
|
return data_dict
|
|
154
142
|
|
|
@@ -105,7 +105,8 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
105
105
|
return result
|
|
106
106
|
|
|
107
107
|
def match(self, gold: str, pred: str) -> float:
|
|
108
|
-
|
|
108
|
+
res = math_equal(pred, gold)
|
|
109
|
+
return 1.0 if res else 0.0
|
|
109
110
|
|
|
110
111
|
@classmethod
|
|
111
112
|
def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
|
|
@@ -168,7 +168,12 @@ class DataAdapter(ABC):
|
|
|
168
168
|
If you want to support local dataset, please rewrite this method in xxx_data_adapter.
|
|
169
169
|
Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
|
|
170
170
|
"""
|
|
171
|
-
|
|
171
|
+
# remove dataset_infos.json file if exists, since MsDataset will occur an error if it exists.
|
|
172
|
+
dataset_infos_path = os.path.join(dataset_name_or_path, 'dataset_infos.json')
|
|
173
|
+
if os.path.exists(dataset_infos_path):
|
|
174
|
+
logger.info(f'Removing dataset_infos.json file at {dataset_infos_path} to avoid MsDataset errors.')
|
|
175
|
+
os.remove(dataset_infos_path)
|
|
176
|
+
return self.load_from_hub(dataset_name_or_path, subset_list, None, **kwargs)
|
|
172
177
|
|
|
173
178
|
def load_with_snapshot(self,
|
|
174
179
|
file_structure: Dict[str, List[str]],
|
|
@@ -382,7 +387,7 @@ class DataAdapter(ABC):
|
|
|
382
387
|
pass
|
|
383
388
|
|
|
384
389
|
def gen_prompt_data(self,
|
|
385
|
-
prompt: str,
|
|
390
|
+
prompt: str = '',
|
|
386
391
|
system_prompt: Optional[str] = None,
|
|
387
392
|
choices: Optional[List[str]] = None,
|
|
388
393
|
index: Optional[Union[int, str]] = None,
|
|
@@ -413,7 +418,8 @@ class DataAdapter(ABC):
|
|
|
413
418
|
system_prompt=system_prompt or self.system_prompt,
|
|
414
419
|
index=index or 0,
|
|
415
420
|
id=id,
|
|
416
|
-
messages=messages
|
|
421
|
+
messages=messages,
|
|
422
|
+
extra_data=kwargs.get('extra_data', None))
|
|
417
423
|
return prompt_data.to_dict()
|
|
418
424
|
|
|
419
425
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
@@ -449,7 +455,6 @@ class DataAdapter(ABC):
|
|
|
449
455
|
"""
|
|
450
456
|
raise NotImplementedError
|
|
451
457
|
|
|
452
|
-
@abstractmethod
|
|
453
458
|
def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
|
|
454
459
|
"""
|
|
455
460
|
Parse the predicted result and extract proper answer.
|
|
@@ -462,9 +467,22 @@ class DataAdapter(ABC):
|
|
|
462
467
|
Returns:
|
|
463
468
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
464
469
|
"""
|
|
465
|
-
|
|
470
|
+
return result
|
|
471
|
+
|
|
472
|
+
def llm_parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
|
|
473
|
+
"""
|
|
474
|
+
Parse the predicted result using LLM.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
result (Any): The predicted answer from the model.
|
|
478
|
+
raw_input_d (dict): The raw input data.
|
|
479
|
+
eval_type (str): The evaluation type, default is 'checkpoint'.
|
|
480
|
+
|
|
481
|
+
Returns:
|
|
482
|
+
The parsed answer. Usually a string for chat.
|
|
483
|
+
"""
|
|
484
|
+
return result
|
|
466
485
|
|
|
467
|
-
@abstractmethod
|
|
468
486
|
def match(self, gold: Any, pred: Any) -> Any:
|
|
469
487
|
"""
|
|
470
488
|
Match the gold answer and the predicted answer.
|
|
@@ -478,7 +496,7 @@ class DataAdapter(ABC):
|
|
|
478
496
|
Returns:
|
|
479
497
|
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
480
498
|
"""
|
|
481
|
-
|
|
499
|
+
return 1.0 if gold == pred else 0.0
|
|
482
500
|
|
|
483
501
|
def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
|
|
484
502
|
"""
|
|
@@ -504,5 +522,7 @@ class DataAdapter(ABC):
|
|
|
504
522
|
|
|
505
523
|
# Request judge and obtain score
|
|
506
524
|
prompt = judge.build_prompt(pred, gold, question)
|
|
507
|
-
|
|
508
|
-
|
|
525
|
+
judge_response = judge(prompt)
|
|
526
|
+
score = judge.get_score(judge_response)
|
|
527
|
+
|
|
528
|
+
return score
|
|
File without changes
|