evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +20 -5
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +1 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/evaluator/evaluator.py +15 -12
- evalscope/metrics/__init__.py +6 -0
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
- evalscope/metrics/llm_judge.py +105 -20
- evalscope/metrics/metrics.py +1 -1
- evalscope/models/adapters/base_adapter.py +0 -2
- evalscope/models/adapters/server_adapter.py +2 -2
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/perf/arguments.py +2 -16
- evalscope/perf/main.py +1 -1
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +45 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +50 -2
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +13 -37
- tests/perf/test_perf.py +2 -2
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
|
@@ -1,77 +1,85 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
#
|
|
2
|
+
# flake8: noqa
|
|
3
3
|
|
|
4
|
-
import
|
|
5
|
-
import hashlib
|
|
6
|
-
import importlib
|
|
7
|
-
import importlib.util
|
|
8
|
-
import numpy as np
|
|
9
|
-
import os
|
|
10
|
-
import random
|
|
4
|
+
import ast
|
|
11
5
|
import re
|
|
12
|
-
import torch
|
|
13
|
-
from inspect import signature
|
|
14
|
-
from typing import Any, Dict, List, Tuple, Union
|
|
15
6
|
|
|
7
|
+
# from . import utils as ann_utils
|
|
8
|
+
from evalscope.constants import ArenaWinner
|
|
16
9
|
from evalscope.utils.logger import get_logger
|
|
17
10
|
|
|
18
11
|
logger = get_logger()
|
|
19
12
|
|
|
20
|
-
|
|
13
|
+
one_score_pattern = re.compile('\[\[(\d+\.?\d*)\]\]')
|
|
14
|
+
one_score_pattern_backup = re.compile('\[(\d+\.?\d*)\]')
|
|
21
15
|
|
|
22
|
-
# Example: export TEST_LEVEL_LIST=0,1
|
|
23
|
-
TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
|
|
24
16
|
|
|
17
|
+
# modified from: https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py#L47
|
|
18
|
+
# does not work with batched completions
|
|
19
|
+
def lmsys_parser(completion, output_format):
|
|
20
|
+
if output_format == '[[rating]]':
|
|
21
|
+
match = re.search(one_score_pattern, completion)
|
|
22
|
+
if not match:
|
|
23
|
+
match = re.search(one_score_pattern_backup, completion)
|
|
25
24
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
25
|
+
if match:
|
|
26
|
+
rating = ast.literal_eval(match.groups()[0])
|
|
27
|
+
else:
|
|
28
|
+
logger.error(f'Content: {completion}\n'
|
|
29
|
+
'You must manually fix the score.')
|
|
30
|
+
rating = -1
|
|
31
|
+
|
|
32
|
+
return rating
|
|
33
|
+
if output_format == '[[rating_a,rating_b]]':
|
|
34
|
+
try:
|
|
35
|
+
score_pair = completion.split('\n')[0]
|
|
36
|
+
score_pair = score_pair.replace(',', ' ')
|
|
37
|
+
sp = score_pair.split(' ')
|
|
38
|
+
if len(sp) == 2:
|
|
39
|
+
score_1 = float(sp[0])
|
|
40
|
+
score_2 = float(sp[1])
|
|
41
|
+
if score_1 > score_2:
|
|
42
|
+
winner = ArenaWinner.MODEL_A
|
|
43
|
+
elif score_1 < score_2:
|
|
44
|
+
winner = ArenaWinner.MODEL_B
|
|
45
|
+
else:
|
|
46
|
+
if score_1 == score_1 == -1:
|
|
47
|
+
winner = ArenaWinner.UNKNOWN
|
|
48
|
+
winner = ArenaWinner.TIE
|
|
49
|
+
return winner, [score_1, score_2]
|
|
50
|
+
else:
|
|
51
|
+
raise Exception('Invalid score pair.')
|
|
52
|
+
except Exception as e:
|
|
53
|
+
logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
|
|
54
|
+
return ArenaWinner.UNKNOWN, [-1, -1]
|
|
55
|
+
elif output_format == '[[A]]':
|
|
56
|
+
if '[[A]]' in completion:
|
|
57
|
+
winner = ArenaWinner.MODEL_A
|
|
58
|
+
elif '[[B]]' in completion:
|
|
59
|
+
winner = ArenaWinner.MODEL_B
|
|
60
|
+
elif '[[C]]' in completion:
|
|
61
|
+
winner = ArenaWinner.TIE
|
|
62
|
+
else:
|
|
63
|
+
logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
|
|
64
|
+
winner = ArenaWinner.UNKNOWN
|
|
65
|
+
return winner
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def ranking_parser(completion, **kwargs):
|
|
37
69
|
try:
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
if spliter:
|
|
44
|
-
for attr in cls_name.split('.'):
|
|
45
|
-
obj_cls = getattr(obj_cls, attr)
|
|
46
|
-
|
|
47
|
-
return functools.partial(obj_cls, *args, **kwargs)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def random_seeded_choice(seed: Union[int, str, float], choices, **kwargs):
|
|
51
|
-
"""Random choice with a (potentially string) seed."""
|
|
52
|
-
return random.Random(seed).choices(choices, k=1, **kwargs)[0]
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def gen_hash(name: str, bits: int = 32):
|
|
56
|
-
return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()[:bits]
|
|
57
|
-
|
|
70
|
+
if isinstance(completion, str):
|
|
71
|
+
ordered_completions = ast.literal_eval(completion)
|
|
72
|
+
else:
|
|
73
|
+
ordered_completions = completion
|
|
58
74
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
|
|
62
|
-
converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
|
|
63
|
-
string, which can then be stored in the json format.
|
|
75
|
+
rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
|
|
76
|
+
assert rank in [1, 2]
|
|
64
77
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
for value in d.values():
|
|
71
|
-
if isinstance(value, dict):
|
|
72
|
-
dict_torch_dtype_to_str(value)
|
|
73
|
-
|
|
74
|
-
return d
|
|
78
|
+
return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f'{e}\nContent: {completion}\n'
|
|
81
|
+
'You must manually fix the score pair.')
|
|
82
|
+
return ArenaWinner.UNKNOWN
|
|
75
83
|
|
|
76
84
|
|
|
77
85
|
class ResponseParser:
|
|
@@ -194,7 +202,6 @@ class ResponseParser:
|
|
|
194
202
|
return last_capital
|
|
195
203
|
return 'No valid option found'
|
|
196
204
|
|
|
197
|
-
|
|
198
205
|
@staticmethod
|
|
199
206
|
def parse_bracketed_answer(text: str, options: list[str]) -> str:
|
|
200
207
|
options = ResponseParser.process_options(options)
|
|
@@ -211,122 +218,3 @@ class ResponseParser:
|
|
|
211
218
|
# Join options into a regex pattern separated by '|', to match any of the options
|
|
212
219
|
options_pattern = '|'.join(escaped_options)
|
|
213
220
|
return options_pattern
|
|
214
|
-
|
|
215
|
-
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
216
|
-
"""
|
|
217
|
-
Normalize score.
|
|
218
|
-
|
|
219
|
-
Args:
|
|
220
|
-
score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678}
|
|
221
|
-
keep_num: number of digits to keep.
|
|
222
|
-
|
|
223
|
-
Returns:
|
|
224
|
-
Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234}
|
|
225
|
-
"""
|
|
226
|
-
if isinstance(score, float):
|
|
227
|
-
score = round(score, keep_num)
|
|
228
|
-
elif isinstance(score, dict):
|
|
229
|
-
score = {k: round(v, keep_num) for k, v in score.items()}
|
|
230
|
-
else:
|
|
231
|
-
logger.warning(f'Unknown score type: {type(score)}')
|
|
232
|
-
|
|
233
|
-
return score
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
def is_module_installed(module_name):
|
|
237
|
-
try:
|
|
238
|
-
importlib.import_module(module_name)
|
|
239
|
-
return True
|
|
240
|
-
except ImportError:
|
|
241
|
-
return False
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
def get_module_path(module_name):
|
|
245
|
-
spec = importlib.util.find_spec(module_name)
|
|
246
|
-
if spec and spec.origin:
|
|
247
|
-
return os.path.abspath(spec.origin)
|
|
248
|
-
else:
|
|
249
|
-
raise ValueError(f'Cannot find module: {module_name}')
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
def get_valid_list(input_list, candidate_list):
|
|
253
|
-
"""
|
|
254
|
-
Get the valid and invalid list from input_list based on candidate_list.
|
|
255
|
-
Args:
|
|
256
|
-
input_list: The input list.
|
|
257
|
-
candidate_list: The candidate list.
|
|
258
|
-
|
|
259
|
-
Returns:
|
|
260
|
-
valid_list: The valid list.
|
|
261
|
-
invalid_list: The invalid list.
|
|
262
|
-
"""
|
|
263
|
-
return [i for i in input_list if i in candidate_list], \
|
|
264
|
-
[i for i in input_list if i not in candidate_list]
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
def get_latest_folder_path(work_dir):
|
|
268
|
-
from datetime import datetime
|
|
269
|
-
|
|
270
|
-
# Get all subdirectories in the work_dir
|
|
271
|
-
folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
|
|
272
|
-
|
|
273
|
-
# Get the timestamp(YYYYMMDD_HHMMSS)
|
|
274
|
-
timestamp_pattern = re.compile(r'^\d{8}_\d{6}$')
|
|
275
|
-
|
|
276
|
-
# Filter out the folders
|
|
277
|
-
timestamped_folders = [f for f in folders if timestamp_pattern.match(f)]
|
|
278
|
-
|
|
279
|
-
if not timestamped_folders:
|
|
280
|
-
print(f'>> No timestamped folders found in {work_dir}!')
|
|
281
|
-
return None
|
|
282
|
-
|
|
283
|
-
# timestamp parser
|
|
284
|
-
def parse_timestamp(folder_name):
|
|
285
|
-
return datetime.strptime(folder_name, '%Y%m%d_%H%M%S')
|
|
286
|
-
|
|
287
|
-
# Find the latest folder
|
|
288
|
-
latest_folder = max(timestamped_folders, key=parse_timestamp)
|
|
289
|
-
|
|
290
|
-
return os.path.join(work_dir, latest_folder)
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
def csv_to_list(file_path: str) -> List[dict]:
|
|
294
|
-
import csv
|
|
295
|
-
|
|
296
|
-
with open(file_path, mode='r', newline='', encoding='utf-8') as csv_file:
|
|
297
|
-
csv_reader = csv.DictReader(csv_file)
|
|
298
|
-
result = [row for row in csv_reader]
|
|
299
|
-
|
|
300
|
-
return result
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
def seed_everything(seed: int):
|
|
304
|
-
"""Set all random seeds to a fixed value for reproducibility.
|
|
305
|
-
|
|
306
|
-
Args:
|
|
307
|
-
seed (int): The seed value.
|
|
308
|
-
"""
|
|
309
|
-
random.seed(seed)
|
|
310
|
-
np.random.seed(seed)
|
|
311
|
-
torch.manual_seed(seed)
|
|
312
|
-
if torch.cuda.is_available():
|
|
313
|
-
torch.cuda.manual_seed_all(seed)
|
|
314
|
-
torch.backends.cudnn.deterministic = True
|
|
315
|
-
torch.backends.cudnn.benchmark = False
|
|
316
|
-
|
|
317
|
-
def get_supported_params(func):
|
|
318
|
-
"""Get the supported parameters of a function."""
|
|
319
|
-
sig = signature(func)
|
|
320
|
-
return list(sig.parameters.keys())
|
|
321
|
-
|
|
322
|
-
def parse_int_or_float(num):
|
|
323
|
-
number = float(num)
|
|
324
|
-
if number.is_integer():
|
|
325
|
-
return int(number)
|
|
326
|
-
return number
|
|
327
|
-
|
|
328
|
-
if __name__ == '__main__':
|
|
329
|
-
options = ['A', 'B', 'C', 'D']
|
|
330
|
-
answers = ['Context .... ANSWER: A', 'answer: A']
|
|
331
|
-
for answer in answers:
|
|
332
|
-
print(ResponseParser.parse_first_option(answer, options))
|
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -8,11 +8,14 @@ logger = get_logger()
|
|
|
8
8
|
|
|
9
9
|
DEFAULT_PROMPT_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and return a letter "A" or "B" to indicate whether the predicted answer is correct or incorrect.
|
|
10
10
|
|
|
11
|
-
Question
|
|
11
|
+
[Question]
|
|
12
|
+
{question}
|
|
12
13
|
|
|
13
|
-
Reference Answer
|
|
14
|
+
[Reference Answer]
|
|
15
|
+
{gold}
|
|
14
16
|
|
|
15
|
-
|
|
17
|
+
[Predicted Answer]
|
|
18
|
+
{pred}
|
|
16
19
|
|
|
17
20
|
Evaluate the model's answer based on correctness compared to the reference answer.
|
|
18
21
|
Grade the predicted answer of this new question as one of:
|
|
@@ -22,6 +25,18 @@ B: INCORRECT
|
|
|
22
25
|
Just return the letters "A" or "B", with no text around it.
|
|
23
26
|
""" # noqa: E501
|
|
24
27
|
|
|
28
|
+
|
|
29
|
+
DEFAULT_NUMERIC_SCORE_TEMPLATE = """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
|
|
30
|
+
Begin your evaluation by providing a short explanation. Be as objective as possible.
|
|
31
|
+
After providing your explanation, you must rate the response on a scale of 0 (worst) to 1 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[0.5]]\"
|
|
32
|
+
|
|
33
|
+
[Question]
|
|
34
|
+
{question}
|
|
35
|
+
|
|
36
|
+
[Response]
|
|
37
|
+
{pred}
|
|
38
|
+
""" # noqa: E501
|
|
39
|
+
|
|
25
40
|
DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
|
|
26
41
|
DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'
|
|
27
42
|
|
|
@@ -31,14 +46,18 @@ class LLMJudge:
|
|
|
31
46
|
A metric that uses LLM to judge the quality of model predictions by comparing them with reference answers.
|
|
32
47
|
"""
|
|
33
48
|
|
|
34
|
-
def __init__(
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
api_key: Optional[str] = None,
|
|
52
|
+
api_url: Optional[str] = None,
|
|
53
|
+
model_id: Optional[str] = None,
|
|
54
|
+
system_prompt: Optional[str] = None,
|
|
55
|
+
prompt_template: Optional[str] = None,
|
|
56
|
+
generation_config: Optional[Dict[str, Any]] = None,
|
|
57
|
+
score_pattern: Optional[str] = None,
|
|
58
|
+
score_mapping: Optional[Dict[str, float]] = None,
|
|
59
|
+
score_type: str = 'pattern', # 'pattern', 'numeric'
|
|
60
|
+
**kwargs):
|
|
42
61
|
"""
|
|
43
62
|
Initialize LLMJudge metric.
|
|
44
63
|
|
|
@@ -49,14 +68,34 @@ class LLMJudge:
|
|
|
49
68
|
system_prompt (str, optional): System prompt for the judge
|
|
50
69
|
prompt_template (str, optional): Prompt template for the judge
|
|
51
70
|
generation_config (dict, optional): Generation configuration for the judge
|
|
71
|
+
score_pattern (str, optional): Regex pattern to extract score from LLM response
|
|
72
|
+
score_mapping (dict, optional): Mapping from extracted score to float value
|
|
73
|
+
score_type (str, optional): Type of score extraction strategy ('pattern', 'numeric') defaults to 'pattern'.
|
|
74
|
+
- 'pattern': Use score_pattern and score_mapping to extract categorical scores
|
|
75
|
+
- 'numeric': Treat the extracted value as a direct numerical score
|
|
52
76
|
"""
|
|
53
77
|
self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
|
|
54
78
|
self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
|
|
55
79
|
self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
|
|
56
80
|
self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
|
|
57
|
-
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
58
81
|
self.generation_config = generation_config or {}
|
|
59
82
|
|
|
83
|
+
# Default score mapping for A/B pattern
|
|
84
|
+
self.score_type = score_type
|
|
85
|
+
if self.score_type == 'numeric':
|
|
86
|
+
self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
|
|
87
|
+
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE',
|
|
88
|
+
DEFAULT_NUMERIC_SCORE_TEMPLATE)
|
|
89
|
+
elif self.score_type == 'pattern':
|
|
90
|
+
self.score_pattern = score_pattern or r'(A|B)'
|
|
91
|
+
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
92
|
+
else:
|
|
93
|
+
raise ValueError(f"Invalid score_type: {self.score_type}. Must be 'pattern' or 'numeric'.")
|
|
94
|
+
self.score_mapping = score_mapping or {'A': 1.0, 'B': 0.0}
|
|
95
|
+
|
|
96
|
+
self._init_server_adapter()
|
|
97
|
+
|
|
98
|
+
def _init_server_adapter(self):
|
|
60
99
|
from evalscope.models import ServerModelAdapter
|
|
61
100
|
|
|
62
101
|
# Initialize ServerModelAdapter
|
|
@@ -95,17 +134,63 @@ class LLMJudge:
|
|
|
95
134
|
def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
|
|
96
135
|
if question is None:
|
|
97
136
|
question = 'Not provided'
|
|
98
|
-
|
|
137
|
+
|
|
138
|
+
# check variables in prompt_template
|
|
139
|
+
prompt = self.prompt_template
|
|
140
|
+
if '{question}' in self.prompt_template:
|
|
141
|
+
prompt = prompt.replace('{question}', question)
|
|
142
|
+
if '{pred}' in self.prompt_template:
|
|
143
|
+
prompt = prompt.replace('{pred}', pred)
|
|
144
|
+
if '{gold}' in self.prompt_template:
|
|
145
|
+
prompt = prompt.replace('{gold}', gold)
|
|
146
|
+
return prompt
|
|
99
147
|
|
|
100
148
|
def get_score(self, response: str) -> float:
|
|
149
|
+
"""
|
|
150
|
+
Extract score from LLM response using the configured pattern and mapping.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
response (str): The response from the LLM
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
float: The numeric score extracted from the response
|
|
157
|
+
"""
|
|
101
158
|
if response is None:
|
|
102
|
-
return 0
|
|
103
|
-
|
|
159
|
+
return 0.0
|
|
160
|
+
|
|
161
|
+
# choose extraction method based on score_type
|
|
162
|
+
if self.score_type == 'numeric':
|
|
163
|
+
return self._extract_numeric_score(response)
|
|
164
|
+
elif self.score_type == 'pattern':
|
|
165
|
+
return self._extract_pattern_score(response)
|
|
166
|
+
|
|
167
|
+
def _extract_numeric_score(self, response: str) -> Optional[float]:
|
|
168
|
+
"""extract numeric score from the response using the score_pattern"""
|
|
169
|
+
match = re.search(self.score_pattern, response)
|
|
170
|
+
|
|
171
|
+
if match:
|
|
172
|
+
# try to convert each captured group to float
|
|
173
|
+
for group in match.groups():
|
|
174
|
+
if group is not None:
|
|
175
|
+
try:
|
|
176
|
+
return float(group)
|
|
177
|
+
except (ValueError, TypeError):
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
# if not found in groups, try the whole match
|
|
181
|
+
try:
|
|
182
|
+
return float(match.group(0))
|
|
183
|
+
except (ValueError, TypeError):
|
|
184
|
+
logger.warning(f'Failed to convert any extracted value to float from: {match.group(0)}')
|
|
185
|
+
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
def _extract_pattern_score(self, response: str) -> float:
|
|
189
|
+
"""use the score_pattern to extract categorical scores"""
|
|
190
|
+
match = re.search(self.score_pattern, response)
|
|
104
191
|
if match:
|
|
105
192
|
answer = match.group(0)
|
|
106
|
-
|
|
107
|
-
return 1
|
|
108
|
-
elif answer == 'B':
|
|
109
|
-
return 0
|
|
193
|
+
return self.score_mapping.get(answer, 0.0)
|
|
110
194
|
else:
|
|
111
|
-
|
|
195
|
+
logger.warning(f"No match found for pattern '{self.score_pattern}' in response: {response}")
|
|
196
|
+
return 0.0
|
evalscope/metrics/metrics.py
CHANGED
|
@@ -223,7 +223,7 @@ def chrf(items):
|
|
|
223
223
|
Source: https://github.com/m-popovic/chrF
|
|
224
224
|
Paper: https://www.aclweb.org/anthology/W15-3049.pdf
|
|
225
225
|
|
|
226
|
-
Higher is better
|
|
226
|
+
Higher is better
|
|
227
227
|
"""
|
|
228
228
|
refs = list(zip(*items))[0]
|
|
229
229
|
preds = list(zip(*items))[1]
|
|
@@ -54,8 +54,6 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', b
|
|
|
54
54
|
|
|
55
55
|
if 'server' not in model_adapter_cls_str:
|
|
56
56
|
model_adapter_cls_str = 'server'
|
|
57
|
-
logger.info(
|
|
58
|
-
f'Using {model_adapter_cls.__name__} for api model evaluation for benchmark {benchmark.name}.')
|
|
59
57
|
|
|
60
58
|
# init server model adapter
|
|
61
59
|
model_adapter_cls = get_model_adapter(model_adapter_cls_str)
|
|
@@ -5,8 +5,8 @@ from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
|
5
5
|
from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
|
|
6
6
|
from typing import List, Optional, Union
|
|
7
7
|
|
|
8
|
+
from evalscope.utils.argument_utils import get_supported_params
|
|
8
9
|
from evalscope.utils.logger import get_logger
|
|
9
|
-
from evalscope.utils.utils import get_supported_params
|
|
10
10
|
from .base_adapter import BaseModelAdapter
|
|
11
11
|
|
|
12
12
|
logger = get_logger()
|
|
@@ -29,7 +29,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
29
29
|
self.api_key = api_key
|
|
30
30
|
|
|
31
31
|
self.client = openai.OpenAI(
|
|
32
|
-
api_key=api_key,
|
|
32
|
+
api_key=self.api_key,
|
|
33
33
|
base_url=self.api_url,
|
|
34
34
|
)
|
|
35
35
|
self.supported_params = get_supported_params(self.client.chat.completions.create)
|
|
@@ -50,14 +50,14 @@ class DummyCustomModel(CustomModel):
|
|
|
50
50
|
# Must return a list of dicts with the same format as the OpenAI API.
|
|
51
51
|
responses = []
|
|
52
52
|
for input_item in original_inputs:
|
|
53
|
-
message = self.make_request_messages(input_item)
|
|
54
|
-
response = f'Dummy response for prompt: {message}'
|
|
53
|
+
# message = self.make_request_messages(input_item)
|
|
54
|
+
# response = f'Dummy response for prompt: {message}'
|
|
55
55
|
|
|
56
56
|
res_d = {
|
|
57
57
|
'choices': [{
|
|
58
58
|
'index': 0,
|
|
59
59
|
'message': {
|
|
60
|
-
'content':
|
|
60
|
+
'content': '*PlaceHolder*',
|
|
61
61
|
'role': 'assistant'
|
|
62
62
|
}
|
|
63
63
|
}],
|
evalscope/perf/arguments.py
CHANGED
|
@@ -6,10 +6,11 @@ from dataclasses import dataclass, field
|
|
|
6
6
|
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from evalscope.constants import DEFAULT_WORK_DIR
|
|
9
|
+
from evalscope.utils import BaseArgument
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
@dataclass
|
|
12
|
-
class Arguments:
|
|
13
|
+
class Arguments(BaseArgument):
|
|
13
14
|
# Model and API
|
|
14
15
|
model: str # Model name or path
|
|
15
16
|
model_id: Optional[str] = None # Model identifier
|
|
@@ -69,15 +70,6 @@ class Arguments:
|
|
|
69
70
|
top_k: Optional[int] = None # Top-k sampling setting for the response
|
|
70
71
|
extra_args: Optional[Dict[str, Any]] = None # Extra arguments
|
|
71
72
|
|
|
72
|
-
@staticmethod
|
|
73
|
-
def from_args(args):
|
|
74
|
-
# Convert Namespace to a dictionary and filter out None values
|
|
75
|
-
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
76
|
-
|
|
77
|
-
if 'func' in args_dict:
|
|
78
|
-
del args_dict['func'] # Note: compat CLI arguments
|
|
79
|
-
return Arguments(**args_dict)
|
|
80
|
-
|
|
81
73
|
def __post_init__(self):
|
|
82
74
|
# Set the default headers
|
|
83
75
|
self.headers = self.headers or {} # Default to empty dictionary
|
|
@@ -108,12 +100,6 @@ class Arguments:
|
|
|
108
100
|
self.parallel
|
|
109
101
|
), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
|
|
110
102
|
|
|
111
|
-
def __str__(self):
|
|
112
|
-
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
113
|
-
|
|
114
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
115
|
-
return self.__dict__
|
|
116
|
-
|
|
117
103
|
|
|
118
104
|
class ParseKVAction(argparse.Action):
|
|
119
105
|
|
evalscope/perf/main.py
CHANGED
|
@@ -9,7 +9,7 @@ from argparse import Namespace
|
|
|
9
9
|
from evalscope.perf.utils.local_server import start_app
|
|
10
10
|
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
11
11
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
12
|
-
from evalscope.utils.
|
|
12
|
+
from evalscope.utils.model_utils import seed_everything
|
|
13
13
|
from .arguments import Arguments, parse_args
|
|
14
14
|
from .benchmark import benchmark
|
|
15
15
|
from .utils.db_util import get_output_path
|
|
@@ -3,27 +3,28 @@ import json
|
|
|
3
3
|
import pickle
|
|
4
4
|
import sqlite3
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
FROM result WHERE success='1'"
|
|
6
|
+
db_path = 'your db path'
|
|
7
|
+
conn = sqlite3.connect(db_path)
|
|
8
|
+
cursor = conn.cursor()
|
|
10
9
|
|
|
11
|
-
#
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
10
|
+
# 获取列名
|
|
11
|
+
cursor.execute('PRAGMA table_info(result)')
|
|
12
|
+
columns = [info[1] for info in cursor.fetchall()]
|
|
13
|
+
print('列名:', columns)
|
|
14
|
+
|
|
15
|
+
cursor.execute('SELECT * FROM result WHERE success=1 AND first_chunk_latency > 1')
|
|
16
|
+
rows = cursor.fetchall()
|
|
17
|
+
print(f'len(rows): {len(rows)}')
|
|
18
|
+
|
|
19
|
+
for row in rows:
|
|
20
|
+
row_dict = dict(zip(columns, row))
|
|
21
|
+
# 解码request
|
|
22
|
+
row_dict['request'] = pickle.loads(base64.b64decode(row_dict['request']))
|
|
23
|
+
# 解码response_messages
|
|
24
|
+
row_dict['response_messages'] = pickle.loads(base64.b64decode(row_dict['response_messages']))
|
|
25
|
+
# print(row_dict)
|
|
26
|
+
print(
|
|
27
|
+
f"request_id: {json.loads(row_dict['response_messages'][0])['id']}, first_chunk_latency: {row_dict['first_chunk_latency']}" # noqa: E501
|
|
28
|
+
)
|
|
29
|
+
# 如果只想看一个可以break
|
|
30
|
+
# break
|
|
@@ -38,7 +38,7 @@ class BenchmarkData:
|
|
|
38
38
|
self.first_chunk_latency = self.query_latency
|
|
39
39
|
self.n_chunks = 1
|
|
40
40
|
self.n_chunks_time = self.query_latency
|
|
41
|
-
self.time_per_output_token = self.n_chunks_time / self.n_chunks
|
|
41
|
+
self.time_per_output_token = self.n_chunks_time / self.n_chunks if self.n_chunks != 0 else 0.0
|
|
42
42
|
|
|
43
43
|
def _calculate_tokens(self, api_plugin):
|
|
44
44
|
self.prompt_tokens, self.completion_tokens = \
|
evalscope/report/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from evalscope.utils.import_utils import _LazyModule
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
-
from .combinator import
|
|
7
|
+
from .combinator import gen_table, get_data_frame, get_report_list
|
|
8
8
|
from .generator import ReportGenerator
|
|
9
9
|
from .utils import Category, Report, ReportKey, Subset
|
|
10
10
|
|