evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  25. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  26. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  27. evalscope/benchmarks/data_adapter.py +20 -5
  28. evalscope/benchmarks/general_arena/__init__.py +0 -0
  29. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  30. evalscope/benchmarks/general_arena/utils.py +226 -0
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  35. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  37. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  38. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  39. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  41. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  42. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  43. evalscope/benchmarks/race/race_adapter.py +1 -1
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  45. evalscope/benchmarks/utils.py +1 -2
  46. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  47. evalscope/config.py +8 -123
  48. evalscope/evaluator/evaluator.py +15 -12
  49. evalscope/metrics/__init__.py +6 -0
  50. evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
  51. evalscope/metrics/llm_judge.py +105 -20
  52. evalscope/metrics/metrics.py +1 -1
  53. evalscope/models/adapters/base_adapter.py +0 -2
  54. evalscope/models/adapters/server_adapter.py +2 -2
  55. evalscope/models/custom/dummy_model.py +3 -3
  56. evalscope/perf/arguments.py +2 -16
  57. evalscope/perf/main.py +1 -1
  58. evalscope/perf/utils/analysis_result.py +24 -23
  59. evalscope/perf/utils/benchmark_util.py +1 -1
  60. evalscope/report/__init__.py +1 -1
  61. evalscope/report/utils.py +34 -15
  62. evalscope/run.py +1 -1
  63. evalscope/summarizer.py +1 -2
  64. evalscope/utils/__init__.py +63 -2
  65. evalscope/utils/argument_utils.py +64 -0
  66. evalscope/utils/import_utils.py +16 -0
  67. evalscope/utils/io_utils.py +45 -4
  68. evalscope/utils/model_utils.py +37 -1
  69. evalscope/version.py +2 -2
  70. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
  71. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
  72. tests/aigc/test_t2i.py +1 -1
  73. tests/cli/test_all.py +50 -2
  74. tests/cli/test_collection.py +1 -1
  75. tests/cli/test_custom.py +261 -0
  76. tests/cli/test_run.py +13 -37
  77. tests/perf/test_perf.py +2 -2
  78. tests/rag/test_clip_benchmark.py +2 -1
  79. tests/rag/test_mteb.py +3 -1
  80. tests/rag/test_ragas.py +3 -1
  81. tests/swift/test_run_swift_eval.py +2 -1
  82. tests/swift/test_run_swift_vlm_eval.py +2 -1
  83. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  84. tests/utils.py +13 -0
  85. tests/vlm/test_vlmeval.py +8 -2
  86. evalscope/evaluator/rating_eval.py +0 -157
  87. evalscope/evaluator/reviewer/__init__.py +0 -1
  88. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  89. evalscope/registry/__init__.py +0 -1
  90. evalscope/registry/config/cfg_arena.yaml +0 -77
  91. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  92. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  93. evalscope/registry/config/cfg_single.yaml +0 -78
  94. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  95. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  96. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  97. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  98. evalscope/registry/data/question.jsonl +0 -80
  99. evalscope/registry/tasks/arc.yaml +0 -28
  100. evalscope/registry/tasks/bbh.yaml +0 -26
  101. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  102. evalscope/registry/tasks/ceval.yaml +0 -27
  103. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  104. evalscope/registry/tasks/cmmlu.yaml +0 -27
  105. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  106. evalscope/registry/tasks/general_qa.yaml +0 -27
  107. evalscope/registry/tasks/gsm8k.yaml +0 -29
  108. evalscope/registry/tasks/mmlu.yaml +0 -29
  109. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  110. evalscope/run_arena.py +0 -202
  111. evalscope/utils/arena_utils.py +0 -217
  112. evalscope/utils/completion_parsers.py +0 -82
  113. /evalscope/{utils → benchmarks}/filters.py +0 -0
  114. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
  115. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
  116. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
  117. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
@@ -1,77 +1,85 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- # Copyright (c) OpenCompass.
2
+ # flake8: noqa
3
3
 
4
- import functools
5
- import hashlib
6
- import importlib
7
- import importlib.util
8
- import numpy as np
9
- import os
10
- import random
4
+ import ast
11
5
  import re
12
- import torch
13
- from inspect import signature
14
- from typing import Any, Dict, List, Tuple, Union
15
6
 
7
+ # from . import utils as ann_utils
8
+ from evalscope.constants import ArenaWinner
16
9
  from evalscope.utils.logger import get_logger
17
10
 
18
11
  logger = get_logger()
19
12
 
20
- TEST_LEVEL_LIST = [0, 1]
13
+ one_score_pattern = re.compile('\[\[(\d+\.?\d*)\]\]')
14
+ one_score_pattern_backup = re.compile('\[(\d+\.?\d*)\]')
21
15
 
22
- # Example: export TEST_LEVEL_LIST=0,1
23
- TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
24
16
 
17
+ # modified from: https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py#L47
18
+ # does not work with batched completions
19
+ def lmsys_parser(completion, output_format):
20
+ if output_format == '[[rating]]':
21
+ match = re.search(one_score_pattern, completion)
22
+ if not match:
23
+ match = re.search(one_score_pattern_backup, completion)
25
24
 
26
- def test_level_list():
27
- global TEST_LEVEL_LIST
28
- if TEST_LEVEL_LIST_STR in os.environ:
29
- TEST_LEVEL_LIST = [int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')]
30
-
31
- return TEST_LEVEL_LIST
32
-
33
-
34
- def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
35
- module_name, spliter, cls_name = eval_class_ref.partition(':')
36
-
25
+ if match:
26
+ rating = ast.literal_eval(match.groups()[0])
27
+ else:
28
+ logger.error(f'Content: {completion}\n'
29
+ 'You must manually fix the score.')
30
+ rating = -1
31
+
32
+ return rating
33
+ if output_format == '[[rating_a,rating_b]]':
34
+ try:
35
+ score_pair = completion.split('\n')[0]
36
+ score_pair = score_pair.replace(',', ' ')
37
+ sp = score_pair.split(' ')
38
+ if len(sp) == 2:
39
+ score_1 = float(sp[0])
40
+ score_2 = float(sp[1])
41
+ if score_1 > score_2:
42
+ winner = ArenaWinner.MODEL_A
43
+ elif score_1 < score_2:
44
+ winner = ArenaWinner.MODEL_B
45
+ else:
46
+ if score_1 == score_1 == -1:
47
+ winner = ArenaWinner.UNKNOWN
48
+ winner = ArenaWinner.TIE
49
+ return winner, [score_1, score_2]
50
+ else:
51
+ raise Exception('Invalid score pair.')
52
+ except Exception as e:
53
+ logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
54
+ return ArenaWinner.UNKNOWN, [-1, -1]
55
+ elif output_format == '[[A]]':
56
+ if '[[A]]' in completion:
57
+ winner = ArenaWinner.MODEL_A
58
+ elif '[[B]]' in completion:
59
+ winner = ArenaWinner.MODEL_B
60
+ elif '[[C]]' in completion:
61
+ winner = ArenaWinner.TIE
62
+ else:
63
+ logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
64
+ winner = ArenaWinner.UNKNOWN
65
+ return winner
66
+
67
+
68
+ def ranking_parser(completion, **kwargs):
37
69
  try:
38
- obj_cls = importlib.import_module(module_name)
39
- except ImportError as e:
40
- logger.error(f'{e}')
41
- raise e
42
-
43
- if spliter:
44
- for attr in cls_name.split('.'):
45
- obj_cls = getattr(obj_cls, attr)
46
-
47
- return functools.partial(obj_cls, *args, **kwargs)
48
-
49
-
50
- def random_seeded_choice(seed: Union[int, str, float], choices, **kwargs):
51
- """Random choice with a (potentially string) seed."""
52
- return random.Random(seed).choices(choices, k=1, **kwargs)[0]
53
-
54
-
55
- def gen_hash(name: str, bits: int = 32):
56
- return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()[:bits]
57
-
70
+ if isinstance(completion, str):
71
+ ordered_completions = ast.literal_eval(completion)
72
+ else:
73
+ ordered_completions = completion
58
74
 
59
- def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
60
- """
61
- Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
62
- converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
63
- string, which can then be stored in the json format.
75
+ rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
76
+ assert rank in [1, 2]
64
77
 
65
- Refer to: https://github.com/huggingface/transformers/pull/16065/files for details.
66
- """
67
- if d.get('torch_dtype', None) is not None and not isinstance(d['torch_dtype'], str):
68
- d['torch_dtype'] = str(d['torch_dtype']).split('.')[1]
69
-
70
- for value in d.values():
71
- if isinstance(value, dict):
72
- dict_torch_dtype_to_str(value)
73
-
74
- return d
78
+ return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
79
+ except Exception as e:
80
+ logger.error(f'{e}\nContent: {completion}\n'
81
+ 'You must manually fix the score pair.')
82
+ return ArenaWinner.UNKNOWN
75
83
 
76
84
 
77
85
  class ResponseParser:
@@ -194,7 +202,6 @@ class ResponseParser:
194
202
  return last_capital
195
203
  return 'No valid option found'
196
204
 
197
-
198
205
  @staticmethod
199
206
  def parse_bracketed_answer(text: str, options: list[str]) -> str:
200
207
  options = ResponseParser.process_options(options)
@@ -211,122 +218,3 @@ class ResponseParser:
211
218
  # Join options into a regex pattern separated by '|', to match any of the options
212
219
  options_pattern = '|'.join(escaped_options)
213
220
  return options_pattern
214
-
215
- def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
216
- """
217
- Normalize score.
218
-
219
- Args:
220
- score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678}
221
- keep_num: number of digits to keep.
222
-
223
- Returns:
224
- Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234}
225
- """
226
- if isinstance(score, float):
227
- score = round(score, keep_num)
228
- elif isinstance(score, dict):
229
- score = {k: round(v, keep_num) for k, v in score.items()}
230
- else:
231
- logger.warning(f'Unknown score type: {type(score)}')
232
-
233
- return score
234
-
235
-
236
- def is_module_installed(module_name):
237
- try:
238
- importlib.import_module(module_name)
239
- return True
240
- except ImportError:
241
- return False
242
-
243
-
244
- def get_module_path(module_name):
245
- spec = importlib.util.find_spec(module_name)
246
- if spec and spec.origin:
247
- return os.path.abspath(spec.origin)
248
- else:
249
- raise ValueError(f'Cannot find module: {module_name}')
250
-
251
-
252
- def get_valid_list(input_list, candidate_list):
253
- """
254
- Get the valid and invalid list from input_list based on candidate_list.
255
- Args:
256
- input_list: The input list.
257
- candidate_list: The candidate list.
258
-
259
- Returns:
260
- valid_list: The valid list.
261
- invalid_list: The invalid list.
262
- """
263
- return [i for i in input_list if i in candidate_list], \
264
- [i for i in input_list if i not in candidate_list]
265
-
266
-
267
- def get_latest_folder_path(work_dir):
268
- from datetime import datetime
269
-
270
- # Get all subdirectories in the work_dir
271
- folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
272
-
273
- # Get the timestamp(YYYYMMDD_HHMMSS)
274
- timestamp_pattern = re.compile(r'^\d{8}_\d{6}$')
275
-
276
- # Filter out the folders
277
- timestamped_folders = [f for f in folders if timestamp_pattern.match(f)]
278
-
279
- if not timestamped_folders:
280
- print(f'>> No timestamped folders found in {work_dir}!')
281
- return None
282
-
283
- # timestamp parser
284
- def parse_timestamp(folder_name):
285
- return datetime.strptime(folder_name, '%Y%m%d_%H%M%S')
286
-
287
- # Find the latest folder
288
- latest_folder = max(timestamped_folders, key=parse_timestamp)
289
-
290
- return os.path.join(work_dir, latest_folder)
291
-
292
-
293
- def csv_to_list(file_path: str) -> List[dict]:
294
- import csv
295
-
296
- with open(file_path, mode='r', newline='', encoding='utf-8') as csv_file:
297
- csv_reader = csv.DictReader(csv_file)
298
- result = [row for row in csv_reader]
299
-
300
- return result
301
-
302
-
303
- def seed_everything(seed: int):
304
- """Set all random seeds to a fixed value for reproducibility.
305
-
306
- Args:
307
- seed (int): The seed value.
308
- """
309
- random.seed(seed)
310
- np.random.seed(seed)
311
- torch.manual_seed(seed)
312
- if torch.cuda.is_available():
313
- torch.cuda.manual_seed_all(seed)
314
- torch.backends.cudnn.deterministic = True
315
- torch.backends.cudnn.benchmark = False
316
-
317
- def get_supported_params(func):
318
- """Get the supported parameters of a function."""
319
- sig = signature(func)
320
- return list(sig.parameters.keys())
321
-
322
- def parse_int_or_float(num):
323
- number = float(num)
324
- if number.is_integer():
325
- return int(number)
326
- return number
327
-
328
- if __name__ == '__main__':
329
- options = ['A', 'B', 'C', 'D']
330
- answers = ['Context .... ANSWER: A', 'answer: A']
331
- for answer in answers:
332
- print(ResponseParser.parse_first_option(answer, options))
@@ -8,11 +8,14 @@ logger = get_logger()
8
8
 
9
9
  DEFAULT_PROMPT_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and return a letter "A" or "B" to indicate whether the predicted answer is correct or incorrect.
10
10
 
11
- Question: {question}
11
+ [Question]
12
+ {question}
12
13
 
13
- Reference Answer: {gold}
14
+ [Reference Answer]
15
+ {gold}
14
16
 
15
- Model Answer: {pred}
17
+ [Predicted Answer]
18
+ {pred}
16
19
 
17
20
  Evaluate the model's answer based on correctness compared to the reference answer.
18
21
  Grade the predicted answer of this new question as one of:
@@ -22,6 +25,18 @@ B: INCORRECT
22
25
  Just return the letters "A" or "B", with no text around it.
23
26
  """ # noqa: E501
24
27
 
28
+
29
+ DEFAULT_NUMERIC_SCORE_TEMPLATE = """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
30
+ Begin your evaluation by providing a short explanation. Be as objective as possible.
31
+ After providing your explanation, you must rate the response on a scale of 0 (worst) to 1 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[0.5]]\"
32
+
33
+ [Question]
34
+ {question}
35
+
36
+ [Response]
37
+ {pred}
38
+ """ # noqa: E501
39
+
25
40
  DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
26
41
  DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'
27
42
 
@@ -31,14 +46,18 @@ class LLMJudge:
31
46
  A metric that uses LLM to judge the quality of model predictions by comparing them with reference answers.
32
47
  """
33
48
 
34
- def __init__(self,
35
- api_key: Optional[str] = None,
36
- api_url: Optional[str] = None,
37
- model_id: Optional[str] = None,
38
- system_prompt: Optional[str] = None,
39
- prompt_template: Optional[str] = None,
40
- generation_config: Optional[Dict[str, Any]] = None,
41
- **kwargs):
49
+ def __init__(
50
+ self,
51
+ api_key: Optional[str] = None,
52
+ api_url: Optional[str] = None,
53
+ model_id: Optional[str] = None,
54
+ system_prompt: Optional[str] = None,
55
+ prompt_template: Optional[str] = None,
56
+ generation_config: Optional[Dict[str, Any]] = None,
57
+ score_pattern: Optional[str] = None,
58
+ score_mapping: Optional[Dict[str, float]] = None,
59
+ score_type: str = 'pattern', # 'pattern', 'numeric'
60
+ **kwargs):
42
61
  """
43
62
  Initialize LLMJudge metric.
44
63
 
@@ -49,14 +68,34 @@ class LLMJudge:
49
68
  system_prompt (str, optional): System prompt for the judge
50
69
  prompt_template (str, optional): Prompt template for the judge
51
70
  generation_config (dict, optional): Generation configuration for the judge
71
+ score_pattern (str, optional): Regex pattern to extract score from LLM response
72
+ score_mapping (dict, optional): Mapping from extracted score to float value
73
+ score_type (str, optional): Type of score extraction strategy ('pattern', 'numeric') defaults to 'pattern'.
74
+ - 'pattern': Use score_pattern and score_mapping to extract categorical scores
75
+ - 'numeric': Treat the extracted value as a direct numerical score
52
76
  """
53
77
  self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
54
78
  self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
55
79
  self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
56
80
  self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
57
- self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
58
81
  self.generation_config = generation_config or {}
59
82
 
83
+ # Default score mapping for A/B pattern
84
+ self.score_type = score_type
85
+ if self.score_type == 'numeric':
86
+ self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
87
+ self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE',
88
+ DEFAULT_NUMERIC_SCORE_TEMPLATE)
89
+ elif self.score_type == 'pattern':
90
+ self.score_pattern = score_pattern or r'(A|B)'
91
+ self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
92
+ else:
93
+ raise ValueError(f"Invalid score_type: {self.score_type}. Must be 'pattern' or 'numeric'.")
94
+ self.score_mapping = score_mapping or {'A': 1.0, 'B': 0.0}
95
+
96
+ self._init_server_adapter()
97
+
98
+ def _init_server_adapter(self):
60
99
  from evalscope.models import ServerModelAdapter
61
100
 
62
101
  # Initialize ServerModelAdapter
@@ -95,17 +134,63 @@ class LLMJudge:
95
134
  def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
96
135
  if question is None:
97
136
  question = 'Not provided'
98
- return self.prompt_template.format(question=question, pred=pred, gold=gold)
137
+
138
+ # check variables in prompt_template
139
+ prompt = self.prompt_template
140
+ if '{question}' in self.prompt_template:
141
+ prompt = prompt.replace('{question}', question)
142
+ if '{pred}' in self.prompt_template:
143
+ prompt = prompt.replace('{pred}', pred)
144
+ if '{gold}' in self.prompt_template:
145
+ prompt = prompt.replace('{gold}', gold)
146
+ return prompt
99
147
 
100
148
  def get_score(self, response: str) -> float:
149
+ """
150
+ Extract score from LLM response using the configured pattern and mapping.
151
+
152
+ Args:
153
+ response (str): The response from the LLM
154
+
155
+ Returns:
156
+ float: The numeric score extracted from the response
157
+ """
101
158
  if response is None:
102
- return 0
103
- match = re.search(r'(A|B)', response)
159
+ return 0.0
160
+
161
+ # choose extraction method based on score_type
162
+ if self.score_type == 'numeric':
163
+ return self._extract_numeric_score(response)
164
+ elif self.score_type == 'pattern':
165
+ return self._extract_pattern_score(response)
166
+
167
+ def _extract_numeric_score(self, response: str) -> Optional[float]:
168
+ """extract numeric score from the response using the score_pattern"""
169
+ match = re.search(self.score_pattern, response)
170
+
171
+ if match:
172
+ # try to convert each captured group to float
173
+ for group in match.groups():
174
+ if group is not None:
175
+ try:
176
+ return float(group)
177
+ except (ValueError, TypeError):
178
+ continue
179
+
180
+ # if not found in groups, try the whole match
181
+ try:
182
+ return float(match.group(0))
183
+ except (ValueError, TypeError):
184
+ logger.warning(f'Failed to convert any extracted value to float from: {match.group(0)}')
185
+
186
+ return None
187
+
188
+ def _extract_pattern_score(self, response: str) -> float:
189
+ """use the score_pattern to extract categorical scores"""
190
+ match = re.search(self.score_pattern, response)
104
191
  if match:
105
192
  answer = match.group(0)
106
- if answer == 'A':
107
- return 1
108
- elif answer == 'B':
109
- return 0
193
+ return self.score_mapping.get(answer, 0.0)
110
194
  else:
111
- return 0
195
+ logger.warning(f"No match found for pattern '{self.score_pattern}' in response: {response}")
196
+ return 0.0
@@ -223,7 +223,7 @@ def chrf(items):
223
223
  Source: https://github.com/m-popovic/chrF
224
224
  Paper: https://www.aclweb.org/anthology/W15-3049.pdf
225
225
 
226
- Higher is better # TODO I think
226
+ Higher is better
227
227
  """
228
228
  refs = list(zip(*items))[0]
229
229
  preds = list(zip(*items))[1]
@@ -54,8 +54,6 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', b
54
54
 
55
55
  if 'server' not in model_adapter_cls_str:
56
56
  model_adapter_cls_str = 'server'
57
- logger.info(
58
- f'Using {model_adapter_cls.__name__} for api model evaluation for benchmark {benchmark.name}.')
59
57
 
60
58
  # init server model adapter
61
59
  model_adapter_cls = get_model_adapter(model_adapter_cls_str)
@@ -5,8 +5,8 @@ from openai.types.chat import ChatCompletion, ChatCompletionChunk
5
5
  from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
6
6
  from typing import List, Optional, Union
7
7
 
8
+ from evalscope.utils.argument_utils import get_supported_params
8
9
  from evalscope.utils.logger import get_logger
9
- from evalscope.utils.utils import get_supported_params
10
10
  from .base_adapter import BaseModelAdapter
11
11
 
12
12
  logger = get_logger()
@@ -29,7 +29,7 @@ class ServerModelAdapter(BaseModelAdapter):
29
29
  self.api_key = api_key
30
30
 
31
31
  self.client = openai.OpenAI(
32
- api_key=api_key,
32
+ api_key=self.api_key,
33
33
  base_url=self.api_url,
34
34
  )
35
35
  self.supported_params = get_supported_params(self.client.chat.completions.create)
@@ -50,14 +50,14 @@ class DummyCustomModel(CustomModel):
50
50
  # Must return a list of dicts with the same format as the OpenAI API.
51
51
  responses = []
52
52
  for input_item in original_inputs:
53
- message = self.make_request_messages(input_item)
54
- response = f'Dummy response for prompt: {message}'
53
+ # message = self.make_request_messages(input_item)
54
+ # response = f'Dummy response for prompt: {message}'
55
55
 
56
56
  res_d = {
57
57
  'choices': [{
58
58
  'index': 0,
59
59
  'message': {
60
- 'content': response,
60
+ 'content': '*PlaceHolder*',
61
61
  'role': 'assistant'
62
62
  }
63
63
  }],
@@ -6,10 +6,11 @@ from dataclasses import dataclass, field
6
6
  from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from evalscope.constants import DEFAULT_WORK_DIR
9
+ from evalscope.utils import BaseArgument
9
10
 
10
11
 
11
12
  @dataclass
12
- class Arguments:
13
+ class Arguments(BaseArgument):
13
14
  # Model and API
14
15
  model: str # Model name or path
15
16
  model_id: Optional[str] = None # Model identifier
@@ -69,15 +70,6 @@ class Arguments:
69
70
  top_k: Optional[int] = None # Top-k sampling setting for the response
70
71
  extra_args: Optional[Dict[str, Any]] = None # Extra arguments
71
72
 
72
- @staticmethod
73
- def from_args(args):
74
- # Convert Namespace to a dictionary and filter out None values
75
- args_dict = {k: v for k, v in vars(args).items() if v is not None}
76
-
77
- if 'func' in args_dict:
78
- del args_dict['func'] # Note: compat CLI arguments
79
- return Arguments(**args_dict)
80
-
81
73
  def __post_init__(self):
82
74
  # Set the default headers
83
75
  self.headers = self.headers or {} # Default to empty dictionary
@@ -108,12 +100,6 @@ class Arguments:
108
100
  self.parallel
109
101
  ), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
110
102
 
111
- def __str__(self):
112
- return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
113
-
114
- def to_dict(self) -> Dict[str, Any]:
115
- return self.__dict__
116
-
117
103
 
118
104
  class ParseKVAction(argparse.Action):
119
105
 
evalscope/perf/main.py CHANGED
@@ -9,7 +9,7 @@ from argparse import Namespace
9
9
  from evalscope.perf.utils.local_server import start_app
10
10
  from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
11
11
  from evalscope.utils.logger import configure_logging, get_logger
12
- from evalscope.utils.utils import seed_everything
12
+ from evalscope.utils.model_utils import seed_everything
13
13
  from .arguments import Arguments, parse_args
14
14
  from .benchmark import benchmark
15
15
  from .utils.db_util import get_output_path
@@ -3,27 +3,28 @@ import json
3
3
  import pickle
4
4
  import sqlite3
5
5
 
6
- result_db_path = './outputs/qwen2.5_benchmark_20241111_160543.db'
7
- con = sqlite3.connect(result_db_path)
8
- query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
9
- FROM result WHERE success='1'"
6
+ db_path = 'your db path'
7
+ conn = sqlite3.connect(db_path)
8
+ cursor = conn.cursor()
10
9
 
11
- # how to save base64.b64encode(pickle.dumps(benchmark_data["request"])).decode("ascii"),
12
- with con:
13
- rows = con.execute(query_sql).fetchall()
14
- if len(rows) > 0:
15
- for row in rows:
16
- request = row[0]
17
- responses = row[1]
18
- request = base64.b64decode(request)
19
- request = pickle.loads(request)
20
- responses = base64.b64decode(responses)
21
- responses = pickle.loads(responses)
22
- response_content = ''
23
- for response in responses:
24
- response = json.loads(response)
25
- if not response['choices']:
26
- continue
27
- response_content += response['choices'][0]['delta']['content']
28
- print('prompt: %s, tokens: %s, completion: %s, tokens: %s' %
29
- (request['messages'][0]['content'], row[2], response_content, row[3]))
10
+ # 获取列名
11
+ cursor.execute('PRAGMA table_info(result)')
12
+ columns = [info[1] for info in cursor.fetchall()]
13
+ print('列名:', columns)
14
+
15
+ cursor.execute('SELECT * FROM result WHERE success=1 AND first_chunk_latency > 1')
16
+ rows = cursor.fetchall()
17
+ print(f'len(rows): {len(rows)}')
18
+
19
+ for row in rows:
20
+ row_dict = dict(zip(columns, row))
21
+ # 解码request
22
+ row_dict['request'] = pickle.loads(base64.b64decode(row_dict['request']))
23
+ # 解码response_messages
24
+ row_dict['response_messages'] = pickle.loads(base64.b64decode(row_dict['response_messages']))
25
+ # print(row_dict)
26
+ print(
27
+ f"request_id: {json.loads(row_dict['response_messages'][0])['id']}, first_chunk_latency: {row_dict['first_chunk_latency']}" # noqa: E501
28
+ )
29
+ # 如果只想看一个可以break
30
+ # break
@@ -38,7 +38,7 @@ class BenchmarkData:
38
38
  self.first_chunk_latency = self.query_latency
39
39
  self.n_chunks = 1
40
40
  self.n_chunks_time = self.query_latency
41
- self.time_per_output_token = self.n_chunks_time / self.n_chunks
41
+ self.time_per_output_token = self.n_chunks_time / self.n_chunks if self.n_chunks != 0 else 0.0
42
42
 
43
43
  def _calculate_tokens(self, api_plugin):
44
44
  self.prompt_tokens, self.completion_tokens = \
@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .combinator import gen_report_table, gen_table, get_data_frame, get_report_list
7
+ from .combinator import gen_table, get_data_frame, get_report_list
8
8
  from .generator import ReportGenerator
9
9
  from .utils import Category, Report, ReportKey, Subset
10
10