evalscope 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (66) hide show
  1. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  2. evalscope/benchmarks/data_adapter.py +9 -4
  3. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -1
  4. evalscope/benchmarks/general_qa/general_qa_adapter.py +2 -1
  5. evalscope/benchmarks/hle/__init__.py +0 -0
  6. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  7. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  8. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  9. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  10. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  11. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  12. evalscope/benchmarks/utils.py +1 -0
  13. evalscope/constants.py +5 -21
  14. evalscope/evaluator/__init__.py +1 -1
  15. evalscope/evaluator/evaluator.py +5 -3
  16. evalscope/metrics/__init__.py +3 -1
  17. evalscope/metrics/completion_parsers.py +7 -0
  18. evalscope/metrics/llm_judge.py +6 -5
  19. evalscope/metrics/metrics.py +19 -7
  20. evalscope/models/__init__.py +4 -8
  21. evalscope/models/adapters/__init__.py +4 -9
  22. evalscope/models/adapters/base_adapter.py +4 -0
  23. evalscope/models/adapters/bfcl_adapter.py +2 -0
  24. evalscope/models/adapters/chat_adapter.py +3 -0
  25. evalscope/models/adapters/choice_adapter.py +4 -0
  26. evalscope/models/adapters/custom_adapter.py +7 -3
  27. evalscope/models/adapters/server_adapter.py +2 -0
  28. evalscope/models/adapters/t2i_adapter.py +3 -0
  29. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  30. evalscope/models/register.py +0 -14
  31. evalscope/perf/arguments.py +13 -0
  32. evalscope/perf/benchmark.py +38 -39
  33. evalscope/perf/http_client.py +30 -86
  34. evalscope/perf/main.py +2 -2
  35. evalscope/perf/plugin/__init__.py +3 -2
  36. evalscope/perf/plugin/api/__init__.py +4 -3
  37. evalscope/perf/plugin/api/base.py +22 -4
  38. evalscope/perf/plugin/api/custom_api.py +212 -55
  39. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  40. evalscope/perf/plugin/api/default_api.py +105 -0
  41. evalscope/perf/plugin/api/openai_api.py +17 -19
  42. evalscope/perf/plugin/datasets/__init__.py +10 -7
  43. evalscope/perf/plugin/datasets/base.py +22 -1
  44. evalscope/perf/plugin/datasets/custom.py +2 -1
  45. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  46. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  47. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  48. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  49. evalscope/perf/plugin/datasets/openqa.py +2 -1
  50. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  51. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  52. evalscope/perf/plugin/registry.py +36 -16
  53. evalscope/perf/utils/benchmark_util.py +14 -20
  54. evalscope/perf/utils/db_util.py +79 -61
  55. evalscope/utils/io_utils.py +10 -0
  56. evalscope/version.py +2 -2
  57. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/METADATA +54 -34
  58. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/RECORD +65 -58
  59. tests/cli/test_all.py +18 -2
  60. tests/cli/test_run.py +25 -37
  61. tests/perf/test_perf.py +29 -2
  62. evalscope/models/model.py +0 -189
  63. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  64. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  65. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  66. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
@@ -35,7 +35,7 @@ SUBJECT_MAPPING = {
35
35
  @Benchmark.register(
36
36
  name='bfcl_v3',
37
37
  pretty_name='BFCL-v3',
38
- tags=['Agent'],
38
+ tags=['Agent', 'Function Calling'],
39
39
  description=
40
40
  'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
41
41
  'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '
@@ -168,6 +168,11 @@ class DataAdapter(ABC):
168
168
  If you want to support local dataset, please rewrite this method in xxx_data_adapter.
169
169
  Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
170
170
  """
171
+ # remove dataset_infos.json file if exists, since MsDataset will occur an error if it exists.
172
+ dataset_infos_path = os.path.join(dataset_name_or_path, 'dataset_infos.json')
173
+ if os.path.exists(dataset_infos_path):
174
+ logger.info(f'Removing dataset_infos.json file at {dataset_infos_path} to avoid MsDataset errors.')
175
+ os.remove(dataset_infos_path)
171
176
  return self.load_from_hub(dataset_name_or_path, subset_list, None, **kwargs)
172
177
 
173
178
  def load_with_snapshot(self,
@@ -382,7 +387,7 @@ class DataAdapter(ABC):
382
387
  pass
383
388
 
384
389
  def gen_prompt_data(self,
385
- prompt: str,
390
+ prompt: str = '',
386
391
  system_prompt: Optional[str] = None,
387
392
  choices: Optional[List[str]] = None,
388
393
  index: Optional[Union[int, str]] = None,
@@ -413,7 +418,8 @@ class DataAdapter(ABC):
413
418
  system_prompt=system_prompt or self.system_prompt,
414
419
  index=index or 0,
415
420
  id=id,
416
- messages=messages)
421
+ messages=messages,
422
+ extra_data=kwargs.get('extra_data', None))
417
423
  return prompt_data.to_dict()
418
424
 
419
425
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
@@ -477,7 +483,6 @@ class DataAdapter(ABC):
477
483
  """
478
484
  return result
479
485
 
480
- @abstractmethod
481
486
  def match(self, gold: Any, pred: Any) -> Any:
482
487
  """
483
488
  Match the gold answer and the predicted answer.
@@ -491,7 +496,7 @@ class DataAdapter(ABC):
491
496
  Returns:
492
497
  The match result. Usually a score (float) for chat/multiple-choice-questions.
493
498
  """
494
- raise NotImplementedError
499
+ return 1.0 if gold == pred else 0.0
495
500
 
496
501
  def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
497
502
  """
@@ -17,7 +17,8 @@ logger = get_logger()
17
17
  @Benchmark.register(
18
18
  name='general_mcq',
19
19
  pretty_name='General-MCQ',
20
- description='A general multiple-choice question answering dataset.',
20
+ description='A general multiple-choice question answering dataset for custom evaluation. '
21
+ 'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#mcq).',
21
22
  tags=['MCQ', 'Custom'],
22
23
  dataset_id='general_mcq',
23
24
  model_adapter=OutputType.GENERATION,
@@ -14,7 +14,8 @@ logger = get_logger()
14
14
  @Benchmark.register(
15
15
  name='general_qa',
16
16
  pretty_name='General-QA',
17
- description='General Question Answering dataset',
17
+ description='A general question answering dataset for custom evaluation. '
18
+ 'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#qa).', # noqa: E501
18
19
  tags=['QA', 'Custom'],
19
20
  dataset_id='general_qa',
20
21
  subset_list=['default'],
File without changes
@@ -0,0 +1,118 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.metrics import DEFAULT_PROMPT_TEMPLATE, LLMJudge, exact_match, mean
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ # flake8: noqa
10
+
11
+ logger = get_logger()
12
+
13
+ SUBSET_LIST = [
14
+ 'Biology/Medicine',
15
+ 'Chemistry',
16
+ 'Computer Science/AI',
17
+ 'Engineering',
18
+ 'Humanities/Social Science',
19
+ 'Math',
20
+ 'Physics',
21
+ 'Other',
22
+ ]
23
+
24
+
25
+ @Benchmark.register(
26
+ name='hle',
27
+ pretty_name="Humanity's-Last-Exam",
28
+ tags=['Knowledge', 'QA'],
29
+ description=
30
+ 'Humanity\'s Last Exam (HLE) is a language model benchmark consisting of 2,500 questions across a broad range of subjects. It was created jointly by the Center for AI Safety and Scale AI. The benchmark classifies the questions into the following broad subjects: mathematics (41%), physics (9%), biology/medicine (11%), humanities/social science (9%), computer science/artificial intelligence (10%), engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions require the ability to understand both text and images, i.e., multi-modality. 24% of the questions are multiple-choice; the rest are short-answer, exact-match questions.', # noqa: E501
31
+ dataset_id='cais/hle',
32
+ subset_list=SUBSET_LIST,
33
+ metric_list=['AverageAccuracy'],
34
+ few_shot_num=0,
35
+ train_split=None,
36
+ eval_split='test',
37
+ prompt_template='{query}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
38
+ )
39
+ class HLEAdapter(DataAdapter):
40
+
41
+ def __init__(self, *args, **kwargs):
42
+ super().__init__(*args, **kwargs)
43
+
44
+ self.llm_as_a_judge = True
45
+
46
+ def load(self, **kwargs):
47
+ kwargs['subset_list'] = ['default']
48
+ data_dict = super().load(**kwargs)
49
+ return self.reformat_subset(data_dict, subset_key='category', format='{}')
50
+
51
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
52
+ # remove image preview
53
+ input_d.pop('image_preview', None)
54
+ input_d.pop('rationale_image', None)
55
+ # generate prompt
56
+ question = input_d['question']
57
+ prompt = self.prompt_template.format(query=question)
58
+ image = input_d.get('image', None)
59
+ # build messages for multi-modal input
60
+ messages = []
61
+ if self.system_prompt:
62
+ messages.append({'role': 'system', 'content': self.system_prompt})
63
+ if image:
64
+ messages.append({
65
+ 'role':
66
+ 'user',
67
+ 'content': [{
68
+ 'type': 'text',
69
+ 'text': prompt
70
+ }, {
71
+ 'type': 'image_url',
72
+ 'image_url': {
73
+ 'url': image
74
+ }
75
+ }]
76
+ })
77
+ else:
78
+ messages.append({'role': 'user', 'content': prompt})
79
+ return self.gen_prompt_data(prompt='', messages=messages)
80
+
81
+ def get_gold_answer(self, input_d: dict) -> str:
82
+ return input_d['answer']
83
+
84
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
85
+ # Extract the answer from the model output \boxed{answer}
86
+ match = re.search(r'\\boxed{([^}]*)}', result)
87
+ if match:
88
+ return match.group(1).strip()
89
+ else:
90
+ logger.warning(f'No answer found in the model output: {result}')
91
+ return ''
92
+
93
+ def llm_parse_pred_result(self, result, raw_input_d=None, **kwargs) -> str:
94
+ return result.strip()
95
+
96
+ def match(self, gold: str, pred: str) -> dict:
97
+ # simple match
98
+ return {
99
+ 'AverageAccuracy': 1.0 if exact_match(gold, pred) else 0.0,
100
+ }
101
+
102
+ def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
103
+ raw_input = kwargs.get('raw_input', None)
104
+ question = raw_input['question']
105
+ # get grading response
106
+ prompt = judge.build_prompt(pred, gold, question)
107
+ judge_response = judge(prompt)
108
+ score = judge.get_score(judge_response)
109
+ return {
110
+ 'AverageAccuracy': score,
111
+ 'response': judge_response,
112
+ }
113
+
114
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
115
+ # zip dict answers
116
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
117
+
118
+ return super().compute_metric(res_dict, **kwargs)
@@ -22,7 +22,8 @@ logger = get_logger()
22
22
  few_shot_num=0,
23
23
  train_split=None,
24
24
  eval_split='test',
25
- prompt_template='Complete the following python code:\n{query}',
25
+ prompt_template=
26
+ 'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{query}', # noqa: E501
26
27
  extra_params={
27
28
  'num_workers': 4,
28
29
  'timeout': 4
@@ -76,26 +77,9 @@ class HumanevalAdapter(DataAdapter):
76
77
 
77
78
  @classmethod
78
79
  def _postprocess(cls, text: str) -> str:
79
- if '```' in text:
80
- blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
81
- if len(blocks) == 0:
82
- text = text.split('```')[1] # fall back to default strategy
83
- else:
84
- text = blocks[0] # fetch the first code block
85
- if not text.startswith('\n'): # in case starting with ```python
86
- text = text[max(text.find('\n') + 1, 0):]
87
- if text.strip().startswith('from') or text.strip().startswith('import'):
88
- def_idx = text.find('def')
89
- if def_idx != -1:
90
- text = text[max(text.find('\n', def_idx) + 1, 0):]
91
- text = text.split('\n\n')[0]
92
- if text.strip().startswith('def'):
93
- text = '\n'.join(text.split('\n')[1:])
94
- if not text.startswith(' '):
95
- if text.startswith(' '):
96
- text = ' ' + text.lstrip()
97
- else:
98
- text = '\n'.join([' ' + line for line in text.split('\n')])
80
+ blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
81
+ if len(blocks) >= 1:
82
+ text = blocks[0]
99
83
  return text
100
84
 
101
85
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
@@ -144,7 +144,7 @@ SUBJECT_MAPPING = {
144
144
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
145
145
  subset_list=SUBSET_LIST,
146
146
  metric_list=['AverageAccuracy'],
147
- few_shot_num=5,
147
+ few_shot_num=0,
148
148
  train_split='train',
149
149
  eval_split='test',
150
150
  prompt_template=
File without changes
@@ -0,0 +1,110 @@
1
+ import importlib
2
+ from collections import defaultdict
3
+ from typing import Dict, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.metrics import Metric, mean, metric_registry
7
+ from evalscope.utils import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ @Benchmark.register(
13
+ name='tau_bench',
14
+ pretty_name='τ-bench',
15
+ tags=['Reasoning', 'Agent', 'Function Calling'],
16
+ description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
17
+ 'and a language agent provided with domain-specific API tools and policy guidelines. '
18
+ 'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating and set a user model. ', # noqa: E501
19
+ dataset_id='https://github.com/sierra-research/tau-bench',
20
+ model_adapter='tau_bench_server',
21
+ subset_list=['airline', 'retail'],
22
+ metric_list=['Pass^1'],
23
+ eval_split='test',
24
+ extra_params={
25
+ 'user_model': 'qwen-plus',
26
+ 'api_key': 'EMPTY',
27
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
28
+ 'generation_config': {
29
+ 'temperature': 0.7,
30
+ 'max_new_tokens': 1024
31
+ }
32
+ })
33
+ class TauBenchAdapter(DataAdapter):
34
+
35
+ def __init__(self, **kwargs):
36
+ super().__init__(**kwargs)
37
+
38
+ spec = importlib.util.find_spec('tau_bench')
39
+ if spec is None:
40
+ raise ImportError(
41
+ '`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.' # noqa: E501
42
+ )
43
+
44
+ metric_registry.register(Metric(name='Pass^1', object=mean))
45
+
46
+ # setup user model args
47
+ extra_params = kwargs.get('extra_params', {})
48
+ self.user_model = extra_params.get('user_model', 'qwen-plus')
49
+ self.api_key = extra_params.get('api_key', 'EMPTY')
50
+ self.api_base = extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
51
+ self.generation_config = extra_params.get('generation_config', {'temperature': 0.7, 'max_new_tokens': 1024})
52
+
53
+ self._patch_env_completion()
54
+
55
+ def _patch_env_completion(self) -> str:
56
+ from tau_bench.envs.user import LLMUserSimulationEnv
57
+
58
+ def new_generate_next_message(self, messages):
59
+ from evalscope.models import ServerModelAdapter
60
+
61
+ user_server = ServerModelAdapter(
62
+ api_url=adapter_instance.api_base,
63
+ model_id=adapter_instance.user_model,
64
+ api_key=adapter_instance.api_key)
65
+ request_json = user_server.make_request(
66
+ input_item={'messages': messages}, infer_cfg=adapter_instance.generation_config)
67
+ res = user_server.send_request(request_json)
68
+
69
+ message = res['choices'][0]['message']
70
+ self.messages.append(message)
71
+ self.total_cost = 0
72
+ return message['content']
73
+
74
+ # get the current instance of TauBenchAdapter
75
+ adapter_instance = self
76
+ LLMUserSimulationEnv.generate_next_message = new_generate_next_message
77
+
78
+ def load(self, **kwargs):
79
+ from tau_bench.envs import get_env
80
+
81
+ data_dict = defaultdict(dict)
82
+ for env_name in self.subset_list:
83
+ logger.info(f'Loading TauBench environment: {env_name}')
84
+ env = get_env(
85
+ env_name=env_name,
86
+ user_strategy='llm',
87
+ user_model='dummy', # Use dummy model to prevent errors
88
+ user_provider='openai', # Use dummy provider to prevent errors
89
+ task_split=self.eval_split,
90
+ )
91
+ tasks = []
92
+ for i in range(len(env.tasks)):
93
+ tasks.append({
94
+ 'task_index': i,
95
+ 'env_name': env_name,
96
+ })
97
+ data_dict[env_name][self.eval_split] = tasks
98
+
99
+ return data_dict
100
+
101
+ def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
102
+ return self.gen_prompt_data(extra_data=input_d)
103
+
104
+ def get_gold_answer(self, input_d):
105
+ return ''
106
+
107
+ def match(self, gold, pred):
108
+ import json
109
+ res = json.loads(pred)
110
+ return res.get('reward', 0.0)
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from typing import Dict, List
2
3
 
3
4
  from evalscope.benchmarks import Benchmark, DataAdapter
@@ -8,7 +9,7 @@ from evalscope.metrics import Metric, mean, metric_registry
8
9
  @Benchmark.register(
9
10
  name='tool_bench',
10
11
  pretty_name='ToolBench-Static',
11
- tags=['Reasoning', 'Agent'],
12
+ tags=['Reasoning', 'Agent', 'Function Calling'],
12
13
  description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
13
14
  'It includes various subsets such as in-domain and out-of-domain, '
14
15
  'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
@@ -40,6 +41,11 @@ class ToolBenchAdapter(DataAdapter):
40
41
  for message in messages:
41
42
  if 'name' in message:
42
43
  del message['name']
44
+ if 'role' in message:
45
+ if message['role'] == 'function':
46
+ content = json.dumps(message, ensure_ascii=False)
47
+ message['role'] = 'user'
48
+ message['content'] = content
43
49
  return self.gen_prompt_data(prompt='', messages=messages)
44
50
 
45
51
  def get_gold_answer(self, input_d: dict) -> str:
@@ -13,6 +13,7 @@ class PromptData:
13
13
  multi_choices: Optional[List[str]] = None
14
14
  id: Optional[str] = None
15
15
  messages: Optional[List[dict]] = None
16
+ extra_data: Optional[Dict] = None
16
17
 
17
18
  def to_dict(self) -> Dict:
18
19
  return {k: v for k, v in asdict(self).items() if v is not None}
evalscope/constants.py CHANGED
@@ -41,27 +41,6 @@ class MetricsConstant:
41
41
  ]
42
42
 
43
43
 
44
- class MetricMembers:
45
-
46
- # Math accuracy metric
47
- MATH_ACCURACY = 'math_accuracy'
48
-
49
- # Code pass@k metric
50
- CODE_PASS_K = 'code_pass_k'
51
-
52
- # Code rouge metric
53
- ROUGE = 'rouge'
54
-
55
- # ELO rating system for pairwise comparison
56
- ELO = 'elo'
57
-
58
- # Pairwise comparison win/lose and tie(optional)
59
- PAIRWISE = 'pairwise'
60
-
61
- # Rating score for single model
62
- SCORE = 'score'
63
-
64
-
65
44
  class ArenaWinner:
66
45
 
67
46
  MODEL_A = 'model_a'
@@ -172,6 +151,11 @@ class JudgeStrategy:
172
151
  LLM_RECALL = 'llm_recall'
173
152
 
174
153
 
154
+ class JudgeScoreType:
155
+ NUMERIC = 'numeric' # numeric score
156
+ PATTERN = 'pattern' # pattern matching score
157
+
158
+
175
159
  class ModelTask:
176
160
  TEXT_GENERATION = 'text_generation'
177
161
  IMAGE_GENERATION = 'image_generation'
@@ -1,3 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.evaluator.evaluator import Evaluator
3
+ from .evaluator import Evaluator
@@ -7,17 +7,19 @@ from collections import OrderedDict, defaultdict
7
7
  from concurrent.futures import ThreadPoolExecutor, as_completed
8
8
  from copy import deepcopy
9
9
  from tqdm import tqdm
10
- from typing import Any, Dict, List, Optional, Union
10
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
11
11
 
12
12
  from evalscope.benchmarks import DataAdapter
13
13
  from evalscope.config import TaskConfig
14
14
  from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
15
- from evalscope.models import BaseModelAdapter
16
15
  from evalscope.report import Report, gen_table
17
16
  from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, gen_hash, jsonl_to_list
18
17
  from evalscope.utils.logger import get_logger
19
18
  from evalscope.utils.model_utils import dict_torch_dtype_to_str
20
19
 
20
+ if TYPE_CHECKING:
21
+ from evalscope.models import BaseModelAdapter
22
+
21
23
  logger = get_logger()
22
24
 
23
25
 
@@ -38,7 +40,7 @@ class Evaluator(object):
38
40
 
39
41
  def __init__(self,
40
42
  data_adapter: DataAdapter,
41
- model_adapter: BaseModelAdapter,
43
+ model_adapter: 'BaseModelAdapter',
42
44
  outputs: OutputsStructure = None,
43
45
  task_cfg: TaskConfig = None,
44
46
  **kwargs):
@@ -5,7 +5,7 @@ from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
7
  from .completion_parsers import ResponseParser, lmsys_parser, ranking_parser
8
- from .llm_judge import LLMJudge
8
+ from .llm_judge import DEFAULT_NUMERIC_SCORE_TEMPLATE, DEFAULT_PROMPT_TEMPLATE, LLMJudge
9
9
  from .math_parser import extract_answer, math_equal, strip_answer_string
10
10
  from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
11
11
  weighted_mean)
@@ -34,6 +34,8 @@ else:
34
34
  ],
35
35
  'llm_judge': [
36
36
  'LLMJudge',
37
+ 'DEFAULT_PROMPT_TEMPLATE',
38
+ 'DEFAULT_NUMERIC_SCORE_TEMPLATE',
37
39
  ],
38
40
  'math_parser': [
39
41
  'extract_answer',
@@ -218,3 +218,10 @@ class ResponseParser:
218
218
  # Join options into a regex pattern separated by '|', to match any of the options
219
219
  options_pattern = '|'.join(escaped_options)
220
220
  return options_pattern
221
+
222
+
223
+ if __name__ == '__main__':
224
+ result = '**Answer: A **Answer: C**'
225
+ options = ['A', 'B', 'C', 'D']
226
+ parsed_result = ResponseParser.parse_first_option(result, options)
227
+ print(f'Parsed result: {parsed_result}') # Should print 'C'
@@ -2,6 +2,7 @@ import os
2
2
  import re
3
3
  from typing import Any, Dict, List, Optional
4
4
 
5
+ from evalscope.constants import JudgeScoreType
5
6
  from evalscope.utils.logger import get_logger
6
7
 
7
8
  logger = get_logger()
@@ -56,7 +57,7 @@ class LLMJudge:
56
57
  generation_config: Optional[Dict[str, Any]] = None,
57
58
  score_pattern: Optional[str] = None,
58
59
  score_mapping: Optional[Dict[str, float]] = None,
59
- score_type: str = 'pattern', # 'pattern', 'numeric'
60
+ score_type: str = JudgeScoreType.PATTERN, # 'pattern', 'numeric'
60
61
  **kwargs):
61
62
  """
62
63
  Initialize LLMJudge metric.
@@ -82,11 +83,11 @@ class LLMJudge:
82
83
 
83
84
  # Default score mapping for A/B pattern
84
85
  self.score_type = score_type
85
- if self.score_type == 'numeric':
86
+ if self.score_type == JudgeScoreType.NUMERIC:
86
87
  self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
87
88
  self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE',
88
89
  DEFAULT_NUMERIC_SCORE_TEMPLATE)
89
- elif self.score_type == 'pattern':
90
+ elif self.score_type == JudgeScoreType.PATTERN:
90
91
  self.score_pattern = score_pattern or r'(A|B)'
91
92
  self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
92
93
  else:
@@ -159,9 +160,9 @@ class LLMJudge:
159
160
  return 0.0
160
161
 
161
162
  # choose extraction method based on score_type
162
- if self.score_type == 'numeric':
163
+ if self.score_type == JudgeScoreType.NUMERIC:
163
164
  return self._extract_numeric_score(response)
164
- elif self.score_type == 'pattern':
165
+ elif self.score_type == JudgeScoreType.PATTERN:
165
166
  return self._extract_pattern_score(response)
166
167
 
167
168
  def _extract_numeric_score(self, response: str) -> Optional[float]:
@@ -9,7 +9,7 @@ import random
9
9
  import sacrebleu
10
10
  from collections import defaultdict
11
11
  from collections.abc import Iterable
12
- from typing import TYPE_CHECKING, Dict, List, Union
12
+ from typing import Dict, List, Union
13
13
 
14
14
 
15
15
  def mean(arr: list):
@@ -22,16 +22,28 @@ def mean(arr: list):
22
22
 
23
23
 
24
24
  def pass_at_k(arr: Union[List[int], List[List[int]]], k: int = 1) -> float:
25
+ """
26
+ Calculates the pass@k metric using the calculate_pass_at_k function.
27
+
28
+ Args:
29
+ arr: List of binary values (1 for correct, 0 for incorrect) or list of such lists
30
+ k: Number of attempts allowed
31
+
32
+ Returns:
33
+ The average pass@k score across all problems
34
+ """
25
35
  if not arr:
26
36
  return 0.0
37
+ if not isinstance(arr[0], list):
38
+ # If arr is a simple list of binary results, convert it to a list of lists
39
+ arr = [arr]
27
40
 
28
- def sub_pass_at_k(sub_arr: List[int]) -> float:
29
- return 1.0 if any(sub_arr[:k]) else 0.0
41
+ # For list of lists case, each inner list represents attempts for one problem
42
+ num_samples = [len(sub_arr) for sub_arr in arr]
43
+ num_correct = [sum(sub_arr) for sub_arr in arr]
44
+ pass_at_k_values = calculate_pass_at_k(num_samples, num_correct, k)
30
45
 
31
- if isinstance(arr[0], list):
32
- return sum(sub_pass_at_k(sub_arr) for sub_arr in arr) / len(arr)
33
- else:
34
- return sum(arr) / len(arr)
46
+ return float(np.mean(pass_at_k_values))
35
47
 
36
48
 
37
49
  def pop_stddev(arr):
@@ -4,12 +4,11 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .adapters import (BaseModelAdapter, ChatGenerationModelAdapter, ContinuationLogitsModelAdapter,
7
+ from .adapters import (BaseModelAdapter, BFCLAdapter, ChatGenerationModelAdapter, ContinuationLogitsModelAdapter,
8
8
  CustomModelAdapter, MultiChoiceModelAdapter, ServerModelAdapter, T2IModelAdapter,
9
- initialize_model_adapter)
9
+ TauBenchAdapter, initialize_model_adapter)
10
10
  from .custom import CustomModel, DummyCustomModel
11
11
  from .local_model import LocalModel, get_local_model
12
- from .model import BaseModel, ChatBaseModel, OpenAIModel
13
12
  from .register import get_model_adapter
14
13
 
15
14
  else:
@@ -23,6 +22,8 @@ else:
23
22
  'CustomModelAdapter',
24
23
  'ServerModelAdapter',
25
24
  'T2IModelAdapter',
25
+ 'TauBenchAdapter',
26
+ 'BFCLAdapter',
26
27
  ],
27
28
  'custom': [
28
29
  'CustomModel',
@@ -32,11 +33,6 @@ else:
32
33
  'LocalModel',
33
34
  'get_local_model',
34
35
  ],
35
- 'model': [
36
- 'BaseModel',
37
- 'ChatBaseModel',
38
- 'OpenAIModel',
39
- ],
40
36
  'register': [
41
37
  'get_model_adapter',
42
38
  ],
@@ -5,15 +5,10 @@ from .choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdap
5
5
  from .custom_adapter import CustomModelAdapter
6
6
  from .server_adapter import ServerModelAdapter
7
7
  from .t2i_adapter import T2IModelAdapter
8
+ from .tau_bench_adapter import TauBenchAdapter
8
9
 
9
10
  __all__ = [
10
- 'initialize_model_adapter',
11
- 'BaseModelAdapter',
12
- 'ChatGenerationModelAdapter',
13
- 'ContinuationLogitsModelAdapter',
14
- 'MultiChoiceModelAdapter',
15
- 'CustomModelAdapter',
16
- 'ServerModelAdapter',
17
- 'BFCLAdapter',
18
- 'T2IModelAdapter',
11
+ 'initialize_model_adapter', 'BaseModelAdapter', 'ChatGenerationModelAdapter', 'ContinuationLogitsModelAdapter',
12
+ 'MultiChoiceModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter', 'BFCLAdapter', 'T2IModelAdapter',
13
+ 'TauBenchAdapter'
19
14
  ]