evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,74 @@
1
+ import torch
2
+ from typing import TYPE_CHECKING, Optional
3
+
4
+ from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType
5
+ from evalscope.utils.logger import get_logger
6
+
7
+ if TYPE_CHECKING:
8
+ from evalscope.config import TaskConfig
9
+
10
+ logger = get_logger()
11
+
12
+
13
+ class LocalModel:
14
+
15
+ def __init__(self,
16
+ model_id: str,
17
+ model_revision: str = DEFAULT_MODEL_REVISION,
18
+ device_map: str = 'auto',
19
+ torch_dtype: str = 'auto',
20
+ cache_dir: str = None,
21
+ **kwargs):
22
+ from modelscope import AutoModelForCausalLM, AutoTokenizer
23
+
24
+ model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
25
+
26
+ if isinstance(torch_dtype, str) and torch_dtype != 'auto':
27
+ torch_dtype = eval(torch_dtype)
28
+
29
+ self.model_id = model_id
30
+ self.model_revision = model_revision
31
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
32
+
33
+ self.tokenizer = AutoTokenizer.from_pretrained(
34
+ self.model_id,
35
+ revision=model_revision,
36
+ trust_remote_code=True,
37
+ cache_dir=model_cache_dir,
38
+ )
39
+
40
+ self.model = AutoModelForCausalLM.from_pretrained(
41
+ self.model_id,
42
+ revision=model_revision,
43
+ device_map=device_map,
44
+ trust_remote_code=True,
45
+ torch_dtype=torch_dtype,
46
+ cache_dir=model_cache_dir,
47
+ )
48
+
49
+ self.model_cfg = {
50
+ 'model_id': model_id,
51
+ 'device_map': device_map,
52
+ 'torch_dtype': str(torch_dtype),
53
+ }
54
+
55
+
56
+ def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
57
+ """Get the base local model for the task. If the task is not checkpoint-based, return None.
58
+ Avoids loading model multiple times for different datasets.
59
+ """
60
+ if task_cfg.eval_type != EvalType.CHECKPOINT:
61
+ return None
62
+ else:
63
+ device_map = task_cfg.model_args.get('device_map', 'auto')
64
+ cache_dir = task_cfg.model_args.get('cache_dir', None)
65
+ model_precision = task_cfg.model_args.get('precision', 'torch.float16')
66
+ model_revision = task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION)
67
+
68
+ base_model = LocalModel(
69
+ model_id=task_cfg.model,
70
+ model_revision=model_revision,
71
+ device_map=device_map,
72
+ torch_dtype=model_precision,
73
+ cache_dir=cache_dir)
74
+ return base_model
evalscope/models/model.py CHANGED
@@ -1,7 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ import random
4
+ import time
2
5
  from abc import ABC, abstractmethod
3
6
  from typing import Any
4
7
 
8
+ from evalscope.utils.logger import get_logger
9
+
10
+ logger = get_logger()
11
+
5
12
 
6
13
  class BaseModel(ABC):
7
14
 
@@ -86,3 +93,137 @@ class ChatBaseModel(BaseModel):
86
93
  }
87
94
  """
88
95
  raise NotImplementedError
96
+
97
+
98
+ class OpenAIModel(ChatBaseModel):
99
+ """
100
+ APIs of OpenAI models.
101
+ Available models: gpt-3.5-turbo, gpt-4
102
+ """
103
+
104
+ MAX_RETRIES = 3
105
+
106
+ def __init__(self, model_cfg: dict, **kwargs):
107
+ super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs)
108
+
109
+ openai_api_key = os.environ.get('OPENAI_API_KEY', None)
110
+ self.api_key = self.model_cfg.get('api_key', openai_api_key)
111
+
112
+ if not self.api_key:
113
+ logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY')
114
+ # raise ValueError(
115
+ # 'OpenAI API key is not provided, '
116
+ # 'please set it in environment variable OPENAI_API_KEY')
117
+
118
+ def predict(self, model_id: str, inputs: dict, **kwargs) -> dict:
119
+
120
+ sys_prompt: str = inputs.get('sys_prompt', '')
121
+ user_prompt: str = inputs.get('user_prompt', '')
122
+
123
+ # model_id: str = kwargs.get('model_id', '')
124
+ temperature: float = kwargs.pop('temperature', 0.2)
125
+ max_tokens: int = kwargs.pop('max_tokens', 1024)
126
+ mode: str = kwargs.pop('mode', 'chat.completion')
127
+
128
+ logger.info(f'Using OpenAI model_id: {model_id}')
129
+
130
+ res = self._predict(
131
+ model_id=model_id,
132
+ sys_prompt=sys_prompt,
133
+ user_prompt=user_prompt,
134
+ temperature=temperature,
135
+ max_tokens=max_tokens,
136
+ mode=mode)
137
+
138
+ return res
139
+
140
+ def _predict(
141
+ self,
142
+ model_id,
143
+ sys_prompt,
144
+ user_prompt,
145
+ temperature,
146
+ max_tokens,
147
+ mode: str = 'chat.completion',
148
+ ) -> dict:
149
+ import openai
150
+
151
+ res = {}
152
+ openai.api_key = self.api_key
153
+
154
+ for i in range(self.MAX_RETRIES):
155
+ try:
156
+ if mode == 'chat.completion':
157
+ resp = openai.ChatCompletion.create(
158
+ model=model_id,
159
+ messages=[{
160
+ 'role': 'system',
161
+ 'content': sys_prompt
162
+ }, {
163
+ 'role': 'user',
164
+ 'content': user_prompt
165
+ }],
166
+ temperature=temperature,
167
+ max_tokens=max_tokens)
168
+
169
+ if resp:
170
+ ans_text = resp['choices'][0]['message']['content']
171
+ model_id = resp['model']
172
+ else:
173
+ logger.warning(f'OpenAI GPT API call failed: got empty response '
174
+ f'for input {sys_prompt} {user_prompt}')
175
+ ans_text = ''
176
+ model_id = ''
177
+
178
+ res['ans_text'] = ans_text
179
+ res['model_id'] = model_id
180
+ else:
181
+ raise ValueError(f'Invalid mode: {mode}')
182
+
183
+ return res
184
+
185
+ except Exception as e:
186
+ logger.warning(f'OpenAI API call failed: {e}')
187
+ time.sleep(3)
188
+ logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
189
+ return res
190
+
191
+
192
+ class DummyChatModel(ChatBaseModel):
193
+
194
+ MODEL_ID = 'dummy_chat_model_0801'
195
+ REVISION = 'v1.0.0'
196
+
197
+ def __init__(self, model_cfg: dict, **kwargs):
198
+ model_cfg['model_id'] = self.MODEL_ID
199
+ model_cfg['revision'] = self.REVISION
200
+ super(DummyChatModel, self).__init__(model_cfg=model_cfg)
201
+
202
+ def predict(self, inputs: dict, **kwargs) -> dict:
203
+
204
+ debug: bool = False
205
+ if debug:
206
+ messages = inputs['messages']
207
+ history = inputs['history']
208
+
209
+ logger.info(f'** messages: {messages}')
210
+ logger.info(f'** history: {history}')
211
+
212
+ choice = random.choice(['A', 'B', 'C', 'D'])
213
+
214
+ # Build response
215
+ res = {
216
+ 'choices': [{
217
+ 'index': 0,
218
+ 'message': {
219
+ 'content': choice,
220
+ 'role': 'assistant'
221
+ }
222
+ }],
223
+ 'created': time.time(),
224
+ 'model': self.MODEL_ID + '-' + self.REVISION,
225
+ 'object': 'chat.completion',
226
+ 'usage': {}
227
+ }
228
+
229
+ return res
@@ -0,0 +1,111 @@
1
+ import requests
2
+ import time
3
+ from typing import Optional, Union
4
+
5
+ from evalscope.models.base_adapter import BaseModelAdapter
6
+ from evalscope.utils.chat_service import ChatMessage
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ class ServerModelAdapter(BaseModelAdapter):
13
+ """
14
+ Server model adapter to request remote API model and generate results.
15
+ """
16
+
17
+ def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
18
+ """
19
+ Args:
20
+ api_url: The URL of the remote API model.
21
+ model_id: The ID of the remote API model.
22
+ api_key: The API key of the remote API model.
23
+ """
24
+ self.api_url = api_url
25
+ self.model_id = model_id
26
+ self.api_key = api_key
27
+ self.seed = kwargs.get('seed', None)
28
+ self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
29
+ super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
30
+
31
+ def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict:
32
+ """
33
+ Model prediction func.
34
+
35
+ Args:
36
+ inputs (Union[str, dict, list]): The input data.
37
+ infer_cfg (dict): Inference configuration.
38
+
39
+ Returns:
40
+ res (dict): The model prediction results.
41
+ """
42
+ infer_cfg = infer_cfg or {}
43
+
44
+ # Process inputs
45
+ if isinstance(inputs, str):
46
+ query = inputs
47
+ system_prompt = None
48
+ elif isinstance(inputs, dict):
49
+ data: list = inputs['data']
50
+ if isinstance(data[0], tuple): # for truthful_qa and hellaswag
51
+ query = '\n'.join(''.join(item) for item in data)
52
+ system_prompt = inputs.get('system_prompt', None)
53
+ else:
54
+ query = data[0]
55
+ system_prompt = inputs.get('system_prompt', None)
56
+ elif isinstance(inputs, list):
57
+ query = '\n'.join(inputs)
58
+ system_prompt = None
59
+ else:
60
+ raise TypeError(f'Unsupported inputs type: {type(inputs)}')
61
+
62
+ content = self.make_request_content(query, system_prompt)
63
+ request_json = self.make_request(content, infer_cfg)
64
+ response = self.send_request(request_json)
65
+ return response
66
+
67
+ def make_request_content(self, query: str, system_prompt: Optional[str] = None) -> dict:
68
+ """
69
+ Make request content for API.
70
+ """
71
+ if system_prompt is not None:
72
+ messages = [
73
+ ChatMessage(role='system', content=system_prompt).model_dump(exclude_unset=True),
74
+ ChatMessage(role='user', content=query).model_dump(exclude_unset=True)
75
+ ]
76
+ else:
77
+ messages = [ChatMessage(role='user', content=query).model_dump(exclude_unset=True)]
78
+ return {'messages': messages}
79
+
80
+ def make_request(self, content: dict, infer_cfg: dict = {}) -> dict:
81
+ """Make request to remote API."""
82
+ # Format request JSON according to OpenAI API format
83
+ do_sample = infer_cfg.get('do_sample', False)
84
+ temperature = infer_cfg.get('temperature', 0.0) if do_sample else 0.0
85
+
86
+ request_json = {
87
+ **content, 'model': self.model_id,
88
+ 'max_tokens': infer_cfg.get('max_tokens', 2048),
89
+ 'temperature': temperature,
90
+ 'top_p': infer_cfg.get('top_p', 1.0),
91
+ 'n': infer_cfg.get('num_return_sequences', 1),
92
+ 'stop': infer_cfg.get('stop', None)
93
+ }
94
+ if self.seed is not None:
95
+ request_json['seed'] = self.seed
96
+ logger.debug(f'Request to remote API: {request_json}')
97
+ return request_json
98
+
99
+ def send_request(self, request_json: dict, max_retries: int = 3) -> dict:
100
+ for attempt in range(max_retries):
101
+ response = requests.post(
102
+ self.api_url, json=request_json, headers={'Authorization': f'Bearer {self.api_key}'})
103
+ if response.status_code == 200:
104
+ response_data = response.json()
105
+ return response_data
106
+ logger.warning(f'Failed to request to remote API: {response.status_code} {response.text}')
107
+ if attempt < max_retries - 1:
108
+ time.sleep(5) # Sleep for 5 seconds before retrying
109
+ else:
110
+ raise RuntimeError(f'Failed to request to remote API after {max_retries} attempts: '
111
+ f'{response.status_code} {response.text}')
@@ -0,0 +1 @@
1
+ from evalscope.perf.main import run_perf_benchmark
evalscope/perf/main.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import asyncio
2
- import logging
3
2
  import os
4
3
  import platform
5
4
  from argparse import Namespace
@@ -1,5 +1,4 @@
1
1
  import json
2
- from transformers import AutoTokenizer
3
2
  from typing import Any, Dict, Iterator, List
4
3
 
5
4
  from evalscope.perf.arguments import Arguments
@@ -25,6 +24,7 @@ class CustomPlugin(ApiPluginBase):
25
24
  """
26
25
  super().__init__(model_path=mode_path)
27
26
  if mode_path is not None:
27
+ from transformers import AutoTokenizer
28
28
  self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
29
29
  else:
30
30
  self.tokenizer = None
@@ -1,6 +1,5 @@
1
1
  import json
2
2
  import os
3
- from transformers import AutoTokenizer
4
3
  from typing import Any, Dict, Iterator, List, Union
5
4
 
6
5
  from evalscope.perf.arguments import Arguments
@@ -25,6 +24,7 @@ class OpenaiPlugin(ApiPluginBase):
25
24
  """
26
25
  super().__init__(model_path=mode_path)
27
26
  if mode_path is not None:
27
+ from transformers import AutoTokenizer
28
28
  self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
29
29
  else:
30
30
  self.tokenizer = None
@@ -1,6 +1,5 @@
1
1
  import base64
2
2
  from io import BytesIO
3
- from modelscope.msdatasets import MsDataset
4
3
  from PIL import Image
5
4
  from typing import Any, Dict, Iterator, List
6
5
 
@@ -26,6 +25,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
26
25
  super().__init__(query_parameters)
27
26
 
28
27
  def build_messages(self) -> Iterator[List[Dict]]:
28
+ from modelscope.msdatasets import MsDataset
29
29
  dataset = MsDataset.load('clip-benchmark/wds_flickr8k', split='test')
30
30
 
31
31
  for item in dataset:
@@ -1,4 +1,3 @@
1
- from modelscope import MsDataset
2
1
  from typing import Any, Dict, Iterator, List
3
2
 
4
3
  from evalscope.perf.arguments import Arguments
@@ -17,6 +16,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
17
16
 
18
17
  def build_messages(self) -> Iterator[List[Dict]]:
19
18
  if not self.query_parameters.dataset_path:
19
+ from modelscope import MsDataset
20
20
  ds = MsDataset.load('AI-ModelScope/LongAlpaca-12k', subset_name='default', split='train')
21
21
  else:
22
22
  ds = self.dataset_json_list(self.query_parameters.dataset_path)
@@ -0,0 +1,5 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.report.combinator import gen_table, get_data_frame, get_report_list
4
+ from evalscope.report.generator import ReportGenerator
5
+ from evalscope.report.utils import Category, Report, ReportKey, Subset