evalscope 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (45) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/backend/rag_eval/utils/llm.py +4 -5
  3. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  4. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  5. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  6. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  7. evalscope/benchmarks/arena_hard/utils.py +162 -0
  8. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  10. evalscope/benchmarks/data_adapter.py +26 -2
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
  13. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  14. evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
  15. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  16. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  17. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  18. evalscope/collections/evaluator.py +1 -1
  19. evalscope/config.py +6 -3
  20. evalscope/constants.py +1 -0
  21. evalscope/evaluator/evaluator.py +5 -4
  22. evalscope/metrics/llm_judge.py +1 -1
  23. evalscope/models/chat_adapter.py +32 -11
  24. evalscope/models/custom_adapter.py +1 -1
  25. evalscope/perf/arguments.py +19 -46
  26. evalscope/perf/benchmark.py +64 -90
  27. evalscope/perf/main.py +1 -1
  28. evalscope/perf/plugin/api/openai_api.py +4 -2
  29. evalscope/perf/plugin/datasets/__init__.py +1 -0
  30. evalscope/perf/plugin/datasets/openqa.py +6 -11
  31. evalscope/perf/plugin/datasets/random_dataset.py +51 -0
  32. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  33. evalscope/perf/utils/db_util.py +5 -2
  34. evalscope/run.py +14 -2
  35. evalscope/version.py +2 -2
  36. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/METADATA +42 -78
  37. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/RECORD +45 -37
  38. tests/cli/test_all.py +33 -24
  39. tests/cli/test_run.py +69 -22
  40. tests/perf/test_perf.py +23 -0
  41. tests/rag/test_ragas.py +4 -1
  42. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
  43. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
  44. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
  45. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,7 @@ import threading
9
9
  import time
10
10
  from http import HTTPStatus
11
11
  from tqdm import tqdm
12
- from typing import List
12
+ from typing import AsyncGenerator, List
13
13
 
14
14
  from evalscope.perf.arguments import Arguments
15
15
  from evalscope.perf.http_client import AioHttpClient, test_connection
@@ -21,92 +21,68 @@ from evalscope.perf.utils.local_server import start_app
21
21
  from evalscope.utils.logger import get_logger
22
22
 
23
23
  logger = get_logger()
24
- query_send_completed_event = asyncio.Event()
24
+
25
25
  data_process_completed_event = asyncio.Event()
26
26
 
27
27
 
28
28
  @exception_handler
29
- async def dispatch_requests_worker(request_queue: asyncio.Queue, args: Arguments):
29
+ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
30
30
  query_generator_class = ApiRegistry(args.api)
31
31
  query_generator = query_generator_class(args.tokenizer_path)
32
32
 
33
33
  def load_prompt(prompt_path_or_text):
34
- """Load the prompt from a file or directly from the input text."""
35
34
  if prompt_path_or_text.startswith('@'):
36
35
  with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
37
36
  return file.read()
38
37
  return prompt_path_or_text
39
38
 
40
- async def dispatch_request(request):
41
- """Dispatch a single request with optional rate limiting."""
42
- await request_queue.put(request)
43
- if args.rate != -1:
44
- interval = np.random.exponential(1.0 / args.rate)
45
- await asyncio.sleep(interval)
46
-
47
- async def dispatch_requests_from_prompt(messages):
48
- """Generate and dispatch requests based on the given prompt."""
39
+ async def generate_requests_from_prompt(messages):
49
40
  request = query_generator.build_request(messages, args)
50
- if args.number is None:
51
- await dispatch_request(request)
52
- return 1
53
41
  for _ in range(args.number):
54
- await dispatch_request(request)
55
- return args.number
42
+ yield request
56
43
 
57
- async def dispatch_requests_from_dataset():
58
- """Generate and dispatch requests based on the dataset."""
59
- total_query_count = 0
44
+ async def generate_requests_from_dataset():
60
45
  message_generator_class = DatasetRegistry(args.dataset)
61
46
  message_generator = message_generator_class(args)
62
47
 
48
+ count = 0
63
49
  for messages in message_generator:
64
50
  request = query_generator.build_request(messages, args)
65
- if request is None:
66
- continue
67
- await dispatch_request(request)
68
- total_query_count += 1
69
- if args.number and total_query_count >= args.number:
70
- break
51
+ if request is not None:
52
+ yield request
53
+ count += 1
54
+ if args.number and count >= args.number:
55
+ break
71
56
 
72
- return total_query_count
73
-
74
- # Load prompt or dataset and dispatch requests accordingly
75
57
  if args.prompt:
76
58
  prompt = load_prompt(args.prompt)
77
59
  messages = [{'role': 'user', 'content': prompt}]
78
- total_queries = await dispatch_requests_from_prompt(messages)
60
+ generator = generate_requests_from_prompt(messages)
79
61
  elif args.dataset:
80
- total_queries = await dispatch_requests_from_dataset()
62
+ generator = generate_requests_from_dataset()
81
63
  else:
82
64
  raise Exception('Either prompt or dataset is required!')
83
65
 
84
- return total_queries
66
+ async for request in generator:
67
+ yield request
68
+ if args.rate != -1:
69
+ interval = np.random.exponential(1.0 / args.rate)
70
+ await asyncio.sleep(interval)
85
71
 
86
72
 
87
73
  @exception_handler
88
- async def send_requests_worker(
89
- task_id,
90
- request_queue: asyncio.Queue,
74
+ async def send_request(
75
+ semaphore: asyncio.Semaphore,
76
+ request: dict,
91
77
  benchmark_data_queue: asyncio.Queue,
92
78
  args: Arguments,
93
79
  ):
94
- client = AioHttpClient(args)
95
- async with client:
96
- while not (query_send_completed_event.is_set() and request_queue.empty()):
97
- try:
98
- # Attempt to get a request from the queue with a timeout
99
- request = await asyncio.wait_for(request_queue.get(), timeout=0.0001)
100
- request_queue.task_done()
101
- except asyncio.TimeoutError:
102
- # If timeout, continue to the next iteration
103
- continue
104
-
105
- # Initialize benchmark data for the current request
80
+ async with semaphore:
81
+ client = AioHttpClient(args)
82
+ async with client:
106
83
  benchmark_data = BenchmarkData(request=request)
107
84
  collected_messages = []
108
85
  try:
109
- # Send the request and process the response
110
86
  async for is_error, state_code, response_data in client.post(request):
111
87
  if is_error or state_code != HTTPStatus.OK:
112
88
  logger.error(f'Request: {request} failed, state_code: {state_code}, data: {response_data}')
@@ -124,7 +100,6 @@ async def send_requests_worker(
124
100
  logger.exception(e)
125
101
  logger.error(f'Request query: {request} exception')
126
102
  finally:
127
- # Record completion time and collected messages
128
103
  benchmark_data.completed_time = time.perf_counter()
129
104
  benchmark_data.response_messages = collected_messages
130
105
  await benchmark_data_queue.put(benchmark_data)
@@ -150,39 +125,45 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
150
125
  name = args.name if args.name else f'{args.model_id}_{current_time}'
151
126
  wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
152
127
 
153
- with sqlite3.connect(result_db_path) as con:
154
- cursor = con.cursor()
155
- create_result_table(cursor)
156
- with tqdm(desc='Processing') as pbar:
157
- while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
158
- try:
159
- # Attempt to get benchmark data from the queue with a timeout
160
- benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
161
- benchmark_data_queue.task_done()
162
- except asyncio.TimeoutError:
163
- # If timeout, continue to the next iteration
164
- continue
128
+ collected_benchmark_data = []
129
+
130
+ with tqdm(desc='Processing', total=args.number) as pbar:
131
+ while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
132
+ try:
133
+ # Attempt to get benchmark data from the queue with a timeout
134
+ benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
135
+ benchmark_data_queue.task_done()
136
+ except asyncio.TimeoutError:
137
+ # If timeout, continue to the next iteration
138
+ continue
165
139
 
166
- # Update metrics based on the benchmark data
167
- metrics.update_metrics(benchmark_data, api_plugin)
140
+ # Update metrics based on the benchmark data
141
+ metrics.update_metrics(benchmark_data, api_plugin)
168
142
 
169
- # Insert benchmark data into the database and commit the transaction
170
- insert_benchmark_data(cursor, benchmark_data)
171
- con.commit()
143
+ # Collect benchmark data for later database insertion
144
+ collected_benchmark_data.append(benchmark_data)
172
145
 
173
- # Create a message with the updated metrics
174
- message = metrics.create_message()
146
+ # Create a message with the updated metrics
147
+ message = metrics.create_message()
175
148
 
176
- # Log the message to wandb if the api key is provided
177
- if args.wandb_api_key:
178
- wandb.log(message)
149
+ # Log the message to wandb if the api key is provided
150
+ if args.wandb_api_key:
151
+ wandb.log(message)
179
152
 
180
- # Log the message to the logger every n queries
181
- if int(metrics.n_total_queries) % args.log_every_n_query == 0:
182
- msg = json.dumps(message, ensure_ascii=False, indent=2)
183
- logger.info(msg)
153
+ # Log the message to the logger every n queries
154
+ if int(metrics.n_total_queries) % args.log_every_n_query == 0:
155
+ msg = json.dumps(message, ensure_ascii=False, indent=2)
156
+ logger.info(msg)
184
157
 
185
- pbar.update(1) # Update the progress bar
158
+ pbar.update(1) # Update the progress bar
159
+
160
+ # Now perform database operations after all benchmark data has been processed
161
+ with sqlite3.connect(result_db_path) as con:
162
+ cursor = con.cursor()
163
+ create_result_table(cursor)
164
+ for benchmark_data in collected_benchmark_data:
165
+ insert_benchmark_data(cursor, benchmark_data)
166
+ con.commit()
186
167
 
187
168
  return metrics, result_db_path
188
169
 
@@ -199,7 +180,7 @@ async def start_server(args: Arguments) -> bool:
199
180
  else:
200
181
  args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
201
182
 
202
- if not await test_connection(args):
183
+ if (not args.no_test_connection) and (not await test_connection(args)):
203
184
  raise TimeoutError('Test connection failed')
204
185
 
205
186
 
@@ -210,39 +191,32 @@ async def benchmark(args: Arguments) -> None:
210
191
  add_signal_handlers(loop)
211
192
 
212
193
  # init queue
213
- request_queue = asyncio.Queue()
214
194
  benchmark_data_queue = asyncio.Queue()
215
195
 
216
196
  # reset event
217
- query_send_completed_event.clear()
218
197
  data_process_completed_event.clear()
219
198
 
199
+ semaphore = asyncio.Semaphore(args.parallel)
200
+
220
201
  async def create_send_request_tasks():
221
202
  tasks: List[asyncio.Task] = []
222
- for idx in range(args.parallel):
223
- task = asyncio.create_task(send_requests_worker(idx, request_queue, benchmark_data_queue, args))
203
+ async for request in get_requests(args):
204
+ task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
224
205
  tasks.append(task)
225
206
  return tasks
226
207
 
227
208
  async def run_tasks():
228
209
  await start_server(args)
229
210
 
230
- dispatch_task = asyncio.create_task(dispatch_requests_worker(request_queue, args))
231
211
  statistic_benchmark_metric_task = asyncio.create_task(
232
212
  statistic_benchmark_metric_worker(benchmark_data_queue, args))
233
213
  send_request_tasks = await create_send_request_tasks()
234
214
 
235
- expected_number_of_queries = await dispatch_task
236
- await request_queue.join()
237
- query_send_completed_event.set()
238
-
239
215
  await asyncio.gather(*send_request_tasks, return_exceptions=True)
240
216
  await benchmark_data_queue.join()
241
217
  data_process_completed_event.set()
242
218
 
243
219
  metrics, result_db_path = await statistic_benchmark_metric_task
244
- summary_result(args, metrics, expected_number_of_queries, result_db_path)
245
-
246
- await asyncio.sleep(0.250)
220
+ summary_result(args, metrics, result_db_path)
247
221
 
248
222
  await run_tasks()
evalscope/perf/main.py CHANGED
@@ -32,7 +32,7 @@ def run_perf_benchmark(args):
32
32
  if platform.system() == 'Windows':
33
33
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
34
34
 
35
- loop = asyncio.get_event_loop()
35
+ loop = asyncio.new_event_loop()
36
36
  if platform.system() != 'Windows':
37
37
  add_signal_handlers(loop)
38
38
 
@@ -70,7 +70,7 @@ class OpenaiPlugin(ApiPluginBase):
70
70
  def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
71
71
  payload['model'] = param.model
72
72
  if param.max_tokens is not None:
73
- payload['max_tokens'] = param.max_tokens
73
+ payload['max_completion_tokens'] = param.max_tokens
74
74
  if param.min_tokens is not None:
75
75
  payload['min_tokens'] = param.min_tokens
76
76
  if param.frequency_penalty is not None:
@@ -94,9 +94,11 @@ class OpenaiPlugin(ApiPluginBase):
94
94
  payload['top_p'] = param.top_p
95
95
  if param.top_k is not None:
96
96
  payload['top_k'] = param.top_k
97
+ if param.extra_args is not None:
98
+ payload.update(param.extra_args)
97
99
  return payload
98
100
 
99
- def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
101
+ def parse_responses(self, responses, request: Any = None, **kwargs) -> tuple[int, int]:
100
102
  """Parser responses and return number of request and response tokens.
101
103
  Only one response for non-stream, multiple responses for stream.
102
104
  """
@@ -3,4 +3,5 @@ from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
3
3
  from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
4
4
  from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
5
5
  from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
6
+ from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
6
7
  from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
@@ -1,5 +1,5 @@
1
1
  import json
2
- import subprocess
2
+ import os
3
3
  from typing import Any, Dict, Iterator, List
4
4
 
5
5
  from evalscope.perf.arguments import Arguments
@@ -18,16 +18,11 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
18
18
 
19
19
  def build_messages(self) -> Iterator[List[Dict]]:
20
20
  if not self.query_parameters.dataset_path:
21
- subprocess.call([
22
- 'modelscope',
23
- 'download',
24
- '--dataset',
25
- 'AI-ModelScope/HC3-Chinese',
26
- 'open_qa.jsonl',
27
- '--local_dir',
28
- './data',
29
- ])
30
- self.query_parameters.dataset_path = './data/open_qa.jsonl'
21
+ from modelscope import dataset_snapshot_download
22
+
23
+ file_name = 'open_qa.jsonl'
24
+ local_path = dataset_snapshot_download('AI-ModelScope/HC3-Chinese', allow_patterns=[file_name])
25
+ self.query_parameters.dataset_path = os.path.join(local_path, file_name)
31
26
 
32
27
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
33
28
  item = json.loads(item)
@@ -0,0 +1,51 @@
1
+ import numpy as np
2
+ from typing import Dict, Iterator, List
3
+
4
+ from evalscope.perf.arguments import Arguments
5
+ from evalscope.perf.plugin.datasets.base import DatasetPluginBase
6
+ from evalscope.perf.plugin.registry import register_dataset
7
+
8
+
9
+ @register_dataset('random')
10
+ class RandomDatasetPlugin(DatasetPluginBase):
11
+ """Read dataset and return prompt.
12
+ """
13
+
14
+ def __init__(self, query_parameters: Arguments):
15
+ super().__init__(query_parameters)
16
+ assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.' # noqa: E501
17
+
18
+ from modelscope import AutoTokenizer
19
+ self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
20
+ self.prefix_length = self.query_parameters.prefix_length
21
+ self.prefix_ids = self.get_random_inputs(self.prefix_length)
22
+ self.template_len = self.get_template_len()
23
+ self.number = self.query_parameters.number or 1
24
+
25
+ def build_messages(self) -> Iterator[List[Dict]]:
26
+ min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
27
+ max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
28
+
29
+ assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
30
+ assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
31
+
32
+ # refer to https://github.com/vllm-project/vllm/blob/ed6e9075d31e32c8548b480a47d1ffb77da1f54c/benchmarks/benchmark_serving.py#L366C1-L399C1 # noqa: E501
33
+ input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
34
+ offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
35
+
36
+ for i in range(self.number):
37
+ prompt_ids = (offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size
38
+ prompt = self.tokenizer.decode(
39
+ self.prefix_ids + prompt_ids.tolist(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
40
+ yield [{'role': 'user', 'content': prompt}]
41
+
42
+ def get_random_inputs(self, length: int) -> List[int]:
43
+ if length <= 0:
44
+ return []
45
+ input_ids = np.random.randint(0, self.tokenizer.vocab_size, size=length).tolist()
46
+ return input_ids
47
+
48
+ def get_template_len(self):
49
+ empty_message = [{'role': 'user', 'content': ''}]
50
+ template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
51
+ return len(template)
@@ -3,6 +3,9 @@ from typing import Dict, Iterator, List, Tuple
3
3
  from evalscope.perf.arguments import Arguments
4
4
  from evalscope.perf.plugin.datasets.base import DatasetPluginBase
5
5
  from evalscope.perf.plugin.registry import register_dataset
6
+ from evalscope.utils.logger import get_logger
7
+
8
+ logger = get_logger()
6
9
 
7
10
 
8
11
  @register_dataset('speed_benchmark')
@@ -18,6 +21,14 @@ class SpeedBenchmarkDatasetPlugin(DatasetPluginBase):
18
21
  def __init__(self, query_parameters: Arguments):
19
22
  super().__init__(query_parameters)
20
23
 
24
+ url = self.query_parameters.url
25
+ if url.endswith('v1/chat/completions'):
26
+ logger.warning(
27
+ 'The API URL is not set correctly for `speed_benchmark`. Using `v1/completions` instead of `v1/chat/completions` by system.' # noqa
28
+ )
29
+ url = url.replace('v1/chat/completions', 'v1/completions')
30
+ self.query_parameters.url = url
31
+
21
32
  def build_messages(self) -> Iterator[List[Dict]]:
22
33
  for input_len in self.INPUT_LENGTH:
23
34
  for _ in range(self.REPEAT):
@@ -2,6 +2,7 @@ import base64
2
2
  import json
3
3
  import os
4
4
  import pickle
5
+ import re
5
6
  import sqlite3
6
7
  import sys
7
8
  from datetime import datetime
@@ -91,6 +92,8 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
91
92
  def get_output_path(args: Arguments) -> str:
92
93
  current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
93
94
  output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
95
+ # Filter illegal characters
96
+ output_path = re.sub(r'[<>:"|?*]', '_', output_path)
94
97
  if not os.path.exists(output_path):
95
98
  os.makedirs(output_path, exist_ok=True)
96
99
  logger.info(f'Save the result to: {output_path}')
@@ -191,12 +194,12 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
191
194
  return results
192
195
 
193
196
 
194
- def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
197
+ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
195
198
  result_path = os.path.dirname(result_db_path)
196
199
  write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
197
200
 
198
201
  data = metrics.create_message()
199
- data.update({'Expected number of requests': expected_number_of_queries, 'Result DB path': result_db_path})
202
+ data.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
200
203
  write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
201
204
 
202
205
  # Print summary in a table
evalscope/run.py CHANGED
@@ -39,9 +39,11 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
39
39
  configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
40
40
 
41
41
  if task_cfg.eval_backend != EvalBackend.NATIVE:
42
- return run_non_native_backend(task_cfg, outputs)
42
+ result = run_non_native_backend(task_cfg, outputs)
43
43
  else:
44
- return evaluate_model(task_cfg, outputs)
44
+ result = evaluate_model(task_cfg, outputs)
45
+
46
+ return result
45
47
 
46
48
 
47
49
  def setup_work_directory(task_cfg: TaskConfig, run_time: str):
@@ -117,6 +119,16 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
117
119
  res_dict = evaluator.eval()
118
120
  eval_results[evaluator.dataset_name] = res_dict
119
121
 
122
+ # Clean up
123
+ if base_model is not None:
124
+ import gc
125
+ import torch
126
+
127
+ del base_model
128
+ del evaluators
129
+ torch.cuda.empty_cache()
130
+ gc.collect()
131
+
120
132
  return eval_results
121
133
 
122
134
 
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.13.0'
4
- __release_datetime__ = '2025-03-14 12:00:00'
3
+ __version__ = '0.13.2'
4
+ __release_datetime__ = '2025-04-01 20:00:00'