evalscope 0.8.0__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (68) hide show
  1. evalscope/backend/base.py +1 -1
  2. evalscope/backend/rag_eval/utils/clip.py +2 -2
  3. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  4. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  5. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
  6. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
  7. evalscope/benchmarks/humaneval/humaneval_adapter.py +193 -7
  8. evalscope/benchmarks/race/race_adapter.py +2 -1
  9. evalscope/config.py +38 -2
  10. evalscope/constants.py +24 -38
  11. evalscope/evaluator/__init__.py +0 -1
  12. evalscope/evaluator/evaluator.py +6 -4
  13. evalscope/evaluator/rating_eval.py +1 -1
  14. evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
  15. evalscope/models/model_adapter.py +1 -1
  16. evalscope/perf/arguments.py +3 -1
  17. evalscope/perf/benchmark.py +3 -3
  18. evalscope/perf/main.py +5 -6
  19. evalscope/perf/plugin/api/openai_api.py +53 -49
  20. evalscope/perf/plugin/registry.py +3 -3
  21. evalscope/perf/utils/benchmark_util.py +4 -4
  22. evalscope/perf/utils/db_util.py +66 -22
  23. evalscope/perf/utils/local_server.py +4 -1
  24. evalscope/run.py +45 -82
  25. evalscope/run_arena.py +2 -1
  26. evalscope/summarizer.py +14 -26
  27. evalscope/third_party/longbench_write/eval.py +2 -1
  28. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  29. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  30. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  31. evalscope/tools/combine_reports.py +2 -4
  32. evalscope/tools/rewrite_eval_results.py +1 -1
  33. evalscope/utils/__init__.py +1 -0
  34. evalscope/utils/chat_service.py +1 -1
  35. evalscope/utils/io_utils.py +162 -0
  36. evalscope/utils/logger.py +8 -0
  37. evalscope/utils/utils.py +0 -175
  38. evalscope/version.py +2 -2
  39. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/METADATA +15 -3
  40. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/RECORD +47 -67
  41. tests/cli/test_run.py +11 -12
  42. tests/perf/test_perf.py +3 -2
  43. tests/vlm/test_vlmeval.py +3 -2
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  52. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  53. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  54. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  55. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  56. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  57. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  58. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  59. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  60. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  61. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  62. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  63. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  64. evalscope/evaluator/humaneval_evaluator.py +0 -158
  65. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/LICENSE +0 -0
  66. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/WHEEL +0 -0
  67. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/entry_points.txt +0 -0
  68. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import os
3
3
  from transformers import AutoTokenizer
4
- from typing import Any, Dict, Iterator, List
4
+ from typing import Any, Dict, Iterator, List, Union
5
5
 
6
6
  from evalscope.perf.arguments import Arguments
7
7
  from evalscope.perf.plugin.api.base import ApiPluginBase
@@ -29,7 +29,7 @@ class OpenaiPlugin(ApiPluginBase):
29
29
  else:
30
30
  self.tokenizer = None
31
31
 
32
- def build_request(self, messages: List[Dict] | str, param: Arguments) -> Dict:
32
+ def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
33
33
  """Build the openai format request based on prompt, dataset
34
34
 
35
35
  Args:
@@ -96,60 +96,64 @@ class OpenaiPlugin(ApiPluginBase):
96
96
 
97
97
  def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
98
98
  """Parser responses and return number of request and response tokens.
99
- sample of the output delta:
100
- {"id":"4","object":"chat.completion.chunk","created":1714030870,"model":"llama3","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
99
+ Only one response for non-stream, multiple responses for stream.
100
+ """
101
101
 
102
+ # when stream, the last response is the full usage
103
+ # when non-stream, the last response is the first response
104
+ last_response_js = json.loads(responses[-1])
105
+ if 'usage' in last_response_js and last_response_js['usage']:
106
+ input_tokens = last_response_js['usage']['prompt_tokens']
107
+ output_tokens = last_response_js['usage']['completion_tokens']
108
+ return input_tokens, output_tokens
102
109
 
103
- Args:
104
- responses (List[bytes]): List of http response body, for stream output,
105
- there are multiple responses, for general only one.
106
- kwargs: (Any): The command line --parameter content.
107
- Returns:
108
- Tuple: Return number of prompt token and number of completion tokens.
109
- """
110
- full_response_content = ''
110
+ # no usage information in the response, parse the response to get the tokens
111
111
  delta_contents = {}
112
- input_tokens = None
113
- output_tokens = None
114
112
  for response in responses:
115
113
  js = json.loads(response)
116
- if js['object'] == 'chat.completion':
117
- for choice in js['choices']:
118
- delta_contents[choice['index']] = [choice['message']['content']]
119
- input_tokens = js['usage']['prompt_tokens']
120
- output_tokens = js['usage']['completion_tokens']
121
- elif js['object'] == 'text_completion':
122
- for choice in js['choices']:
123
- delta_contents[choice['index']] = [choice['text']]
124
- input_tokens = js['usage']['prompt_tokens']
125
- output_tokens = js['usage']['completion_tokens']
126
- elif js['object'] == 'chat.completion.chunk':
127
- if 'choices' in js:
128
- for choice in js['choices']:
129
- if 'delta' in choice and 'index' in choice:
130
- delta = choice['delta']
131
- idx = choice['index']
132
- if 'content' in delta:
133
- delta_content = delta['content']
134
- if idx in delta_contents:
135
- delta_contents[idx].append(delta_content)
136
- else:
137
- delta_contents[idx] = [delta_content]
138
- # usage in chunk: {"id":"","object":"chat.completion.chunk","created":1718269986,"model":"llama3",
139
- # "choices":[],"usage":{"prompt_tokens":32,"total_tokens":384,"completion_tokens":352}}
140
- if 'usage' in js and js['usage']:
141
- input_tokens = js['usage']['prompt_tokens']
142
- output_tokens = js['usage']['completion_tokens']
143
- if (input_tokens is None and output_tokens is None and self.tokenizer is not None):
144
- input_tokens = 0
145
- output_tokens = 0
114
+ if 'object' in js:
115
+ self.__process_response_object(js, delta_contents)
116
+ else:
117
+ self.__process_no_object(js, delta_contents)
118
+
119
+ input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
120
+ return input_tokens, output_tokens
121
+
122
+ def __process_response_object(self, js, delta_contents):
123
+ if js['object'] == 'chat.completion':
124
+ for choice in js['choices']:
125
+ delta_contents[choice['index']] = [choice['message']['content']]
126
+ elif js['object'] == 'text_completion':
127
+ for choice in js['choices']:
128
+ delta_contents[choice['index']] = [choice['text']]
129
+ elif js['object'] == 'chat.completion.chunk':
130
+ for choice in js.get('choices', []):
131
+ if 'delta' in choice and 'index' in choice:
132
+ delta = choice['delta']
133
+ idx = choice['index']
134
+ if 'content' in delta:
135
+ delta_content = delta['content']
136
+ delta_contents.setdefault(idx, []).append(delta_content)
137
+
138
+ def __process_no_object(self, js, delta_contents):
139
+ # assume the response is a single choice
140
+ for choice in js['choices']:
141
+ if 'delta' in choice:
142
+ delta = choice['delta']
143
+ idx = choice['index']
144
+ if 'content' in delta:
145
+ delta_content = delta['content']
146
+ delta_contents.setdefault(idx, []).append(delta_content)
147
+ else:
148
+ delta_contents[choice['index']] = [choice['message']['content']]
149
+
150
+ def __calculate_tokens_from_content(self, request, delta_contents):
151
+ input_tokens = output_tokens = 0
152
+ if self.tokenizer is not None:
146
153
  for idx, choice_contents in delta_contents.items():
147
- full_response_content = ''.join([m for m in choice_contents])
154
+ full_response_content = ''.join(choice_contents)
148
155
  input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
149
156
  output_tokens += len(self.tokenizer.encode(full_response_content))
150
- elif input_tokens is None and output_tokens is None: # no usage info get.
151
- input_tokens = 0
152
- output_tokens = 0
157
+ else:
153
158
  logger.warning('No usage information found. Please specify `--tokenizer-path` to generate usage details.')
154
-
155
159
  return input_tokens, output_tokens
@@ -1,4 +1,4 @@
1
- from typing import Any, List, Type
1
+ from typing import Any, List, Type, Union
2
2
 
3
3
 
4
4
  class PluginRegistry:
@@ -20,7 +20,7 @@ class PluginRegistry:
20
20
  return self.get_class(name)
21
21
 
22
22
 
23
- def register_dataset(name: str | List[str]):
23
+ def register_dataset(name: Union[str, List[str]]):
24
24
 
25
25
  def class_decorator(cls: Type):
26
26
  if isinstance(name, str):
@@ -35,7 +35,7 @@ def register_dataset(name: str | List[str]):
35
35
  return class_decorator
36
36
 
37
37
 
38
- def register_api(name: str | List[str]):
38
+ def register_api(name: Union[str, List[str]]):
39
39
 
40
40
  def class_decorator(cls: Type):
41
41
  if isinstance(name, str):
@@ -116,19 +116,19 @@ class BenchmarkMetrics:
116
116
 
117
117
  def create_message(self, default_ndigits=3):
118
118
  message = {
119
- 'Time taken for tests (senconds)': round(self.total_time, default_ndigits),
119
+ 'Time taken for tests (s)': round(self.total_time, default_ndigits),
120
120
  'Number of concurrency': self.concurrency,
121
121
  'Total requests': int(self.n_total_queries),
122
122
  'Succeed requests': self.n_succeed_queries,
123
123
  'Failed requests': self.n_failed_queries,
124
+ 'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
124
125
  'Average QPS': round(self.qps, default_ndigits),
125
126
  'Average latency (s)': round(self.avg_latency, default_ndigits),
126
127
  'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
127
128
  'Average time per output token (s)': round(self.avg_time_per_token, 5),
128
- 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
129
- 'Average package per request': round(self.n_avg_chunks, default_ndigits),
130
- 'Throughput(average output tokens per second)': round(self.avg_token_per_seconds, default_ndigits),
131
129
  'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
132
130
  'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
131
+ 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
132
+ 'Average package per request': round(self.n_avg_chunks, default_ndigits),
133
133
  }
134
134
  return message
@@ -6,6 +6,7 @@ import sqlite3
6
6
  import sys
7
7
  from datetime import datetime
8
8
  from tabulate import tabulate
9
+ from typing import Dict, List
9
10
 
10
11
  from evalscope.perf.arguments import Arguments
11
12
  from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
@@ -107,44 +108,87 @@ def get_result_db_path(args: Arguments):
107
108
  return result_db_path
108
109
 
109
110
 
110
- def get_percentile_results(result_db_path: str):
111
+ def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
112
+ """
113
+ Calculate the percentiles for a specific list of data.
111
114
 
112
- def percentile_results(rows, index, percentiles):
113
- results = {}
114
- n_success_queries = len(rows)
115
- for percentile in percentiles:
115
+ :param data: List of values for a specific metric.
116
+ :param percentiles: List of percentiles to calculate.
117
+ :return: Dictionary of calculated percentiles.
118
+ """
119
+ results = {}
120
+ n_success_queries = len(data)
121
+ data.sort()
122
+ for percentile in percentiles:
123
+ try:
116
124
  idx = int(n_success_queries * percentile / 100)
117
- row = rows[idx]
118
- value = row[index] if row[index] is not None else float('inf')
125
+ value = data[idx] if data[idx] is not None else float('nan')
119
126
  results[percentile] = round(value, 4)
120
- return results
127
+ except IndexError:
128
+ results[percentile] = float('nan')
129
+ return results
130
+
131
+
132
+ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
133
+ """
134
+ Compute and return quantiles for various metrics from the database results.
135
+
136
+ :param result_db_path: Path to the SQLite database file.
137
+ :return: Dictionary of percentiles for various metrics.
138
+ """
139
+
140
+ def inter_token_latencies(chunk_times_json: str) -> List[float]:
141
+ try:
142
+ chunk_times = json.loads(chunk_times_json)
143
+ return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
144
+ except (json.JSONDecodeError, TypeError) as e:
145
+ logger.error(f'Error parsing chunk times: {e}')
146
+ return []
121
147
 
122
148
  query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
123
149
  'n_chunks, chunk_time, prompt_tokens, completion_tokens '
124
- 'FROM result WHERE success=1 ORDER BY first_chunk_latency ASC')
150
+ 'FROM result WHERE success=1')
151
+
125
152
  percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
126
153
 
127
154
  with sqlite3.connect(result_db_path) as con:
128
155
  rows = con.execute(query_sql).fetchall()
129
156
 
130
- if len(rows) <= len(percentiles):
157
+ if len(rows) < len(percentiles):
131
158
  logger.info('Too little data to calculate quantiles!')
132
159
  return {}
133
160
 
134
- # Calculate percentiles for first chunk latency and latency
135
- first_chunk_latency_index = 5
136
- latency_index = 4
161
+ # Define index variables for columns
162
+ CHUNK_TIMES_INDEX = 1
163
+ LATENCY_INDEX = 4
164
+ FIRST_CHUNK_LATENCY_INDEX = 5
165
+ PROMPT_TOKENS_INDEX = 8
166
+ COMPLETION_TOKENS_INDEX = 9
167
+
168
+ # Prepare data for each metric
169
+ inter_token_latencies_all = []
170
+ for row in rows:
171
+ inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
172
+
173
+ metrics = {
174
+ 'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
175
+ 'TPOT (s)':
176
+ inter_token_latencies_all,
177
+ 'Latency (s)': [row[LATENCY_INDEX] for row in rows],
178
+ 'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
179
+ 'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
180
+ 'Throughput(tokens/s)':
181
+ [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
182
+ for row in rows]
183
+ }
137
184
 
138
- first_chunk_latency_results = percentile_results(rows, first_chunk_latency_index, percentiles)
139
- rows.sort(key=lambda x: x[latency_index])
140
- latency_results = percentile_results(rows, latency_index, percentiles)
185
+ # Calculate percentiles for each metric
186
+ results = {'Percentile': [f'{p}%' for p in percentiles]}
187
+ for metric_name, data in metrics.items():
188
+ metric_percentiles = calculate_percentiles(data, percentiles)
189
+ results[metric_name] = [metric_percentiles[p] for p in percentiles]
141
190
 
142
- # Prepare data for tabulation
143
- return {
144
- 'Percentile': [f'{p}%' for p in percentiles],
145
- 'First Chunk Latency (s)': [first_chunk_latency_results[p] for p in percentiles],
146
- 'Latency (s)': [latency_results[p] for p in percentiles]
147
- }
191
+ return results
148
192
 
149
193
 
150
194
  def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
@@ -102,6 +102,8 @@ def start_app(args: Arguments):
102
102
 
103
103
  elif args.api == 'local_vllm':
104
104
  os.environ['VLLM_USE_MODELSCOPE'] = 'True'
105
+ os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
106
+ os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
105
107
  # yapf: disable
106
108
  proc = subprocess.Popen([
107
109
  'python', '-m', 'vllm.entrypoints.openai.api_server',
@@ -111,7 +113,8 @@ def start_app(args: Arguments):
111
113
  '--max-model-len', '32768',
112
114
  '--gpu-memory-utilization', '0.9',
113
115
  '--host', '0.0.0.0',
114
- '--port', args.port,
116
+ '--port', str(args.port),
117
+ '--trust-remote-code',
115
118
  '--disable-log-requests',
116
119
  '--disable-log-stats',
117
120
  ])
evalscope/run.py CHANGED
@@ -10,12 +10,13 @@ from datetime import datetime
10
10
  from typing import List, Optional, Union
11
11
 
12
12
  from evalscope.arguments import parse_args
13
- from evalscope.config import TaskConfig
14
- from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType, OutputsStructure
15
- from evalscope.evaluator import Evaluator, HumanevalEvaluator
13
+ from evalscope.config import TaskConfig, parse_task_config
14
+ from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType
15
+ from evalscope.evaluator import Evaluator
16
16
  from evalscope.models.custom import CustomModel
17
17
  from evalscope.utils import import_module_util, seed_everything
18
- from evalscope.utils.logger import get_logger
18
+ from evalscope.utils.io_utils import OutputsStructure, are_paths_same
19
+ from evalscope.utils.logger import configure_logging, get_logger
19
20
 
20
21
  logger = get_logger()
21
22
 
@@ -23,15 +24,6 @@ BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
23
24
  MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
24
25
 
25
26
 
26
- def configure_logging(debug: bool, outputs: Optional[OutputsStructure]):
27
- """Configure logging level based on the debug flag."""
28
- if outputs:
29
- log_file = os.path.join(outputs.logs_dir, 'eval_log.log')
30
- get_logger(log_file=log_file, force=True)
31
- if debug:
32
- get_logger(log_level=logging.DEBUG, force=True)
33
-
34
-
35
27
  def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
36
28
  """Run evaluation task(s) based on the provided configuration."""
37
29
  run_time = datetime.now().strftime('%Y%m%d_%H%M%S')
@@ -48,35 +40,15 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
48
40
  """Run a single evaluation task."""
49
41
  seed_everything(task_cfg.seed)
50
42
  outputs = setup_work_directory(task_cfg, run_time)
51
- configure_logging(task_cfg.debug, outputs)
43
+ configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
52
44
 
45
+ task_cfg.dump_yaml(outputs.configs_dir)
53
46
  logger.info(task_cfg)
54
47
 
55
- return evaluate_model(task_cfg, outputs)
56
-
57
-
58
- def parse_task_config(task_cfg) -> TaskConfig:
59
- """Parse task configuration from various formats into a TaskConfig object."""
60
- if isinstance(task_cfg, TaskConfig):
61
- logger.info('Args: Task config is provided with TaskConfig type.')
62
- elif isinstance(task_cfg, dict):
63
- logger.info('Args: Task config is provided with dictionary type.')
64
- task_cfg = TaskConfig.from_dict(task_cfg)
65
- elif isinstance(task_cfg, Namespace):
66
- logger.info('Args: Task config is provided with CommandLine type.')
67
- task_cfg = TaskConfig.from_args(task_cfg)
68
- elif isinstance(task_cfg, str):
69
- extension = task_cfg.split('.')[-1]
70
- logger.info(f'Args: Task config is provided with {extension} file type.')
71
- if extension in ['yaml', 'yml']:
72
- task_cfg = TaskConfig.from_yaml(task_cfg)
73
- elif extension == 'json':
74
- task_cfg = TaskConfig.from_json(task_cfg)
75
- else:
76
- raise ValueError('Args: Unsupported file extension.')
48
+ if task_cfg.eval_backend != EvalBackend.NATIVE:
49
+ return run_non_native_backend(task_cfg)
77
50
  else:
78
- raise ValueError('Args: Please provide a valid task config.')
79
- return task_cfg
51
+ return evaluate_model(task_cfg, outputs)
80
52
 
81
53
 
82
54
  def setup_work_directory(task_cfg: TaskConfig, run_time: str):
@@ -84,10 +56,15 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
84
56
  if task_cfg.use_cache:
85
57
  task_cfg.work_dir = task_cfg.use_cache
86
58
  logger.info(f'Set resume from {task_cfg.work_dir}')
87
- elif task_cfg.work_dir == DEFAULT_WORK_DIR:
59
+ elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
88
60
  task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
89
61
 
90
62
  outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
63
+
64
+ if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
65
+ task_cfg.eval_config['time_str'] = run_time
66
+ elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
67
+ task_cfg.eval_config['work_dir'] = task_cfg.work_dir
91
68
  return outputs
92
69
 
93
70
 
@@ -125,10 +102,6 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
125
102
  """Evaluate the model based on the provided task configuration."""
126
103
  # Initialize evaluator
127
104
  eval_results = {}
128
- task_cfg.dump_yaml(outputs.configs_dir)
129
-
130
- if task_cfg.eval_backend != EvalBackend.NATIVE:
131
- return run_non_native_backend(task_cfg)
132
105
 
133
106
  for dataset_name in task_cfg.datasets:
134
107
  evaluator = create_evaluator(task_cfg, dataset_name, outputs)
@@ -143,45 +116,35 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
143
116
  imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
144
117
  model_adapter = initialize_model_adapter(task_cfg, dataset_name, imported_modules)
145
118
 
146
- if dataset_name == 'humaneval':
147
- problem_file = task_cfg.dataset_args.get('humaneval', {}).get('local_path')
148
- return HumanevalEvaluator(
149
- problem_file=problem_file,
150
- model_id=task_cfg.model,
151
- model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
152
- model_adapter=model_adapter,
153
- outputs=outputs,
154
- is_custom_outputs_dir=False,
155
- )
156
- else:
157
- dataset_config = task_cfg.dataset_args.get(dataset_name, {})
158
- dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID']
159
- in_prompt_template = dataset_config.get('prompt_template', '')
160
- few_shot_num = dataset_config.get('few_shot_num', None)
161
- few_shot_random = dataset_config.get('few_shot_random', True)
162
-
163
- data_adapter = imported_modules['DataAdapterClass'](
164
- few_shot_num=few_shot_num,
165
- few_shot_random=few_shot_random,
166
- prompt_template=in_prompt_template,
167
- )
168
- in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
169
-
170
- logger.info(f'Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
171
-
172
- return Evaluator(
173
- dataset_name_or_path=dataset_name_or_path,
174
- subset_list=in_subset_list,
175
- data_adapter=data_adapter,
176
- model_adapter=model_adapter,
177
- use_cache=task_cfg.use_cache,
178
- outputs=outputs,
179
- datasets_dir=task_cfg.dataset_dir,
180
- datasets_hub=task_cfg.dataset_hub,
181
- stage=task_cfg.stage,
182
- eval_type=task_cfg.eval_type,
183
- overall_task_cfg=task_cfg,
184
- )
119
+ dataset_config = task_cfg.dataset_args.get(dataset_name, {})
120
+ dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID']
121
+ in_prompt_template = dataset_config.get('prompt_template', '')
122
+ few_shot_num = dataset_config.get('few_shot_num', None)
123
+ few_shot_random = dataset_config.get('few_shot_random', True)
124
+
125
+ data_adapter = imported_modules['DataAdapterClass'](
126
+ few_shot_num=few_shot_num,
127
+ few_shot_random=few_shot_random,
128
+ prompt_template=in_prompt_template,
129
+ outputs=outputs,
130
+ )
131
+ in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
132
+
133
+ logger.info(f'Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
134
+
135
+ return Evaluator(
136
+ dataset_name_or_path=dataset_name_or_path,
137
+ subset_list=in_subset_list,
138
+ data_adapter=data_adapter,
139
+ model_adapter=model_adapter,
140
+ use_cache=task_cfg.use_cache,
141
+ outputs=outputs,
142
+ datasets_dir=task_cfg.dataset_dir,
143
+ datasets_hub=task_cfg.dataset_hub,
144
+ stage=task_cfg.stage,
145
+ eval_type=task_cfg.eval_type,
146
+ overall_task_cfg=task_cfg,
147
+ )
185
148
 
186
149
 
187
150
  def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
evalscope/run_arena.py CHANGED
@@ -11,7 +11,8 @@ from tqdm import tqdm
11
11
  from evalscope.constants import EvalConfigKeys
12
12
  from evalscope.evaluator.rating_eval import RatingEvaluate
13
13
  from evalscope.models.model_adapter import ChatGenerationModelAdapter
14
- from evalscope.utils import dump_jsonl_data, get_obj_from_cfg, jsonl_to_list, yaml_to_dict
14
+ from evalscope.utils import get_obj_from_cfg
15
+ from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
15
16
  from evalscope.utils.logger import get_logger
16
17
 
17
18
  logger = get_logger()
evalscope/summarizer.py CHANGED
@@ -4,10 +4,11 @@ import json
4
4
  import os
5
5
  from typing import List, Union
6
6
 
7
- from evalscope.config import TaskConfig
8
- from evalscope.constants import EvalBackend, OutputsStructure
7
+ from evalscope.config import TaskConfig, parse_task_config
8
+ from evalscope.constants import EvalBackend
9
9
  from evalscope.tools.combine_reports import gen_table
10
- from evalscope.utils import csv_to_list, get_latest_folder_path, json_to_dict, yaml_to_dict
10
+ from evalscope.utils import csv_to_list, get_latest_folder_path
11
+ from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
11
12
  from evalscope.utils.logger import get_logger
12
13
 
13
14
  logger = get_logger()
@@ -24,7 +25,7 @@ class Summarizer:
24
25
  if reports_dir is None:
25
26
  raise ValueError(f'No reports directory in {outputs_dir}')
26
27
 
27
- report_files: list = glob.glob(os.path.join(reports_dir, '*.json'))
28
+ report_files: list = glob.glob(os.path.join(reports_dir, '**/*.json'))
28
29
  for report_file in report_files:
29
30
  with open(report_file, 'r') as f:
30
31
  res_list.append(json.load(f))
@@ -47,33 +48,20 @@ class Summarizer:
47
48
  A report dict is overall report on a benchmark for specific model.
48
49
  """
49
50
  final_res_list: List[dict] = []
50
- candidate_task_cfgs: List[dict] = []
51
-
52
- if isinstance(task_cfg, dict):
53
- candidate_task_cfgs = [task_cfg]
54
- elif isinstance(task_cfg, str):
55
- task_cfg: dict = yaml_to_dict(task_cfg)
56
- candidate_task_cfgs = [task_cfg]
57
- elif isinstance(task_cfg, TaskConfig):
58
- task_cfg: dict = task_cfg.to_dict()
59
- candidate_task_cfgs = [task_cfg]
60
- elif isinstance(task_cfg, list):
51
+ candidate_task_cfgs: List[TaskConfig] = []
52
+
53
+ if isinstance(task_cfg, list):
61
54
  for task_cfg_item in task_cfg:
62
- if isinstance(task_cfg_item, str):
63
- task_cfg_item: dict = yaml_to_dict(task_cfg_item)
64
- elif isinstance(task_cfg_item, TaskConfig):
65
- task_cfg_item: dict = task_cfg_item.to_dict()
66
- candidate_task_cfgs.append(task_cfg_item)
55
+ candidate_task_cfgs.append(parse_task_config(task_cfg_item))
67
56
  else:
68
- raise ValueError(f'Invalid task_cfg: {task_cfg}')
57
+ candidate_task_cfgs.append(parse_task_config(task_cfg))
69
58
 
70
59
  for candidate_task in candidate_task_cfgs:
71
60
  logger.info(f'**Loading task cfg for summarizer: {candidate_task}')
72
- eval_backend = candidate_task.get('eval_backend') or EvalBackend.NATIVE
61
+ eval_backend = candidate_task.eval_backend
73
62
 
74
63
  if eval_backend == EvalBackend.NATIVE:
75
- outputs_dir: str = candidate_task.get('outputs')
76
- outputs_dir: str = os.path.expanduser(outputs_dir)
64
+ outputs_dir: str = os.path.expanduser(candidate_task.work_dir)
77
65
  if outputs_dir is None:
78
66
  raise ValueError(f'No outputs_dir in {task_cfg}')
79
67
  res_list: list = Summarizer.get_report(outputs_dir=outputs_dir)
@@ -128,8 +116,8 @@ class Summarizer:
128
116
  return final_res_list
129
117
 
130
118
  @staticmethod
131
- def parse_eval_config(candidate_task):
132
- eval_config: Union[str, dict] = candidate_task.get('eval_config')
119
+ def parse_eval_config(candidate_task: TaskConfig):
120
+ eval_config: Union[str, dict] = candidate_task.eval_config
133
121
  assert eval_config is not None, 'Please provide eval_config for specific evaluation backend.'
134
122
 
135
123
  if isinstance(eval_config, str):
@@ -10,7 +10,8 @@ import requests
10
10
  from concurrent.futures import ThreadPoolExecutor
11
11
  from tqdm import tqdm
12
12
 
13
- from evalscope.utils import get_logger, jsonl_to_list
13
+ from evalscope.utils import get_logger
14
+ from evalscope.utils.io_utils import jsonl_to_list
14
15
 
15
16
  logger = get_logger()
16
17
 
@@ -4,7 +4,8 @@ from typing import Union
4
4
 
5
5
  from evalscope.third_party.longbench_write.eval import run_eval
6
6
  from evalscope.third_party.longbench_write.infer import run_infer
7
- from evalscope.utils import get_logger, json_to_dict, yaml_to_dict
7
+ from evalscope.utils import get_logger
8
+ from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
8
9
 
9
10
  logger = get_logger()
10
11
 
@@ -6,7 +6,7 @@ from typing import List
6
6
 
7
7
  from evalscope.third_party.longbench_write.eval import EvalLength
8
8
  from evalscope.third_party.longbench_write.utils import chinese_to_arabic, count_words
9
- from evalscope.utils import jsonl_to_list
9
+ from evalscope.utils.io_utils import jsonl_to_list
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  logger = get_logger()
@@ -5,7 +5,8 @@ from typing import Union
5
5
 
6
6
  from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
7
7
  from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
8
- from evalscope.utils import get_logger, json_to_dict, yaml_to_dict
8
+ from evalscope.utils import get_logger
9
+ from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
9
10
 
10
11
  logger = get_logger()
11
12
 
@@ -19,16 +19,14 @@ def get_report(report_file: str):
19
19
  dataset_name = data_d['dataset_name']
20
20
  model_name = data_d['model_name']
21
21
  score = data_d['score'] # float or dict
22
+ metric = data_d['metric']
22
23
  score_d = {}
23
24
  if isinstance(score, dict):
24
- # score_d = dict([(k, round(v, 4) * 100) for k, v in score.items()])
25
25
  score_d = score
26
26
  elif isinstance(score, float):
27
- # score_d['acc'] = round(score, 4) * 100
28
- score_d['acc'] = score
27
+ score_d[metric] = score
29
28
  else:
30
29
  raise ValueError(f'Unknown score type: {type(score)}')
31
- # score_str = '\n'.join([str(v) + ' (' + k + ')' for k, v in score_d.items()])
32
30
  score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
33
31
 
34
32
  return model_name, {'dataset_name': dataset_name, 'score': score_str}