evalscope 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (47) hide show
  1. evalscope/backend/base.py +1 -1
  2. evalscope/backend/rag_eval/utils/clip.py +2 -2
  3. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  4. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  5. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
  6. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
  7. evalscope/benchmarks/humaneval/humaneval_adapter.py +193 -7
  8. evalscope/benchmarks/race/race_adapter.py +2 -1
  9. evalscope/config.py +35 -1
  10. evalscope/constants.py +24 -38
  11. evalscope/evaluator/__init__.py +0 -1
  12. evalscope/evaluator/evaluator.py +5 -4
  13. evalscope/evaluator/rating_eval.py +1 -1
  14. evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
  15. evalscope/perf/arguments.py +2 -1
  16. evalscope/perf/benchmark.py +2 -2
  17. evalscope/perf/main.py +2 -5
  18. evalscope/perf/plugin/api/openai_api.py +2 -2
  19. evalscope/perf/plugin/registry.py +3 -3
  20. evalscope/perf/utils/benchmark_util.py +4 -4
  21. evalscope/perf/utils/db_util.py +66 -22
  22. evalscope/perf/utils/local_server.py +3 -1
  23. evalscope/run.py +45 -82
  24. evalscope/run_arena.py +2 -1
  25. evalscope/summarizer.py +14 -26
  26. evalscope/third_party/longbench_write/eval.py +2 -1
  27. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  28. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  29. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  30. evalscope/tools/combine_reports.py +2 -4
  31. evalscope/tools/rewrite_eval_results.py +1 -1
  32. evalscope/utils/__init__.py +1 -0
  33. evalscope/utils/chat_service.py +1 -1
  34. evalscope/utils/io_utils.py +162 -0
  35. evalscope/utils/logger.py +8 -0
  36. evalscope/utils/utils.py +0 -175
  37. evalscope/version.py +2 -2
  38. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/METADATA +1 -1
  39. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/RECORD +46 -46
  40. tests/cli/test_run.py +11 -12
  41. tests/perf/test_perf.py +2 -1
  42. tests/vlm/test_vlmeval.py +3 -2
  43. evalscope/evaluator/humaneval_evaluator.py +0 -158
  44. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  45. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  46. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  47. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
evalscope/backend/base.py CHANGED
@@ -2,7 +2,7 @@
2
2
  from typing import Union
3
3
 
4
4
  from evalscope.config import TaskConfig
5
- from evalscope.utils import yaml_to_dict
5
+ from evalscope.utils.io_utils import yaml_to_dict
6
6
 
7
7
 
8
8
  class BackendManager:
@@ -4,7 +4,7 @@ import torch.nn.functional as F
4
4
  from langchain_core.embeddings import Embeddings
5
5
  from PIL import Image
6
6
  from transformers import AutoModel, AutoProcessor
7
- from typing import List
7
+ from typing import List, Union
8
8
 
9
9
  from evalscope.backend.rag_eval.utils.tools import PIL_to_base64, download_model
10
10
  from evalscope.constants import HubType
@@ -86,7 +86,7 @@ class CLIPModel(Embeddings):
86
86
  self.transform = self.processor.image_processor
87
87
  self.tokenizer = self.processor.tokenizer
88
88
 
89
- def encode_text(self, batch_texts: List[str] | List[List[str]]):
89
+ def encode_text(self, batch_texts: Union[List[str], List[List[str]]]):
90
90
  if isinstance(batch_texts[0], list):
91
91
  batch_texts = [text for _, texts in enumerate(batch_texts) for text in texts]
92
92
  # Ensure that the input texts are within the token limit
@@ -80,7 +80,7 @@ class BaseModel(Embeddings):
80
80
  """Embed query text. Compact mteb."""
81
81
  raise NotImplementedError
82
82
 
83
- def encode_corpus(self, corpus: List[str] | List[Dict[str, str]], **kwargs) -> list[torch.Tensor]:
83
+ def encode_corpus(self, corpus: Union[List[str], List[Dict[str, str]]], **kwargs) -> list[torch.Tensor]:
84
84
  """Embed search docs . Compact mteb."""
85
85
  raise NotImplementedError
86
86
 
@@ -8,7 +8,7 @@ from typing import Any, Optional
8
8
  from evalscope.benchmarks.data_adapter import DataAdapter
9
9
  from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
10
10
  from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
11
- from evalscope.utils import jsonl_to_list
11
+ from evalscope.utils.io_utils import jsonl_to_list
12
12
  from evalscope.utils.logger import get_logger
13
13
 
14
14
  logger = get_logger()
@@ -6,7 +6,8 @@ import re
6
6
 
7
7
  from evalscope.benchmarks import DataAdapter
8
8
  from evalscope.metrics.metrics import exact_match, weighted_mean
9
- from evalscope.utils import jsonl_to_list, normalize_score
9
+ from evalscope.utils import normalize_score
10
+ from evalscope.utils.io_utils import jsonl_to_list
10
11
  from evalscope.utils.logger import get_logger
11
12
 
12
13
  # flake8: noqa
@@ -5,7 +5,8 @@ import re
5
5
 
6
6
  from evalscope.benchmarks.data_adapter import DataAdapter
7
7
  from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import jsonl_to_list, normalize_score
8
+ from evalscope.utils import normalize_score
9
+ from evalscope.utils.io_utils import jsonl_to_list
9
10
  from evalscope.utils.logger import get_logger
10
11
 
11
12
  # flake8: noqa
@@ -1,20 +1,206 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import json
3
+ import os
4
+ import re
5
+ from tqdm import tqdm
6
+ from typing import List
2
7
 
3
- # flake8: noqa
8
+ from evalscope.benchmarks.data_adapter import DataAdapter
9
+ from evalscope.metrics.metrics import weighted_mean
10
+ from evalscope.tools.combine_reports import gen_table
11
+ from evalscope.utils import normalize_score
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
4
15
 
5
16
  DATASET_ID = 'modelscope/humaneval'
6
17
  SUBSET_LIST = ['openai_humaneval']
7
18
 
8
- # Note: ONLY FOR CLASS IMPORT, No implementation here.
9
-
10
19
  # Example:
11
- # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}
20
+ # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
12
21
 
13
22
 
14
- class HumanevalAdapter:
23
+ class HumanevalAdapter(DataAdapter):
15
24
  """
16
25
  A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
17
26
  """
18
27
 
19
- def __init__(self):
20
- ...
28
+ def __init__(self,
29
+ subset_list: list = None,
30
+ metric_list: list = None,
31
+ few_shot_num: int = None,
32
+ train_split: str = None,
33
+ eval_split: str = 'test',
34
+ prompt_template: str = 'Complete the following python code:\n',
35
+ **kwargs):
36
+ try:
37
+ from human_eval.data import stream_jsonl, write_jsonl
38
+ from human_eval.evaluation import check_correctness
39
+ except ImportError:
40
+ raise ImportError('Please install human_eval:'
41
+ 'https://github.com/openai/human-eval/tree/master#installation , '
42
+ 'Note that you need to enable the execution code in the human_eval/execution.py first.')
43
+
44
+ if subset_list is None:
45
+ subset_list = SUBSET_LIST
46
+
47
+ if metric_list is None:
48
+ metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
49
+
50
+ self.k = [1]
51
+ self.num_workers = 4
52
+ self.timeout = 4.0
53
+ self.outputs = kwargs.get('outputs', None)
54
+
55
+ self.read_problems_func = stream_jsonl
56
+ self.write_jsonl_func = write_jsonl
57
+ self.eval_func = check_correctness
58
+
59
+ super().__init__(
60
+ subset_list=subset_list,
61
+ metric_list=metric_list,
62
+ few_shot_num=few_shot_num,
63
+ train_split=train_split,
64
+ eval_split=eval_split,
65
+ prompt_template=prompt_template,
66
+ **kwargs)
67
+
68
+ def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
69
+ data_dict = {}
70
+ for subset_name in subset_list:
71
+ data_dict[subset_name] = {}
72
+ # [{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...]
73
+ data_dict[subset_name][self.eval_split] = [task for task in self.read_problems_func(dataset_name_or_path)]
74
+
75
+ return data_dict
76
+
77
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
78
+ """
79
+ Generate prompt for the model.
80
+
81
+ Args:
82
+ input_d (dict): The raw input. A single data format of the Humaneval:
83
+ {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
84
+ """
85
+ full_prompt = input_d['prompt']
86
+ full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
87
+
88
+ return {'data': [full_prompt]}
89
+
90
+ def get_answers(self, infer_cfg: dict) -> List[dict]:
91
+ ans_list: list = []
92
+ system_prompt: str = ''
93
+ for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
94
+ prompt: str = system_prompt + data_d['prompt']
95
+ inputs: dict = {'data': [prompt]}
96
+
97
+ pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
98
+
99
+ pred_ans: str = pred_res['choices'][0]['message']['content']
100
+ pred_ans = self._postprocess(pred_ans)
101
+
102
+ ans_list.append({'task_id': task_id, 'completion': pred_ans})
103
+
104
+ return ans_list
105
+
106
+ def eval(self, infer_cfg: dict, **kwargs):
107
+
108
+ # predict
109
+ ans_list: list = self.get_answers(infer_cfg)
110
+ ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
111
+
112
+ self.write_jsonl_func(filename=ans_out_file, data=ans_list)
113
+ # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
114
+ logger.info('** Dump predictions successfully.')
115
+
116
+ # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
117
+ results = self.eval_func(
118
+ sample_file=ans_out_file,
119
+ k=self.k,
120
+ n_workers=self.num_workers,
121
+ timeout=self.timeout,
122
+ problem_file=self.problem_file)
123
+
124
+ # output: report
125
+ report_map: dict = self.gen_report(results=results)
126
+ report_dir: str = self.outputs_structure.reports_dir
127
+ report_file: str = os.path.join(report_dir, 'human_eval_report.json')
128
+
129
+ with open(report_file, 'w') as f:
130
+ f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
131
+ # logger.info(f'** Dump report to {report_file} \n')
132
+ logger.info('** Dump report \n')
133
+
134
+ try:
135
+ # Make table
136
+ report_table: str = gen_table([report_dir])
137
+ logger.info(f'** Report table: \n {report_table} \n')
138
+ except Exception:
139
+ logger.error('Failed to generate report table.')
140
+
141
+ def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
142
+ total_num: int = sum([num for _, num in subset_score_map.values()])
143
+ weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
144
+ weighted_avg_acc = normalize_score(score=weighted_avg_acc)
145
+ cate_avg_list = [{
146
+ 'name': subset_name,
147
+ 'score': normalize_score(score=score)
148
+ } for subset_name, (score, _) in subset_score_map.items()]
149
+
150
+ category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
151
+
152
+ res_map = dict(
153
+ name=report_name or 'HumanEval',
154
+ metric='pass@1',
155
+ score=weighted_avg_acc,
156
+ category=[category_d],
157
+ total_num=total_num)
158
+
159
+ return res_map
160
+
161
+ @classmethod
162
+ def _postprocess(cls, text: str) -> str:
163
+ if '```' in text:
164
+ blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
165
+ if len(blocks) == 0:
166
+ text = text.split('```')[1] # fall back to default strategy
167
+ else:
168
+ text = blocks[0] # fetch the first code block
169
+ if not text.startswith('\n'): # in case starting with ```python
170
+ text = text[max(text.find('\n') + 1, 0):]
171
+ if text.strip().startswith('from') or text.strip().startswith('import'):
172
+ def_idx = text.find('def')
173
+ if def_idx != -1:
174
+ text = text[max(text.find('\n', def_idx) + 1, 0):]
175
+ text = text.split('\n\n')[0]
176
+ if text.strip().startswith('def'):
177
+ text = '\n'.join(text.split('\n')[1:])
178
+ if not text.startswith(' '):
179
+ if text.startswith(' '):
180
+ text = ' ' + text.lstrip()
181
+ else:
182
+ text = '\n'.join([' ' + line for line in text.split('\n')])
183
+ return text
184
+
185
+ def compute_metric(self, review_res_list: list) -> float:
186
+ """
187
+ Compute evaluation result by specific metric.
188
+
189
+ Args:
190
+ review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
191
+
192
+ Returns:
193
+ The metric score.
194
+ """
195
+ items = [(score, 1.0) for score in review_res_list]
196
+ return weighted_mean(items)
197
+
198
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
199
+ return self._postprocess(result)
200
+
201
+ def get_gold_answer(self, input_d: dict) -> str:
202
+ return input_d
203
+
204
+ def match(self, gold: str, pred: str) -> float:
205
+ res = self.eval_func(gold, pred, self.timeout)
206
+ return float(res['passed'])
@@ -5,7 +5,8 @@ import os
5
5
 
6
6
  from evalscope.benchmarks.data_adapter import DataAdapter
7
7
  from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import jsonl_to_list, normalize_score
8
+ from evalscope.utils import normalize_score
9
+ from evalscope.utils.io_utils import jsonl_to_list
9
10
  from evalscope.utils.logger import get_logger
10
11
 
11
12
  # flake8: noqa
evalscope/config.py CHANGED
@@ -9,7 +9,8 @@ from typing import Dict, List, Optional, Union
9
9
 
10
10
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
11
11
  from evalscope.models.custom import CustomModel
12
- from evalscope.utils import dict_to_yaml, gen_hash, json_to_dict, yaml_to_dict
12
+ from evalscope.utils import gen_hash
13
+ from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
13
14
  from evalscope.utils.logger import get_logger
14
15
 
15
16
  logger = get_logger()
@@ -31,6 +32,7 @@ DEFAULT_GENERATION_CONFIG = {
31
32
  class TaskConfig:
32
33
  # Model-related arguments
33
34
  model: Union[str, CustomModel, None] = None
35
+ model_id: Optional[str] = None
34
36
  model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
35
37
 
36
38
  # Template-related arguments
@@ -64,6 +66,13 @@ class TaskConfig:
64
66
  dry_run: bool = False
65
67
  seed: int = 42
66
68
 
69
+ def __post_init__(self):
70
+ if (not self.model_id) and self.model:
71
+ if isinstance(self.model, CustomModel):
72
+ self.model_id = type(self.model).__name__
73
+ else:
74
+ self.model_id = os.path.basename(self.model).rstrip(os.sep)
75
+
67
76
  def to_dict(self):
68
77
  # Note: to avoid serialization error for some model instance
69
78
  return self.__dict__
@@ -119,6 +128,7 @@ class TaskConfig:
119
128
  continue
120
129
 
121
130
  task.model = custom_model
131
+ task.model_id = type(custom_model).__name__
122
132
  res_list.append(task)
123
133
 
124
134
  return res_list
@@ -168,6 +178,30 @@ tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
168
178
  registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
169
179
 
170
180
 
181
+ def parse_task_config(task_cfg) -> TaskConfig:
182
+ """Parse task configuration from various formats into a TaskConfig object."""
183
+ if isinstance(task_cfg, TaskConfig):
184
+ logger.info('Args: Task config is provided with TaskConfig type.')
185
+ elif isinstance(task_cfg, dict):
186
+ logger.info('Args: Task config is provided with dictionary type.')
187
+ task_cfg = TaskConfig.from_dict(task_cfg)
188
+ elif isinstance(task_cfg, Namespace):
189
+ logger.info('Args: Task config is provided with CommandLine type.')
190
+ task_cfg = TaskConfig.from_args(task_cfg)
191
+ elif isinstance(task_cfg, str):
192
+ extension = task_cfg.split('.')[-1]
193
+ logger.info(f'Args: Task config is provided with {extension} file type.')
194
+ if extension in ['yaml', 'yml']:
195
+ task_cfg = TaskConfig.from_yaml(task_cfg)
196
+ elif extension == 'json':
197
+ task_cfg = TaskConfig.from_json(task_cfg)
198
+ else:
199
+ raise ValueError('Args: Unsupported file extension.')
200
+ else:
201
+ raise ValueError('Args: Please provide a valid task config.')
202
+ return task_cfg
203
+
204
+
171
205
  class TempModel(CustomModel):
172
206
 
173
207
  def __init__(self, config: dict):
evalscope/constants.py CHANGED
@@ -1,5 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
2
  from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
4
3
  from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
5
4
 
@@ -7,6 +6,7 @@ DEFAULT_WORK_DIR = './outputs'
7
6
  DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
8
7
  DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
9
8
  DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
9
+ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
10
10
 
11
11
 
12
12
  class HubType:
@@ -76,33 +76,6 @@ class ArenaMode:
76
76
  PAIRWISE_BASELINE = 'pairwise_baseline'
77
77
 
78
78
 
79
- class OutputsStructure:
80
- LOGS_DIR = 'logs'
81
- PREDICTIONS_DIR = 'predictions'
82
- REVIEWS_DIR = 'reviews'
83
- REPORTS_DIR = 'reports'
84
- CONFIGS_DIR = 'configs'
85
-
86
- def __init__(self, outputs_dir: str, is_make: bool = True):
87
- self.outputs_dir = outputs_dir
88
- self.logs_dir = os.path.join(outputs_dir, OutputsStructure.LOGS_DIR)
89
- self.predictions_dir = os.path.join(outputs_dir, OutputsStructure.PREDICTIONS_DIR)
90
- self.reviews_dir = os.path.join(outputs_dir, OutputsStructure.REVIEWS_DIR)
91
- self.reports_dir = os.path.join(outputs_dir, OutputsStructure.REPORTS_DIR)
92
- self.configs_dir = os.path.join(outputs_dir, OutputsStructure.CONFIGS_DIR)
93
-
94
- if is_make:
95
- self.create_directories()
96
-
97
- def create_directories(self):
98
- os.makedirs(self.outputs_dir, exist_ok=True)
99
- os.makedirs(self.logs_dir, exist_ok=True)
100
- os.makedirs(self.predictions_dir, exist_ok=True)
101
- os.makedirs(self.reviews_dir, exist_ok=True)
102
- os.makedirs(self.reports_dir, exist_ok=True)
103
- os.makedirs(self.configs_dir, exist_ok=True)
104
-
105
-
106
79
  class AnswerKeys:
107
80
  ANSWER_ID = 'answer_id'
108
81
  RAW_INPUT = 'raw_input'
@@ -166,17 +139,30 @@ class EvalType:
166
139
 
167
140
 
168
141
  class EvalBackend:
169
- # Use native evaluation pipeline of EvalScope
170
- NATIVE = 'Native'
171
142
 
172
- # Use OpenCompass framework as the evaluation backend
173
- OPEN_COMPASS = 'OpenCompass'
143
+ class _Backend:
144
+ # compatible with old version, set 'value'
145
+
146
+ def __init__(self, value):
147
+ self._value = value
148
+
149
+ @property
150
+ def value(self):
151
+ return self._value
152
+
153
+ def __str__(self):
154
+ return self._value
174
155
 
175
- # Use VLM Eval Kit as the multi-modal model evaluation backend
176
- VLM_EVAL_KIT = 'VLMEvalKit'
156
+ def __repr__(self):
157
+ return f"'{self._value}'"
177
158
 
178
- # Use RAGEval as the RAG evaluation backend
179
- RAG_EVAL = 'RAGEval'
159
+ def __eq__(self, other):
160
+ if isinstance(other, str):
161
+ return self._value == other
162
+ return NotImplemented
180
163
 
181
- # Use third-party evaluation backend/modules
182
- THIRD_PARTY = 'ThirdParty'
164
+ NATIVE = _Backend('Native')
165
+ OPEN_COMPASS = _Backend('OpenCompass')
166
+ VLM_EVAL_KIT = _Backend('VLMEvalKit')
167
+ RAG_EVAL = _Backend('RAGEval')
168
+ THIRD_PARTY = _Backend('ThirdParty')
@@ -1,4 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
3
  from evalscope.evaluator.evaluator import Evaluator
4
- from evalscope.evaluator.humaneval_evaluator import HumanevalEvaluator
@@ -11,10 +11,11 @@ from typing import Any, Dict, List, Optional, Union
11
11
  from evalscope.benchmarks import DataAdapter
12
12
  from evalscope.config import TaskConfig
13
13
  from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
14
- OutputsStructure, ReviewKeys)
14
+ ReviewKeys)
15
15
  from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
16
16
  from evalscope.tools.combine_reports import gen_table
17
- from evalscope.utils import dict_torch_dtype_to_str, dump_jsonl_data, gen_hash, jsonl_to_list
17
+ from evalscope.utils import dict_torch_dtype_to_str, gen_hash
18
+ from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
18
19
  from evalscope.utils.logger import get_logger
19
20
 
20
21
  logger = get_logger()
@@ -56,8 +57,8 @@ class Evaluator(object):
56
57
  **kwargs):
57
58
 
58
59
  self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
59
- self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep))
60
- self.model_name = os.path.basename(str(overall_task_cfg.model).rstrip(os.sep))
60
+ self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
61
+ self.model_name = overall_task_cfg.model_id
61
62
  self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
62
63
 
63
64
  self.datasets_dir = os.path.expanduser(datasets_dir)
@@ -5,8 +5,8 @@ import pyarrow as pa
5
5
  from typing import List, Union
6
6
 
7
7
  from evalscope.constants import MetricMembers
8
- from evalscope.utils import jsonl_to_list
9
8
  from evalscope.utils.arena_utils import compute_elo
9
+ from evalscope.utils.io_utils import jsonl_to_list
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  logger = get_logger()
@@ -12,8 +12,9 @@ from typing import Any, List
12
12
 
13
13
  from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
14
14
  from evalscope.models.openai_model import OpenAIModel
15
- from evalscope.utils import completion_parsers, dump_jsonl_data, jsonl_to_list, random_seeded_choice
15
+ from evalscope.utils import completion_parsers, random_seeded_choice
16
16
  from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
17
+ from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
17
18
  from evalscope.utils.logger import get_logger
18
19
 
19
20
  logger = get_logger()
@@ -16,7 +16,7 @@ class Arguments:
16
16
  attn_implementation: Optional[str] = None # Attention implementaion, only for local inference
17
17
  api: str = 'openai' # API to be used (default: 'openai')
18
18
  tokenizer_path: Optional[str] = None # Path to the tokenizer
19
- port: str = '8877' # Port number for the local API server
19
+ port: int = 8877 # Port number for the local API server
20
20
 
21
21
  # Connection settings
22
22
  url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
@@ -138,6 +138,7 @@ def add_argument(parser: argparse.ArgumentParser):
138
138
 
139
139
  # Connection settings
140
140
  parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
141
+ parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
141
142
  parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
142
143
  parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
143
144
  parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
@@ -195,9 +195,9 @@ async def start_server(args: Arguments) -> bool:
195
195
  server.start()
196
196
 
197
197
  if args.dataset.startswith('speed_benchmark'):
198
- args.url = 'http://127.0.0.1:8877/v1/completions'
198
+ args.url = f'http://127.0.0.1:{args.port}/v1/completions'
199
199
  else:
200
- args.url = 'http://127.0.0.1:8877/v1/chat/completions'
200
+ args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
201
201
 
202
202
  if not await test_connection(args):
203
203
  raise TimeoutError('Test connection failed')
evalscope/perf/main.py CHANGED
@@ -8,7 +8,7 @@ from evalscope.perf.arguments import Arguments, parse_args
8
8
  from evalscope.perf.benchmark import benchmark
9
9
  from evalscope.perf.utils.db_util import get_output_path
10
10
  from evalscope.perf.utils.handler import add_signal_handlers
11
- from evalscope.utils.logger import get_logger
11
+ from evalscope.utils.logger import configure_logging, get_logger
12
12
  from evalscope.utils.utils import seed_everything
13
13
 
14
14
  logger = get_logger()
@@ -23,10 +23,7 @@ def run_perf_benchmark(args):
23
23
 
24
24
  # Setup logger and output
25
25
  args.outputs_dir = get_output_path(args)
26
- get_logger(log_file=os.path.join(args.outputs_dir, 'benchmark.log'), force=True)
27
-
28
- if args.debug:
29
- get_logger(log_level=logging.DEBUG, force=True)
26
+ configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
30
27
 
31
28
  logger.info('Starting benchmark...')
32
29
  logger.info(args)
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import os
3
3
  from transformers import AutoTokenizer
4
- from typing import Any, Dict, Iterator, List
4
+ from typing import Any, Dict, Iterator, List, Union
5
5
 
6
6
  from evalscope.perf.arguments import Arguments
7
7
  from evalscope.perf.plugin.api.base import ApiPluginBase
@@ -29,7 +29,7 @@ class OpenaiPlugin(ApiPluginBase):
29
29
  else:
30
30
  self.tokenizer = None
31
31
 
32
- def build_request(self, messages: List[Dict] | str, param: Arguments) -> Dict:
32
+ def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
33
33
  """Build the openai format request based on prompt, dataset
34
34
 
35
35
  Args:
@@ -1,4 +1,4 @@
1
- from typing import Any, List, Type
1
+ from typing import Any, List, Type, Union
2
2
 
3
3
 
4
4
  class PluginRegistry:
@@ -20,7 +20,7 @@ class PluginRegistry:
20
20
  return self.get_class(name)
21
21
 
22
22
 
23
- def register_dataset(name: str | List[str]):
23
+ def register_dataset(name: Union[str, List[str]]):
24
24
 
25
25
  def class_decorator(cls: Type):
26
26
  if isinstance(name, str):
@@ -35,7 +35,7 @@ def register_dataset(name: str | List[str]):
35
35
  return class_decorator
36
36
 
37
37
 
38
- def register_api(name: str | List[str]):
38
+ def register_api(name: Union[str, List[str]]):
39
39
 
40
40
  def class_decorator(cls: Type):
41
41
  if isinstance(name, str):
@@ -116,19 +116,19 @@ class BenchmarkMetrics:
116
116
 
117
117
  def create_message(self, default_ndigits=3):
118
118
  message = {
119
- 'Time taken for tests (senconds)': round(self.total_time, default_ndigits),
119
+ 'Time taken for tests (s)': round(self.total_time, default_ndigits),
120
120
  'Number of concurrency': self.concurrency,
121
121
  'Total requests': int(self.n_total_queries),
122
122
  'Succeed requests': self.n_succeed_queries,
123
123
  'Failed requests': self.n_failed_queries,
124
+ 'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
124
125
  'Average QPS': round(self.qps, default_ndigits),
125
126
  'Average latency (s)': round(self.avg_latency, default_ndigits),
126
127
  'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
127
128
  'Average time per output token (s)': round(self.avg_time_per_token, 5),
128
- 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
129
- 'Average package per request': round(self.n_avg_chunks, default_ndigits),
130
- 'Throughput(average output tokens per second)': round(self.avg_token_per_seconds, default_ndigits),
131
129
  'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
132
130
  'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
131
+ 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
132
+ 'Average package per request': round(self.n_avg_chunks, default_ndigits),
133
133
  }
134
134
  return message