evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  25. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  26. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  27. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  28. evalscope/benchmarks/data_adapter.py +29 -9
  29. evalscope/benchmarks/general_arena/__init__.py +0 -0
  30. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  31. evalscope/benchmarks/general_arena/utils.py +226 -0
  32. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
  33. evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
  34. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  35. evalscope/benchmarks/hle/__init__.py +0 -0
  36. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  37. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  38. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  40. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  41. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  42. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  43. evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
  44. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  45. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  46. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  47. evalscope/benchmarks/race/race_adapter.py +1 -1
  48. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  49. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  50. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  51. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  52. evalscope/benchmarks/utils.py +2 -2
  53. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  54. evalscope/config.py +8 -123
  55. evalscope/constants.py +5 -21
  56. evalscope/evaluator/__init__.py +1 -1
  57. evalscope/evaluator/evaluator.py +20 -15
  58. evalscope/metrics/__init__.py +9 -1
  59. evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
  60. evalscope/metrics/llm_judge.py +106 -20
  61. evalscope/metrics/metrics.py +20 -8
  62. evalscope/models/__init__.py +4 -8
  63. evalscope/models/adapters/__init__.py +4 -9
  64. evalscope/models/adapters/base_adapter.py +4 -0
  65. evalscope/models/adapters/bfcl_adapter.py +2 -0
  66. evalscope/models/adapters/chat_adapter.py +3 -0
  67. evalscope/models/adapters/choice_adapter.py +4 -0
  68. evalscope/models/adapters/custom_adapter.py +7 -3
  69. evalscope/models/adapters/server_adapter.py +4 -2
  70. evalscope/models/adapters/t2i_adapter.py +3 -0
  71. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  72. evalscope/models/custom/dummy_model.py +3 -3
  73. evalscope/models/register.py +0 -14
  74. evalscope/perf/arguments.py +15 -16
  75. evalscope/perf/benchmark.py +38 -39
  76. evalscope/perf/http_client.py +30 -86
  77. evalscope/perf/main.py +3 -3
  78. evalscope/perf/plugin/__init__.py +3 -2
  79. evalscope/perf/plugin/api/__init__.py +4 -3
  80. evalscope/perf/plugin/api/base.py +22 -4
  81. evalscope/perf/plugin/api/custom_api.py +212 -55
  82. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  83. evalscope/perf/plugin/api/default_api.py +105 -0
  84. evalscope/perf/plugin/api/openai_api.py +17 -19
  85. evalscope/perf/plugin/datasets/__init__.py +10 -7
  86. evalscope/perf/plugin/datasets/base.py +22 -1
  87. evalscope/perf/plugin/datasets/custom.py +2 -1
  88. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  89. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  90. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  91. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  92. evalscope/perf/plugin/datasets/openqa.py +2 -1
  93. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  94. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  95. evalscope/perf/plugin/registry.py +36 -16
  96. evalscope/perf/utils/analysis_result.py +24 -23
  97. evalscope/perf/utils/benchmark_util.py +14 -20
  98. evalscope/perf/utils/db_util.py +79 -61
  99. evalscope/report/__init__.py +1 -1
  100. evalscope/report/utils.py +34 -15
  101. evalscope/run.py +1 -1
  102. evalscope/summarizer.py +1 -2
  103. evalscope/utils/__init__.py +63 -2
  104. evalscope/utils/argument_utils.py +64 -0
  105. evalscope/utils/import_utils.py +16 -0
  106. evalscope/utils/io_utils.py +55 -4
  107. evalscope/utils/model_utils.py +37 -1
  108. evalscope/version.py +2 -2
  109. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
  110. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
  111. tests/aigc/test_t2i.py +1 -1
  112. tests/cli/test_all.py +68 -4
  113. tests/cli/test_collection.py +1 -1
  114. tests/cli/test_custom.py +261 -0
  115. tests/cli/test_run.py +34 -70
  116. tests/perf/test_perf.py +31 -4
  117. tests/rag/test_clip_benchmark.py +2 -1
  118. tests/rag/test_mteb.py +3 -1
  119. tests/rag/test_ragas.py +3 -1
  120. tests/swift/test_run_swift_eval.py +2 -1
  121. tests/swift/test_run_swift_vlm_eval.py +2 -1
  122. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  123. tests/utils.py +13 -0
  124. tests/vlm/test_vlmeval.py +8 -2
  125. evalscope/evaluator/rating_eval.py +0 -157
  126. evalscope/evaluator/reviewer/__init__.py +0 -1
  127. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  128. evalscope/models/model.py +0 -189
  129. evalscope/registry/__init__.py +0 -1
  130. evalscope/registry/config/cfg_arena.yaml +0 -77
  131. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  132. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  133. evalscope/registry/config/cfg_single.yaml +0 -78
  134. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  135. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  136. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  137. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  138. evalscope/registry/data/question.jsonl +0 -80
  139. evalscope/registry/tasks/arc.yaml +0 -28
  140. evalscope/registry/tasks/bbh.yaml +0 -26
  141. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  142. evalscope/registry/tasks/ceval.yaml +0 -27
  143. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  144. evalscope/registry/tasks/cmmlu.yaml +0 -27
  145. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  146. evalscope/registry/tasks/general_qa.yaml +0 -27
  147. evalscope/registry/tasks/gsm8k.yaml +0 -29
  148. evalscope/registry/tasks/mmlu.yaml +0 -29
  149. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  150. evalscope/run_arena.py +0 -202
  151. evalscope/utils/arena_utils.py +0 -217
  152. evalscope/utils/completion_parsers.py +0 -82
  153. /evalscope/{utils → benchmarks}/filters.py +0 -0
  154. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  155. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  156. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  157. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
evalscope/config.py CHANGED
@@ -1,7 +1,6 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
3
  import copy
4
- import json
5
4
  import os
6
5
  from argparse import Namespace
7
6
  from dataclasses import dataclass, field
@@ -10,18 +9,15 @@ from typing import Dict, List, Optional, Union
10
9
  from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType,
11
10
  JudgeStrategy, ModelTask, OutputType)
12
11
  from evalscope.models import CustomModel, DummyCustomModel
13
- from evalscope.utils import gen_hash
14
- from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
12
+ from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
13
+ from evalscope.utils.io_utils import dict_to_yaml, gen_hash
15
14
  from evalscope.utils.logger import get_logger
16
- from evalscope.utils.utils import parse_int_or_float
17
15
 
18
16
  logger = get_logger()
19
17
 
20
- cur_path = os.path.dirname(os.path.abspath(__file__))
21
-
22
18
 
23
19
  @dataclass
24
- class TaskConfig:
20
+ class TaskConfig(BaseArgument):
25
21
  # Model-related arguments
26
22
  model: Union[str, 'CustomModel', None] = None
27
23
  model_id: Optional[str] = None
@@ -132,15 +128,6 @@ class TaskConfig:
132
128
  'precision': 'torch.float16',
133
129
  }
134
130
 
135
- def to_dict(self):
136
- result = self.__dict__.copy()
137
- if isinstance(self.model, CustomModel):
138
- result['model'] = self.model.__class__.__name__
139
- return result
140
-
141
- def __str__(self):
142
- return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
143
-
144
131
  def update(self, other: Union['TaskConfig', dict]):
145
132
  if isinstance(other, TaskConfig):
146
133
  other = other.to_dict()
@@ -155,91 +142,11 @@ class TaskConfig:
155
142
  except Exception as e:
156
143
  logger.warning(f'Failed to dump overall task config: {e}')
157
144
 
158
- @staticmethod
159
- def list():
160
- return list(registry_tasks.keys())
161
-
162
- @staticmethod
163
- def from_yaml(yaml_file: str):
164
- return TaskConfig.from_dict(yaml_to_dict(yaml_file))
165
-
166
- @staticmethod
167
- def from_dict(d: dict):
168
- return TaskConfig(**d)
169
-
170
- @staticmethod
171
- def from_json(json_file: str):
172
- return TaskConfig.from_dict(json_to_dict(json_file))
173
-
174
- @staticmethod
175
- def from_args(args: Namespace):
176
- # Convert Namespace to a dictionary and filter out None values
177
- args_dict = {k: v for k, v in vars(args).items() if v is not None}
178
-
179
- if 'func' in args_dict:
180
- del args_dict['func'] # Note: compat CLI arguments
181
-
182
- return TaskConfig.from_dict(args_dict)
183
-
184
- @staticmethod
185
- def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
186
- res_list = []
187
- for task_name in tasks:
188
- task = registry_tasks.get(task_name, None)
189
- if task is None:
190
- logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
191
- continue
192
-
193
- task.model = custom_model
194
- task.model_args = custom_model.config
195
- task.model_id = type(custom_model).__name__
196
- res_list.append(task)
197
-
198
- return res_list
199
-
200
- @staticmethod
201
- def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
202
- """
203
- Register a new task (dataset) for evaluation.
204
-
205
- Args:
206
- name: str, the dataset name.
207
- data_pattern: str, the data pattern for the task.
208
- e.g. `mmlu`, `ceval`, `gsm8k`, ...
209
- refer to task_config.list() for all available datasets.
210
- dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
211
- then your specific custom dataset directory will be /path/to/data/{name}
212
- subset_list: list, the subset list for the dataset.
213
- e.g. ['middle_school_politics', 'operating_system']
214
- refer to the mmlu for example. https://github.com/hendrycks/test/blob/master/categories.py
215
- """
216
- available_datasets = list(registry_tasks.keys())
217
- if data_pattern not in available_datasets:
218
- logger.error(
219
- f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
220
- return
221
-
222
- # Reuse the existing task config and update the datasets
223
- pattern_config = registry_tasks[data_pattern]
224
-
225
- custom_config = copy.deepcopy(pattern_config)
226
- custom_config.datasets = [data_pattern]
227
- custom_config.dataset_args = {data_pattern: {}}
228
- custom_config.eval_type = EvalType.CHECKPOINT
229
-
230
- if dataset_dir is not None:
231
- custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
232
-
233
- if subset_list is not None:
234
- custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
235
-
236
- registry_tasks.update({name: custom_config})
237
- logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
238
-
239
-
240
- tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
241
-
242
- registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
145
+ def to_dict(self):
146
+ result = self.__dict__.copy()
147
+ if isinstance(self.model, CustomModel):
148
+ result['model'] = self.model.__class__.__name__
149
+ return result
243
150
 
244
151
 
245
152
  def parse_task_config(task_cfg) -> TaskConfig:
@@ -264,25 +171,3 @@ def parse_task_config(task_cfg) -> TaskConfig:
264
171
  else:
265
172
  raise ValueError('Args: Please provide a valid task config.')
266
173
  return task_cfg
267
-
268
-
269
- class TempModel(CustomModel):
270
-
271
- def __init__(self, config: dict):
272
- super().__init__(config=config)
273
-
274
- def predict(self, prompts: str, **kwargs):
275
- return [item + ': response' for item in prompts]
276
-
277
-
278
- if __name__ == '__main__':
279
- model = TempModel(config={'model_id': 'test-swift-dummy-model'})
280
- task_config = TaskConfig()
281
-
282
- # Register a new task
283
- TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
284
-
285
- swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
286
- for item in swift_eval_task:
287
- print(item)
288
- print()
evalscope/constants.py CHANGED
@@ -41,27 +41,6 @@ class MetricsConstant:
41
41
  ]
42
42
 
43
43
 
44
- class MetricMembers:
45
-
46
- # Math accuracy metric
47
- MATH_ACCURACY = 'math_accuracy'
48
-
49
- # Code pass@k metric
50
- CODE_PASS_K = 'code_pass_k'
51
-
52
- # Code rouge metric
53
- ROUGE = 'rouge'
54
-
55
- # ELO rating system for pairwise comparison
56
- ELO = 'elo'
57
-
58
- # Pairwise comparison win/lose and tie(optional)
59
- PAIRWISE = 'pairwise'
60
-
61
- # Rating score for single model
62
- SCORE = 'score'
63
-
64
-
65
44
  class ArenaWinner:
66
45
 
67
46
  MODEL_A = 'model_a'
@@ -172,6 +151,11 @@ class JudgeStrategy:
172
151
  LLM_RECALL = 'llm_recall'
173
152
 
174
153
 
154
+ class JudgeScoreType:
155
+ NUMERIC = 'numeric' # numeric score
156
+ PATTERN = 'pattern' # pattern matching score
157
+
158
+
175
159
  class ModelTask:
176
160
  TEXT_GENERATION = 'text_generation'
177
161
  IMAGE_GENERATION = 'image_generation'
@@ -1,3 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.evaluator.evaluator import Evaluator
3
+ from .evaluator import Evaluator
@@ -7,16 +7,18 @@ from collections import OrderedDict, defaultdict
7
7
  from concurrent.futures import ThreadPoolExecutor, as_completed
8
8
  from copy import deepcopy
9
9
  from tqdm import tqdm
10
- from typing import Any, Dict, List, Optional, Union
10
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
11
11
 
12
12
  from evalscope.benchmarks import DataAdapter
13
13
  from evalscope.config import TaskConfig
14
14
  from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
15
- from evalscope.models import BaseModelAdapter
16
15
  from evalscope.report import Report, gen_table
17
- from evalscope.utils import dict_torch_dtype_to_str, gen_hash
18
- from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
16
+ from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, gen_hash, jsonl_to_list
19
17
  from evalscope.utils.logger import get_logger
18
+ from evalscope.utils.model_utils import dict_torch_dtype_to_str
19
+
20
+ if TYPE_CHECKING:
21
+ from evalscope.models import BaseModelAdapter
20
22
 
21
23
  logger = get_logger()
22
24
 
@@ -38,7 +40,7 @@ class Evaluator(object):
38
40
 
39
41
  def __init__(self,
40
42
  data_adapter: DataAdapter,
41
- model_adapter: BaseModelAdapter,
43
+ model_adapter: 'BaseModelAdapter',
42
44
  outputs: OutputsStructure = None,
43
45
  task_cfg: TaskConfig = None,
44
46
  **kwargs):
@@ -237,9 +239,10 @@ class Evaluator(object):
237
239
  if use_llm:
238
240
  # Use LLM as judge
239
241
  assert self.judge is not None, f'Judge model is required for LLM judging {self.data_adapter.name}'
242
+ pred_content = self.data_adapter.llm_parse_pred_result(
243
+ result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
240
244
  review_result = self.data_adapter.llm_match(
241
- gold_content, answer_content, self.judge, raw_input=raw_input_d)
242
- pred = answer_content
245
+ gold_content, pred_content, self.judge, raw_input=raw_input_d)
243
246
  else:
244
247
  # Use rule-based judging
245
248
  pred_content = self.data_adapter.parse_pred_result(
@@ -250,15 +253,14 @@ class Evaluator(object):
250
253
  if (self.task_cfg.judge_strategy == JudgeStrategy.LLM_RECALL
251
254
  and isinstance(review_result, (bool, int, float)) and not bool(review_result)):
252
255
  assert self.judge is not None, f'Judge model is required for LLM_RECALL strategy {self.data_adapter.name}' # noqa: E501
256
+ pred_content = self.data_adapter.llm_parse_pred_result(
257
+ result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
253
258
  review_result = self.data_adapter.llm_match(
254
- gold_content, answer_content, self.judge, raw_input=raw_input_d)
255
- pred = answer_content
256
- else:
257
- pred = pred_content
259
+ gold_content, pred_content, self.judge, raw_input=raw_input_d)
258
260
 
259
261
  choice[ReviewKeys.REVIEW] = {
260
262
  ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
261
- ReviewKeys.PRED: pred,
263
+ ReviewKeys.PRED: pred_content,
262
264
  ReviewKeys.RESULT: review_result
263
265
  }
264
266
  rev_choices.append(choice)
@@ -394,9 +396,6 @@ class Evaluator(object):
394
396
  report_map: Report = self.data_adapter.gen_report(
395
397
  subset_score_map=reviews_score_all, model_name=self.model_name)
396
398
 
397
- # Post process report
398
- self.data_adapter.post_process_report(report_map, report_path=report_path)
399
-
400
399
  # Make table
401
400
  try:
402
401
  report_table = gen_table(report_list=[report_map], add_overall_metric=True)
@@ -418,6 +417,12 @@ class Evaluator(object):
418
417
  report_map.to_json(report_file)
419
418
  logger.info(f'Dump report to: {report_file} \n')
420
419
 
420
+ # Post process report
421
+ try:
422
+ self.data_adapter.post_process_report(report_map, report_path=report_path)
423
+ except Exception as e:
424
+ logger.error(f'Failed to post process report: {e}')
425
+
421
426
  return report_map
422
427
 
423
428
  def eval(self, **kwargs) -> dict:
@@ -4,7 +4,8 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .llm_judge import LLMJudge
7
+ from .completion_parsers import ResponseParser, lmsys_parser, ranking_parser
8
+ from .llm_judge import DEFAULT_NUMERIC_SCORE_TEMPLATE, DEFAULT_PROMPT_TEMPLATE, LLMJudge
8
9
  from .math_parser import extract_answer, math_equal, strip_answer_string
9
10
  from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
10
11
  weighted_mean)
@@ -33,12 +34,19 @@ else:
33
34
  ],
34
35
  'llm_judge': [
35
36
  'LLMJudge',
37
+ 'DEFAULT_PROMPT_TEMPLATE',
38
+ 'DEFAULT_NUMERIC_SCORE_TEMPLATE',
36
39
  ],
37
40
  'math_parser': [
38
41
  'extract_answer',
39
42
  'math_equal',
40
43
  'strip_answer_string',
41
44
  ],
45
+ 'completion_parsers': [
46
+ 'ResponseParser',
47
+ 'lmsys_parser',
48
+ 'ranking_parser',
49
+ ],
42
50
  }
43
51
 
44
52
  import sys
@@ -1,77 +1,85 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- # Copyright (c) OpenCompass.
2
+ # flake8: noqa
3
3
 
4
- import functools
5
- import hashlib
6
- import importlib
7
- import importlib.util
8
- import numpy as np
9
- import os
10
- import random
4
+ import ast
11
5
  import re
12
- import torch
13
- from inspect import signature
14
- from typing import Any, Dict, List, Tuple, Union
15
6
 
7
+ # from . import utils as ann_utils
8
+ from evalscope.constants import ArenaWinner
16
9
  from evalscope.utils.logger import get_logger
17
10
 
18
11
  logger = get_logger()
19
12
 
20
- TEST_LEVEL_LIST = [0, 1]
13
+ one_score_pattern = re.compile('\[\[(\d+\.?\d*)\]\]')
14
+ one_score_pattern_backup = re.compile('\[(\d+\.?\d*)\]')
21
15
 
22
- # Example: export TEST_LEVEL_LIST=0,1
23
- TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
24
16
 
17
+ # modified from: https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py#L47
18
+ # does not work with batched completions
19
+ def lmsys_parser(completion, output_format):
20
+ if output_format == '[[rating]]':
21
+ match = re.search(one_score_pattern, completion)
22
+ if not match:
23
+ match = re.search(one_score_pattern_backup, completion)
25
24
 
26
- def test_level_list():
27
- global TEST_LEVEL_LIST
28
- if TEST_LEVEL_LIST_STR in os.environ:
29
- TEST_LEVEL_LIST = [int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')]
30
-
31
- return TEST_LEVEL_LIST
32
-
33
-
34
- def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
35
- module_name, spliter, cls_name = eval_class_ref.partition(':')
36
-
25
+ if match:
26
+ rating = ast.literal_eval(match.groups()[0])
27
+ else:
28
+ logger.error(f'Content: {completion}\n'
29
+ 'You must manually fix the score.')
30
+ rating = -1
31
+
32
+ return rating
33
+ if output_format == '[[rating_a,rating_b]]':
34
+ try:
35
+ score_pair = completion.split('\n')[0]
36
+ score_pair = score_pair.replace(',', ' ')
37
+ sp = score_pair.split(' ')
38
+ if len(sp) == 2:
39
+ score_1 = float(sp[0])
40
+ score_2 = float(sp[1])
41
+ if score_1 > score_2:
42
+ winner = ArenaWinner.MODEL_A
43
+ elif score_1 < score_2:
44
+ winner = ArenaWinner.MODEL_B
45
+ else:
46
+ if score_1 == score_1 == -1:
47
+ winner = ArenaWinner.UNKNOWN
48
+ winner = ArenaWinner.TIE
49
+ return winner, [score_1, score_2]
50
+ else:
51
+ raise Exception('Invalid score pair.')
52
+ except Exception as e:
53
+ logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
54
+ return ArenaWinner.UNKNOWN, [-1, -1]
55
+ elif output_format == '[[A]]':
56
+ if '[[A]]' in completion:
57
+ winner = ArenaWinner.MODEL_A
58
+ elif '[[B]]' in completion:
59
+ winner = ArenaWinner.MODEL_B
60
+ elif '[[C]]' in completion:
61
+ winner = ArenaWinner.TIE
62
+ else:
63
+ logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
64
+ winner = ArenaWinner.UNKNOWN
65
+ return winner
66
+
67
+
68
+ def ranking_parser(completion, **kwargs):
37
69
  try:
38
- obj_cls = importlib.import_module(module_name)
39
- except ImportError as e:
40
- logger.error(f'{e}')
41
- raise e
42
-
43
- if spliter:
44
- for attr in cls_name.split('.'):
45
- obj_cls = getattr(obj_cls, attr)
70
+ if isinstance(completion, str):
71
+ ordered_completions = ast.literal_eval(completion)
72
+ else:
73
+ ordered_completions = completion
46
74
 
47
- return functools.partial(obj_cls, *args, **kwargs)
75
+ rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
76
+ assert rank in [1, 2]
48
77
 
49
-
50
- def random_seeded_choice(seed: Union[int, str, float], choices, **kwargs):
51
- """Random choice with a (potentially string) seed."""
52
- return random.Random(seed).choices(choices, k=1, **kwargs)[0]
53
-
54
-
55
- def gen_hash(name: str, bits: int = 32):
56
- return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()[:bits]
57
-
58
-
59
- def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
60
- """
61
- Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
62
- converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
63
- string, which can then be stored in the json format.
64
-
65
- Refer to: https://github.com/huggingface/transformers/pull/16065/files for details.
66
- """
67
- if d.get('torch_dtype', None) is not None and not isinstance(d['torch_dtype'], str):
68
- d['torch_dtype'] = str(d['torch_dtype']).split('.')[1]
69
-
70
- for value in d.values():
71
- if isinstance(value, dict):
72
- dict_torch_dtype_to_str(value)
73
-
74
- return d
78
+ return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
79
+ except Exception as e:
80
+ logger.error(f'{e}\nContent: {completion}\n'
81
+ 'You must manually fix the score pair.')
82
+ return ArenaWinner.UNKNOWN
75
83
 
76
84
 
77
85
  class ResponseParser:
@@ -194,7 +202,6 @@ class ResponseParser:
194
202
  return last_capital
195
203
  return 'No valid option found'
196
204
 
197
-
198
205
  @staticmethod
199
206
  def parse_bracketed_answer(text: str, options: list[str]) -> str:
200
207
  options = ResponseParser.process_options(options)
@@ -212,121 +219,9 @@ class ResponseParser:
212
219
  options_pattern = '|'.join(escaped_options)
213
220
  return options_pattern
214
221
 
215
- def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
216
- """
217
- Normalize score.
218
-
219
- Args:
220
- score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678}
221
- keep_num: number of digits to keep.
222
-
223
- Returns:
224
- Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234}
225
- """
226
- if isinstance(score, float):
227
- score = round(score, keep_num)
228
- elif isinstance(score, dict):
229
- score = {k: round(v, keep_num) for k, v in score.items()}
230
- else:
231
- logger.warning(f'Unknown score type: {type(score)}')
232
-
233
- return score
234
-
235
-
236
- def is_module_installed(module_name):
237
- try:
238
- importlib.import_module(module_name)
239
- return True
240
- except ImportError:
241
- return False
242
-
243
-
244
- def get_module_path(module_name):
245
- spec = importlib.util.find_spec(module_name)
246
- if spec and spec.origin:
247
- return os.path.abspath(spec.origin)
248
- else:
249
- raise ValueError(f'Cannot find module: {module_name}')
250
-
251
-
252
- def get_valid_list(input_list, candidate_list):
253
- """
254
- Get the valid and invalid list from input_list based on candidate_list.
255
- Args:
256
- input_list: The input list.
257
- candidate_list: The candidate list.
258
-
259
- Returns:
260
- valid_list: The valid list.
261
- invalid_list: The invalid list.
262
- """
263
- return [i for i in input_list if i in candidate_list], \
264
- [i for i in input_list if i not in candidate_list]
265
-
266
-
267
- def get_latest_folder_path(work_dir):
268
- from datetime import datetime
269
-
270
- # Get all subdirectories in the work_dir
271
- folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
272
-
273
- # Get the timestamp(YYYYMMDD_HHMMSS)
274
- timestamp_pattern = re.compile(r'^\d{8}_\d{6}$')
275
-
276
- # Filter out the folders
277
- timestamped_folders = [f for f in folders if timestamp_pattern.match(f)]
278
-
279
- if not timestamped_folders:
280
- print(f'>> No timestamped folders found in {work_dir}!')
281
- return None
282
-
283
- # timestamp parser
284
- def parse_timestamp(folder_name):
285
- return datetime.strptime(folder_name, '%Y%m%d_%H%M%S')
286
-
287
- # Find the latest folder
288
- latest_folder = max(timestamped_folders, key=parse_timestamp)
289
-
290
- return os.path.join(work_dir, latest_folder)
291
-
292
-
293
- def csv_to_list(file_path: str) -> List[dict]:
294
- import csv
295
-
296
- with open(file_path, mode='r', newline='', encoding='utf-8') as csv_file:
297
- csv_reader = csv.DictReader(csv_file)
298
- result = [row for row in csv_reader]
299
-
300
- return result
301
-
302
-
303
- def seed_everything(seed: int):
304
- """Set all random seeds to a fixed value for reproducibility.
305
-
306
- Args:
307
- seed (int): The seed value.
308
- """
309
- random.seed(seed)
310
- np.random.seed(seed)
311
- torch.manual_seed(seed)
312
- if torch.cuda.is_available():
313
- torch.cuda.manual_seed_all(seed)
314
- torch.backends.cudnn.deterministic = True
315
- torch.backends.cudnn.benchmark = False
316
-
317
- def get_supported_params(func):
318
- """Get the supported parameters of a function."""
319
- sig = signature(func)
320
- return list(sig.parameters.keys())
321
-
322
- def parse_int_or_float(num):
323
- number = float(num)
324
- if number.is_integer():
325
- return int(number)
326
- return number
327
222
 
328
223
  if __name__ == '__main__':
224
+ result = '**Answer: A **Answer: C**'
329
225
  options = ['A', 'B', 'C', 'D']
330
- answers = ['Context .... ANSWER: A', 'answer: A']
331
- for answer in answers:
332
- print(ResponseParser.parse_first_option(answer, options))
226
+ parsed_result = ResponseParser.parse_first_option(result, options)
227
+ print(f'Parsed result: {parsed_result}') # Should print 'C'