evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  25. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  26. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  27. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  28. evalscope/benchmarks/data_adapter.py +29 -9
  29. evalscope/benchmarks/general_arena/__init__.py +0 -0
  30. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  31. evalscope/benchmarks/general_arena/utils.py +226 -0
  32. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
  33. evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
  34. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  35. evalscope/benchmarks/hle/__init__.py +0 -0
  36. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  37. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  38. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  40. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  41. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  42. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  43. evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
  44. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  45. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  46. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  47. evalscope/benchmarks/race/race_adapter.py +1 -1
  48. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  49. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  50. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  51. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  52. evalscope/benchmarks/utils.py +2 -2
  53. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  54. evalscope/config.py +8 -123
  55. evalscope/constants.py +5 -21
  56. evalscope/evaluator/__init__.py +1 -1
  57. evalscope/evaluator/evaluator.py +20 -15
  58. evalscope/metrics/__init__.py +9 -1
  59. evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
  60. evalscope/metrics/llm_judge.py +106 -20
  61. evalscope/metrics/metrics.py +20 -8
  62. evalscope/models/__init__.py +4 -8
  63. evalscope/models/adapters/__init__.py +4 -9
  64. evalscope/models/adapters/base_adapter.py +4 -0
  65. evalscope/models/adapters/bfcl_adapter.py +2 -0
  66. evalscope/models/adapters/chat_adapter.py +3 -0
  67. evalscope/models/adapters/choice_adapter.py +4 -0
  68. evalscope/models/adapters/custom_adapter.py +7 -3
  69. evalscope/models/adapters/server_adapter.py +4 -2
  70. evalscope/models/adapters/t2i_adapter.py +3 -0
  71. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  72. evalscope/models/custom/dummy_model.py +3 -3
  73. evalscope/models/register.py +0 -14
  74. evalscope/perf/arguments.py +15 -16
  75. evalscope/perf/benchmark.py +38 -39
  76. evalscope/perf/http_client.py +30 -86
  77. evalscope/perf/main.py +3 -3
  78. evalscope/perf/plugin/__init__.py +3 -2
  79. evalscope/perf/plugin/api/__init__.py +4 -3
  80. evalscope/perf/plugin/api/base.py +22 -4
  81. evalscope/perf/plugin/api/custom_api.py +212 -55
  82. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  83. evalscope/perf/plugin/api/default_api.py +105 -0
  84. evalscope/perf/plugin/api/openai_api.py +17 -19
  85. evalscope/perf/plugin/datasets/__init__.py +10 -7
  86. evalscope/perf/plugin/datasets/base.py +22 -1
  87. evalscope/perf/plugin/datasets/custom.py +2 -1
  88. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  89. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  90. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  91. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  92. evalscope/perf/plugin/datasets/openqa.py +2 -1
  93. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  94. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  95. evalscope/perf/plugin/registry.py +36 -16
  96. evalscope/perf/utils/analysis_result.py +24 -23
  97. evalscope/perf/utils/benchmark_util.py +14 -20
  98. evalscope/perf/utils/db_util.py +79 -61
  99. evalscope/report/__init__.py +1 -1
  100. evalscope/report/utils.py +34 -15
  101. evalscope/run.py +1 -1
  102. evalscope/summarizer.py +1 -2
  103. evalscope/utils/__init__.py +63 -2
  104. evalscope/utils/argument_utils.py +64 -0
  105. evalscope/utils/import_utils.py +16 -0
  106. evalscope/utils/io_utils.py +55 -4
  107. evalscope/utils/model_utils.py +37 -1
  108. evalscope/version.py +2 -2
  109. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
  110. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
  111. tests/aigc/test_t2i.py +1 -1
  112. tests/cli/test_all.py +68 -4
  113. tests/cli/test_collection.py +1 -1
  114. tests/cli/test_custom.py +261 -0
  115. tests/cli/test_run.py +34 -70
  116. tests/perf/test_perf.py +31 -4
  117. tests/rag/test_clip_benchmark.py +2 -1
  118. tests/rag/test_mteb.py +3 -1
  119. tests/rag/test_ragas.py +3 -1
  120. tests/swift/test_run_swift_eval.py +2 -1
  121. tests/swift/test_run_swift_vlm_eval.py +2 -1
  122. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  123. tests/utils.py +13 -0
  124. tests/vlm/test_vlmeval.py +8 -2
  125. evalscope/evaluator/rating_eval.py +0 -157
  126. evalscope/evaluator/reviewer/__init__.py +0 -1
  127. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  128. evalscope/models/model.py +0 -189
  129. evalscope/registry/__init__.py +0 -1
  130. evalscope/registry/config/cfg_arena.yaml +0 -77
  131. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  132. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  133. evalscope/registry/config/cfg_single.yaml +0 -78
  134. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  135. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  136. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  137. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  138. evalscope/registry/data/question.jsonl +0 -80
  139. evalscope/registry/tasks/arc.yaml +0 -28
  140. evalscope/registry/tasks/bbh.yaml +0 -26
  141. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  142. evalscope/registry/tasks/ceval.yaml +0 -27
  143. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  144. evalscope/registry/tasks/cmmlu.yaml +0 -27
  145. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  146. evalscope/registry/tasks/general_qa.yaml +0 -27
  147. evalscope/registry/tasks/gsm8k.yaml +0 -29
  148. evalscope/registry/tasks/mmlu.yaml +0 -29
  149. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  150. evalscope/run_arena.py +0 -202
  151. evalscope/utils/arena_utils.py +0 -217
  152. evalscope/utils/completion_parsers.py +0 -82
  153. /evalscope/{utils → benchmarks}/filters.py +0 -0
  154. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  155. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  156. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  157. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,8 @@ from functools import partial
5
5
  from typing import Optional, Union
6
6
 
7
7
  from evalscope.backend.base import BackendManager
8
- from evalscope.utils import get_valid_list, is_module_installed
8
+ from evalscope.utils.import_utils import is_module_installed
9
+ from evalscope.utils.io_utils import get_valid_list
9
10
  from evalscope.utils.logger import get_logger
10
11
 
11
12
  logger = get_logger()
@@ -68,6 +69,8 @@ class VLMEvalKitBackendManager(BackendManager):
68
69
  del remain_cfg['type'] # remove not used args
69
70
 
70
71
  norm_model_type = os.path.basename(model_type).replace(':', '-').replace('.', '_')
72
+ model_cfg['type'] = norm_model_type
73
+
71
74
  self.valid_models.update({norm_model_type: partial(model_class, model=model_type, **remain_cfg)})
72
75
  new_model_names.append(norm_model_type)
73
76
  else:
@@ -2,6 +2,7 @@
2
2
  import glob
3
3
  import importlib
4
4
  import os
5
+ import time
5
6
 
6
7
  from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
7
8
  from evalscope.benchmarks.data_adapter import DataAdapter
@@ -13,11 +14,24 @@ logger = get_logger()
13
14
  pattern = os.path.join(os.path.dirname(__file__), '*', '**', '*_adapter.py')
14
15
  files = glob.glob(pattern, recursive=True)
15
16
 
17
+ import_times = []
18
+
16
19
  for file_path in files:
17
20
  if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
18
21
  # Convert file path to a module path
19
22
  relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
20
23
  module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
21
24
  full_path = f'evalscope.benchmarks.{module_path}'
25
+
26
+ start_time = time.perf_counter()
22
27
  importlib.import_module(full_path)
23
- # print(f'Importing {full_path}')
28
+ end_time = time.perf_counter()
29
+
30
+ import_times.append((full_path, end_time - start_time))
31
+
32
+ # Sort by import time in descending order
33
+ import_times.sort(key=lambda x: x[1], reverse=True)
34
+
35
+ # Log the sorted import times
36
+ for module, duration in import_times:
37
+ logger.debug(f'Module {module} imported in {duration:.6f} seconds')
@@ -48,4 +48,5 @@ class AIME24Adapter(DataAdapter):
48
48
  return result
49
49
 
50
50
  def match(self, gold: str, pred: str) -> float:
51
- return math_equal(pred, gold)
51
+ res = math_equal(pred, gold)
52
+ return 1.0 if res else 0.0
@@ -48,4 +48,5 @@ class AIME25Adapter(DataAdapter):
48
48
  return result
49
49
 
50
50
  def match(self, gold: str, pred: str) -> float:
51
- return math_equal(pred, gold)
51
+ res = math_equal(pred, gold)
52
+ return 1.0 if res else 0.0
@@ -47,7 +47,7 @@ Evaluate the models based on the quality and relevance of their outputs, and sel
47
47
  @Benchmark.register(
48
48
  name='alpaca_eval',
49
49
  pretty_name='AlpacaEval2.0',
50
- tags=['Instruction-Following', 'Reasoning'],
50
+ tags=['Instruction-Following', 'Arena'],
51
51
  description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
52
52
  'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
53
53
  'provide more accurate and cost-effective model assessments. '
@@ -6,7 +6,7 @@ import os
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType, OutputType
8
8
  from evalscope.metrics import exact_match
9
- from evalscope.utils import ResponseParser
9
+ from evalscope.metrics.completion_parsers import ResponseParser
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  # flake8: noqa
@@ -17,7 +17,7 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
17
17
  @Benchmark.register(
18
18
  name='arena_hard',
19
19
  pretty_name='ArenaHard',
20
- tags=['Instruction-Following', 'Reasoning'],
20
+ tags=['Instruction-Following', 'Arena'],
21
21
  description=
22
22
  'ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, '
23
23
  'where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. '
@@ -127,18 +127,6 @@ def get_bootstrap_result(battles, func_compute_elo, num_round):
127
127
  return df[df.median().sort_values(ascending=False).index]
128
128
 
129
129
 
130
- def preety_print_two_ratings(ratings_1, ratings_2, column_names):
131
- df = (
132
- pd.DataFrame(
133
- [[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()],
134
- columns=['Model', column_names[0], column_names[1]],
135
- ).sort_values(column_names[0], ascending=False).reset_index(drop=True))
136
- df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
137
- df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
138
- df.index = df.index + 1
139
- return df
140
-
141
-
142
130
  def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
143
131
  names = sorted(list(elo_ratings.keys()))
144
132
  wins = defaultdict(lambda: defaultdict(lambda: 0))
@@ -35,7 +35,7 @@ SUBJECT_MAPPING = {
35
35
  @Benchmark.register(
36
36
  name='bfcl_v3',
37
37
  pretty_name='BFCL-v3',
38
- tags=['Agent'],
38
+ tags=['Agent', 'Function Calling'],
39
39
  description=
40
40
  'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
41
41
  'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '
@@ -1,11 +1,13 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import csv
3
3
  import os
4
+ from collections import defaultdict
4
5
 
5
6
  from evalscope.benchmarks import Benchmark, DataAdapter
6
7
  from evalscope.constants import EvalType, OutputType
7
8
  from evalscope.metrics import exact_match
8
- from evalscope.utils import ResponseParser
9
+ from evalscope.metrics.completion_parsers import ResponseParser
10
+ from evalscope.utils.io_utils import csv_to_list
9
11
  from evalscope.utils.logger import get_logger
10
12
 
11
13
  # flake8: noqa
@@ -154,7 +156,7 @@ class CEVALAdapter(DataAdapter):
154
156
  self.choices = ['A', 'B', 'C', 'D']
155
157
 
156
158
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
157
- data_dict = {}
159
+ data_dict = defaultdict(dict)
158
160
  for subset_name in subset_list:
159
161
  for split_name in [self.train_split, self.eval_split]:
160
162
  if os.path.exists(dataset_name_or_path):
@@ -162,20 +164,7 @@ class CEVALAdapter(DataAdapter):
162
164
  else:
163
165
  file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
164
166
  if os.path.exists(file_path):
165
- with open(file_path, encoding='utf-8') as f:
166
- rows = []
167
- reader = csv.reader(f)
168
- header = next(reader)
169
- for row in reader:
170
- item = dict(zip(header, row))
171
- item.setdefault('explanation', '')
172
- item.setdefault('answer', '')
173
- rows.append(item)
174
-
175
- if subset_name in data_dict:
176
- data_dict[subset_name].update({split_name: rows})
177
- else:
178
- data_dict[subset_name] = {split_name: rows}
167
+ data_dict[subset_name][split_name] = csv_to_list(file_path)
179
168
 
180
169
  return data_dict
181
170
 
@@ -2,11 +2,13 @@
2
2
 
3
3
  import csv
4
4
  import os
5
+ from collections import defaultdict
5
6
 
6
7
  from evalscope.benchmarks import Benchmark, DataAdapter
7
8
  from evalscope.constants import EvalType, OutputType
8
9
  from evalscope.metrics import exact_match
9
- from evalscope.utils import ResponseParser
10
+ from evalscope.metrics.completion_parsers import ResponseParser
11
+ from evalscope.utils.io_utils import csv_to_list
10
12
  from evalscope.utils.logger import get_logger
11
13
 
12
14
  # flake8: noqa
@@ -126,29 +128,15 @@ class CMMLUAdapter(DataAdapter):
126
128
  self.choices = ['A', 'B', 'C', 'D']
127
129
 
128
130
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
129
- data_dict = {}
131
+ data_dict = defaultdict(dict)
130
132
  for subset_name in subset_list:
131
- data_dict[subset_name] = {}
132
133
  for split_name in [self.train_split, self.eval_split]:
133
- file_path = os.path.join(work_dir, dataset_name_or_path, split_name, f'{subset_name}.csv')
134
+ if os.path.exists(dataset_name_or_path):
135
+ file_path = os.path.join(dataset_name_or_path, split_name, f'{subset_name}.csv')
136
+ else:
137
+ file_path = os.path.join(work_dir, dataset_name_or_path, split_name, f'{subset_name}.csv')
134
138
  if os.path.exists(file_path):
135
- with open(file_path, encoding='utf-8') as f:
136
- rows = []
137
- reader = csv.reader(f)
138
- for row in reader:
139
- if len(row) != 7:
140
- logger.error(f'Mismatch len of row: {row}, len of row should be 6. Skip this row.')
141
- continue
142
- rows.append({
143
- 'Question': row[1],
144
- 'A': row[2],
145
- 'B': row[3],
146
- 'C': row[4],
147
- 'D': row[5],
148
- 'Answer': row[6],
149
- })
150
-
151
- data_dict[subset_name].update({split_name: rows})
139
+ data_dict[subset_name][split_name] = csv_to_list(file_path)
152
140
 
153
141
  return data_dict
154
142
 
@@ -105,7 +105,8 @@ class CompetitionMathAdapter(DataAdapter):
105
105
  return result
106
106
 
107
107
  def match(self, gold: str, pred: str) -> float:
108
- return math_equal(pred, gold)
108
+ res = math_equal(pred, gold)
109
+ return 1.0 if res else 0.0
109
110
 
110
111
  @classmethod
111
112
  def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
@@ -168,7 +168,12 @@ class DataAdapter(ABC):
168
168
  If you want to support local dataset, please rewrite this method in xxx_data_adapter.
169
169
  Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
170
170
  """
171
- return self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
171
+ # remove dataset_infos.json file if exists, since MsDataset will occur an error if it exists.
172
+ dataset_infos_path = os.path.join(dataset_name_or_path, 'dataset_infos.json')
173
+ if os.path.exists(dataset_infos_path):
174
+ logger.info(f'Removing dataset_infos.json file at {dataset_infos_path} to avoid MsDataset errors.')
175
+ os.remove(dataset_infos_path)
176
+ return self.load_from_hub(dataset_name_or_path, subset_list, None, **kwargs)
172
177
 
173
178
  def load_with_snapshot(self,
174
179
  file_structure: Dict[str, List[str]],
@@ -382,7 +387,7 @@ class DataAdapter(ABC):
382
387
  pass
383
388
 
384
389
  def gen_prompt_data(self,
385
- prompt: str,
390
+ prompt: str = '',
386
391
  system_prompt: Optional[str] = None,
387
392
  choices: Optional[List[str]] = None,
388
393
  index: Optional[Union[int, str]] = None,
@@ -413,7 +418,8 @@ class DataAdapter(ABC):
413
418
  system_prompt=system_prompt or self.system_prompt,
414
419
  index=index or 0,
415
420
  id=id,
416
- messages=messages)
421
+ messages=messages,
422
+ extra_data=kwargs.get('extra_data', None))
417
423
  return prompt_data.to_dict()
418
424
 
419
425
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
@@ -449,7 +455,6 @@ class DataAdapter(ABC):
449
455
  """
450
456
  raise NotImplementedError
451
457
 
452
- @abstractmethod
453
458
  def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
454
459
  """
455
460
  Parse the predicted result and extract proper answer.
@@ -462,9 +467,22 @@ class DataAdapter(ABC):
462
467
  Returns:
463
468
  The parsed answer. Depending on the dataset. Usually a string for chat.
464
469
  """
465
- raise NotImplementedError
470
+ return result
471
+
472
+ def llm_parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
473
+ """
474
+ Parse the predicted result using LLM.
475
+
476
+ Args:
477
+ result (Any): The predicted answer from the model.
478
+ raw_input_d (dict): The raw input data.
479
+ eval_type (str): The evaluation type, default is 'checkpoint'.
480
+
481
+ Returns:
482
+ The parsed answer. Usually a string for chat.
483
+ """
484
+ return result
466
485
 
467
- @abstractmethod
468
486
  def match(self, gold: Any, pred: Any) -> Any:
469
487
  """
470
488
  Match the gold answer and the predicted answer.
@@ -478,7 +496,7 @@ class DataAdapter(ABC):
478
496
  Returns:
479
497
  The match result. Usually a score (float) for chat/multiple-choice-questions.
480
498
  """
481
- raise NotImplementedError
499
+ return 1.0 if gold == pred else 0.0
482
500
 
483
501
  def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
484
502
  """
@@ -504,5 +522,7 @@ class DataAdapter(ABC):
504
522
 
505
523
  # Request judge and obtain score
506
524
  prompt = judge.build_prompt(pred, gold, question)
507
- score = judge(prompt)
508
- return judge.get_score(score)
525
+ judge_response = judge(prompt)
526
+ score = judge.get_score(judge_response)
527
+
528
+ return score
File without changes