evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (114) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +40 -30
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +2 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  7. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  8. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  9. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  10. evalscope/backend/rag_eval/utils/embedding.py +77 -39
  11. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  12. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  13. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  14. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  16. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  17. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  18. evalscope/benchmarks/benchmark.py +2 -0
  19. evalscope/benchmarks/bfcl/__init__.py +0 -0
  20. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  21. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  22. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  23. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  24. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  25. evalscope/benchmarks/data_adapter.py +99 -16
  26. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  27. evalscope/benchmarks/docmath/__init__.py +0 -0
  28. evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
  29. evalscope/benchmarks/docmath/utils.py +220 -0
  30. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  31. evalscope/benchmarks/frames/__init__.py +0 -0
  32. evalscope/benchmarks/frames/frames_adapter.py +91 -0
  33. evalscope/benchmarks/frames/utils.py +37 -0
  34. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  35. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  36. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  37. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  38. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  39. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  40. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  41. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  42. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  43. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  44. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  45. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  46. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  47. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  48. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  49. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  50. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
  51. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  52. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  53. evalscope/benchmarks/race/race_adapter.py +3 -0
  54. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  55. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  56. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  57. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  58. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
  59. evalscope/benchmarks/tool_bench/utils.py +5 -4
  60. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  61. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  62. evalscope/benchmarks/utils.py +25 -0
  63. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  64. evalscope/cli/start_app.py +2 -2
  65. evalscope/collections/__init__.py +35 -3
  66. evalscope/collections/evaluator.py +68 -34
  67. evalscope/config.py +8 -2
  68. evalscope/constants.py +1 -1
  69. evalscope/evaluator/evaluator.py +40 -28
  70. evalscope/metrics/__init__.py +3 -1
  71. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  72. evalscope/metrics/llm_judge.py +12 -5
  73. evalscope/metrics/math_parser.py +1 -1
  74. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  75. evalscope/models/adapters/__init__.py +2 -0
  76. evalscope/models/adapters/base_adapter.py +31 -27
  77. evalscope/models/adapters/bfcl_adapter.py +244 -0
  78. evalscope/models/adapters/server_adapter.py +80 -23
  79. evalscope/models/custom/custom_model.py +0 -3
  80. evalscope/models/custom/dummy_model.py +77 -39
  81. evalscope/models/local_model.py +1 -1
  82. evalscope/models/register.py +2 -1
  83. evalscope/perf/arguments.py +4 -2
  84. evalscope/perf/benchmark.py +16 -12
  85. evalscope/perf/main.py +7 -0
  86. evalscope/perf/plugin/api/openai_api.py +2 -0
  87. evalscope/perf/plugin/datasets/custom.py +15 -0
  88. evalscope/perf/utils/benchmark_util.py +1 -1
  89. evalscope/perf/utils/local_server.py +1 -0
  90. evalscope/perf/utils/log_utils.py +12 -5
  91. evalscope/perf/utils/rich_display.py +1 -1
  92. evalscope/report/__init__.py +36 -4
  93. evalscope/report/combinator.py +40 -6
  94. evalscope/report/generator.py +33 -9
  95. evalscope/report/utils.py +84 -4
  96. evalscope/run.py +12 -0
  97. evalscope/summarizer.py +1 -1
  98. evalscope/utils/io_utils.py +59 -2
  99. evalscope/utils/logger.py +1 -1
  100. evalscope/utils/utils.py +12 -0
  101. evalscope/version.py +2 -2
  102. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
  103. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
  104. tests/aigc/test_t2i.py +48 -11
  105. tests/cli/test_all.py +14 -3
  106. tests/cli/test_collection.py +6 -4
  107. tests/cli/test_run.py +50 -25
  108. tests/rag/test_clip_benchmark.py +5 -1
  109. tests/rag/test_mteb.py +51 -7
  110. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  111. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  112. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  113. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  114. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,28 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import TYPE_CHECKING
3
+
4
+ from evalscope.utils.import_utils import _LazyModule
5
+
6
+ if TYPE_CHECKING:
7
+ from .app import create_app
8
+ from .arguments import add_argument
9
+
10
+ else:
11
+ _import_structure = {
12
+ 'app': [
13
+ 'create_app',
14
+ ],
15
+ 'arguments': [
16
+ 'add_argument',
17
+ ],
18
+ }
19
+
20
+ import sys
21
+
22
+ sys.modules[__name__] = _LazyModule(
23
+ __name__,
24
+ globals()['__file__'],
25
+ _import_structure,
26
+ module_spec=__spec__,
27
+ extra_objects={},
28
+ )
@@ -1,6 +1,7 @@
1
1
  import argparse
2
2
  import glob
3
3
  import gradio as gr
4
+ import json
4
5
  import numpy as np
5
6
  import os
6
7
  import pandas as pd
@@ -11,35 +12,15 @@ from dataclasses import dataclass
11
12
  from typing import Any, List, Union
12
13
 
13
14
  from evalscope.constants import DataCollection
14
- from evalscope.report import Report, ReportKey, add_argument, get_data_frame, get_report_list
15
+ from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
15
16
  from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
16
17
  from evalscope.utils.logger import configure_logging, get_logger
17
18
  from evalscope.version import __version__
19
+ from .arguments import add_argument
20
+ from .constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, PLOTLY_THEME, REPORT_TOKEN
18
21
 
19
22
  logger = get_logger()
20
23
 
21
- PLOTLY_THEME = 'plotly_dark'
22
- REPORT_TOKEN = '@@'
23
- MODEL_TOKEN = '::'
24
- DATASET_TOKEN = ', '
25
- LATEX_DELIMITERS = [{
26
- 'left': '$$',
27
- 'right': '$$',
28
- 'display': True
29
- }, {
30
- 'left': '$',
31
- 'right': '$',
32
- 'display': False
33
- }, {
34
- 'left': '\\(',
35
- 'right': '\\)',
36
- 'display': False
37
- }, {
38
- 'left': '\\[',
39
- 'right': '\\]',
40
- 'display': True
41
- }]
42
-
43
24
 
44
25
  def scan_for_report_folders(root_path):
45
26
  """Scan for folders containing reports subdirectories"""
@@ -155,11 +136,11 @@ def plot_single_report_scores(df: pd.DataFrame):
155
136
 
156
137
  def plot_single_report_sunburst(report_list: List[Report]):
157
138
  if report_list[0].name == DataCollection.NAME:
158
- df = get_data_frame(report_list)
139
+ df = get_data_frame(report_list=report_list)
159
140
  categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
160
141
  path = categories + [ReportKey.subset_name]
161
142
  else:
162
- df = get_data_frame(report_list, flatten_metrics=False)
143
+ df = get_data_frame(report_list=report_list, flatten_metrics=False)
163
144
  categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
164
145
  path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
165
146
  logger.debug(f'df: {df}')
@@ -185,6 +166,13 @@ def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
185
166
  return df, styler
186
167
 
187
168
 
169
+ def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
170
+ for report in report_list:
171
+ if report.dataset_name == dataset_name:
172
+ return report.analysis
173
+ return 'N/A'
174
+
175
+
188
176
  def plot_single_dataset_scores(df: pd.DataFrame):
189
177
  # TODO: add metric radio and relace category name
190
178
  plot = px.bar(
@@ -246,7 +234,7 @@ def convert_html_tags(text):
246
234
  def process_string(string: str, max_length: int = 2048) -> str:
247
235
  string = convert_html_tags(string) # for display labels e.g.
248
236
  if max_length and len(string) > max_length:
249
- return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
237
+ return f'{string[:max_length // 2]}...[truncate]...{string[-max_length // 2:]}'
250
238
  return string
251
239
 
252
240
 
@@ -270,7 +258,7 @@ def dict_to_markdown(data) -> str:
270
258
  return '\n\n'.join(markdown_lines)
271
259
 
272
260
 
273
- def process_model_prediction(item: Any, max_length: int = 2048) -> str:
261
+ def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
274
262
  """
275
263
  Process model prediction output into a formatted string.
276
264
 
@@ -294,6 +282,20 @@ def process_model_prediction(item: Any, max_length: int = 2048) -> str:
294
282
  return result
295
283
 
296
284
 
285
+ def process_model_prediction(item: Any, max_length: int = 4096) -> str:
286
+ if isinstance(item, (dict, list)):
287
+ result = json.dumps(item, ensure_ascii=False, indent=2)
288
+ result = f'```json\n{result}\n```'
289
+ else:
290
+ result = str(item)
291
+
292
+ # Apply HTML tag conversion and truncation only at the final output
293
+ if max_length is not None:
294
+ return process_string(result, max_length)
295
+
296
+ return result
297
+
298
+
297
299
  def normalize_score(score):
298
300
  try:
299
301
  if isinstance(score, bool):
@@ -456,6 +458,10 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
456
458
  'zh': '数据集分数',
457
459
  'en': 'Dataset Scores'
458
460
  },
461
+ 'report_analysis': {
462
+ 'zh': '报告智能分析',
463
+ 'en': 'Report Intelligent Analysis'
464
+ },
459
465
  'dataset_scores_table': {
460
466
  'zh': '数据集分数表',
461
467
  'en': 'Dataset Scores Table'
@@ -511,6 +517,9 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
511
517
  with gr.Tab(locale_dict['dataset_details'][lang]):
512
518
  dataset_radio = gr.Radio(
513
519
  label=locale_dict['select_dataset'][lang], choices=[], show_label=True, interactive=True)
520
+ # show dataset details
521
+ with gr.Accordion(locale_dict['report_analysis'][lang], open=True):
522
+ report_analysis = gr.Markdown(value='N/A', show_copy_button=True)
514
523
  gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
515
524
  dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
516
525
  gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
@@ -586,15 +595,16 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
586
595
  @gr.on(
587
596
  triggers=[dataset_radio.change, report_list.change],
588
597
  inputs=[dataset_radio, report_list],
589
- outputs=[dataset_plot, dataset_table, subset_select, data_review_df])
598
+ outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
590
599
  def update_single_report_dataset(dataset_name, report_list):
591
600
  logger.debug(f'Updating single report dataset: {dataset_name}')
592
- report_df = get_data_frame(report_list)
601
+ report_df = get_data_frame(report_list=report_list)
602
+ analysis = get_report_analysis(report_list, dataset_name)
593
603
  data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
594
604
  data_score_plot = plot_single_dataset_scores(data_score_df)
595
605
  subsets = data_score_df[ReportKey.subset_name].unique().tolist()
596
606
  logger.debug(f'subsets: {subsets}')
597
- return data_score_plot, styler, gr.update(choices=subsets, value=None), None
607
+ return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
598
608
 
599
609
  @gr.on(
600
610
  triggers=[subset_select.change],
@@ -0,0 +1,21 @@
1
+ PLOTLY_THEME = 'plotly_dark'
2
+ REPORT_TOKEN = '@@'
3
+ MODEL_TOKEN = '::'
4
+ DATASET_TOKEN = ', '
5
+ LATEX_DELIMITERS = [{
6
+ 'left': '$$',
7
+ 'right': '$$',
8
+ 'display': True
9
+ }, {
10
+ 'left': '$',
11
+ 'right': '$',
12
+ 'display': False
13
+ }, {
14
+ 'left': '\\(',
15
+ 'right': '\\)',
16
+ 'display': False
17
+ }, {
18
+ 'left': '\\[',
19
+ 'right': '\\]',
20
+ 'display': True
21
+ }]
evalscope/arguments.py CHANGED
@@ -67,7 +67,7 @@ def add_argument(parser: argparse.ArgumentParser):
67
67
  parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
68
68
  parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
69
69
  choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
70
- parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
70
+ parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
71
71
  parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
72
72
 
73
73
  # Cache and working directory arguments
@@ -89,6 +89,7 @@ def add_argument(parser: argparse.ArgumentParser):
89
89
  parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
90
90
  parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
91
91
  parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
92
+ parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
92
93
  # yapf: enable
93
94
 
94
95
 
@@ -1,4 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
2
3
  import subprocess
3
4
  import tempfile
4
5
  from dataclasses import asdict
@@ -204,7 +205,7 @@ class OpenCompassBackendManager(BackendManager):
204
205
  model_d['meta_template'] = get_template(model_d['meta_template'])
205
206
 
206
207
  # set the 'abbr' as the 'path' if 'abbr' is not specified
207
- model_d['abbr'] = model_d['path']
208
+ model_d['abbr'] = os.path.basename(model_d['path'])
208
209
 
209
210
  model_config = ApiModelConfig(**model_d)
210
211
  models.append(asdict(model_config))
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import posixpath # For URL path handling
2
3
  import torch
3
4
  from torch.utils.data import DataLoader
4
5
  from torch.utils.data import Dataset as TorchDataset
@@ -186,42 +187,53 @@ def build_wds_dataset(dataset_name, transform, split='test', data_dir='root', ca
186
187
 
187
188
  Set `cache_dir` to a path to cache the dataset, otherwise, no caching will occur.
188
189
  """
190
+ import requests
189
191
  import webdataset as wds
190
192
 
191
193
  def read_txt(fname):
192
- if '://' in fname:
193
- stream = os.popen("curl -L -s --fail '%s'" % fname, 'r')
194
- value = stream.read()
195
- if stream.close():
196
- raise FileNotFoundError('Failed to retreive data')
194
+ if fname.startswith(('http://', 'https://')):
195
+ try:
196
+ response = requests.get(fname)
197
+ response.raise_for_status() # Ensure the HTTP request was successful
198
+ return response.text
199
+ except requests.exceptions.RequestException as e:
200
+ raise FileNotFoundError(f'Failed to read {fname}: {e}')
197
201
  else:
198
202
  with open(fname, 'r') as file:
199
- value = file.read()
200
- return value
203
+ return file.read()
204
+
205
+ def url_path_join(*parts):
206
+ """Join URL path parts with forward slashes regardless of platform"""
207
+ return posixpath.join(*parts)
201
208
 
202
209
  if not data_dir:
203
210
  data_dir = f'https://modelscope.cn/datasets/clip-benchmark/wds_{dataset_name}/resolve/master'
204
211
 
205
212
  # Git LFS files have a different file path to access the raw data than other files
206
- if data_dir.startswith('https://modelscope.cn/datasets'):
213
+ is_url = data_dir.startswith(('http://', 'https://'))
214
+ if is_url and data_dir.startswith('https://modelscope.cn/datasets'):
207
215
  *split_url_head, _, url_path = data_dir.split('/', 7)
208
216
  url_head = '/'.join(split_url_head)
209
217
  metadata_dir = '/'.join([url_head, 'resolve', url_path])
210
218
  tardata_dir = '/'.join([url_head, 'resolve', url_path])
211
219
  else:
212
220
  metadata_dir = tardata_dir = data_dir
221
+
222
+ # Use appropriate path joining function based on whether we're dealing with a URL
223
+ path_join = url_path_join if is_url else os.path.join
224
+
213
225
  # Get number of shards
214
- nshards_fname = os.path.join(metadata_dir, split, 'nshards.txt')
226
+ nshards_fname = path_join(metadata_dir, split, 'nshards.txt')
215
227
  nshards = int(read_txt(nshards_fname)) # Do not catch FileNotFound, nshards.txt should be mandatory
216
228
 
217
229
  # Get dataset type (classification or retrieval)
218
- type_fname = os.path.join(metadata_dir, 'dataset_type.txt')
230
+ type_fname = path_join(metadata_dir, 'dataset_type.txt')
219
231
  try:
220
232
  dataset_type = read_txt(type_fname).strip().lower()
221
233
  except FileNotFoundError:
222
234
  dataset_type = 'classification'
223
235
 
224
- filepattern = os.path.join(tardata_dir, split, '{0..%d}.tar' % (nshards - 1))
236
+ filepattern = path_join(tardata_dir, split, '{0..%d}.tar' % (nshards - 1))
225
237
  # Load webdataset (support WEBP, PNG, and JPG for now)
226
238
  if not cache_dir or not isinstance(cache_dir, str):
227
239
  cache_dir = None
@@ -11,7 +11,9 @@ class ModelArguments:
11
11
  pooling_mode: Optional[str] = None
12
12
  max_seq_length: int = 512 # max sequence length
13
13
  # prompt for llm based model
14
- prompt: str = ''
14
+ prompt: Optional[str] = None
15
+ # prompts dictionary for different tasks, if prompt is not set
16
+ prompts: Optional[Dict[str, str]] = None
15
17
  # model kwargs
16
18
  model_kwargs: dict = field(default_factory=dict)
17
19
  # config kwargs
@@ -33,6 +35,7 @@ class ModelArguments:
33
35
  'pooling_mode': self.pooling_mode,
34
36
  'max_seq_length': self.max_seq_length,
35
37
  'prompt': self.prompt,
38
+ 'prompts': self.prompts,
36
39
  'model_kwargs': self.model_kwargs,
37
40
  'config_kwargs': self.config_kwargs,
38
41
  'encode_kwargs': self.encode_kwargs,
@@ -1,6 +1,6 @@
1
1
  import mteb
2
2
  import os
3
- from mteb.task_selection import results_to_dataframe
3
+ from tabulate import tabulate
4
4
 
5
5
  from evalscope.backend.rag_eval import EmbeddingModel, cmteb
6
6
  from evalscope.utils.logger import get_logger
@@ -12,14 +12,27 @@ def show_results(output_folder, model, results):
12
12
  model_name = model.mteb_model_meta.model_name_as_path()
13
13
  revision = model.mteb_model_meta.revision
14
14
 
15
- results_df = results_to_dataframe({model_name: {revision: results}})
15
+ data = []
16
+ for model_res in results:
17
+ main_res = model_res.only_main_score()
18
+ for split, score in main_res.scores.items():
19
+ for sub_score in score:
20
+ data.append({
21
+ 'Model': model_name.replace('eval__', ''),
22
+ 'Revision': revision,
23
+ 'Task Type': main_res.task_type,
24
+ 'Task': main_res.task_name,
25
+ 'Split': split,
26
+ 'Subset': sub_score['hf_subset'],
27
+ 'Main Score': sub_score['main_score'],
28
+ })
16
29
 
17
30
  save_path = os.path.join(
18
31
  output_folder,
19
32
  model_name,
20
33
  revision,
21
34
  )
22
- logger.info(f'Evaluation results:\n{results_df.to_markdown()}')
35
+ logger.info(f'Evaluation results:\n{tabulate(data, headers="keys", tablefmt="grid")}')
23
36
  logger.info(f'Evaluation results saved in {os.path.abspath(save_path)}')
24
37
 
25
38
 
@@ -34,6 +47,7 @@ def one_stage_eval(
34
47
  tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'], dataset_path=custom_dataset_path)
35
48
  evaluation = mteb.MTEB(tasks=tasks)
36
49
 
50
+ eval_args['encode_kwargs'] = model_args.get('encode_kwargs', {})
37
51
  # run evaluation
38
52
  results = evaluation.run(model, **eval_args)
39
53
 
@@ -66,6 +80,7 @@ def two_stage_eval(
66
80
  overwrite_results=True,
67
81
  hub=eval_args['hub'],
68
82
  limits=eval_args['limits'],
83
+ encode_kwargs=model1_args.get('encode_kwargs', {}),
69
84
  )
70
85
  # stage 2: run cross encoder
71
86
  results = evaluation.run(
@@ -77,6 +92,7 @@ def two_stage_eval(
77
92
  overwrite_results=True,
78
93
  hub=eval_args['hub'],
79
94
  limits=eval_args['limits'],
95
+ encode_kwargs=model2_args.get('encode_kwargs', {}),
80
96
  )
81
97
 
82
98
  # save and log results
@@ -9,7 +9,6 @@ class CustomRetrieval(AbsTaskRetrieval):
9
9
  ignore_identical_ids: bool = True
10
10
 
11
11
  def __init__(self, dataset_path: Optional[str] = 'custom_eval/text/retrieval', **kwargs):
12
- super().__init__(**kwargs)
13
12
  self.metadata = TaskMetadata(
14
13
  name='CustomRetrieval',
15
14
  description='CustomRetrieval Task',
@@ -34,6 +33,7 @@ class CustomRetrieval(AbsTaskRetrieval):
34
33
  bibtex_citation='',
35
34
  descriptive_stats={},
36
35
  )
36
+ super().__init__(**kwargs)
37
37
 
38
38
  def load_data(self, **kwargs):
39
39
  if self.data_loaded:
@@ -2,6 +2,7 @@ import os
2
2
  import torch
3
3
  from langchain_core.embeddings import Embeddings
4
4
  from langchain_openai.embeddings import OpenAIEmbeddings
5
+ from mteb.encoder_interface import PromptType
5
6
  from sentence_transformers import models
6
7
  from sentence_transformers.cross_encoder import CrossEncoder
7
8
  from sentence_transformers.SentenceTransformer import SentenceTransformer
@@ -12,6 +13,7 @@ from typing import Dict, List, Optional, Union
12
13
  from evalscope.backend.rag_eval.utils.tools import download_model
13
14
  from evalscope.constants import HubType
14
15
  from evalscope.utils.logger import get_logger
16
+ from evalscope.utils.utils import get_supported_params
15
17
 
16
18
  logger = get_logger()
17
19
 
@@ -22,14 +24,14 @@ class BaseModel(Embeddings):
22
24
  self,
23
25
  model_name_or_path: str = '',
24
26
  max_seq_length: int = 512,
25
- prompt: str = '',
27
+ prompt: Optional[str] = None,
28
+ prompts: Optional[Dict[str, str]] = None,
26
29
  revision: Optional[str] = 'master',
27
30
  **kwargs,
28
31
  ):
29
32
  self.model_name_or_path = model_name_or_path
30
33
  self.max_seq_length = max_seq_length
31
34
  self.model_kwargs = kwargs.pop('model_kwargs', {})
32
- self.model_kwargs['trust_remote_code'] = True
33
35
 
34
36
  self.config_kwargs = kwargs.pop('config_kwargs', {})
35
37
  self.config_kwargs['trust_remote_code'] = True
@@ -38,7 +40,9 @@ class BaseModel(Embeddings):
38
40
  self.encode_kwargs['convert_to_tensor'] = True
39
41
 
40
42
  self.prompt = prompt
43
+ self.prompts = prompts if prompts else {}
41
44
  self.revision = revision
45
+ self.framework = ['PyTorch']
42
46
 
43
47
  @property
44
48
  def mteb_model_meta(self):
@@ -46,10 +50,22 @@ class BaseModel(Embeddings):
46
50
  from mteb import ModelMeta
47
51
 
48
52
  return ModelMeta(
49
- name=os.path.basename(self.model_name_or_path),
53
+ name='eval/' + os.path.basename(self.model_name_or_path), # Ensure the name contains a slash
50
54
  revision=self.revision,
51
55
  languages=None,
52
56
  release_date=None,
57
+ n_parameters=None,
58
+ memory_usage_mb=None,
59
+ max_tokens=None,
60
+ embed_dim=None,
61
+ license=None,
62
+ open_weights=None,
63
+ public_training_code=None,
64
+ public_training_data=None,
65
+ similarity_fn_name=None,
66
+ use_instructions=None,
67
+ training_datasets=None,
68
+ framework=self.framework,
53
69
  )
54
70
 
55
71
  def embed_documents(self, texts: List[str]) -> List[List[float]]:
@@ -61,7 +77,7 @@ class BaseModel(Embeddings):
61
77
  Returns:
62
78
  List of embeddings.
63
79
  """
64
- return self.encode_corpus(texts).tolist()
80
+ return self.encode(texts).tolist()
65
81
 
66
82
  def embed_query(self, text: str) -> List[float]:
67
83
  """Embed query text. Compact langchain.
@@ -72,19 +88,17 @@ class BaseModel(Embeddings):
72
88
  Returns:
73
89
  Embedding.
74
90
  """
75
- return self.encode_queries(text).tolist()
91
+ return self.encode(text).tolist()
76
92
 
77
93
  def encode(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
78
94
  """Embed text."""
79
95
  raise NotImplementedError
80
96
 
81
- def encode_queries(self, queries: List[str], **kwargs) -> list[torch.Tensor]:
82
- """Embed query text. Compact mteb."""
83
- raise NotImplementedError
84
-
85
- def encode_corpus(self, corpus: Union[List[str], List[Dict[str, str]]], **kwargs) -> list[torch.Tensor]:
86
- """Embed search docs . Compact mteb."""
87
- raise NotImplementedError
97
+ def get_prompt(self, task_name: str) -> Optional[str]:
98
+ """Get prompt for the given task name."""
99
+ if self.prompt:
100
+ return self.prompt
101
+ return self.prompts.get(task_name, None)
88
102
 
89
103
 
90
104
  class SentenceTransformerModel(BaseModel):
@@ -92,6 +106,9 @@ class SentenceTransformerModel(BaseModel):
92
106
  def __init__(self, model_name_or_path: str, pooling_mode: Optional[str] = None, **kwargs):
93
107
  super().__init__(model_name_or_path, **kwargs)
94
108
 
109
+ self.framework = ['Sentence Transformers', 'PyTorch']
110
+
111
+ self.model_kwargs['trust_remote_code'] = True
95
112
  if not pooling_mode:
96
113
  self.model = SentenceTransformer(
97
114
  self.model_name_or_path,
@@ -112,43 +129,52 @@ class SentenceTransformerModel(BaseModel):
112
129
 
113
130
  self.model.max_seq_length = self.max_seq_length
114
131
 
115
- def encode(self, texts: Union[str, List[str]], prompt=None, **kwargs) -> List[torch.Tensor]:
116
- kwargs.pop('prompt_name', '') # remove prompt name, use prompt
132
+ self.supported_encode_params = get_supported_params(self.model.encode)
133
+
134
+ def encode(self, texts: Union[str, List[str]], **kwargs) -> List[torch.Tensor]:
135
+ # pop unused kwargs
136
+ extra_params = {}
137
+ for key in list(kwargs.keys()):
138
+ if key not in self.supported_encode_params:
139
+ extra_params[key] = kwargs.pop(key)
117
140
  self.encode_kwargs.update(kwargs)
118
141
 
142
+ # set prompt if provided
143
+ prompt = None
144
+ prompt_type = extra_params.pop('prompt_type', '')
145
+ task_name = extra_params.pop('task_name', '')
146
+ if prompt_type and prompt_type == PromptType.query:
147
+ prompt = self.get_prompt(task_name)
148
+
119
149
  embeddings = self.model.encode(texts, prompt=prompt, **self.encode_kwargs)
120
150
  assert isinstance(embeddings, Tensor)
121
151
  return embeddings.cpu().detach()
122
152
 
123
- def encode_queries(self, queries, **kwargs):
124
- return self.encode(queries, prompt=self.prompt)
125
-
126
- def encode_corpus(self, corpus, **kwargs):
127
- if isinstance(corpus[0], dict):
128
- input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
129
- else:
130
- input_texts = corpus
131
- return self.encode(input_texts)
132
-
133
153
 
134
154
  class CrossEncoderModel(BaseModel):
135
155
 
136
156
  def __init__(self, model_name_or_path: str, **kwargs):
137
157
  super().__init__(model_name_or_path, **kwargs)
158
+
159
+ self.framework = ['Sentence Transformers', 'PyTorch']
160
+
138
161
  self.model = CrossEncoder(
139
162
  self.model_name_or_path,
140
163
  trust_remote_code=True,
141
164
  max_length=self.max_seq_length,
165
+ automodel_args=self.model_kwargs,
142
166
  )
167
+ self.supported_encode_params = get_supported_params(self.model.predict)
143
168
 
144
169
  def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
170
+ for key in list(kwargs.keys()):
171
+ if key not in self.supported_encode_params:
172
+ kwargs.pop(key)
145
173
  self.encode_kwargs.update(kwargs)
146
174
 
147
- if len(sentences[0]) == 3: # Note: For mteb retrieval task
175
+ if len(sentences[0]) == 2: # Note: For mteb retrieval task
148
176
  processed_sentences = []
149
- for query, docs, instruction in sentences:
150
- if isinstance(docs, dict):
151
- docs = docs['text']
177
+ for query, docs in sentences:
152
178
  processed_sentences.append((self.prompt + query, docs))
153
179
  sentences = processed_sentences
154
180
  embeddings = self.model.predict(sentences, **self.encode_kwargs)
@@ -163,6 +189,7 @@ class APIEmbeddingModel(BaseModel):
163
189
  self.openai_api_base = kwargs.get('api_base')
164
190
  self.openai_api_key = kwargs.get('api_key')
165
191
  self.dimensions = kwargs.get('dimensions')
192
+ self.framework = ['API']
166
193
 
167
194
  self.model = OpenAIEmbeddings(
168
195
  model=self.model_name,
@@ -175,26 +202,37 @@ class APIEmbeddingModel(BaseModel):
175
202
 
176
203
  self.batch_size = self.encode_kwargs.get('batch_size', 10)
177
204
 
205
+ self.supported_encode_params = get_supported_params(self.model.embed_documents)
206
+
178
207
  def encode(self, texts: Union[str, List[str]], **kwargs) -> Tensor:
208
+ # pop unused kwargs
209
+ extra_params = {}
210
+ for key in list(kwargs.keys()):
211
+ if key not in self.supported_encode_params:
212
+ extra_params[key] = kwargs.pop(key)
213
+ self.encode_kwargs.update(kwargs)
214
+
215
+ # set prompt if provided
216
+ prompt = None
217
+ prompt_type = extra_params.pop('prompt_type', '')
218
+ task_name = extra_params.pop('task_name', '')
219
+ if prompt_type and prompt_type == PromptType.query:
220
+ prompt = self.get_prompt(task_name)
221
+
179
222
  if isinstance(texts, str):
180
223
  texts = [texts]
181
224
 
182
225
  embeddings: List[List[float]] = []
183
226
  for i in tqdm(range(0, len(texts), self.batch_size)):
184
- response = self.model.embed_documents(texts[i:i + self.batch_size], chunk_size=self.batch_size)
227
+ # set prompt if provided
228
+ if prompt is not None:
229
+ batch_texts = [prompt + text for text in texts[i:i + self.batch_size]]
230
+ else:
231
+ batch_texts = texts[i:i + self.batch_size]
232
+ response = self.model.embed_documents(batch_texts, chunk_size=self.batch_size)
185
233
  embeddings.extend(response)
186
234
  return torch.tensor(embeddings)
187
235
 
188
- def encode_queries(self, queries, **kwargs):
189
- return self.encode(queries, **kwargs)
190
-
191
- def encode_corpus(self, corpus, **kwargs):
192
- if isinstance(corpus[0], dict):
193
- input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
194
- else:
195
- input_texts = corpus
196
- return self.encode(input_texts, **kwargs)
197
-
198
236
 
199
237
  class EmbeddingModel:
200
238
  """Custom embeddings"""
@@ -69,6 +69,7 @@ class EvalMuseAdapter(T2IBaseAdapter):
69
69
  if 'FGA_BLIP2Score' in metric_name and '(' in metric_name: # FGA_BLIP2Score element score
70
70
  metrics_prefix = metric_name.split(':')[0]
71
71
  category = metric_name.rpartition('(')[-1].split(')')[0]
72
+ category = category.split('-')[0].lower() # remove the suffix if exists
72
73
  new_items[f'{metrics_prefix}:{category}'].extend(value_list)
73
74
  else:
74
75
  new_items[metric_name].extend(value_list)
@@ -1,5 +1,4 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import OutputType
3
2
  from evalscope.metrics import extract_answer, math_equal, strip_answer_string
4
3
  from evalscope.utils.logger import get_logger
5
4
 
@@ -11,6 +10,9 @@ logger = get_logger()
11
10
  @Benchmark.register(
12
11
  name='aime24',
13
12
  pretty_name='AIME-2024',
13
+ tags=['Mathematics'],
14
+ description=
15
+ 'The AIME 2024 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.', # noqa: E501
14
16
  dataset_id='HuggingFaceH4/aime_2024',
15
17
  subset_list=['default'],
16
18
  metric_list=['AveragePass@1'],