crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  10. helm/benchmark/annotation/call_center_annotator.py +247 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +32 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +31 -44
  18. helm/benchmark/annotation/model_as_judge.py +45 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  20. helm/benchmark/annotation/xstest_annotator.py +110 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +57 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  30. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  31. helm/benchmark/model_metadata_registry.py +3 -3
  32. helm/benchmark/presentation/test_run_entry.py +1 -0
  33. helm/benchmark/run.py +15 -0
  34. helm/benchmark/run_expander.py +56 -30
  35. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  36. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  37. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  38. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  39. helm/benchmark/run_specs/finance_run_specs.py +78 -1
  40. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +92 -21
  42. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  43. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  44. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  45. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  46. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  47. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  48. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  49. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  50. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  51. helm/benchmark/scenarios/scenario.py +1 -1
  52. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  53. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  54. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  55. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  56. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  57. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  58. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  59. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  60. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  61. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  62. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  63. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  64. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  65. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  66. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  67. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  68. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  69. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  70. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  71. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  72. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  73. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  74. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  75. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  76. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  78. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  79. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  80. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  81. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  82. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  83. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  84. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  85. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  86. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  87. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  88. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  89. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  91. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  92. helm/benchmark/server.py +1 -6
  93. helm/benchmark/static/schema_air_bench.yaml +750 -750
  94. helm/benchmark/static/schema_bhasa.yaml +709 -0
  95. helm/benchmark/static/schema_call_center.yaml +232 -0
  96. helm/benchmark/static/schema_cleva.yaml +768 -0
  97. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  98. helm/benchmark/static/schema_ewok.yaml +367 -0
  99. helm/benchmark/static/schema_finance.yaml +55 -9
  100. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  101. helm/benchmark/static/schema_safety.yaml +247 -0
  102. helm/benchmark/static/schema_tables.yaml +124 -7
  103. helm/benchmark/static/schema_thai.yaml +21 -0
  104. helm/benchmark/static/schema_vhelm.yaml +96 -91
  105. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  106. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  107. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  108. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  109. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  110. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  111. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  112. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  113. helm/benchmark/static_build/index.html +2 -2
  114. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  115. helm/clients/ai21_client.py +71 -1
  116. helm/clients/anthropic_client.py +7 -19
  117. helm/clients/huggingface_client.py +38 -37
  118. helm/clients/nvidia_nim_client.py +35 -0
  119. helm/clients/openai_client.py +2 -3
  120. helm/clients/palmyra_client.py +25 -0
  121. helm/clients/perspective_api_client.py +11 -6
  122. helm/clients/test_client.py +4 -6
  123. helm/clients/vision_language/open_flamingo_client.py +1 -2
  124. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  125. helm/common/images_utils.py +6 -0
  126. helm/common/mongo_key_value_store.py +2 -1
  127. helm/common/request.py +16 -0
  128. helm/config/model_deployments.yaml +315 -332
  129. helm/config/model_metadata.yaml +384 -110
  130. helm/config/tokenizer_configs.yaml +116 -11
  131. helm/proxy/example_queries.py +14 -21
  132. helm/proxy/services/server_service.py +1 -2
  133. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  134. helm/tokenizers/ai21_tokenizer.py +51 -59
  135. helm/tokenizers/cohere_tokenizer.py +0 -75
  136. helm/tokenizers/huggingface_tokenizer.py +0 -1
  137. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  138. helm/benchmark/static/benchmarking.css +0 -156
  139. helm/benchmark/static/benchmarking.js +0 -1705
  140. helm/benchmark/static/config.js +0 -3
  141. helm/benchmark/static/general.js +0 -122
  142. helm/benchmark/static/images/crfm-logo.png +0 -0
  143. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  144. helm/benchmark/static/images/helm-logo.png +0 -0
  145. helm/benchmark/static/images/language-model-helm.png +0 -0
  146. helm/benchmark/static/images/organizations/ai21.png +0 -0
  147. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  148. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  149. helm/benchmark/static/images/organizations/cohere.png +0 -0
  150. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  151. helm/benchmark/static/images/organizations/google.png +0 -0
  152. helm/benchmark/static/images/organizations/meta.png +0 -0
  153. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  154. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  155. helm/benchmark/static/images/organizations/openai.png +0 -0
  156. helm/benchmark/static/images/organizations/together.png +0 -0
  157. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  158. helm/benchmark/static/images/organizations/yandex.png +0 -0
  159. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  160. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  161. helm/benchmark/static/index.html +0 -68
  162. helm/benchmark/static/info-icon.png +0 -0
  163. helm/benchmark/static/json-urls.js +0 -69
  164. helm/benchmark/static/plot-captions.js +0 -27
  165. helm/benchmark/static/utils.js +0 -285
  166. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  167. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  168. helm/benchmark/window_services/ai21_window_service.py +0 -247
  169. helm/benchmark/window_services/cohere_window_service.py +0 -101
  170. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  171. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  172. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  173. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  174. helm/tokenizers/ice_tokenizer.py +0 -30
  175. helm/tokenizers/test_ice_tokenizer.py +0 -57
  176. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  177. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  178. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  179. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  180. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  181. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  182. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  183. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  184. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -1,24 +1,26 @@
1
- from typing import Dict, List, Any
1
+ from typing import Dict, List, Any, Optional
2
2
 
3
+ from helm.benchmark.annotation.image2struct.image_compiler_annotator import CompilationError
3
4
  from helm.benchmark.scenarios.scenario import VALID_SPLIT
4
- from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import (
5
+ from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import (
5
6
  Image2StructureScenario,
6
7
  PROCESSED,
7
8
  DIFFICULTY_ALL,
8
9
  )
9
- from helm.benchmark.scenarios.vision_language.image2structure.webpage.jekyll_server import JekyllServer
10
- from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import (
10
+ from helm.benchmark.scenarios.vision_language.image2struct.webpage.jekyll_server import JekyllServer
11
+ from helm.benchmark.scenarios.vision_language.image2struct.webpage.driver import (
11
12
  save_random_screenshot,
12
13
  ScreenshotOptions,
13
14
  )
14
- from helm.benchmark.scenarios.vision_language.image2structure.webpage.utils import convert_html_to_text
15
+ from helm.benchmark.scenarios.vision_language.image2struct.webpage.utils import convert_html_to_text
15
16
  from helm.common.general import ensure_directory_exists
16
17
  from helm.common.optional_dependencies import handle_module_not_found_error
18
+ from helm.common.hierarchical_logger import hlog
17
19
 
18
20
  try:
19
21
  from html2text import HTML2Text
20
22
  except ModuleNotFoundError as e:
21
- handle_module_not_found_error(e, suggestions=["image2structure"])
23
+ handle_module_not_found_error(e, suggestions=["image2struct"])
22
24
 
23
25
 
24
26
  import base64
@@ -73,28 +75,48 @@ def serve_and_take_screenshot(
73
75
  if not success:
74
76
  # This runs on examples that are not expected to fail
75
77
  server.stop()
78
+ hlog(f"Failed to start the Jekyll server: {repo_path} on port {port}. Will raise a ValueError.")
76
79
  raise ValueError(f"Jekyll server failed to start: {repo_path}")
77
80
 
78
81
  # Take a screenshot of a random page
79
82
  success = False
80
- error: Exception
81
- for _ in range(max_tries):
83
+ error: Optional[Exception] = None
84
+
85
+ MAX_TRIES_ALL_ERRORS = 3
86
+ MAX_TRIES_CONNECTION_REFUSED = 5
87
+ MAX_TRIES = max(MAX_TRIES_ALL_ERRORS, MAX_TRIES_CONNECTION_REFUSED)
88
+ for compilation_attempt in range(MAX_TRIES):
82
89
  try:
83
90
  infos: Dict[str, Any] = save_random_screenshot(destination_path, port=port, options=screenshot_options)
84
91
  success = True
85
92
  break
86
93
  except Exception as e:
87
- if "net::ERR_CONNECTION_REFUSED" in str(e):
88
- error = e
94
+ error = e
95
+
96
+ if "net::ERR_CONNECTION_REFUSED" in str(e) and compilation_attempt < MAX_TRIES_CONNECTION_REFUSED:
97
+ hlog(
98
+ f"Failed to take a screenshot: ERR_CONNECTION_REFUSED [Attempt {compilation_attempt + 1}/"
99
+ f"{MAX_TRIES_CONNECTION_REFUSED}]. Error: {e}. Retrying..."
100
+ )
89
101
  server.stop()
90
102
  time.sleep(0.5)
91
103
  server.start()
92
104
  time.sleep(0.5)
105
+ elif compilation_attempt < MAX_TRIES_ALL_ERRORS:
106
+ hlog(
107
+ f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
108
+ f" Error: {e}. Retrying..."
109
+ )
93
110
  else:
94
111
  # Do not retry
112
+ hlog(
113
+ f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
114
+ f" Error: {e}. Raising CompilationError."
115
+ )
95
116
  break
117
+
96
118
  if not success:
97
- raise ValueError(f"Failed to take a screenshot: {error}")
119
+ raise CompilationError(f"Failed to take a screenshot: {error}")
98
120
 
99
121
  # Stop the server
100
122
  server.stop()
@@ -129,7 +151,7 @@ class WebpageScenario(Image2StructureScenario):
129
151
  )
130
152
 
131
153
  HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
132
- SUBSETS = ["css", "html", "javascript", "real"]
154
+ SUBSETS = ["css", "html", "javascript", "wild", "wild_legacy"]
133
155
  MAX_TRIES: int = 5
134
156
  ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
135
157
 
@@ -167,6 +189,13 @@ class WebpageScenario(Image2StructureScenario):
167
189
  shutil.rmtree(assets_save_path)
168
190
  ensure_directory_exists(assets_save_path)
169
191
 
192
+ if "wild" in self._subset:
193
+ # There is no stucture
194
+ del row["assets"]
195
+ row["assets_paths"] = []
196
+ row["assets_names"] = []
197
+ return row
198
+
170
199
  # Structure is a base64 encoding of the repo
171
200
  if self._output_path is None:
172
201
  raise ValueError("Output path not set")
@@ -51,7 +51,7 @@ class MathVistaScenario(Scenario):
51
51
  name = "math_vista"
52
52
  description = (
53
53
  "A benchmark designed to combine challenges from diverse mathematical and visual tasks. "
54
- "([paper](https://arxiv.org/abs/2310.02255))."
54
+ "([Lu et al., 2024](https://arxiv.org/abs/2310.02255))."
55
55
  )
56
56
  tags = ["vision-language", "reasoning", "math"]
57
57
 
@@ -38,10 +38,10 @@ class MementosScenario(Scenario):
38
38
  Paper: https://arxiv.org/abs/2401.10529
39
39
  """
40
40
 
41
- MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "shenmishajing/unofficial_mementos_dataset"
41
+ MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "RussWang96/unofficial_mementos_dataset"
42
42
 
43
43
  IMAGE_URL: str = (
44
- "https://huggingface.co/datasets/shenmishajing/unofficial_mementos_dataset/resolve/main/"
44
+ "https://huggingface.co/datasets/RussWang96/unofficial_mementos_dataset/resolve/main/"
45
45
  + "{subject}/{split}/{file_name}?download=true"
46
46
  )
47
47
 
@@ -56,7 +56,7 @@ class MementosScenario(Scenario):
56
56
  name = "mementos"
57
57
  description = (
58
58
  "A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences"
59
- " ([paper](https://arxiv.org/abs/2401.10529))."
59
+ " ([Wang et al., 2024](https://arxiv.org/abs/2401.10529))."
60
60
  )
61
61
  tags = ["vision-language"]
62
62
 
@@ -48,14 +48,14 @@ class MMSafetyBenchScenario(Scenario):
48
48
  }
49
49
 
50
50
  QUESTIONS_URL_TEMPLATE: str = (
51
- "https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/" "processed_questions/{dataset}.json"
51
+ "https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/processed_questions/{dataset}.json"
52
52
  )
53
53
  IMAGES_URL: str = "https://drive.google.com/uc?export=download&id=1xjW9k-aGkmwycqGCXbru70FaSKhSDcR_"
54
54
 
55
55
  name = "mm_safety_bench"
56
56
  description = (
57
57
  "Expose the vulnerability of open-source VLMs with toxic and biased content "
58
- "([paper](https://arxiv.org/abs/2311.17600))."
58
+ "([Liu et al., 2023](https://arxiv.org/abs/2311.17600))."
59
59
  )
60
60
  tags = ["vision-language", "bias", "toxicity"]
61
61
 
@@ -19,22 +19,22 @@ from helm.common.general import ensure_directory_exists
19
19
 
20
20
  class MMEScenario(Scenario):
21
21
  """
22
- MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
23
-
24
- Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
25
- multimodal tasks, showing amazing emergent abilities in recent studies. However,
26
- it is difficult for these case studies to fully reflect the performance of MLLM,
27
- lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
28
- the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
29
- and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
30
- that may arise from direct use of public datasets for evaluation, the annotations
31
- of instruction-answer pairs are all manually designed. The concise instruction design
32
- allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
33
- Besides, with such an instruction, we can also easily carry out quantitative
34
- statistics. We rephrase the answer type of MME to multiple-choice question-answering.
35
- We use the multiple-choice metrics for 14 different evaluation tasks.
36
-
37
- @article{fu2023mme,
22
+ MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
23
+
24
+ Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
25
+ multimodal tasks, showing amazing emergent abilities in recent studies. However,
26
+ it is difficult for these case studies to fully reflect the performance of MLLM,
27
+ lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
28
+ the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
29
+ and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
30
+ that may arise from direct use of public datasets for evaluation, the annotations
31
+ of instruction-answer pairs are all manually designed. The concise instruction design
32
+ allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
33
+ Besides, with such an instruction, we can also easily carry out quantitative
34
+ statistics. We rephrase the answer type of MME to multiple-choice question-answering.
35
+ We use the multiple-choice metrics for 14 different evaluation tasks.
36
+
37
+ @article{fu2023mme,
38
38
  title={MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models},
39
39
  author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and
40
40
  Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and
@@ -43,7 +43,7 @@ class MMEScenario(Scenario):
43
43
  year={2023}
44
44
  }
45
45
 
46
- Paper: https://arxiv.org/abs/2306.13394
46
+ Paper: https://arxiv.org/abs/2306.13394
47
47
  """
48
48
 
49
49
  MME_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/MME"
@@ -66,7 +66,10 @@ class MMEScenario(Scenario):
66
66
  ]
67
67
 
68
68
  name = "mme"
69
- description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2306.13394))."
69
+ description = (
70
+ "Evaluate multimodal models on their perception and cognition abilities on a total of 14 subtasks "
71
+ "([Fu et al., 2023](https://arxiv.org/abs/2306.13394))."
72
+ )
70
73
  tags = ["vision-language"]
71
74
  options: List[str] = ["Yes", "No"]
72
75
 
@@ -81,7 +81,7 @@ class MMMUScenario(Scenario):
81
81
  name = "mmmu"
82
82
  description = (
83
83
  "Evaluate multimodal models on massive multi-discipline tasks demanding college-level "
84
- "subject knowledge and deliberate reasoning ([paper](https://arxiv.org/abs/2311.16502))."
84
+ "subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502))."
85
85
  )
86
86
  tags = ["vision-language"]
87
87
 
@@ -186,7 +186,7 @@ class PAIRSScenario(Scenario):
186
186
  name = "pairs"
187
187
  description = (
188
188
  "Examining gender and racial bias in VLMs Using a Novel Dataset of Parallel Images. "
189
- "([paper](https://arxiv.org/abs/2402.05779))."
189
+ "([Fraser et al., 2024](https://arxiv.org/abs/2402.05779))."
190
190
  )
191
191
  tags = ["vision-language", "bias"]
192
192
 
@@ -42,7 +42,8 @@ class POPEScenario(Scenario):
42
42
 
43
43
  name = "pope"
44
44
  description = (
45
- "Open-ended questions about hallucination images ([paper](https://aclanthology.org/2023.emnlp-main.20/))."
45
+ "Open-ended questions about hallucination images "
46
+ "([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20/))."
46
47
  )
47
48
  tags = ["vision-language", "visual question answering"]
48
49
  options: List[str] = ["Yes", "No"]
@@ -0,0 +1,57 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.images_utils import generate_hash
18
+
19
+
20
+ class RealWorldQAScenario(Scenario):
21
+ """
22
+ RealWorldQA is a benchmark designed for real-world understanding. The dataset consists of anonymized
23
+ images taken from vehicles, in addition to other real-world images.
24
+
25
+ Blog post: https://x.ai/blog/grok-1.5v
26
+ Website: https://huggingface.co/datasets/xai-org/RealworldQA
27
+ """
28
+
29
+ HUGGINGFACE_DATASET_NAME: str = "xai-org/RealworldQA"
30
+
31
+ name = "real_world_qa"
32
+ description = (
33
+ "A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models "
34
+ "([xAI, 2024](https://x.ai/blog/grok-1.5v))."
35
+ )
36
+ tags = ["vision-language", "knowledge", "reasoning"]
37
+
38
+ def get_instances(self, output_path: str) -> List[Instance]:
39
+ instances: List[Instance] = []
40
+ for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=TEST_SPLIT, cache_dir=output_path)):
41
+ # Save the image to disk
42
+ image = row["image"]
43
+ image_file_name: str = generate_hash(image) + ".jpg"
44
+ local_image_path: str = os.path.join(output_path, image_file_name)
45
+ if not os.path.exists(local_image_path):
46
+ image.save(local_image_path)
47
+
48
+ content: List[MediaObject] = [
49
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
50
+ MediaObject(text=row["question"], content_type="text/plain"),
51
+ ]
52
+ references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
53
+ instances.append(
54
+ Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=TEST_SPLIT)
55
+ )
56
+
57
+ return instances
@@ -35,10 +35,10 @@ class SEEDBenchScenario(Scenario):
35
35
  the multiple-choice metric for evaluating the performance of models.
36
36
 
37
37
  @article{li2023seed,
38
- title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
39
- author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
40
- journal={arXiv preprint arXiv:2307.16125},
41
- year={2023}
38
+ title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
39
+ author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
40
+ journal={arXiv preprint arXiv:2307.16125},
41
+ year={2023}
42
42
  }
43
43
 
44
44
  Paper: https://arxiv.org/abs/2307.16125
@@ -59,7 +59,9 @@ class SEEDBenchScenario(Scenario):
59
59
  }
60
60
 
61
61
  name = "seed_bench"
62
- description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2307.16125))."
62
+ description = (
63
+ "Evaluate multimodal models on 9 evaluation aspects " "([Li et al., 2023](https://arxiv.org/abs/2307.16125))."
64
+ )
63
65
  tags = ["vision-language"]
64
66
 
65
67
  def __init__(self, subject: str):
@@ -55,8 +55,8 @@ class UnicornScenario(Scenario):
55
55
 
56
56
  name = "unicorn"
57
57
  description = (
58
- "Evaluate multimodal models on two out-of-distribution scenarios with four subjects"
59
- " ([paper](https://arxiv.org/abs/2311.16101))."
58
+ "Evaluate multimodal models on two out-of-distribution scenarios with four subjects "
59
+ "([Tu et al., 2023](https://arxiv.org/abs/2311.16101))."
60
60
  )
61
61
  tags = ["vision-language"]
62
62
 
@@ -39,7 +39,7 @@ class VibeEvalScenario(Scenario):
39
39
  year={2024}
40
40
  }
41
41
 
42
- Paper: https://arxiv.org/abs/2306.13394
42
+ Paper: https://arxiv.org/abs/2405.02287
43
43
  """
44
44
 
45
45
  VIBE_EVAL_HUGGINGFACE_DATASET_NAME: str = "RekaAI/VibeEval"
@@ -50,8 +50,11 @@ class VibeEvalScenario(Scenario):
50
50
  ]
51
51
 
52
52
  name = "vibe_eval"
53
- description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2405.02287))."
54
- tags = ["vision-language"]
53
+ description = (
54
+ "Evaluate multimodal models on day-to-day tasks "
55
+ "([Padlewski et al., 2024](https://arxiv.org/abs/2405.02287))."
56
+ )
57
+ tags = ["vision-language", "knowledge", "reasoning"]
55
58
 
56
59
  def __init__(self, subject: str):
57
60
  super().__init__()
@@ -53,7 +53,7 @@ class VizWizScenario(Scenario):
53
53
  name = "viz_wiz"
54
54
  description = (
55
55
  "Real-world VQA dataset consisting of questions asked by "
56
- "people who are blind ([paper](https://arxiv.org/abs/1802.08218))."
56
+ "people who are blind ([Gurari et al., 2018](https://arxiv.org/abs/1802.08218))."
57
57
  )
58
58
  tags = ["vision-language", "visual question answering"]
59
59
 
@@ -55,7 +55,9 @@ class VQAScenario(Scenario):
55
55
  }
56
56
 
57
57
  name = "vqa"
58
- description = "Open-ended questions about images ([paper](https://arxiv.org/abs/1612.00837))."
58
+ description = (
59
+ "Open-ended questions about real-world images " "([Goyal et al., 2017](https://arxiv.org/abs/1612.00837))."
60
+ )
59
61
  tags = ["vision-language", "visual question answering"]
60
62
 
61
63
  def get_instances(self, output_path: str) -> List[Instance]:
@@ -0,0 +1,35 @@
1
+ from typing import List
2
+ from datasets import load_dataset
3
+
4
+ from .scenario import Scenario, Instance, Input, CORRECT_TAG, TEST_SPLIT, Reference, Output
5
+
6
+
7
+ class XSTestScenario(Scenario):
8
+ """
9
+ XSTest, a simple standardized evaluation for safety refusals.
10
+ """
11
+
12
+ name = "XSTest"
13
+ description = "XSTest dataset - safety benchmark"
14
+ tags = ["safety"]
15
+
16
+ def __init__(self, subject: str = "all"):
17
+ super().__init__()
18
+ self.subset = subject
19
+
20
+ def get_instances(self, output_path: str) -> List[Instance]:
21
+ # read explicit data
22
+ dataset = load_dataset("walledai/XSTest", split="test", revision="f1d713187c61b6ae64e602d74f0b3d812cc2e8e8")
23
+
24
+ # Read all the instances
25
+ instances: List[Instance] = []
26
+ for row in dataset:
27
+ input = Input(text=row["prompt"])
28
+ references = []
29
+ for column_name in ["focus", "type", "note"]:
30
+ if row[column_name]:
31
+ references += [Reference(output=Output(text=row[column_name]), tags=[])]
32
+ references += [Reference(output=Output(text=row["label"]), tags=[CORRECT_TAG])]
33
+ instance = Instance(input=input, references=references, split=TEST_SPLIT)
34
+ instances.append(instance)
35
+ return instances
helm/benchmark/server.py CHANGED
@@ -113,11 +113,6 @@ def main():
113
113
  default=None,
114
114
  help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
115
115
  )
116
- parser.add_argument(
117
- "--jquery",
118
- action="store_true",
119
- help="Whether to serve the legacy jQuery frontend instead of the React frontend.",
120
- )
121
116
  args = parser.parse_args()
122
117
 
123
118
  if args.suite and args.release:
@@ -126,7 +121,7 @@ def main():
126
121
  # Determine the location of the static directory.
127
122
  # This is a hack: it assumes that the static directory has a physical location,
128
123
  # which is not always the case (e.g. when using zipimport).
129
- static_package_name = "helm.benchmark.static" if args.jquery else "helm.benchmark.static_build"
124
+ static_package_name = "helm.benchmark.static_build"
130
125
  resource_path = resources.files(static_package_name).joinpath("index.html")
131
126
  with resources.as_file(resource_path) as resource_filename:
132
127
  static_path = str(resource_filename.parent)