crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
  3. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +32 -31
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  6. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  7. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  8. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  9. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  10. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  11. helm/benchmark/annotation/annotator_factory.py +6 -0
  12. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  13. helm/benchmark/annotation/call_center_annotator.py +247 -0
  14. helm/benchmark/annotation/financebench_annotator.py +79 -0
  15. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  16. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  17. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  18. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  19. helm/benchmark/annotation/live_qa_annotator.py +71 -0
  20. helm/benchmark/annotation/medication_qa_annotator.py +68 -0
  21. helm/benchmark/annotation/model_as_judge.py +45 -0
  22. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  23. helm/benchmark/annotation/xstest_annotator.py +110 -0
  24. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  25. helm/benchmark/huggingface_registration.py +16 -6
  26. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  27. helm/benchmark/metrics/annotation_metrics.py +108 -0
  28. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  29. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  30. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  31. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  32. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  33. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  35. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  36. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  37. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  38. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  39. helm/benchmark/metrics/safety_metrics.py +57 -0
  40. helm/benchmark/metrics/summac/model_summac.py +3 -3
  41. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  42. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  43. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  44. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  45. helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
  46. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  47. helm/benchmark/model_metadata_registry.py +3 -3
  48. helm/benchmark/presentation/schema.py +54 -4
  49. helm/benchmark/presentation/test_run_entry.py +1 -0
  50. helm/benchmark/presentation/test_schema.py +11 -0
  51. helm/benchmark/run.py +31 -2
  52. helm/benchmark/run_expander.py +113 -10
  53. helm/benchmark/run_spec_factory.py +4 -0
  54. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  55. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  56. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  57. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  58. helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
  59. helm/benchmark/run_specs/experimental_run_specs.py +85 -0
  60. helm/benchmark/run_specs/finance_run_specs.py +110 -0
  61. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  62. helm/benchmark/run_specs/vlm_run_specs.py +251 -57
  63. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  64. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  65. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  66. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  67. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  68. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  69. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  70. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  71. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  72. helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
  73. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  74. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  75. helm/benchmark/scenarios/scenario.py +1 -1
  76. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  77. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  78. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  79. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  80. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  81. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  82. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  83. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  84. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  85. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  86. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  87. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  88. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
  90. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  92. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  93. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  94. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  95. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  97. helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
  98. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
  99. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
  100. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  101. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  102. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  103. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
  104. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  105. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  106. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  107. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  108. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  109. helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
  110. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  111. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  112. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  113. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
  114. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  115. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  116. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  117. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  118. helm/benchmark/server.py +1 -6
  119. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  120. helm/benchmark/static/schema_bhasa.yaml +709 -0
  121. helm/benchmark/static/schema_call_center.yaml +232 -0
  122. helm/benchmark/static/schema_classic.yaml +3 -59
  123. helm/benchmark/static/schema_cleva.yaml +768 -0
  124. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  125. helm/benchmark/static/schema_ewok.yaml +367 -0
  126. helm/benchmark/static/schema_finance.yaml +189 -0
  127. helm/benchmark/static/schema_image2struct.yaml +588 -0
  128. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  129. helm/benchmark/static/schema_lite.yaml +3 -61
  130. helm/benchmark/static/schema_medical.yaml +255 -0
  131. helm/benchmark/static/schema_mmlu.yaml +3 -61
  132. helm/benchmark/static/schema_safety.yaml +247 -0
  133. helm/benchmark/static/schema_tables.yaml +317 -0
  134. helm/benchmark/static/schema_thai.yaml +244 -0
  135. helm/benchmark/static/schema_unitxt.yaml +3 -61
  136. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
  137. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  138. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  139. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  140. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  141. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  142. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  143. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  144. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  145. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  146. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  147. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  148. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  149. helm/benchmark/static_build/index.html +2 -2
  150. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  151. helm/clients/ai21_client.py +71 -1
  152. helm/clients/anthropic_client.py +50 -28
  153. helm/clients/auto_client.py +11 -0
  154. helm/clients/client.py +24 -7
  155. helm/clients/cohere_client.py +98 -3
  156. helm/clients/huggingface_client.py +79 -19
  157. helm/clients/nvidia_nim_client.py +35 -0
  158. helm/clients/openai_client.py +11 -5
  159. helm/clients/palmyra_client.py +25 -0
  160. helm/clients/perspective_api_client.py +11 -6
  161. helm/clients/reka_client.py +189 -0
  162. helm/clients/test_client.py +7 -9
  163. helm/clients/test_huggingface_client.py +19 -3
  164. helm/clients/test_together_client.py +72 -2
  165. helm/clients/together_client.py +129 -23
  166. helm/clients/vertexai_client.py +62 -18
  167. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  168. helm/clients/vision_language/open_flamingo_client.py +1 -2
  169. helm/clients/vision_language/paligemma_client.py +146 -0
  170. helm/clients/vision_language/palmyra_vision_client.py +99 -0
  171. helm/clients/yi_client.py +31 -0
  172. helm/common/critique_request.py +10 -1
  173. helm/common/images_utils.py +25 -0
  174. helm/common/mongo_key_value_store.py +2 -1
  175. helm/common/request.py +16 -0
  176. helm/config/model_deployments.yaml +740 -363
  177. helm/config/model_metadata.yaml +824 -128
  178. helm/config/tokenizer_configs.yaml +207 -10
  179. helm/proxy/critique/model_critique_client.py +32 -4
  180. helm/proxy/example_queries.py +14 -21
  181. helm/proxy/services/server_service.py +2 -3
  182. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  183. helm/tokenizers/ai21_tokenizer.py +51 -59
  184. helm/tokenizers/auto_tokenizer.py +1 -1
  185. helm/tokenizers/cohere_tokenizer.py +29 -62
  186. helm/tokenizers/huggingface_tokenizer.py +35 -13
  187. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  188. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  189. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  190. helm/benchmark/static/benchmarking.css +0 -156
  191. helm/benchmark/static/benchmarking.js +0 -1705
  192. helm/benchmark/static/config.js +0 -3
  193. helm/benchmark/static/general.js +0 -122
  194. helm/benchmark/static/images/crfm-logo.png +0 -0
  195. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  196. helm/benchmark/static/images/helm-logo.png +0 -0
  197. helm/benchmark/static/images/language-model-helm.png +0 -0
  198. helm/benchmark/static/images/organizations/ai21.png +0 -0
  199. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  200. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  201. helm/benchmark/static/images/organizations/cohere.png +0 -0
  202. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  203. helm/benchmark/static/images/organizations/google.png +0 -0
  204. helm/benchmark/static/images/organizations/meta.png +0 -0
  205. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  206. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  207. helm/benchmark/static/images/organizations/openai.png +0 -0
  208. helm/benchmark/static/images/organizations/together.png +0 -0
  209. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  210. helm/benchmark/static/images/organizations/yandex.png +0 -0
  211. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  212. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  213. helm/benchmark/static/index.html +0 -68
  214. helm/benchmark/static/info-icon.png +0 -0
  215. helm/benchmark/static/json-urls.js +0 -69
  216. helm/benchmark/static/plot-captions.js +0 -27
  217. helm/benchmark/static/schema_image2structure.yaml +0 -304
  218. helm/benchmark/static/utils.js +0 -285
  219. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  220. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  221. helm/benchmark/window_services/ai21_window_service.py +0 -247
  222. helm/benchmark/window_services/cohere_window_service.py +0 -101
  223. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  224. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  225. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  226. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  227. helm/tokenizers/ice_tokenizer.py +0 -30
  228. helm/tokenizers/test_ice_tokenizer.py +0 -57
  229. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  230. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  231. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  232. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  233. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  234. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  235. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  236. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -22,6 +22,10 @@ from helm.common.general import ensure_directory_exists
22
22
  from helm.common.hierarchical_logger import hlog
23
23
 
24
24
  PROCESSED: str = "processed"
25
+ DIFFICULTY_ALL = "all"
26
+ DIFFICULTY_EASY = "easy"
27
+ DIFFICULTY_MEDIUM = "medium"
28
+ DIFFICULTY_HARD = "hard"
25
29
 
26
30
 
27
31
  class Image2StructureScenario(Scenario):
@@ -38,13 +42,16 @@ class Image2StructureScenario(Scenario):
38
42
  VALID_SPLIT: "validation",
39
43
  }
40
44
 
41
- def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
45
+ def __init__(
46
+ self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT, difficulty: str = DIFFICULTY_ALL
47
+ ):
42
48
  super().__init__()
43
49
  assert subset in self.SUBSETS, f"Invalid subset: {subset}"
44
50
  self._subset: str = subset
45
51
  self._recompile_prompt: bool = recompile_prompt
46
52
  self._split: str = split
47
53
  self._output_path: Optional[str] = None
54
+ self._difficulty: str = difficulty
48
55
 
49
56
  def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
50
57
  # By default, there are no assets
@@ -110,6 +117,10 @@ class Image2StructureScenario(Scenario):
110
117
  )
111
118
  continue
112
119
 
120
+ # Filter by difficulty
121
+ if self._difficulty != DIFFICULTY_ALL and row["difficulty"] != self._difficulty:
122
+ continue
123
+
113
124
  # Step 1: Preprocess the row
114
125
  row = self.preprocess_row(row, assets_path)
115
126
 
@@ -158,7 +169,7 @@ class Image2StructureScenario(Scenario):
158
169
  # representing the structure (such as LaTeX code)
159
170
  multimedia_object = MultimediaObject([image_object])
160
171
  reference = Reference(
161
- output=Output(text=row["text"], multimedia_content=multimedia_object),
172
+ output=Output(text=row["text"] if "text" in row else "", multimedia_content=multimedia_object),
162
173
  tags=[CORRECT_TAG],
163
174
  )
164
175
  else:
@@ -1,22 +1,18 @@
1
- from helm.benchmark.scenarios.scenario import VALID_SPLIT
2
- from helm.benchmark.scenarios.vision_language.image2structure.utils_latex import (
1
+ from helm.benchmark.scenarios.vision_language.image2struct.utils_latex import (
3
2
  latex_to_image,
4
3
  strip_unnecessary_latex_parts,
5
4
  )
6
- from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
5
+ from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import Image2StructureScenario
7
6
 
8
7
 
9
8
  class LatexScenario(Image2StructureScenario):
10
9
  BASE_PROMPT = "Please provide the LaTeX code used to generate this image. Only generate the code relevant to what you see. Your code will be surrounded by all the imports necessary as well as the begin and end document delimiters." # noqa: E501
11
10
  HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-latex"
12
- SUBSETS = ["equation", "table", "plot", "algorithm"]
11
+ SUBSETS = ["equation", "table", "plot", "algorithm", "wild", "wild_legacy"]
13
12
 
14
13
  name = "image2latex"
15
14
  description = "Evaluate multimodal models on Latex generation to recreate a provided image"
16
15
 
17
- def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
18
- super().__init__(subset, recompile_prompt, split)
19
-
20
16
  def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
21
17
  image, infos = latex_to_image(structure, assets_path=assets_path, crop=True)
22
18
  image.save(destination_path)
@@ -1,5 +1,4 @@
1
- from helm.benchmark.scenarios.scenario import VALID_SPLIT
2
- from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
1
+ from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import Image2StructureScenario
3
2
 
4
3
 
5
4
  class MusicSheetScenario(Image2StructureScenario):
@@ -13,8 +12,5 @@ class MusicSheetScenario(Image2StructureScenario):
13
12
  name = "image2musicsheet"
14
13
  description = "Evaluate multimodal models on Lilypond generation to recreate a provided image"
15
14
 
16
- def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
17
- super().__init__(subset, recompile_prompt, split)
18
-
19
15
  def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
20
16
  raise Exception("Music sheets have no ground truth, compilation is not possible")
@@ -5,6 +5,7 @@ import os
5
5
  import re
6
6
 
7
7
  from helm.common.optional_dependencies import handle_module_not_found_error, OptionalDependencyNotInstalled
8
+ from helm.common.hierarchical_logger import hlog
8
9
 
9
10
  try:
10
11
  from latex import build_pdf
@@ -12,14 +13,13 @@ try:
12
13
  from PIL import ImageOps
13
14
  from PIL.Image import Image
14
15
  except ModuleNotFoundError as e:
15
- handle_module_not_found_error(e, suggestions=["image2structure"])
16
+ handle_module_not_found_error(e, suggestions=["image2struct"])
16
17
 
17
18
  # LaTeX preamble
18
19
  # Make sure to install "latex-full".
19
20
  TEX_INCLUDES = r"""
20
21
  \usepackage{amsmath,amssymb,amsfonts}
21
22
  \usepackage{graphicx}
22
- \usepackage{graphicx}
23
23
  \usepackage{amsmath}
24
24
  \usepackage{xcolor}
25
25
  \usepackage{algorithm}
@@ -98,23 +98,19 @@ def pdf_to_image(
98
98
 
99
99
  def strip_unnecessary_latex_parts(latex_code: str) -> str:
100
100
  """Strip unnecessary parts of the LaTeX code."""
101
-
102
101
  # Remove comments
103
102
  minimal_latex_code = re.sub(r"%.*?\n", "\n", latex_code)
104
-
105
103
  # Remove \documentclass and any \usepackage lines
106
- minimal_latex_code = re.sub(r"\\documentclass\{.*?\}\n", "", latex_code)
107
- minimal_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*?\}\n", "", minimal_latex_code)
108
-
104
+ minimal_latex_code = re.sub(r"\\documentclass(\[.*?\])?\{.*?\}", "", latex_code)
105
+ minimal_latex_code = re.sub(r"\\documentstyle(\[.*?\])?\{.*?\}", "", minimal_latex_code)
106
+ minimal_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*?\}", "", minimal_latex_code)
109
107
  # Remove everything before \begin{document} and including it, and everything after \end{document}
110
108
  minimal_latex_code = re.sub(r"\\begin\{document\}\n*", "", minimal_latex_code, flags=re.DOTALL)
111
109
  minimal_latex_code = re.sub(r"\\end\{document\}.*", "", minimal_latex_code, flags=re.DOTALL)
112
-
113
110
  # Ensure \begin{...} is followed by a \n
114
111
  minimal_latex_code = re.sub(r"(\\begin\{.*?\}(\[.*?\])?)(?!\n)", r"\1\n", minimal_latex_code)
115
112
  # Ensure \end{...} has a \n before it
116
113
  minimal_latex_code = re.sub(r"(\\end\{.*?\})(?!\n)", r"\1\n", minimal_latex_code)
117
-
118
114
  # Normalize space sequences to a single space globally
119
115
  minimal_latex_code = re.sub(r" +", " ", minimal_latex_code)
120
116
  # Replace tabs with a single space
@@ -123,7 +119,6 @@ def strip_unnecessary_latex_parts(latex_code: str) -> str:
123
119
  minimal_latex_code = re.sub(r"^[ \t]+|[ \t]+$", "", minimal_latex_code, flags=re.MULTILINE)
124
120
  # Remove unnecessary whitespace - multiple empty lines and tabulations
125
121
  minimal_latex_code = re.sub(r"\n\s*\n", "\n", minimal_latex_code)
126
-
127
122
  return minimal_latex_code.strip()
128
123
 
129
124
 
@@ -226,25 +221,21 @@ def handle_latex_error(
226
221
  # Error format: "LaTeX Error: Environment <env> undefined."
227
222
  undefined_search = re.search(r"LaTeX Error: Environment (.*) undefined", str_e)
228
223
  if undefined_search:
229
- # If a package is missing and this is our first retry, then simply include TEX_INCLUDES
230
- if num_try_remaining == MAX_NUM_TRIES:
231
- fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
232
- if num_try_remaining < MAX_NUM_TRIES or fixed_code == original_latex_code:
233
- # Here we try to manually solve the missing environment.
234
- # This is either executed on the second rety or the first if no changements
235
- # were made in the first retry.
236
- assert TEX_INCLUDES in fixed_code, "TEX_INCLUDES should be present in the code"
237
- # TEX_INCLUDES is already present, so we add the missing package
238
- # Since we cannot know the name of the package that contains the missing environment,
239
- # we simply hope that they are named the same way.
240
- env_undefined: str = undefined_search.group(1)
241
-
242
- if f"\\usepackage{{{env_undefined}}}" in fixed_code:
243
- # We already tried to include the missing package, but it probably
244
- # does not exist, so we raise an error
245
- raise RuntimeError(str(e)) from e
246
-
247
- fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")
224
+ # Here we try to manually solve the missing environment.
225
+ # This is either executed on the second rety or the first if no changements
226
+ # were made in the first retry.
227
+ assert TEX_INCLUDES in fixed_code, f"TEX_INCLUDES should be present in the code. code={fixed_code}"
228
+ # TEX_INCLUDES is already present, so we add the missing package
229
+ # Since we cannot know the name of the package that contains the missing environment,
230
+ # we simply hope that they are named the same way.
231
+ env_undefined: str = undefined_search.group(1)
232
+
233
+ if f"\\usepackage{{{env_undefined}}}" in fixed_code:
234
+ # We already tried to include the missing package, but it probably
235
+ # does not exist, so we raise an error
236
+ raise RuntimeError(str(e)) from e
237
+
238
+ fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")
248
239
 
249
240
  # Try again with the fixed code (if the fixed code is different from the original code)
250
241
  if fixed_code != original_latex_code:
@@ -313,23 +304,24 @@ def latex_to_image(
313
304
 
314
305
  # 2. Add preamble
315
306
  # 2.1. Remove \documentclass if present to make sure we use our own
316
- documentclass_search = re.search(r"\\documentclass\{(.*)\}", original_latex_code)
307
+ documentclass_search = re.search(r"\\documentclass(\[.*?\])?\{.*?\}", original_latex_code)
308
+ documentstyle_search = re.search(r"\\documentstyle(\[.*?\])?\{.*?\}", original_latex_code)
317
309
  if documentclass_search:
318
- documentclass: str = documentclass_search.group(1)
319
- original_latex_code = original_latex_code.replace(f"\\documentclass{{{documentclass}}}", TEX_BEGIN_FILE)
310
+ matching_string = documentclass_search.group()
311
+ original_latex_code = original_latex_code.replace(matching_string, TEX_BEGIN_FILE)
312
+ elif documentstyle_search:
313
+ matching_string = documentstyle_search.group()
314
+ original_latex_code = original_latex_code.replace(matching_string, TEX_BEGIN_FILE)
320
315
  else:
321
316
  # If there is no \documentclass, we add our own
322
317
  original_latex_code = TEX_BEGIN_FILE + "\n\n" + original_latex_code
323
318
 
324
- # 2.2. Add includes. In this first step, we only add includes if none are present.
325
- # We do this because if some are present, we might define them twice which can cause errors
326
- # and this section should not make the original LaTeX code fail if it was compilable.
327
- # If there are missing packages, in handle_latex_error, we will add TEX_INCLUDES after the begin document,
328
- # which might define some packages twice, but often solves the problem.
329
- if not re.search(r"\\usepackage\{.*\}", original_latex_code):
330
- original_latex_code = original_latex_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
319
+ # 2.2. Add includes. In this ste we remove all includes for the default ones.
320
+ original_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*\}", "", original_latex_code)
321
+ original_latex_code = original_latex_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
331
322
 
332
323
  latex_code: str = original_latex_code
324
+ hlog(f"Compiling LaTeX code:\n{latex_code}")
333
325
  try:
334
326
  pdf_stream = latex_to_pdf(latex_code, assets_path=assets_path)
335
327
  image = pdf_to_image(pdf_stream, crop=crop, resize_to=resize_to)
@@ -6,7 +6,7 @@ try:
6
6
  from selenium import webdriver
7
7
  import selenium.common.exceptions
8
8
  except ModuleNotFoundError as e:
9
- handle_module_not_found_error(e, suggestions=["image2structure"])
9
+ handle_module_not_found_error(e, suggestions=["image2struct"])
10
10
 
11
11
 
12
12
  def init_driver(url: str, resolution: Tuple[int, int] = (1920, 1080)) -> webdriver.Chrome:
@@ -5,7 +5,7 @@ from helm.common.optional_dependencies import handle_module_not_found_error
5
5
  try:
6
6
  from html2text import HTML2Text
7
7
  except ModuleNotFoundError as e:
8
- handle_module_not_found_error(e, suggestions=["image2structure"])
8
+ handle_module_not_found_error(e, suggestions=["image2struct"])
9
9
 
10
10
 
11
11
  def convert_html_to_text(handler: HTML2Text, html: str) -> str:
@@ -1,23 +1,26 @@
1
- from typing import Dict, List, Any
1
+ from typing import Dict, List, Any, Optional
2
2
 
3
+ from helm.benchmark.annotation.image2struct.image_compiler_annotator import CompilationError
3
4
  from helm.benchmark.scenarios.scenario import VALID_SPLIT
4
- from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import (
5
+ from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import (
5
6
  Image2StructureScenario,
6
7
  PROCESSED,
8
+ DIFFICULTY_ALL,
7
9
  )
8
- from helm.benchmark.scenarios.vision_language.image2structure.webpage.jekyll_server import JekyllServer
9
- from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import (
10
+ from helm.benchmark.scenarios.vision_language.image2struct.webpage.jekyll_server import JekyllServer
11
+ from helm.benchmark.scenarios.vision_language.image2struct.webpage.driver import (
10
12
  save_random_screenshot,
11
13
  ScreenshotOptions,
12
14
  )
13
- from helm.benchmark.scenarios.vision_language.image2structure.webpage.utils import convert_html_to_text
15
+ from helm.benchmark.scenarios.vision_language.image2struct.webpage.utils import convert_html_to_text
14
16
  from helm.common.general import ensure_directory_exists
15
17
  from helm.common.optional_dependencies import handle_module_not_found_error
18
+ from helm.common.hierarchical_logger import hlog
16
19
 
17
20
  try:
18
21
  from html2text import HTML2Text
19
22
  except ModuleNotFoundError as e:
20
- handle_module_not_found_error(e, suggestions=["image2structure"])
23
+ handle_module_not_found_error(e, suggestions=["image2struct"])
21
24
 
22
25
 
23
26
  import base64
@@ -72,28 +75,48 @@ def serve_and_take_screenshot(
72
75
  if not success:
73
76
  # This runs on examples that are not expected to fail
74
77
  server.stop()
78
+ hlog(f"Failed to start the Jekyll server: {repo_path} on port {port}. Will raise a ValueError.")
75
79
  raise ValueError(f"Jekyll server failed to start: {repo_path}")
76
80
 
77
81
  # Take a screenshot of a random page
78
82
  success = False
79
- error: Exception
80
- for _ in range(max_tries):
83
+ error: Optional[Exception] = None
84
+
85
+ MAX_TRIES_ALL_ERRORS = 3
86
+ MAX_TRIES_CONNECTION_REFUSED = 5
87
+ MAX_TRIES = max(MAX_TRIES_ALL_ERRORS, MAX_TRIES_CONNECTION_REFUSED)
88
+ for compilation_attempt in range(MAX_TRIES):
81
89
  try:
82
90
  infos: Dict[str, Any] = save_random_screenshot(destination_path, port=port, options=screenshot_options)
83
91
  success = True
84
92
  break
85
93
  except Exception as e:
86
- if "net::ERR_CONNECTION_REFUSED" in str(e):
87
- error = e
94
+ error = e
95
+
96
+ if "net::ERR_CONNECTION_REFUSED" in str(e) and compilation_attempt < MAX_TRIES_CONNECTION_REFUSED:
97
+ hlog(
98
+ f"Failed to take a screenshot: ERR_CONNECTION_REFUSED [Attempt {compilation_attempt + 1}/"
99
+ f"{MAX_TRIES_CONNECTION_REFUSED}]. Error: {e}. Retrying..."
100
+ )
88
101
  server.stop()
89
102
  time.sleep(0.5)
90
103
  server.start()
91
104
  time.sleep(0.5)
105
+ elif compilation_attempt < MAX_TRIES_ALL_ERRORS:
106
+ hlog(
107
+ f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
108
+ f" Error: {e}. Retrying..."
109
+ )
92
110
  else:
93
111
  # Do not retry
112
+ hlog(
113
+ f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
114
+ f" Error: {e}. Raising CompilationError."
115
+ )
94
116
  break
117
+
95
118
  if not success:
96
- raise ValueError(f"Failed to take a screenshot: {error}")
119
+ raise CompilationError(f"Failed to take a screenshot: {error}")
97
120
 
98
121
  # Stop the server
99
122
  server.stop()
@@ -128,7 +151,7 @@ class WebpageScenario(Image2StructureScenario):
128
151
  )
129
152
 
130
153
  HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
131
- SUBSETS = ["css", "html", "javascript"]
154
+ SUBSETS = ["css", "html", "javascript", "wild", "wild_legacy"]
132
155
  MAX_TRIES: int = 5
133
156
  ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
134
157
 
@@ -140,9 +163,10 @@ class WebpageScenario(Image2StructureScenario):
140
163
  subset: str,
141
164
  recompile_prompt: bool = True,
142
165
  split: str = VALID_SPLIT,
166
+ difficulty: str = DIFFICULTY_ALL,
143
167
  screenshot_options: ScreenshotOptions = ScreenshotOptions(),
144
168
  ):
145
- super().__init__(subset, recompile_prompt, split)
169
+ super().__init__(subset, recompile_prompt, split, difficulty)
146
170
  self._screenshot_options = screenshot_options
147
171
  self._html2text = HTML2Text()
148
172
  self._html2text.ignore_links = True
@@ -165,6 +189,13 @@ class WebpageScenario(Image2StructureScenario):
165
189
  shutil.rmtree(assets_save_path)
166
190
  ensure_directory_exists(assets_save_path)
167
191
 
192
+ if "wild" in self._subset:
193
+ # There is no stucture
194
+ del row["assets"]
195
+ row["assets_paths"] = []
196
+ row["assets_names"] = []
197
+ return row
198
+
168
199
  # Structure is a base64 encoding of the repo
169
200
  if self._output_path is None:
170
201
  raise ValueError("Output path not set")
@@ -51,7 +51,7 @@ class MathVistaScenario(Scenario):
51
51
  name = "math_vista"
52
52
  description = (
53
53
  "A benchmark designed to combine challenges from diverse mathematical and visual tasks. "
54
- "([paper](https://arxiv.org/abs/2310.02255))."
54
+ "([Lu et al., 2024](https://arxiv.org/abs/2310.02255))."
55
55
  )
56
56
  tags = ["vision-language", "reasoning", "math"]
57
57
 
@@ -38,10 +38,10 @@ class MementosScenario(Scenario):
38
38
  Paper: https://arxiv.org/abs/2401.10529
39
39
  """
40
40
 
41
- MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "shenmishajing/unofficial_mementos_dataset"
41
+ MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "RussWang96/unofficial_mementos_dataset"
42
42
 
43
43
  IMAGE_URL: str = (
44
- "https://huggingface.co/datasets/shenmishajing/unofficial_mementos_dataset/resolve/main/"
44
+ "https://huggingface.co/datasets/RussWang96/unofficial_mementos_dataset/resolve/main/"
45
45
  + "{subject}/{split}/{file_name}?download=true"
46
46
  )
47
47
 
@@ -56,7 +56,7 @@ class MementosScenario(Scenario):
56
56
  name = "mementos"
57
57
  description = (
58
58
  "A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences"
59
- " ([paper](https://arxiv.org/abs/2401.10529))."
59
+ " ([Wang et al., 2024](https://arxiv.org/abs/2401.10529))."
60
60
  )
61
61
  tags = ["vision-language"]
62
62
 
@@ -48,14 +48,14 @@ class MMSafetyBenchScenario(Scenario):
48
48
  }
49
49
 
50
50
  QUESTIONS_URL_TEMPLATE: str = (
51
- "https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/" "processed_questions/{dataset}.json"
51
+ "https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/processed_questions/{dataset}.json"
52
52
  )
53
53
  IMAGES_URL: str = "https://drive.google.com/uc?export=download&id=1xjW9k-aGkmwycqGCXbru70FaSKhSDcR_"
54
54
 
55
55
  name = "mm_safety_bench"
56
56
  description = (
57
57
  "Expose the vulnerability of open-source VLMs with toxic and biased content "
58
- "([paper](https://arxiv.org/abs/2311.17600))."
58
+ "([Liu et al., 2023](https://arxiv.org/abs/2311.17600))."
59
59
  )
60
60
  tags = ["vision-language", "bias", "toxicity"]
61
61
 
@@ -19,22 +19,22 @@ from helm.common.general import ensure_directory_exists
19
19
 
20
20
  class MMEScenario(Scenario):
21
21
  """
22
- MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
23
-
24
- Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
25
- multimodal tasks, showing amazing emergent abilities in recent studies. However,
26
- it is difficult for these case studies to fully reflect the performance of MLLM,
27
- lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
28
- the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
29
- and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
30
- that may arise from direct use of public datasets for evaluation, the annotations
31
- of instruction-answer pairs are all manually designed. The concise instruction design
32
- allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
33
- Besides, with such an instruction, we can also easily carry out quantitative
34
- statistics. We rephrase the answer type of MME to multiple-choice question-answering.
35
- We use the multiple-choice metrics for 14 different evaluation tasks.
36
-
37
- @article{fu2023mme,
22
+ MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
23
+
24
+ Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
25
+ multimodal tasks, showing amazing emergent abilities in recent studies. However,
26
+ it is difficult for these case studies to fully reflect the performance of MLLM,
27
+ lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
28
+ the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
29
+ and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
30
+ that may arise from direct use of public datasets for evaluation, the annotations
31
+ of instruction-answer pairs are all manually designed. The concise instruction design
32
+ allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
33
+ Besides, with such an instruction, we can also easily carry out quantitative
34
+ statistics. We rephrase the answer type of MME to multiple-choice question-answering.
35
+ We use the multiple-choice metrics for 14 different evaluation tasks.
36
+
37
+ @article{fu2023mme,
38
38
  title={MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models},
39
39
  author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and
40
40
  Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and
@@ -43,7 +43,7 @@ class MMEScenario(Scenario):
43
43
  year={2023}
44
44
  }
45
45
 
46
- Paper: https://arxiv.org/abs/2306.13394
46
+ Paper: https://arxiv.org/abs/2306.13394
47
47
  """
48
48
 
49
49
  MME_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/MME"
@@ -66,7 +66,10 @@ class MMEScenario(Scenario):
66
66
  ]
67
67
 
68
68
  name = "mme"
69
- description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2306.13394))."
69
+ description = (
70
+ "Evaluate multimodal models on their perception and cognition abilities on a total of 14 subtasks "
71
+ "([Fu et al., 2023](https://arxiv.org/abs/2306.13394))."
72
+ )
70
73
  tags = ["vision-language"]
71
74
  options: List[str] = ["Yes", "No"]
72
75
 
@@ -81,7 +81,7 @@ class MMMUScenario(Scenario):
81
81
  name = "mmmu"
82
82
  description = (
83
83
  "Evaluate multimodal models on massive multi-discipline tasks demanding college-level "
84
- "subject knowledge and deliberate reasoning ([paper](https://arxiv.org/abs/2311.16502))."
84
+ "subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502))."
85
85
  )
86
86
  tags = ["vision-language"]
87
87
 
@@ -19,7 +19,7 @@ class PAIRSScenario(Scenario):
19
19
  """
20
20
  Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
21
21
 
22
- Modified to ensure there is no ambiguity regarding the preferred choice for each question.
22
+ Modified to add an option to opt-out with "unclear" as a choice.
23
23
 
24
24
  @misc{fraser2024examining,
25
25
  title={Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel
@@ -186,7 +186,7 @@ class PAIRSScenario(Scenario):
186
186
  name = "pairs"
187
187
  description = (
188
188
  "Examining gender and racial bias in VLMs Using a Novel Dataset of Parallel Images. "
189
- "([paper](https://arxiv.org/abs/2402.05779))."
189
+ "([Fraser et al., 2024](https://arxiv.org/abs/2402.05779))."
190
190
  )
191
191
  tags = ["vision-language", "bias"]
192
192
 
@@ -232,13 +232,14 @@ class PAIRSScenario(Scenario):
232
232
  MediaObject(location=local_image_path, content_type="image/png"),
233
233
  MediaObject(text=question.text, content_type="text/plain"),
234
234
  ]
235
+ references = [Reference(Output(text=choice), tags=[]) for i, choice in enumerate(question.choices)]
236
+ # Add the preferred choice "unclear" as the correct answer
237
+ references.append(Reference(Output(text="unclear"), tags=[CORRECT_TAG]))
238
+
235
239
  instances.append(
236
240
  Instance(
237
241
  Input(multimedia_content=MultimediaObject(content)),
238
- references=[
239
- Reference(Output(text=choice), tags=[CORRECT_TAG] if i == question.preferred_choice else [])
240
- for i, choice in enumerate(question.choices)
241
- ],
242
+ references=references,
242
243
  split=TEST_SPLIT,
243
244
  )
244
245
  )
@@ -42,7 +42,8 @@ class POPEScenario(Scenario):
42
42
 
43
43
  name = "pope"
44
44
  description = (
45
- "Open-ended questions about hallucination images ([paper](https://aclanthology.org/2023.emnlp-main.20/))."
45
+ "Open-ended questions about hallucination images "
46
+ "([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20/))."
46
47
  )
47
48
  tags = ["vision-language", "visual question answering"]
48
49
  options: List[str] = ["Yes", "No"]
@@ -0,0 +1,57 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.images_utils import generate_hash
18
+
19
+
20
+ class RealWorldQAScenario(Scenario):
21
+ """
22
+ RealWorldQA is a benchmark designed for real-world understanding. The dataset consists of anonymized
23
+ images taken from vehicles, in addition to other real-world images.
24
+
25
+ Blog post: https://x.ai/blog/grok-1.5v
26
+ Website: https://huggingface.co/datasets/xai-org/RealworldQA
27
+ """
28
+
29
+ HUGGINGFACE_DATASET_NAME: str = "xai-org/RealworldQA"
30
+
31
+ name = "real_world_qa"
32
+ description = (
33
+ "A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models "
34
+ "([xAI, 2024](https://x.ai/blog/grok-1.5v))."
35
+ )
36
+ tags = ["vision-language", "knowledge", "reasoning"]
37
+
38
+ def get_instances(self, output_path: str) -> List[Instance]:
39
+ instances: List[Instance] = []
40
+ for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=TEST_SPLIT, cache_dir=output_path)):
41
+ # Save the image to disk
42
+ image = row["image"]
43
+ image_file_name: str = generate_hash(image) + ".jpg"
44
+ local_image_path: str = os.path.join(output_path, image_file_name)
45
+ if not os.path.exists(local_image_path):
46
+ image.save(local_image_path)
47
+
48
+ content: List[MediaObject] = [
49
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
50
+ MediaObject(text=row["question"], content_type="text/plain"),
51
+ ]
52
+ references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
53
+ instances.append(
54
+ Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=TEST_SPLIT)
55
+ )
56
+
57
+ return instances
@@ -35,10 +35,10 @@ class SEEDBenchScenario(Scenario):
35
35
  the multiple-choice metric for evaluating the performance of models.
36
36
 
37
37
  @article{li2023seed,
38
- title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
39
- author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
40
- journal={arXiv preprint arXiv:2307.16125},
41
- year={2023}
38
+ title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
39
+ author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
40
+ journal={arXiv preprint arXiv:2307.16125},
41
+ year={2023}
42
42
  }
43
43
 
44
44
  Paper: https://arxiv.org/abs/2307.16125
@@ -59,7 +59,9 @@ class SEEDBenchScenario(Scenario):
59
59
  }
60
60
 
61
61
  name = "seed_bench"
62
- description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2307.16125))."
62
+ description = (
63
+ "Evaluate multimodal models on 9 evaluation aspects " "([Li et al., 2023](https://arxiv.org/abs/2307.16125))."
64
+ )
63
65
  tags = ["vision-language"]
64
66
 
65
67
  def __init__(self, subject: str):