crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (209) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  10. helm/benchmark/annotation/call_center_annotator.py +258 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +37 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +36 -44
  18. helm/benchmark/annotation/model_as_judge.py +96 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  20. helm/benchmark/annotation/xstest_annotator.py +100 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +79 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  30. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  31. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  32. helm/benchmark/model_metadata_registry.py +3 -3
  33. helm/benchmark/presentation/create_plots.py +1 -1
  34. helm/benchmark/presentation/schema.py +3 -0
  35. helm/benchmark/presentation/summarize.py +106 -256
  36. helm/benchmark/presentation/test_run_entry.py +1 -0
  37. helm/benchmark/presentation/test_summarize.py +145 -3
  38. helm/benchmark/run.py +15 -0
  39. helm/benchmark/run_expander.py +83 -30
  40. helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
  41. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  42. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  43. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  44. helm/benchmark/run_specs/finance_run_specs.py +82 -1
  45. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  46. helm/benchmark/run_specs/vlm_run_specs.py +100 -24
  47. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  48. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  49. helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
  50. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  51. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  52. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  53. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  54. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  55. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  56. helm/benchmark/scenarios/raft_scenario.py +1 -1
  57. helm/benchmark/scenarios/scenario.py +1 -1
  58. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  59. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  60. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  61. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  62. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  63. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  64. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  65. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  66. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  67. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  68. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  69. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  70. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  71. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  72. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  73. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  74. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  75. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  76. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  78. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  79. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  80. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  81. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  82. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  83. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  84. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  85. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  86. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  87. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  88. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  92. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  93. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  94. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  95. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  97. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  98. helm/benchmark/server.py +1 -6
  99. helm/benchmark/static/schema_air_bench.yaml +750 -750
  100. helm/benchmark/static/schema_bhasa.yaml +709 -0
  101. helm/benchmark/static/schema_call_center.yaml +232 -0
  102. helm/benchmark/static/schema_cleva.yaml +768 -0
  103. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  104. helm/benchmark/static/schema_ewok.yaml +367 -0
  105. helm/benchmark/static/schema_finance.yaml +55 -9
  106. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  107. helm/benchmark/static/schema_legal.yaml +566 -0
  108. helm/benchmark/static/schema_safety.yaml +266 -0
  109. helm/benchmark/static/schema_tables.yaml +149 -8
  110. helm/benchmark/static/schema_thai.yaml +21 -0
  111. helm/benchmark/static/schema_vhelm.yaml +137 -101
  112. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  113. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  114. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  115. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  116. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  117. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  118. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  119. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  120. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  121. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  122. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  123. helm/benchmark/static_build/index.html +2 -2
  124. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  125. helm/benchmark/window_services/tokenizer_service.py +0 -5
  126. helm/clients/ai21_client.py +71 -1
  127. helm/clients/anthropic_client.py +7 -19
  128. helm/clients/huggingface_client.py +38 -37
  129. helm/clients/nvidia_nim_client.py +35 -0
  130. helm/clients/openai_client.py +18 -4
  131. helm/clients/palmyra_client.py +24 -0
  132. helm/clients/perspective_api_client.py +11 -6
  133. helm/clients/test_client.py +4 -6
  134. helm/clients/together_client.py +22 -0
  135. helm/clients/vision_language/open_flamingo_client.py +1 -2
  136. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  137. helm/common/cache.py +8 -30
  138. helm/common/images_utils.py +6 -0
  139. helm/common/key_value_store.py +9 -9
  140. helm/common/mongo_key_value_store.py +5 -4
  141. helm/common/request.py +16 -0
  142. helm/common/test_cache.py +1 -48
  143. helm/common/tokenization_request.py +0 -9
  144. helm/config/model_deployments.yaml +444 -329
  145. helm/config/model_metadata.yaml +513 -111
  146. helm/config/tokenizer_configs.yaml +140 -11
  147. helm/proxy/example_queries.py +14 -21
  148. helm/proxy/server.py +0 -9
  149. helm/proxy/services/remote_service.py +0 -6
  150. helm/proxy/services/server_service.py +6 -20
  151. helm/proxy/services/service.py +0 -6
  152. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  153. helm/tokenizers/ai21_tokenizer.py +51 -59
  154. helm/tokenizers/cohere_tokenizer.py +0 -75
  155. helm/tokenizers/huggingface_tokenizer.py +0 -1
  156. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  157. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  158. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  159. helm/benchmark/data_overlap/light_scenario.py +0 -60
  160. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  161. helm/benchmark/static/benchmarking.css +0 -156
  162. helm/benchmark/static/benchmarking.js +0 -1705
  163. helm/benchmark/static/config.js +0 -3
  164. helm/benchmark/static/general.js +0 -122
  165. helm/benchmark/static/images/crfm-logo.png +0 -0
  166. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  167. helm/benchmark/static/images/helm-logo.png +0 -0
  168. helm/benchmark/static/images/language-model-helm.png +0 -0
  169. helm/benchmark/static/images/organizations/ai21.png +0 -0
  170. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  171. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  172. helm/benchmark/static/images/organizations/cohere.png +0 -0
  173. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  174. helm/benchmark/static/images/organizations/google.png +0 -0
  175. helm/benchmark/static/images/organizations/meta.png +0 -0
  176. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  177. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  178. helm/benchmark/static/images/organizations/openai.png +0 -0
  179. helm/benchmark/static/images/organizations/together.png +0 -0
  180. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  181. helm/benchmark/static/images/organizations/yandex.png +0 -0
  182. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  183. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  184. helm/benchmark/static/index.html +0 -68
  185. helm/benchmark/static/info-icon.png +0 -0
  186. helm/benchmark/static/json-urls.js +0 -69
  187. helm/benchmark/static/plot-captions.js +0 -27
  188. helm/benchmark/static/utils.js +0 -285
  189. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  190. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  191. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  192. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  193. helm/benchmark/window_services/ai21_window_service.py +0 -247
  194. helm/benchmark/window_services/cohere_window_service.py +0 -101
  195. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  196. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  197. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  198. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  199. helm/tokenizers/ice_tokenizer.py +0 -30
  200. helm/tokenizers/test_ice_tokenizer.py +0 -57
  201. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  202. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  203. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
  204. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  205. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  206. /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
  207. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  208. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
  209. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,73 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.narrativeqa_scenario import NarrativeQAScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_narrativeqa_scenario():
10
+ scenario = NarrativeQAScenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = scenario.get_instances(tmpdir)
13
+ assert len(instances) == 1572
14
+ assert (
15
+ instances[0].input.text
16
+ == "At Madeline Hall, an old mansion-house near Southampton belonging to the wealthy de Versely family, lives"
17
+ " an elderly spinster Miss Delmar, the aunt of the earl de Versely and Captain Delmar. Miss Delmar invites"
18
+ " Arabella Mason, the daughter of a deceased, well-liked steward to stay with her as a lower-class guest in"
19
+ " the house. Captain Delmar is known to visit his aunt at Madeline Hall frequently, accompanied by his"
20
+ " valet Ben Keene, who is also a private marine. Captain Delmar eventually suggests that Ben should propose"
21
+ " to Arabella, and the two marry in secret, to the frustration of Miss Delmar and Arabella's mother. The"
22
+ " captain is able to smooth over the situation with his aunt, even after it is discovered that Arabella was"
23
+ " six months pregnant at the time of the marriage. She later gives birth to a boy, who takes the Captain's"
24
+ " Christian name and Ben's surname--the titular Percival Keene.\nThe family moves to Chatham, after Ben is"
25
+ " ordered back with his detachment. Arabella opens up a successful shop and circulating library below her"
26
+ " house, enlisting the help of her mother and sister, Amelia. Percival becomes well known in town from his"
27
+ " mischievous pranks on officers and other strangers, often encouraged by his aunt Amelia. However,"
28
+ " Percival's mother and grandmother are less fond of his disregard for manners, and insist on sending him"
29
+ " to school after an episode in which he bites his grandmother. Percival reports to the school house of Mr."
30
+ " O'Gallagher, a poor Irish scholar, who rules his class with a system of severe corporal punishment. Mr."
31
+ " O'Gallagher routinely bullies Percival by stealing his lunch, leading Percival to seek revenge by"
32
+ " poisoning his sandwiches with calomel. On Guy Fawkes Day the schoolteacher confiscates all the"
33
+ " schoolboys' fireworks, for which Percival retaliates by setting off the collected fireworks while the"
34
+ " teacher sits above them, leading to the total destruction of the schoolhouse and near death of the"
35
+ " schoolmaster.\nWhen Percival is a young teenager, Captain Delmar reappears and offers him a position"
36
+ " aboard his new navy ship, the H.M. Calliope. While preparing to enter service, Percival overhears gossip"
37
+ " of his illegitimate birth, introducing the idea that Captain Delmar may be his father. He confronts his"
38
+ " mother about his parentage, which she at first harshly denies but later tearfully explains the truth of"
39
+ " her affair. Early in his service in the navy, Percival is captured during a pirate raid along with"
40
+ " others. The pirate crew is entirely black, and the captain explains that they are primarily escaped"
41
+ " slaves from the Americas. Percival is taken in as a cabin boy, and later dyes his skin tan in the"
42
+ " appearance of a mulatto to please the captain who doesn't approve of white skin. The pirates often seek"
43
+ " to take over slave trading vessels, killing every white person on board. During the taking of one such"
44
+ " vessel, Percival is able is convince the captain to spare the lives of a wealthy Dutch merchant and his"
45
+ " young daughter, Minnie. Eventually the H.M. Calliope takes the pirate ship, and Percival--unrecognizable"
46
+ " with his dyed skin--is taken as a prisoner, later to convince his fellow shipman of his true"
47
+ " identity.\nAfter his reappearance aboard the ship, Percival gains esteem among the crew and is welcomed"
48
+ " back by the emotional Captain Delmar. His reputation continues to grow over the course of his service in"
49
+ " conflicts with Dutch and French vessels around the island of Curacao. He also stands in for an ill"
50
+ " Captain Delmar in a duel with a French officer, effectively saving the captain's life. At this point, the"
51
+ " captain receives news that his older brother has died, making him the new Lord de Versely, and before"
52
+ " returning to England he grants Perceval command of his own schooner. After another intense but successful"
53
+ " battle with a French war ship, Percival is promoted to captain. During his service in the Navy, Percival"
54
+ " still partakes in the merry pranks of his youth, and at one point teams up with a mulatto hotel owner in"
55
+ " Curaรงao to convince his fellow officers they've been poisoned. He also keeps correspondence with Minnie,"
56
+ " developing a romance with the beautiful heiress.\nNear the end of the story, Percival guides his crew"
57
+ " through a terrible storm in which many of the crew are killed and the ship is heavily damaged. After"
58
+ " being saved by another English vessel, he receives a letter informing him of Lord de Versely's sudden"
59
+ " death from heart complications and learns that he has been left all of his personal property. Percival is"
60
+ " still disappointed that he can not take his father's name. He later journey's with his friend Bob Cross"
61
+ " to Hamburg to reunite with Minnie, but is captured by French troops on the road and sentenced to"
62
+ " execution for spying. During a skirmish between the French and the Cossacks, Percival and Cross are able"
63
+ " to escape and continue on the road. At the end of the novel, Percival proposes to Minnie, and stands to"
64
+ " inherit a great fortune through her father. He also receives a letter from the de Versely attorney"
65
+ " letting him know he has been granted the arms and name of Delmar.\nQuestion: Who did Percival reunited"
66
+ " with?"
67
+ )
68
+
69
+ assert instances[0].references == [
70
+ Reference(output=Output(text="Minnie"), tags=[CORRECT_TAG]),
71
+ Reference(output=Output(text="minnie"), tags=[CORRECT_TAG]),
72
+ ]
73
+ assert instances[0].split == "train"
@@ -86,9 +86,9 @@ class ThaiExamScenario(Scenario):
86
86
  super().__init__()
87
87
  self.exam = exam
88
88
 
89
- def download_thai_exam(self, path: str):
89
+ def download_thai_exam(self, path: str, revision: str):
90
90
  ensure_file_downloaded(
91
- "https://storage.googleapis.com/thai_dataset/thai_exam.tar.gz",
91
+ f"https://huggingface.co/datasets/scb10x/thai_exam/resolve/{revision}/thai_exam.tar.gz",
92
92
  target_path=path,
93
93
  unpack=True,
94
94
  )
@@ -118,8 +118,8 @@ class ThaiExamScenario(Scenario):
118
118
 
119
119
  def get_instances(self, output_path) -> List[Instance]:
120
120
  data_path: str = os.path.join(output_path, "data")
121
- self.download_thai_exam(data_path)
122
-
121
+ # ThaiExam (v1.0) revision = d78aef04ea3cc5095545e6951cb39e17c64e26a1
122
+ self.download_thai_exam(data_path, revision="d78aef04ea3cc5095545e6951cb39e17c64e26a1")
123
123
  instances: List[Instance] = []
124
124
  splits: Dict[str, str] = {
125
125
  "train": TRAIN_SPLIT,
@@ -42,7 +42,7 @@ class AOKVQAScenario(Scenario):
42
42
  name = "a_okvqa"
43
43
  description = (
44
44
  "A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of "
45
- "commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718))."
45
+ "commonsense and world knowledge to answer ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718))."
46
46
  )
47
47
  tags = ["vision-language", "knowledge", "reasoning"]
48
48
 
@@ -51,8 +51,8 @@ class BingoScenario(Scenario):
51
51
 
52
52
  name = "bingo"
53
53
  description = (
54
- "Evaluate multimodal models on biased and inference-challenging scenarios with five subjects"
55
- " ([paper](https://arxiv.org/abs/2311.03287))."
54
+ "Evaluate multimodal models on biased and inference-challenging scenarios with five subjects "
55
+ "([Cui et al., 2023](https://arxiv.org/abs/2311.03287))."
56
56
  )
57
57
  tags = ["vision-language"]
58
58
 
@@ -75,7 +75,8 @@ class Crossmodal3600Scenario(Scenario):
75
75
  name = "crossmodal_3600"
76
76
  description = (
77
77
  "Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated "
78
- "with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))."
78
+ "with human-generated reference captions in 36 languages. "
79
+ "([Thapliyal et al., 2022)](https://arxiv.org/abs/2205.12522))."
79
80
  )
80
81
  tags = ["vision-language", "multilinguality"]
81
82
 
@@ -0,0 +1,104 @@
1
+ from typing import List, Set
2
+ import os
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ TRAIN_SPLIT,
11
+ Instance,
12
+ Input,
13
+ Output,
14
+ Reference,
15
+ Scenario,
16
+ )
17
+ from helm.common.media_object import MediaObject, MultimediaObject
18
+ from helm.common.images_utils import generate_hash
19
+
20
+
21
+ class ExamsVScenario(Scenario):
22
+ """
23
+ EXAMS-V: A Multi-Discipline Multilingual Multimodal Exam Benchmark for Evaluating Vision Language Models
24
+
25
+ A challenging multi-discipline multimodal multilingual exam benchmark for evaluating vision language models.
26
+ It consists of 20,932 multiple-choice questions across 20 school disciplines covering natural science,
27
+ social science, and other miscellaneous studies, e.g.,religion, fine arts, business, etc.
28
+
29
+ Paper: https://arxiv.org/abs/2403.10378
30
+ Website: https://huggingface.co/datasets/Rocktim/EXAMS-V
31
+ """
32
+
33
+ HUGGINGFACE_DATASET_NAME: str = "Rocktim/EXAMS-V"
34
+
35
+ VALID_LANGUAGES: Set[str] = {
36
+ "Chinese",
37
+ "Croation",
38
+ "Italian",
39
+ "Hungarian",
40
+ "Arabic",
41
+ "Serbian",
42
+ "Bulgarian",
43
+ "English",
44
+ "German",
45
+ "French",
46
+ "Spanish",
47
+ "Polish",
48
+ }
49
+ VALID_SUBJECT_GROUP: Set[str] = {
50
+ "Natural Science",
51
+ "Social Sciences",
52
+ "Other",
53
+ }
54
+ VALID_TYPES: Set[str] = {"text", "image_text"}
55
+
56
+ name = "exams_v"
57
+ description = (
58
+ "Multimodal and Multilingual benchmark to evaluate vision-language models across 20 school disciplines "
59
+ "covering natural science, social science, and other miscellaneous studies "
60
+ "([Das et al., 2024]( https://arxiv.org/abs/2403.10378))."
61
+ )
62
+ tags = ["vision-language", "knowledge", "reasoning", "multilingual"]
63
+
64
+ def __init__(self, language: str, subject_grouped: str, type: str) -> None:
65
+ super().__init__()
66
+
67
+ subject_grouped = subject_grouped.replace("_", " ")
68
+ assert subject_grouped in self.VALID_SUBJECT_GROUP, f"Invalid subject_grouped: {subject_grouped}"
69
+ assert type in self.VALID_TYPES, f"Invalid type: {type}"
70
+ assert language in self.VALID_LANGUAGES, f"Invalid language: {language}"
71
+
72
+ self._language: str = language
73
+ self._subject_grouped: str = subject_grouped
74
+ self._type: str = type
75
+
76
+ def get_instances(self, output_path: str) -> List[Instance]:
77
+ instances: List[Instance] = []
78
+
79
+ for split in [TRAIN_SPLIT, TEST_SPLIT]:
80
+ for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=split, cache_dir=output_path)):
81
+ language: str = row["language"]
82
+ subject_grouped: str = row["subject_grouped"]
83
+ type: str = row["type"]
84
+
85
+ # Exclude examples that do not match the specified language, subject, and type
86
+ if language != self._language or subject_grouped != self._subject_grouped or type != self._type:
87
+ continue
88
+
89
+ # Save the image to disk
90
+ image = row["image"]
91
+ image_file_name: str = generate_hash(image) + ".jpg"
92
+ local_image_path: str = os.path.join(output_path, image_file_name)
93
+ if not os.path.exists(local_image_path):
94
+ image.convert("RGB").save(local_image_path)
95
+
96
+ content: List[MediaObject] = [
97
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
98
+ ]
99
+ references: List[Reference] = [Reference(output=Output(text=row["answer_key"]), tags=[CORRECT_TAG])]
100
+ instances.append(
101
+ Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=split)
102
+ )
103
+
104
+ return instances
@@ -0,0 +1,136 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ VALID_SPLIT,
10
+ TRAIN_SPLIT,
11
+ Instance,
12
+ Input,
13
+ Output,
14
+ Reference,
15
+ Scenario,
16
+ )
17
+ from helm.common.media_object import MediaObject, MultimediaObject
18
+ from helm.common.images_utils import generate_hash
19
+
20
+
21
+ class FairFaceScenario(Scenario):
22
+ """
23
+ Identify the race, gender or age of a photo of a person.
24
+
25
+ @misc{kärkkäinen2019fairfacefaceattributedataset,
26
+ title={FairFace: Face Attribute Dataset for Balanced Race, Gender, and Age},
27
+ author={Kimmo Kärkkäinen and Jungseock Joo},
28
+ year={2019},
29
+ eprint={1908.04913},
30
+ archivePrefix={arXiv},
31
+ primaryClass={cs.CV},
32
+ url={https://arxiv.org/abs/1908.04913},
33
+ }
34
+
35
+ Paper: https://arxiv.org/abs/1908.04913
36
+ Website: https://huggingface.co/datasets/HuggingFaceM4/FairFace
37
+ """
38
+
39
+ HUGGINGFACE_DATASET_NAME: str = "HuggingFaceM4/FairFace"
40
+
41
+ RACE_ATTRIBUTE: str = "race"
42
+ GENDER_ATTRIBUTE: str = "gender"
43
+ AGE_ATTRIBUTE: str = "age"
44
+
45
+ QUESTION_TEMPLATE: str = "Identify the {attribute} of the person in the image."
46
+
47
+ RACE_SUBGROUPS: List[str] = [
48
+ "East Asian",
49
+ "Indian",
50
+ "Black",
51
+ "White",
52
+ "Middle Eastern",
53
+ "Latino Hispanic",
54
+ "Southeast Asian",
55
+ ]
56
+ GENDER_SUBGROUPS: List[str] = ["Male", "Female"]
57
+ AGE_SUBGROUPS: List[str] = [
58
+ "0-2 years",
59
+ "3-9 years",
60
+ "10-19 years",
61
+ "20-29 years",
62
+ "30-39 years",
63
+ "40-49 years",
64
+ "50-59 years",
65
+ "60-69 years",
66
+ "Over 70 years",
67
+ ]
68
+
69
+ name = "fair_face"
70
+ description = (
71
+ "Identify the race, gender or age of a photo of a person "
72
+ "([Karkkainen et al., 2019](https://arxiv.org/abs/1908.04913))."
73
+ )
74
+ tags = ["vision-language", "fairness"]
75
+
76
+ def __init__(self, attribute: str, subgroup: str) -> None:
77
+ super().__init__()
78
+
79
+ subgroups: List[str]
80
+ if attribute == self.RACE_ATTRIBUTE:
81
+ subgroups = self.RACE_SUBGROUPS
82
+ elif attribute == self.GENDER_ATTRIBUTE:
83
+ subgroups = self.GENDER_SUBGROUPS
84
+ elif attribute == self.AGE_ATTRIBUTE:
85
+ subgroups = self.AGE_SUBGROUPS
86
+ else:
87
+ raise ValueError(f"Invalid attribute: {attribute}")
88
+
89
+ # Validate the value passed in for the subgroup argument and set possible subgroup choices.
90
+ # The subgroup passed in has a _ for spaces in the string.
91
+ subgroup = subgroup.replace("_", " ")
92
+ assert subgroup in subgroups, f"Invalid subgroup for {attribute} attribute: {subgroup}"
93
+ self._subgroup_choices: List[str] = subgroups
94
+ self._correct_subgroup_index: int = subgroups.index(subgroup)
95
+
96
+ self._attribute: str = attribute # For answer column
97
+ self._question: str = self.QUESTION_TEMPLATE.format(attribute=attribute) # What text to prompt the model?
98
+
99
+ def get_instances(self, output_path: str) -> List[Instance]:
100
+ instances: List[Instance] = []
101
+ for split in [TRAIN_SPLIT, VALID_SPLIT]:
102
+ for row in tqdm(
103
+ load_dataset(
104
+ self.HUGGINGFACE_DATASET_NAME,
105
+ "1.25",
106
+ split="validation" if split == VALID_SPLIT else split,
107
+ cache_dir=output_path,
108
+ )
109
+ ):
110
+ # Filter out rows that do not match the subgroup
111
+ if row[self._attribute] != self._correct_subgroup_index:
112
+ continue
113
+
114
+ # Save the image to disk
115
+ image = row["image"]
116
+ image_file_name: str = generate_hash(image) + ".jpg"
117
+ local_image_path: str = os.path.join(output_path, image_file_name)
118
+ if not os.path.exists(local_image_path):
119
+ image.save(local_image_path)
120
+
121
+ content: List[MediaObject] = [
122
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
123
+ MediaObject(text=self._question, content_type="text/plain"),
124
+ ]
125
+ references: List[Reference] = [
126
+ Reference(
127
+ output=Output(text=subgroup),
128
+ tags=[CORRECT_TAG] if i == self._correct_subgroup_index else [],
129
+ )
130
+ for i, subgroup in enumerate(self._subgroup_choices)
131
+ ]
132
+ instances.append(
133
+ Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=split)
134
+ )
135
+
136
+ return instances
@@ -41,7 +41,7 @@ class Flickr30KScenario(Scenario):
41
41
  name = "flickr30k"
42
42
  description = (
43
43
  "An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr "
44
- "images ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))."
44
+ "images ([Young et al., 2014](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))."
45
45
  )
46
46
  tags = ["vision-language"]
47
47
 
@@ -31,7 +31,7 @@ class GQAScenario(Scenario):
31
31
  }
32
32
 
33
33
  Paper: https://arxiv.org/abs/1902.09506
34
- Website: https://github.com/stanford-crfm/helm/issues/1951
34
+ Website: https://cs.stanford.edu/people/dorarad/gqa/about.html
35
35
  """
36
36
 
37
37
  QUESTIONS_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip"
@@ -40,7 +40,7 @@ class GQAScenario(Scenario):
40
40
  name = "gqa"
41
41
  description = (
42
42
  "Questions about real-world visual reasoning and compositional QA "
43
- "([paper](https://arxiv.org/abs/1902.09506))."
43
+ "([Hudson and Manning, 2019](https://arxiv.org/abs/1902.09506))."
44
44
  )
45
45
  tags = ["vision-language", "reasoning"]
46
46
 
@@ -49,7 +49,7 @@ class HatefulMemesScenario(Scenario):
49
49
  QUESTION: str = "Is this meme mean?"
50
50
 
51
51
  name = "hateful_memes"
52
- description = "Identify if a meme is hateful or not ([paper](https://arxiv.org/abs/2005.04790))."
52
+ description = "Identify if a meme is hateful or not ([Kiela et al., 2020](https://arxiv.org/abs/2005.04790))."
53
53
  tags = ["vision-language"]
54
54
 
55
55
  def get_instances(self, output_path: str) -> List[Instance]:
@@ -20,7 +20,7 @@ class Chart2CSVScenario(Scenario):
20
20
 
21
21
  name = "chart2csv"
22
22
  description = "Convert a chart to CSV."
23
- tags = ["vision-language", "image2structure"]
23
+ tags = ["vision-language", "image2struct"]
24
24
 
25
25
  def get_instances(self, output_path: str) -> List[Instance]:
26
26
  assert os.path.exists(output_path), f"Dataset does not exist at {output_path}"
@@ -1,14 +1,14 @@
1
- from helm.benchmark.scenarios.vision_language.image2structure.utils_latex import (
1
+ from helm.benchmark.scenarios.vision_language.image2struct.utils_latex import (
2
2
  latex_to_image,
3
3
  strip_unnecessary_latex_parts,
4
4
  )
5
- from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
5
+ from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import Image2StructureScenario
6
6
 
7
7
 
8
8
  class LatexScenario(Image2StructureScenario):
9
9
  BASE_PROMPT = "Please provide the LaTeX code used to generate this image. Only generate the code relevant to what you see. Your code will be surrounded by all the imports necessary as well as the begin and end document delimiters." # noqa: E501
10
10
  HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-latex"
11
- SUBSETS = ["equation", "table", "plot", "algorithm", "real"]
11
+ SUBSETS = ["equation", "table", "plot", "algorithm", "wild", "wild_legacy"]
12
12
 
13
13
  name = "image2latex"
14
14
  description = "Evaluate multimodal models on Latex generation to recreate a provided image"
@@ -1,4 +1,4 @@
1
- from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
1
+ from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import Image2StructureScenario
2
2
 
3
3
 
4
4
  class MusicSheetScenario(Image2StructureScenario):
@@ -5,6 +5,7 @@ import os
5
5
  import re
6
6
 
7
7
  from helm.common.optional_dependencies import handle_module_not_found_error, OptionalDependencyNotInstalled
8
+ from helm.common.hierarchical_logger import hlog
8
9
 
9
10
  try:
10
11
  from latex import build_pdf
@@ -12,14 +13,13 @@ try:
12
13
  from PIL import ImageOps
13
14
  from PIL.Image import Image
14
15
  except ModuleNotFoundError as e:
15
- handle_module_not_found_error(e, suggestions=["image2structure"])
16
+ handle_module_not_found_error(e, suggestions=["image2struct"])
16
17
 
17
18
  # LaTeX preamble
18
19
  # Make sure to install "latex-full".
19
20
  TEX_INCLUDES = r"""
20
21
  \usepackage{amsmath,amssymb,amsfonts}
21
22
  \usepackage{graphicx}
22
- \usepackage{graphicx}
23
23
  \usepackage{amsmath}
24
24
  \usepackage{xcolor}
25
25
  \usepackage{algorithm}
@@ -98,23 +98,19 @@ def pdf_to_image(
98
98
 
99
99
  def strip_unnecessary_latex_parts(latex_code: str) -> str:
100
100
  """Strip unnecessary parts of the LaTeX code."""
101
-
102
101
  # Remove comments
103
102
  minimal_latex_code = re.sub(r"%.*?\n", "\n", latex_code)
104
-
105
103
  # Remove \documentclass and any \usepackage lines
106
- minimal_latex_code = re.sub(r"\\documentclass\{.*?\}\n", "", latex_code)
107
- minimal_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*?\}\n", "", minimal_latex_code)
108
-
104
+ minimal_latex_code = re.sub(r"\\documentclass(\[.*?\])?\{.*?\}", "", latex_code)
105
+ minimal_latex_code = re.sub(r"\\documentstyle(\[.*?\])?\{.*?\}", "", minimal_latex_code)
106
+ minimal_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*?\}", "", minimal_latex_code)
109
107
  # Remove everything before \begin{document} and including it, and everything after \end{document}
110
108
  minimal_latex_code = re.sub(r"\\begin\{document\}\n*", "", minimal_latex_code, flags=re.DOTALL)
111
109
  minimal_latex_code = re.sub(r"\\end\{document\}.*", "", minimal_latex_code, flags=re.DOTALL)
112
-
113
110
  # Ensure \begin{...} is followed by a \n
114
111
  minimal_latex_code = re.sub(r"(\\begin\{.*?\}(\[.*?\])?)(?!\n)", r"\1\n", minimal_latex_code)
115
112
  # Ensure \end{...} has a \n before it
116
113
  minimal_latex_code = re.sub(r"(\\end\{.*?\})(?!\n)", r"\1\n", minimal_latex_code)
117
-
118
114
  # Normalize space sequences to a single space globally
119
115
  minimal_latex_code = re.sub(r" +", " ", minimal_latex_code)
120
116
  # Replace tabs with a single space
@@ -123,7 +119,6 @@ def strip_unnecessary_latex_parts(latex_code: str) -> str:
123
119
  minimal_latex_code = re.sub(r"^[ \t]+|[ \t]+$", "", minimal_latex_code, flags=re.MULTILINE)
124
120
  # Remove unnecessary whitespace - multiple empty lines and tabulations
125
121
  minimal_latex_code = re.sub(r"\n\s*\n", "\n", minimal_latex_code)
126
-
127
122
  return minimal_latex_code.strip()
128
123
 
129
124
 
@@ -226,25 +221,21 @@ def handle_latex_error(
226
221
  # Error format: "LaTeX Error: Environment <env> undefined."
227
222
  undefined_search = re.search(r"LaTeX Error: Environment (.*) undefined", str_e)
228
223
  if undefined_search:
229
- # If a package is missing and this is our first retry, then simply include TEX_INCLUDES
230
- if num_try_remaining == MAX_NUM_TRIES:
231
- fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
232
- if num_try_remaining < MAX_NUM_TRIES or fixed_code == original_latex_code:
233
- # Here we try to manually solve the missing environment.
234
- # This is either executed on the second rety or the first if no changements
235
- # were made in the first retry.
236
- assert TEX_INCLUDES in fixed_code, "TEX_INCLUDES should be present in the code"
237
- # TEX_INCLUDES is already present, so we add the missing package
238
- # Since we cannot know the name of the package that contains the missing environment,
239
- # we simply hope that they are named the same way.
240
- env_undefined: str = undefined_search.group(1)
241
-
242
- if f"\\usepackage{{{env_undefined}}}" in fixed_code:
243
- # We already tried to include the missing package, but it probably
244
- # does not exist, so we raise an error
245
- raise RuntimeError(str(e)) from e
246
-
247
- fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")
224
+ # Here we try to manually solve the missing environment.
225
+ # This is either executed on the second rety or the first if no changements
226
+ # were made in the first retry.
227
+ assert TEX_INCLUDES in fixed_code, f"TEX_INCLUDES should be present in the code. code={fixed_code}"
228
+ # TEX_INCLUDES is already present, so we add the missing package
229
+ # Since we cannot know the name of the package that contains the missing environment,
230
+ # we simply hope that they are named the same way.
231
+ env_undefined: str = undefined_search.group(1)
232
+
233
+ if f"\\usepackage{{{env_undefined}}}" in fixed_code:
234
+ # We already tried to include the missing package, but it probably
235
+ # does not exist, so we raise an error
236
+ raise RuntimeError(str(e)) from e
237
+
238
+ fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")
248
239
 
249
240
  # Try again with the fixed code (if the fixed code is different from the original code)
250
241
  if fixed_code != original_latex_code:
@@ -313,23 +304,24 @@ def latex_to_image(
313
304
 
314
305
  # 2. Add preamble
315
306
  # 2.1. Remove \documentclass if present to make sure we use our own
316
- documentclass_search = re.search(r"\\documentclass\{(.*)\}", original_latex_code)
307
+ documentclass_search = re.search(r"\\documentclass(\[.*?\])?\{.*?\}", original_latex_code)
308
+ documentstyle_search = re.search(r"\\documentstyle(\[.*?\])?\{.*?\}", original_latex_code)
317
309
  if documentclass_search:
318
- documentclass: str = documentclass_search.group(1)
319
- original_latex_code = original_latex_code.replace(f"\\documentclass{{{documentclass}}}", TEX_BEGIN_FILE)
310
+ matching_string = documentclass_search.group()
311
+ original_latex_code = original_latex_code.replace(matching_string, TEX_BEGIN_FILE)
312
+ elif documentstyle_search:
313
+ matching_string = documentstyle_search.group()
314
+ original_latex_code = original_latex_code.replace(matching_string, TEX_BEGIN_FILE)
320
315
  else:
321
316
  # If there is no \documentclass, we add our own
322
317
  original_latex_code = TEX_BEGIN_FILE + "\n\n" + original_latex_code
323
318
 
324
- # 2.2. Add includes. In this first step, we only add includes if none are present.
325
- # We do this because if some are present, we might define them twice which can cause errors
326
- # and this section should not make the original LaTeX code fail if it was compilable.
327
- # If there are missing packages, in handle_latex_error, we will add TEX_INCLUDES after the begin document,
328
- # which might define some packages twice, but often solves the problem.
329
- if not re.search(r"\\usepackage\{.*\}", original_latex_code):
330
- original_latex_code = original_latex_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
319
+ # 2.2. Add includes. In this ste we remove all includes for the default ones.
320
+ original_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*\}", "", original_latex_code)
321
+ original_latex_code = original_latex_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
331
322
 
332
323
  latex_code: str = original_latex_code
324
+ hlog(f"Compiling LaTeX code:\n{latex_code}")
333
325
  try:
334
326
  pdf_stream = latex_to_pdf(latex_code, assets_path=assets_path)
335
327
  image = pdf_to_image(pdf_stream, crop=crop, resize_to=resize_to)
@@ -6,7 +6,7 @@ try:
6
6
  from selenium import webdriver
7
7
  import selenium.common.exceptions
8
8
  except ModuleNotFoundError as e:
9
- handle_module_not_found_error(e, suggestions=["image2structure"])
9
+ handle_module_not_found_error(e, suggestions=["image2struct"])
10
10
 
11
11
 
12
12
  def init_driver(url: str, resolution: Tuple[int, int] = (1920, 1080)) -> webdriver.Chrome:
@@ -5,7 +5,7 @@ from helm.common.optional_dependencies import handle_module_not_found_error
5
5
  try:
6
6
  from html2text import HTML2Text
7
7
  except ModuleNotFoundError as e:
8
- handle_module_not_found_error(e, suggestions=["image2structure"])
8
+ handle_module_not_found_error(e, suggestions=["image2struct"])
9
9
 
10
10
 
11
11
  def convert_html_to_text(handler: HTML2Text, html: str) -> str: