crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
  3. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +32 -31
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  6. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  7. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  8. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  9. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  10. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  11. helm/benchmark/annotation/annotator_factory.py +6 -0
  12. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  13. helm/benchmark/annotation/call_center_annotator.py +247 -0
  14. helm/benchmark/annotation/financebench_annotator.py +79 -0
  15. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  16. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  17. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  18. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  19. helm/benchmark/annotation/live_qa_annotator.py +71 -0
  20. helm/benchmark/annotation/medication_qa_annotator.py +68 -0
  21. helm/benchmark/annotation/model_as_judge.py +45 -0
  22. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  23. helm/benchmark/annotation/xstest_annotator.py +110 -0
  24. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  25. helm/benchmark/huggingface_registration.py +16 -6
  26. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  27. helm/benchmark/metrics/annotation_metrics.py +108 -0
  28. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  29. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  30. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  31. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  32. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  33. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  35. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  36. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  37. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  38. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  39. helm/benchmark/metrics/safety_metrics.py +57 -0
  40. helm/benchmark/metrics/summac/model_summac.py +3 -3
  41. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  42. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  43. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  44. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  45. helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
  46. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  47. helm/benchmark/model_metadata_registry.py +3 -3
  48. helm/benchmark/presentation/schema.py +54 -4
  49. helm/benchmark/presentation/test_run_entry.py +1 -0
  50. helm/benchmark/presentation/test_schema.py +11 -0
  51. helm/benchmark/run.py +31 -2
  52. helm/benchmark/run_expander.py +113 -10
  53. helm/benchmark/run_spec_factory.py +4 -0
  54. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  55. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  56. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  57. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  58. helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
  59. helm/benchmark/run_specs/experimental_run_specs.py +85 -0
  60. helm/benchmark/run_specs/finance_run_specs.py +110 -0
  61. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  62. helm/benchmark/run_specs/vlm_run_specs.py +251 -57
  63. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  64. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  65. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  66. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  67. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  68. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  69. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  70. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  71. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  72. helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
  73. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  74. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  75. helm/benchmark/scenarios/scenario.py +1 -1
  76. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  77. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  78. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  79. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  80. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  81. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  82. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  83. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  84. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  85. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  86. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  87. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  88. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
  90. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  92. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  93. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  94. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  95. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  97. helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
  98. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
  99. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
  100. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  101. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  102. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  103. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
  104. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  105. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  106. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  107. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  108. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  109. helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
  110. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  111. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  112. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  113. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
  114. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  115. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  116. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  117. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  118. helm/benchmark/server.py +1 -6
  119. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  120. helm/benchmark/static/schema_bhasa.yaml +709 -0
  121. helm/benchmark/static/schema_call_center.yaml +232 -0
  122. helm/benchmark/static/schema_classic.yaml +3 -59
  123. helm/benchmark/static/schema_cleva.yaml +768 -0
  124. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  125. helm/benchmark/static/schema_ewok.yaml +367 -0
  126. helm/benchmark/static/schema_finance.yaml +189 -0
  127. helm/benchmark/static/schema_image2struct.yaml +588 -0
  128. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  129. helm/benchmark/static/schema_lite.yaml +3 -61
  130. helm/benchmark/static/schema_medical.yaml +255 -0
  131. helm/benchmark/static/schema_mmlu.yaml +3 -61
  132. helm/benchmark/static/schema_safety.yaml +247 -0
  133. helm/benchmark/static/schema_tables.yaml +317 -0
  134. helm/benchmark/static/schema_thai.yaml +244 -0
  135. helm/benchmark/static/schema_unitxt.yaml +3 -61
  136. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
  137. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  138. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  139. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  140. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  141. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  142. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  143. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  144. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  145. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  146. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  147. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  148. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  149. helm/benchmark/static_build/index.html +2 -2
  150. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  151. helm/clients/ai21_client.py +71 -1
  152. helm/clients/anthropic_client.py +50 -28
  153. helm/clients/auto_client.py +11 -0
  154. helm/clients/client.py +24 -7
  155. helm/clients/cohere_client.py +98 -3
  156. helm/clients/huggingface_client.py +79 -19
  157. helm/clients/nvidia_nim_client.py +35 -0
  158. helm/clients/openai_client.py +11 -5
  159. helm/clients/palmyra_client.py +25 -0
  160. helm/clients/perspective_api_client.py +11 -6
  161. helm/clients/reka_client.py +189 -0
  162. helm/clients/test_client.py +7 -9
  163. helm/clients/test_huggingface_client.py +19 -3
  164. helm/clients/test_together_client.py +72 -2
  165. helm/clients/together_client.py +129 -23
  166. helm/clients/vertexai_client.py +62 -18
  167. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  168. helm/clients/vision_language/open_flamingo_client.py +1 -2
  169. helm/clients/vision_language/paligemma_client.py +146 -0
  170. helm/clients/vision_language/palmyra_vision_client.py +99 -0
  171. helm/clients/yi_client.py +31 -0
  172. helm/common/critique_request.py +10 -1
  173. helm/common/images_utils.py +25 -0
  174. helm/common/mongo_key_value_store.py +2 -1
  175. helm/common/request.py +16 -0
  176. helm/config/model_deployments.yaml +740 -363
  177. helm/config/model_metadata.yaml +824 -128
  178. helm/config/tokenizer_configs.yaml +207 -10
  179. helm/proxy/critique/model_critique_client.py +32 -4
  180. helm/proxy/example_queries.py +14 -21
  181. helm/proxy/services/server_service.py +2 -3
  182. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  183. helm/tokenizers/ai21_tokenizer.py +51 -59
  184. helm/tokenizers/auto_tokenizer.py +1 -1
  185. helm/tokenizers/cohere_tokenizer.py +29 -62
  186. helm/tokenizers/huggingface_tokenizer.py +35 -13
  187. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  188. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  189. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  190. helm/benchmark/static/benchmarking.css +0 -156
  191. helm/benchmark/static/benchmarking.js +0 -1705
  192. helm/benchmark/static/config.js +0 -3
  193. helm/benchmark/static/general.js +0 -122
  194. helm/benchmark/static/images/crfm-logo.png +0 -0
  195. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  196. helm/benchmark/static/images/helm-logo.png +0 -0
  197. helm/benchmark/static/images/language-model-helm.png +0 -0
  198. helm/benchmark/static/images/organizations/ai21.png +0 -0
  199. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  200. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  201. helm/benchmark/static/images/organizations/cohere.png +0 -0
  202. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  203. helm/benchmark/static/images/organizations/google.png +0 -0
  204. helm/benchmark/static/images/organizations/meta.png +0 -0
  205. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  206. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  207. helm/benchmark/static/images/organizations/openai.png +0 -0
  208. helm/benchmark/static/images/organizations/together.png +0 -0
  209. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  210. helm/benchmark/static/images/organizations/yandex.png +0 -0
  211. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  212. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  213. helm/benchmark/static/index.html +0 -68
  214. helm/benchmark/static/info-icon.png +0 -0
  215. helm/benchmark/static/json-urls.js +0 -69
  216. helm/benchmark/static/plot-captions.js +0 -27
  217. helm/benchmark/static/schema_image2structure.yaml +0 -304
  218. helm/benchmark/static/utils.js +0 -285
  219. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  220. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  221. helm/benchmark/window_services/ai21_window_service.py +0 -247
  222. helm/benchmark/window_services/cohere_window_service.py +0 -101
  223. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  224. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  225. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  226. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  227. helm/tokenizers/ice_tokenizer.py +0 -30
  228. helm/tokenizers/test_ice_tokenizer.py +0 -57
  229. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  230. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  231. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  232. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  233. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  234. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  235. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  236. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -28,14 +28,14 @@ from helm.benchmark.metrics.vision_language.image_utils import (
28
28
  pixel_similarity,
29
29
  sift_similarity,
30
30
  )
31
- from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive, get_most_frequent_color
31
+ from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive, get_most_frequent_color, to_gray
32
32
 
33
33
  try:
34
34
  from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
35
35
  from PIL import Image
36
36
  import imagehash
37
37
  except ModuleNotFoundError as e:
38
- handle_module_not_found_error(e, suggestions=["image2structure"])
38
+ handle_module_not_found_error(e, suggestions=["image2struct"])
39
39
 
40
40
 
41
41
  def pad(small_image: Image.Image, large_image: Image.Image, axis: int) -> Image.Image:
@@ -78,9 +78,8 @@ class AnnotatedImageMetrics(Metric):
78
78
 
79
79
  # Metric names
80
80
  COMPILE_METRIC: str = "compilation_success"
81
- BLOCK_EARTH_MOVER_SIMILARITY_NORM1: str = "block_emd_similarity_white"
82
- BLOCK_EARTH_MOVER_SIMILARITY_NORM2: str = "block_emd_similarity_median_color"
83
- BLOCK_EARTH_MOVER_SIMILARITY: str = "block_emd_similarity"
81
+ EARTH_MOVER_SIMILARITY = "earth_mover_similarity"
82
+ BLOCK_EMD: str = "block_emd"
84
83
  PIXEL_SIMILARITY: str = "pixel_similarity"
85
84
  SIFT_SIMILARITY: str = "sift_similarity"
86
85
  LPIPS_SIMILARITY: str = "lpips_similarity"
@@ -108,12 +107,10 @@ class AnnotatedImageMetrics(Metric):
108
107
  metrics: List[AnnotatedMetric] = [
109
108
  AnnotatedMetric(self.PIXEL_SIMILARITY, pixel_similarity, "image_np_gray"),
110
109
  AnnotatedMetric(self.SIFT_SIMILARITY, sift_similarity, "image_np"),
111
- # Raw block EMD
112
- AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY, self.compute_block_emd_raw, "image_PIL"),
113
- # Normalized block EMD against white
114
- AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1, self.compute_block_emd_white, "image_PIL"),
115
- # Normalized block EMD against median
116
- AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2, self.compute_block_emd_median, "image_PIL"),
110
+ AnnotatedMetric(self.BLOCK_EMD, self.compute_block_emd_raw, "image_PIL"), # Raw block-EMD
111
+ AnnotatedMetric(
112
+ self.EARTH_MOVER_SIMILARITY, self.ems, "image_PIL"
113
+ ), # Normalized block-EMD against black/white
117
114
  AnnotatedMetric(self.LPIPS_SIMILARITY, self.lpips_similarity, "image_PIL"),
118
115
  AnnotatedMetric(self.FID_SIMILARITY, self.fid_similarity, "image_PIL"),
119
116
  AnnotatedMetric(self.SSIM_SIMILARITY, self.compute_ssim, "image_np_gray"),
@@ -391,9 +388,15 @@ class AnnotatedImageMetrics(Metric):
391
388
  features1 = self._get_inception_features(img1_tensor)
392
389
  features2 = self._get_inception_features(img2_tensor)
393
390
 
394
- fid_score = self._calculate_fid(features1, features2)
395
- normalize_fid: float = np.exp(-fid_score * self.NORMALIZE_FID_FACTOR)
396
- return normalize_fid
391
+ # TODO: Justify the value of the constant here or remove this code to only keep the cosine similarity.
392
+ # fid_score = self._calculate_fid(features1, features2)
393
+ # normalize_fid: float = np.exp(-fid_score * self.NORMALIZE_FID_FACTOR)
394
+ # return normalize_fid
395
+
396
+ # Use the cosine similarity between the features as a proxy for FID
397
+ # Return a score between 0 and 1, where 1 is the most similar
398
+ score = 0.5 * (1 + np.dot(features1[0], features2[0]) / (np.linalg.norm(features1) * np.linalg.norm(features2)))
399
+ return score
397
400
 
398
401
  def compute_ssim(self, generated_image: np.ndarray, reference_image: np.ndarray) -> float:
399
402
  """Compute the Structural Similarity Index (SSIM) between the generated and reference images."""
@@ -414,58 +417,7 @@ class AnnotatedImageMetrics(Metric):
414
417
  result = _edit_similarity(completion_tokens, truncated_reference_tokens)
415
418
  return result
416
419
 
417
- def compute_block_emd_white(
418
- self,
419
- pred_image: Image.Image,
420
- ref_image: Image.Image,
421
- threshold_most_frequent_color: float = 0.5,
422
- patch_size: Tuple[int, int] = (8, 8),
423
- max_num_patches: int = 100,
424
- weight_most_frequent_color: float = 0.001,
425
- use_tqdm: bool = False,
426
- ):
427
- """Computes the block Earth Moving Distance (EMD). This attempts to
428
- speed up EMD for images with huge areas by considering movement/transformatio
429
- of blocks of pixels. The score is normalized against EMD against white images
430
- """
431
-
432
- def compute_numerator():
433
- return self.compute_block_emd_raw_wrapper(
434
- pred_image,
435
- ref_image,
436
- threshold_most_frequent_color,
437
- patch_size,
438
- max_num_patches,
439
- weight_most_frequent_color,
440
- use_tqdm,
441
- )
442
-
443
- def compute_denominator():
444
- constant_image = Image.new("RGB", ref_image.size, (255, 255, 255)) # default color is white
445
- value = compute_emd_recursive(
446
- constant_image,
447
- ref_image,
448
- threshold_most_frequent_color,
449
- patch_size,
450
- max_num_patches,
451
- weight_most_frequent_color,
452
- use_tqdm,
453
- )
454
- return {"value": value}
455
-
456
- hash_dict = {
457
- "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
458
- }
459
- cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
460
- cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1}", **hash_dict}
461
-
462
- assert self._cache is not None
463
- emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
464
- emd_base, _ = self._cache.get(cache_key_denominator, compute_denominator)
465
-
466
- return 1.0 - emd_raw["value"] / emd_base["value"]
467
-
468
- def compute_block_emd_median(
420
+ def ems(
469
421
  self,
470
422
  pred_image: Image.Image,
471
423
  ref_image: Image.Image,
@@ -493,9 +445,13 @@ class AnnotatedImageMetrics(Metric):
493
445
  def compute_denominator():
494
446
  ref_img_np = np.array(ref_image)
495
447
  (rgb_most_frequent_color, _) = get_most_frequent_color(ref_img_np)
448
+ grayscale_most_frequent_color = to_gray(rgb_most_frequent_color)[0]
496
449
 
497
450
  # Most frequent color as base
498
- constant_image = Image.new("RGB", ref_image.size, tuple(rgb_most_frequent_color)) # type: ignore
451
+ if grayscale_most_frequent_color < 127:
452
+ constant_image = Image.new("RGB", ref_image.size, (255, 255, 255)) # Make it white
453
+ else:
454
+ constant_image = Image.new("RGB", ref_image.size, (0, 0, 0)) # Make it black
499
455
  value = compute_emd_recursive(
500
456
  constant_image,
501
457
  ref_image,
@@ -509,9 +465,10 @@ class AnnotatedImageMetrics(Metric):
509
465
 
510
466
  hash_dict = {
511
467
  "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
468
+ "generated_image": str(AnnotatedImageMetrics.HASH_FUNC(pred_image, hash_size=self.HASH_LENGTH)),
512
469
  }
513
- cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
514
- cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2}", **hash_dict}
470
+ cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EMD}", **hash_dict}
471
+ cache_key_denominator = {"metric_name": "intermediate_ems_extreme_denominator", **hash_dict}
515
472
 
516
473
  assert self._cache is not None
517
474
  emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
@@ -542,8 +499,9 @@ class AnnotatedImageMetrics(Metric):
542
499
 
543
500
  hash_dict = {
544
501
  "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
502
+ "generated_image": str(AnnotatedImageMetrics.HASH_FUNC(pred_image, hash_size=self.HASH_LENGTH)),
545
503
  }
546
- cache_key = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
504
+ cache_key = {"metric_name": f"intermediate_{self.BLOCK_EMD}", **hash_dict}
547
505
  assert self._cache is not None
548
506
  emd_raw, _ = self._cache.get(cache_key, compute)
549
507
 
@@ -560,8 +518,8 @@ class AnnotatedImageMetrics(Metric):
560
518
  use_tqdm: bool = False,
561
519
  ):
562
520
  """Computes the block Earth Moving Distance (EMD). This attempts to
563
- speed up EMD for images with huge areas by considering movement/transformatio
564
- of blocks of pixels. The score is normalized against EMD against white images
521
+ speed up EMD for images with huge areas by considering
522
+ movement/transformation of blocks of pixels.
565
523
  """
566
524
  emd_value = compute_emd_recursive(
567
525
  pred_image,
@@ -6,7 +6,7 @@ try:
6
6
  import cv2
7
7
  from PIL.Image import Image
8
8
  except ModuleNotFoundError as e:
9
- handle_module_not_found_error(e, suggestions=["image2structure"])
9
+ handle_module_not_found_error(e, suggestions=["image2struct"])
10
10
 
11
11
 
12
12
  def preprocess_image(image: Image) -> np.ndarray:
@@ -22,9 +22,6 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
22
22
  # OpenAI Chat format
23
23
  OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
24
24
 
25
- # Mistral instruction-following format
26
- MISTRAL_MODEL_TAG: str = "MISTRAL_MODEL_TAG"
27
-
28
25
  # For Anthropic models
29
26
  ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
30
27
  ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
@@ -69,6 +66,9 @@ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
69
66
  LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
70
67
  FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
71
68
 
69
+ # Deprecated models that are no longer available.
70
+ # These are usually closed API models that have been permanently removed
71
+ DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
72
72
 
73
73
  # Frozen is set to false as the model_deployment_registry.py file
74
74
  # might populate the deployment_names field.
@@ -1,6 +1,9 @@
1
+ import ast
2
+ import dataclasses
1
3
  from dataclasses import dataclass, field
2
4
  from typing import List, Optional, Dict
3
5
  import dacite
6
+ from inspect import cleandoc
4
7
  import mako.template
5
8
  import yaml
6
9
  import importlib_resources as resources
@@ -17,6 +20,11 @@ SCHEMA_YAML_PACKAGE: str = "helm.benchmark.static"
17
20
  SCHEMA_CLASSIC_YAML_FILENAME: str = "schema_classic.yaml"
18
21
 
19
22
 
23
+ _ADAPTER_SPEC_PACKAGE = "helm.benchmark.adaptation"
24
+ _ADAPTER_SPEC_FILENAME = "adapter_spec.py"
25
+ _ADAPTER_SPEC_CLASS_NAME = "AdapterSpec"
26
+
27
+
20
28
  @dataclass(frozen=True)
21
29
  class Field:
22
30
  """
@@ -198,9 +206,6 @@ class RunGroup(Field):
198
206
  class Schema:
199
207
  """Specifies information about what to display on the frontend."""
200
208
 
201
- # Adapter fields (e.g., temperature)
202
- adapter: List[Field]
203
-
204
209
  # Information about each field
205
210
  metrics: List[Field]
206
211
 
@@ -213,6 +218,11 @@ class Schema:
213
218
  # Group the scenarios
214
219
  run_groups: List[RunGroup]
215
220
 
221
+ # Adapter fields (e.g., temperature)
222
+ # Automatically populated from the docstrings in the AdapterSpec class definition.
223
+ # Should not be specified in the user's YAML file.
224
+ adapter: Optional[List[Field]] = None
225
+
216
226
  def __post_init__(self):
217
227
  self.name_to_metric = {metric.name: metric for metric in self.metrics}
218
228
  self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
@@ -220,6 +230,43 @@ class Schema:
220
230
  self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
221
231
 
222
232
 
233
+ def get_adapter_fields() -> List[Field]:
234
+ """Generate the adapter fields from the docstrings in the AdapterSpec class definition."""
235
+ # Unfortunately there is no standard library support for getting docstrings of class fields,
236
+ # so we have to do the parsing outselves. Fortunately, the parsing is quite straightforward.
237
+ adapter_spec_path = resources.files(_ADAPTER_SPEC_PACKAGE).joinpath(_ADAPTER_SPEC_FILENAME)
238
+ with open(adapter_spec_path, "r") as f:
239
+ contents = f.read()
240
+ module_node = ast.parse(contents)
241
+ adapter_spec_node = [
242
+ node
243
+ for node in ast.iter_child_nodes(module_node)
244
+ if isinstance(node, ast.ClassDef) and node.name == _ADAPTER_SPEC_CLASS_NAME
245
+ ][0]
246
+ metadata_fields: List[Field] = []
247
+ field_name: str = ""
248
+ for node in ast.iter_child_nodes(adapter_spec_node):
249
+ if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
250
+ # This node is a field definition.
251
+ # Save the name of the field for later.
252
+ field_name = node.target.id
253
+ else:
254
+ # If this is a docstring that immediately follows a field definition,
255
+ # output an adapter field with the name set to the field definition and
256
+ # the description set to the docstring.
257
+ if (
258
+ field_name
259
+ and isinstance(node, ast.Expr)
260
+ and isinstance(node.value, ast.Constant)
261
+ and isinstance(node.value.value, str)
262
+ ):
263
+ description = cleandoc(node.value.value).replace("\n", " ")
264
+ metadata_fields.append(Field(name=field_name, description=description))
265
+ field_name = ""
266
+
267
+ return metadata_fields
268
+
269
+
223
270
  def get_default_schema_path() -> str:
224
271
  return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)
225
272
 
@@ -229,4 +276,7 @@ def read_schema(schema_path: str) -> Schema:
229
276
  hlog(f"Reading schema file {schema_path}...")
230
277
  with open(schema_path, "r") as f:
231
278
  raw = yaml.safe_load(f)
232
- return dacite.from_dict(Schema, raw)
279
+ schema = dacite.from_dict(Schema, raw)
280
+ if schema.adapter:
281
+ hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
282
+ return dataclasses.replace(schema, adapter=get_adapter_fields())
@@ -16,6 +16,7 @@ class TestRunEntry:
16
16
 
17
17
  @pytest.mark.parametrize("fname", list_fnames())
18
18
  def test_read_all_specs(self, fname: str):
19
+ pytest.skip("Skipping slow tests")
19
20
  run_entries = read_run_entries([fname])
20
21
  for entry in run_entries.entries:
21
22
  construct_run_specs(parse_object_spec(entry.description))
@@ -0,0 +1,11 @@
1
+ from helm.benchmark.presentation.schema import get_adapter_fields
2
+
3
+
4
+ def test_get_adapter_fields() -> None:
5
+ adapter_fields = get_adapter_fields()
6
+ assert adapter_fields
7
+ assert adapter_fields[0].name == "method"
8
+ assert (
9
+ adapter_fields[0].description
10
+ == "The high-level strategy for converting instances into a prompt for the language model."
11
+ )
helm/benchmark/run.py CHANGED
@@ -1,9 +1,11 @@
1
1
  import argparse
2
2
  from dataclasses import replace
3
3
  import os
4
+ import re
4
5
  from typing import List, Optional
5
6
 
6
7
 
8
+ from helm.benchmark import model_metadata_registry
7
9
  from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
8
10
  from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
9
11
  from helm.common.general import ensure_directory_exists
@@ -264,6 +266,13 @@ def main():
264
266
  default=None,
265
267
  help="Full class name of the Runner class to use. If unset, uses the default Runner.",
266
268
  )
269
+ parser.add_argument(
270
+ "--openvino",
271
+ action="store_true",
272
+ default=False,
273
+ help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
274
+ "specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
275
+ )
267
276
  add_run_args(parser)
268
277
  args = parser.parse_args()
269
278
  validate_args(args)
@@ -275,12 +284,19 @@ def main():
275
284
  from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
276
285
 
277
286
  for huggingface_model_name in args.enable_huggingface_models:
278
- register_huggingface_hub_model_from_flag_value(huggingface_model_name)
287
+ if args.openvino:
288
+ register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
289
+ else:
290
+ register_huggingface_hub_model_from_flag_value(huggingface_model_name)
291
+
279
292
  if args.enable_local_huggingface_models:
280
293
  from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
281
294
 
282
295
  for huggingface_model_path in args.enable_local_huggingface_models:
283
- register_huggingface_local_model_from_flag_value(huggingface_model_path)
296
+ if args.openvino:
297
+ register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
298
+ else:
299
+ register_huggingface_local_model_from_flag_value(huggingface_model_path)
284
300
 
285
301
  run_entries: List[RunEntry] = []
286
302
  if args.conf_paths:
@@ -300,6 +316,19 @@ def main():
300
316
  ensure_directory_exists(args.output_path)
301
317
  set_benchmark_output_path(args.output_path)
302
318
 
319
+ # Validate the --models-to-run flag
320
+ if args.models_to_run:
321
+ all_models = set(model_metadata_registry.get_all_models())
322
+ for model_to_run in args.models_to_run:
323
+ if model_to_run not in all_models:
324
+ raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
325
+ else:
326
+ model_expander_pattern = re.compile(
327
+ r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
328
+ )
329
+ if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
330
+ raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
331
+
303
332
  run_specs = run_entries_to_run_specs(
304
333
  run_entries=run_entries,
305
334
  max_eval_instances=args.max_eval_instances,
@@ -10,6 +10,7 @@ from helm.benchmark.model_metadata_registry import (
10
10
  get_all_text_models,
11
11
  get_model_metadata,
12
12
  get_model_names_with_tag,
13
+ DEPRECATED_MODEL_TAG,
13
14
  FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
14
15
  LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
15
16
  ABLATION_MODEL_TAG,
@@ -194,6 +195,15 @@ class StopRunExpander(RunExpander):
194
195
  self.value = value
195
196
 
196
197
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
198
+ if self.value == "none":
199
+ return [
200
+ replace(
201
+ run_spec,
202
+ name=f"{run_spec.name},{self.name}={self.value}",
203
+ adapter_spec=replace(run_spec.adapter_spec, stop_sequences=[]),
204
+ ),
205
+ ]
206
+
197
207
  if self.value == "hash":
198
208
  stop = "###"
199
209
  elif self.value == "semicolon":
@@ -334,16 +344,6 @@ class AnthropicClaude3RunExpander(RunExpander):
334
344
  run_spec,
335
345
  adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
336
346
  )
337
- if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
338
- instructions = "Answer with only a single letter."
339
- if run_spec.adapter_spec.instructions:
340
- instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
341
- return [
342
- replace(
343
- run_spec,
344
- adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
345
- ),
346
- ]
347
347
  return [run_spec]
348
348
 
349
349
 
@@ -601,6 +601,12 @@ class ModelRunExpander(ReplaceValueRunExpander):
601
601
  values_dict["ablation"] = models
602
602
  else:
603
603
  values_dict[family_name] = models
604
+
605
+ # For each of the keys above, filter out deprecated models.
606
+ deprecated_models = set(get_model_names_with_tag(DEPRECATED_MODEL_TAG))
607
+ for family_name in values_dict.keys():
608
+ values_dict[family_name] = [model for model in values_dict[family_name] if model not in deprecated_models]
609
+
604
610
  return values_dict
605
611
 
606
612
 
@@ -1035,6 +1041,7 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
1035
1041
  "chinese": {"chinese": [translate(language_code="zh-CN")]},
1036
1042
  "hindi": {"hindi": [translate(language_code="hi")]},
1037
1043
  "spanish": {"spanish": [translate(language_code="es")]},
1044
+ "swahili": {"swahili": [translate(language_code="sw")]},
1038
1045
  # Styles
1039
1046
  "art": {
1040
1047
  "art": [
@@ -1380,6 +1387,101 @@ class ChatMLRunExpander(RunExpander):
1380
1387
  ]
1381
1388
 
1382
1389
 
1390
+ class OutputFormatInstructions(RunExpander):
1391
+ """Add extra instructions to about output formatting to HELM Lite scenarios.
1392
+
1393
+ Many instruction-following models and chat models are tuned to expect conversational prompts
1394
+ and respond in a conversational way. These models occasionally produce outputs that are not
1395
+ in the expected format. This run expander instructs these models to provide the output in
1396
+ the format expected by the scenario.
1397
+
1398
+ The argument should be the name of the scenario."""
1399
+
1400
+ name = "output_format_instructions"
1401
+
1402
+ _SUFFIX_SUFFIX = "_suffix"
1403
+
1404
+ def __init__(self, scenario: str):
1405
+ if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX):
1406
+ self.scenario = scenario[: -len(OutputFormatInstructions._SUFFIX_SUFFIX)]
1407
+ self.suffix = True
1408
+ else:
1409
+ self.scenario = scenario
1410
+ self.suffix = False
1411
+
1412
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
1413
+ if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
1414
+ if self.scenario == "mmlu_only_last_question":
1415
+ instructions = "Answer only the last question with only a single letter."
1416
+ elif self.scenario == "mmlu":
1417
+ instructions = "Answer with only a single letter."
1418
+ elif self.scenario == "mcqa":
1419
+ instructions = "Answer with only a single letter."
1420
+ else:
1421
+ instructions = "Answer with only a single letter."
1422
+ elif run_spec.adapter_spec.method == ADAPT_GENERATION:
1423
+ output_noun = run_spec.adapter_spec.output_prefix.split(":")[0]
1424
+ if self.scenario == "narrative_qa":
1425
+ instructions = (
1426
+ "Answer with one word, a few-word phrase, or a short sentence. "
1427
+ + "Avoid extra, unnecessary information in the answer."
1428
+ )
1429
+ elif self.scenario == "natural_qa":
1430
+ instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
1431
+ elif self.scenario == "legalbench":
1432
+ if output_noun != "Answer":
1433
+ instructions = f"Answer with the {output_noun.lower()}."
1434
+ else:
1435
+ instructions = "Answer yes or no."
1436
+ elif self.scenario == "legalbench_abercrombie":
1437
+ instructions = "Answer with only 'generic', 'descriptive', 'suggestive', 'arbitrary' or 'fanciful'."
1438
+ elif self.scenario == "legalbench_function_of_decision_section":
1439
+ instructions = "Answer with only 'Facts', 'Procedural History', 'Issue', 'Rule', 'Analysis', 'Conclusion' or 'Decree'." # noqa: E501
1440
+ elif self.scenario == "legalbench_yes_or_no":
1441
+ instructions = "Answer with only 'Yes' or 'No'."
1442
+ elif self.scenario == "wmt_14":
1443
+ instructions = "Answer with the English translation."
1444
+ elif self.scenario == "wmt_14_only_last_sentence":
1445
+ instructions = "Answer with only the English translation for the last sentence."
1446
+ elif self.scenario == "math":
1447
+ instructions = "Wrap the final answer with the \\boxed{} command."
1448
+ elif self.scenario == "numeric_nlg":
1449
+ instructions = "Answer with only description of the last table as a single paragraph on a single line."
1450
+ elif self.scenario == "tab_fact":
1451
+ instructions = (
1452
+ "Answer with only the classification of the last statement, either 'refuted' or 'entailed'."
1453
+ )
1454
+ elif self.scenario == "wikitq":
1455
+ instructions = (
1456
+ "Answer only the last question with a short answer. "
1457
+ "Avoid extra, unnecessary information in the answer."
1458
+ )
1459
+ else:
1460
+ raise ValueError(f"Unknown scenario {self.scenario}")
1461
+
1462
+ if self.suffix:
1463
+ return [
1464
+ replace(
1465
+ run_spec,
1466
+ adapter_spec=replace(
1467
+ run_spec.adapter_spec,
1468
+ global_suffix=f"{run_spec.adapter_spec.global_suffix}\n\n{instructions}",
1469
+ ),
1470
+ ),
1471
+ ]
1472
+
1473
+ if run_spec.adapter_spec.instructions:
1474
+ instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
1475
+ else:
1476
+ instructions = f"{instructions}\n"
1477
+ return [
1478
+ replace(
1479
+ run_spec,
1480
+ adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
1481
+ ),
1482
+ ]
1483
+
1484
+
1383
1485
  RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1384
1486
  InstructionsRunExpander,
1385
1487
  PromptRunExpander,
@@ -1402,6 +1504,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1402
1504
  NumOutputTokensRunExpander,
1403
1505
  ChatMLRunExpander,
1404
1506
  EvalSplitRunExpander,
1507
+ OutputFormatInstructions,
1405
1508
  ]
1406
1509
 
1407
1510
 
@@ -156,6 +156,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
156
156
  increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
157
157
  run_spec = singleton(increase_temperature_expander.expand(run_spec))
158
158
 
159
+ # MedLM-Large
160
+ if run_spec.adapter_spec.model == "google/medlm-large":
161
+ run_spec = singleton(StopRunExpander("none").expand(run_spec))
162
+
159
163
  return run_spec
160
164
 
161
165
  run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
@@ -0,0 +1,40 @@
1
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
2
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
3
+ from helm.benchmark.metrics.metric import MetricSpec
4
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
5
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
6
+
7
+
8
+ @run_spec_function("air_bench_2024")
9
+ def get_air_bench_2024_spec() -> RunSpec:
10
+ adapter_spec = AdapterSpec(
11
+ method=ADAPT_GENERATION,
12
+ global_prefix="",
13
+ global_suffix="",
14
+ instructions="",
15
+ input_prefix="",
16
+ input_suffix="",
17
+ output_prefix="",
18
+ output_suffix="",
19
+ instance_prefix="",
20
+ max_train_instances=0,
21
+ num_outputs=1,
22
+ max_tokens=512,
23
+ temperature=0.0,
24
+ stop_sequences=[],
25
+ )
26
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
27
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator")]
28
+ metric_specs = [
29
+ MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
30
+ MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
31
+ MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
32
+ ]
33
+ return RunSpec(
34
+ name="air_bench_2024",
35
+ scenario_spec=scenario_spec,
36
+ adapter_spec=adapter_spec,
37
+ metric_specs=metric_specs,
38
+ annotators=annotator_specs,
39
+ groups=["air_bench_2024"],
40
+ )