crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  13. helm/benchmark/annotation/model_as_judge.py +12 -16
  14. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  15. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  16. helm/benchmark/executor.py +11 -12
  17. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  18. helm/benchmark/metrics/bias_word_lists.py +1 -1
  19. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  20. helm/benchmark/metrics/classification_metrics.py +3 -3
  21. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  22. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  23. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  24. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  25. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  26. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  27. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  28. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  29. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  30. helm/benchmark/metrics/medalign_metrics.py +9 -29
  31. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  32. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  33. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  34. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  35. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  36. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  37. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  38. helm/benchmark/metrics/metric_service.py +11 -11
  39. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  40. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  41. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  42. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  43. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  44. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  45. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  46. helm/benchmark/metrics/summac/model_summac.py +1 -2
  47. helm/benchmark/metrics/summarization_metrics.py +2 -1
  48. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  49. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  50. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  51. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  52. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  53. helm/benchmark/model_deployment_registry.py +6 -8
  54. helm/benchmark/presentation/contamination.py +3 -3
  55. helm/benchmark/presentation/create_plots.py +33 -12
  56. helm/benchmark/presentation/run_display.py +13 -0
  57. helm/benchmark/presentation/schema.py +2 -1
  58. helm/benchmark/presentation/summarize.py +76 -59
  59. helm/benchmark/reeval_run.py +3 -4
  60. helm/benchmark/reeval_runner.py +3 -3
  61. helm/benchmark/run.py +78 -73
  62. helm/benchmark/run_expander.py +12 -1
  63. helm/benchmark/run_spec_factory.py +7 -6
  64. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  65. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  66. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  67. helm/benchmark/run_specs/long_context_run_specs.py +67 -15
  68. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  69. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  70. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  71. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  72. helm/benchmark/runner.py +5 -5
  73. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  74. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  75. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  76. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  77. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  78. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  79. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  80. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  81. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  82. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  83. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  84. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  85. helm/benchmark/scenarios/clear_scenario.py +11 -7
  86. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  87. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  88. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  89. helm/benchmark/scenarios/grammar.py +2 -2
  90. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  91. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  92. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  93. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  94. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  95. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  96. helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
  97. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  98. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  99. helm/benchmark/scenarios/medec_scenario.py +6 -1
  100. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  101. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  102. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  103. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  104. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  105. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  106. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  107. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  108. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  109. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  110. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  111. helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
  112. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  113. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  114. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  115. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  116. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  117. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  118. helm/benchmark/scenarios/numeracy_scenario.py +2 -1
  119. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  120. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  121. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  122. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  123. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  124. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  125. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  126. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  127. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  128. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  129. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  130. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  131. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  132. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  133. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  134. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  135. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  136. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  137. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  138. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  139. helm/benchmark/server.py +2 -1
  140. helm/benchmark/static/schema_audio.yaml +60 -49
  141. helm/benchmark/static/schema_enterprise.yaml +21 -0
  142. helm/benchmark/static/schema_long_context.yaml +63 -20
  143. helm/benchmark/static/schema_medhelm.yaml +272 -213
  144. helm/benchmark/static/schema_melt.yaml +1257 -0
  145. helm/benchmark/static/schema_slphelm.yaml +162 -0
  146. helm/benchmark/static/schema_vhelm.yaml +26 -26
  147. helm/benchmark/static/schema_video.yaml +219 -0
  148. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  149. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  150. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  151. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  152. helm/benchmark/static_build/index.html +4 -4
  153. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  154. helm/benchmark/window_services/test_utils.py +3 -4
  155. helm/benchmark/window_services/tokenizer_service.py +7 -8
  156. helm/clients/anthropic_client.py +69 -29
  157. helm/clients/audio_language/diva_llama_client.py +4 -2
  158. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  159. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  160. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  161. helm/clients/audio_language/test.py +62 -0
  162. helm/clients/bedrock_client.py +3 -1
  163. helm/clients/client.py +7 -7
  164. helm/clients/grok_client.py +36 -0
  165. helm/clients/huggingface_client.py +42 -3
  166. helm/clients/huggingface_pipeline_client.py +138 -0
  167. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  168. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  169. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  170. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  171. helm/clients/openai_client.py +100 -54
  172. helm/clients/openai_responses_client.py +174 -0
  173. helm/clients/palmyra_client.py +2 -5
  174. helm/clients/reka_client.py +2 -2
  175. helm/clients/together_client.py +31 -4
  176. helm/clients/vertexai_client.py +6 -0
  177. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  178. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  179. helm/clients/vision_language/idefics_client.py +6 -2
  180. helm/clients/vision_language/paligemma_client.py +2 -2
  181. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  182. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  183. helm/clients/writer_client.py +102 -0
  184. helm/common/context.py +80 -0
  185. helm/common/credentials_utils.py +5 -5
  186. helm/common/general.py +9 -2
  187. helm/common/hierarchical_logger.py +46 -3
  188. helm/common/local_context.py +140 -0
  189. helm/common/remote_context.py +61 -0
  190. helm/common/request.py +8 -0
  191. helm/config/model_deployments.yaml +864 -193
  192. helm/config/model_metadata.yaml +667 -53
  193. helm/config/tokenizer_configs.yaml +144 -3
  194. helm/proxy/cli.py +3 -1
  195. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  196. helm/proxy/services/server_service.py +21 -85
  197. helm/tokenizers/grok_tokenizer.py +53 -0
  198. helm/tokenizers/huggingface_tokenizer.py +1 -1
  199. helm/tokenizers/test_grok_tokenizer.py +33 -0
  200. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  201. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  202. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  203. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  204. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
  205. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
  206. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -21,7 +21,10 @@ from helm.benchmark.model_metadata_registry import (
21
21
  AUDIO_LANGUAGE_MODEL_TAG,
22
22
  INSTRUCTION_FOLLOWING_MODEL_TAG,
23
23
  )
24
- from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
24
+ from helm.benchmark.adaptation.adapters.adapter_factory import (
25
+ ADAPT_GENERATION,
26
+ ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
27
+ )
25
28
  from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
26
29
  from helm.benchmark.run_spec import RunSpec
27
30
  from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
@@ -537,6 +540,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
537
540
  "all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
538
541
  "big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
539
542
  "vhelm": [0, 1, 2, 4, 8],
543
+ "melt": [0, 1, 5],
540
544
  }
541
545
 
542
546
 
@@ -1476,6 +1480,8 @@ class OutputFormatInstructions(RunExpander):
1476
1480
  instructions = "Answer with only a single letter."
1477
1481
  elif self.scenario == "mcqa":
1478
1482
  instructions = "Answer with only a single letter."
1483
+ elif self.scenario == "mcqa_no_period":
1484
+ instructions = "Answer with only a single letter. Do not include a period in your answer."
1479
1485
  elif self.scenario == "mcqa_only_last_question":
1480
1486
  instructions = "Answer only the last question with only a single letter."
1481
1487
  else:
@@ -1521,6 +1527,11 @@ class OutputFormatInstructions(RunExpander):
1521
1527
  )
1522
1528
  else:
1523
1529
  raise ValueError(f"Unknown scenario {self.scenario}")
1530
+ elif run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
1531
+ if self.scenario == "mmlu_pro" or self.scenario == "gpqa":
1532
+ instructions = 'In your response, replace "insert answer here" with the single uppercase letter corresponding to your answer.' # noqa: E501
1533
+ else:
1534
+ raise ValueError(f"Unknown scenario {self.scenario}")
1524
1535
 
1525
1536
  if self.no_prefix:
1526
1537
  if instructions:
@@ -143,12 +143,13 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
143
143
  ):
144
144
  run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
145
145
 
146
- if model.name == "openai/o1-2024-12-17":
147
- # From https://platform.openai.com/docs/guides/reasoning,
148
- # "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
149
- # experimenting with these models. As you become familiar with the number of reasoning tokens your
150
- # prompts require, you can adjust this buffer accordingly."
151
- run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
146
+ # TODO: find a better solution for this
147
+ # if model.name.startswith("openai/o"):
148
+ # # From https://platform.openai.com/docs/guides/reasoning,
149
+ # # "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
150
+ # # experimenting with these models. As you become familiar with the number of reasoning tokens your
151
+ # # prompts require, you can adjust this buffer accordingly."
152
+ # run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
152
153
 
153
154
  # IDEFICS special handling
154
155
  if IDEFICS_MODEL_TAG in model.tags:
@@ -113,6 +113,18 @@ def _get_gpt4_critique_metric_specs(num_respondents: int, max_tokens: int) -> Li
113
113
  ]
114
114
 
115
115
 
116
+ def _get_gpt4_refusal_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
117
+ return [
118
+ MetricSpec(
119
+ class_name="helm.benchmark.metrics.gpt4_audio_refusal_metrics.GPT4AudioRefusalCritiqueMetric",
120
+ args={
121
+ "num_respondents": num_respondents,
122
+ "max_tokens": max_tokens,
123
+ },
124
+ )
125
+ ]
126
+
127
+
116
128
  ########################################################################################################################
117
129
  # RunSpecs
118
130
 
@@ -215,16 +227,20 @@ def get_mustard_audio_run_spec() -> RunSpec:
215
227
 
216
228
 
217
229
  @run_spec_function("voice_jailbreak_attacks")
218
- def get_voice_jailbreak_attacks_run_spec(subset: str) -> RunSpec:
230
+ def get_voice_jailbreak_attacks_run_spec(subset: str, num_respondents: int = 1) -> RunSpec:
219
231
  scenario_spec = ScenarioSpec(
220
232
  class_name="helm.benchmark.scenarios.audio_language.voice_jailbreak_attacks_scenario."
221
233
  "VoiceJailbreakAttacksScenario",
222
234
  args={"subset": subset},
223
235
  )
224
- adapter_spec = _get_generation_adapter_spec(max_tokens=1024)
225
- metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
226
- include_basic_metrics=True, include_generative_harms_metrics=True
236
+ adapter_spec = _get_generation_adapter_spec(
237
+ instructions="Listen to the audio and respond according to its instructions.",
238
+ max_tokens=1024,
227
239
  )
240
+ metric_specs: List[MetricSpec] = _get_gpt4_refusal_metric_specs(
241
+ num_respondents=num_respondents,
242
+ max_tokens=200,
243
+ ) + get_generative_harms_metric_specs(include_basic_metrics=True, include_generative_harms_metrics=True)
228
244
 
229
245
  run_spec_name: str = "voice_jailbreak_attacks"
230
246
  return RunSpec(
@@ -258,19 +274,20 @@ def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
258
274
 
259
275
 
260
276
  @run_spec_function("vocal_sound")
261
- def get_vocal_sound_run_spec() -> RunSpec:
277
+ def get_vocal_sound_run_spec(sound: str) -> RunSpec:
262
278
  scenario_spec = ScenarioSpec(
263
279
  class_name="helm.benchmark.scenarios.audio_language.vocal_sound_scenario.VocalSoundScenario",
280
+ args={"sound": sound},
264
281
  )
265
282
  adapter_spec = _get_generation_adapter_spec(
266
283
  instructions="Listen to the audio and classify the speaker behavior. Choose only from these options:"
267
284
  '"Cough", "Laughter", "Sigh", "Sneeze", "Sniff", or "Throat clearing". Respond with just the behavior.',
268
285
  max_tokens=5,
269
286
  )
270
- metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
287
+ metric_specs = get_exact_match_metric_specs()
271
288
  run_spec_name: str = "vocal_sound"
272
289
  return RunSpec(
273
- name=run_spec_name,
290
+ name=f"{run_spec_name}:sound={sound}",
274
291
  scenario_spec=scenario_spec,
275
292
  adapter_spec=adapter_spec,
276
293
  metric_specs=metric_specs,
@@ -501,13 +518,20 @@ def get_air_bench_chat_run_spec(subject: str, num_respondents: int = 1) -> RunSp
501
518
  )
502
519
  + _get_open_ended_generation_metric_specs()
503
520
  )
521
+
504
522
  run_spec_name: str = "air_bench_chat"
523
+ group_name: str = run_spec_name
524
+ if subject in ["mix", "speech"]:
525
+ group_name += "_reasoning"
526
+ elif subject in ["sound", "music"]:
527
+ group_name += "_knowledge"
528
+
505
529
  return RunSpec(
506
530
  name=f"{run_spec_name}:subject={subject}",
507
531
  scenario_spec=scenario_spec,
508
532
  adapter_spec=adapter_spec,
509
533
  metric_specs=metric_specs,
510
- groups=[run_spec_name],
534
+ groups=[group_name],
511
535
  )
512
536
 
513
537
 
@@ -611,3 +635,23 @@ def get_parade_run_spec(voice: str, subset: str) -> RunSpec:
611
635
  metric_specs=metric_specs,
612
636
  groups=[run_spec_name],
613
637
  )
638
+
639
+
640
+ @run_spec_function("corebench")
641
+ def get_corebench_run_spec() -> RunSpec:
642
+ scenario_spec = ScenarioSpec(
643
+ class_name="helm.benchmark.scenarios.audio_language.corebench_scenario.COREBenchScenario",
644
+ )
645
+ adapter_spec = _get_generation_adapter_spec(
646
+ instructions="",
647
+ max_tokens=10,
648
+ )
649
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
650
+ run_spec_name: str = "corebench"
651
+ return RunSpec(
652
+ name=f"{run_spec_name}",
653
+ scenario_spec=scenario_spec,
654
+ adapter_spec=adapter_spec,
655
+ metric_specs=metric_specs,
656
+ groups=[run_spec_name],
657
+ )
@@ -100,6 +100,26 @@ def get_conv_fin_qa_calc_spec() -> RunSpec:
100
100
  )
101
101
 
102
102
 
103
+ @run_spec_function("kpi_edgar")
104
+ def get_kpi_edgar_spec() -> RunSpec:
105
+ scenario_spec = ScenarioSpec(
106
+ class_name="helm.benchmark.scenarios.kpi_edgar_scenario.KPIEDGARScenario",
107
+ )
108
+
109
+ adapter_spec = get_generation_adapter_spec(
110
+ input_noun=None, output_noun="Answer", max_tokens=100, max_train_instances=20
111
+ )
112
+
113
+ return RunSpec(
114
+ name="kpi_edgar",
115
+ scenario_spec=scenario_spec,
116
+ adapter_spec=adapter_spec,
117
+ metric_specs=get_basic_metric_specs([])
118
+ + [MetricSpec(class_name="helm.benchmark.metrics.kpi_edgar_metrics.KPIEdgarMetric")],
119
+ groups=["kpi_edgar"],
120
+ )
121
+
122
+
103
123
  # Legal
104
124
 
105
125
 
@@ -6,7 +6,11 @@ from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
6
6
  from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
7
7
  from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
8
8
  from helm.benchmark.annotation.annotator import AnnotatorSpec
9
- from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_exact_match_metric_specs
9
+ from helm.benchmark.metrics.common_metric_specs import (
10
+ get_basic_metric_specs,
11
+ get_exact_match_metric_specs,
12
+ get_open_ended_generation_metric_specs,
13
+ )
10
14
  from helm.benchmark.metrics.metric import MetricSpec
11
15
  from helm.benchmark.run_spec import RunSpec, run_spec_function
12
16
  from helm.benchmark.scenarios.scenario import ScenarioSpec
@@ -192,3 +196,29 @@ def get_czech_bank_qa_spec(config_name: str = "berka_queries_1024_2024_12_18") -
192
196
  annotators=[AnnotatorSpec("helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator")],
193
197
  groups=["czech_bank_qa"],
194
198
  )
199
+
200
+
201
+ @run_spec_function("medi_qa_without_annotator")
202
+ def get_medi_qa_without_annotator_spec() -> RunSpec:
203
+ """A version of medi_qa that does not use annotators.
204
+
205
+ EXPERIMENTAL: You should probably use medi_qa instead."""
206
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medi_qa_scenario.MediQAScenario", args={})
207
+
208
+ adapter_spec = get_generation_adapter_spec(
209
+ instructions="Answer the following consumer health question.",
210
+ input_noun="Question",
211
+ output_noun="Answer",
212
+ max_tokens=1024,
213
+ max_train_instances=0,
214
+ stop_sequences=[],
215
+ )
216
+
217
+ metric_specs = get_open_ended_generation_metric_specs()
218
+ return RunSpec(
219
+ name="medi_qa",
220
+ scenario_spec=scenario_spec,
221
+ adapter_spec=adapter_spec,
222
+ metric_specs=metric_specs,
223
+ groups=["medi_qa"],
224
+ )
@@ -1,5 +1,9 @@
1
- from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
2
- from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_open_ended_generation_metric_specs
1
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_CHAT, ADAPT_GENERATION, AdapterSpec
2
+ from helm.benchmark.metrics.common_metric_specs import (
3
+ get_exact_match_metric_specs,
4
+ get_open_ended_generation_metric_specs,
5
+ )
6
+ from helm.benchmark.metrics.metric import MetricSpec
3
7
  from helm.benchmark.run_spec import RunSpec, run_spec_function
4
8
  from helm.benchmark.scenarios.scenario import ScenarioSpec
5
9
 
@@ -26,7 +30,7 @@ def _get_long_context_generation_adapter_spec(max_tokens: int) -> AdapterSpec:
26
30
 
27
31
 
28
32
  @run_spec_function("ruler_hotpotqa")
29
- def get_ruler_hotpotqa_spec(max_num_words: int = 65536) -> RunSpec:
33
+ def get_ruler_hotpotqa_spec(max_num_words: int = 131072) -> RunSpec:
30
34
  scenario_spec = ScenarioSpec(
31
35
  class_name="helm.benchmark.scenarios.ruler_qa_scenarios.RULERHotpotQAScenario",
32
36
  args={
@@ -35,18 +39,21 @@ def get_ruler_hotpotqa_spec(max_num_words: int = 65536) -> RunSpec:
35
39
  )
36
40
 
37
41
  adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=100)
42
+ metric_specs = get_open_ended_generation_metric_specs() + [
43
+ MetricSpec(class_name="helm.benchmark.metrics.ruler_qa_metrics.RulerQAMetric")
44
+ ]
38
45
 
39
46
  return RunSpec(
40
47
  name=f"ruler_hotpotqa:max_num_words={max_num_words}",
41
48
  scenario_spec=scenario_spec,
42
49
  adapter_spec=adapter_spec,
43
- metric_specs=get_open_ended_generation_metric_specs(),
50
+ metric_specs=metric_specs,
44
51
  groups=["ruler_hotpotqa"],
45
52
  )
46
53
 
47
54
 
48
55
  @run_spec_function("ruler_squad")
49
- def get_ruler_squad_spec(max_num_words: int = 65536) -> RunSpec:
56
+ def get_ruler_squad_spec(max_num_words: int = 131072) -> RunSpec:
50
57
  scenario_spec = ScenarioSpec(
51
58
  class_name="helm.benchmark.scenarios.ruler_qa_scenarios.RULERSQuADScenario",
52
59
  args={
@@ -55,35 +62,80 @@ def get_ruler_squad_spec(max_num_words: int = 65536) -> RunSpec:
55
62
  )
56
63
 
57
64
  adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=100)
65
+ metric_specs = get_open_ended_generation_metric_specs() + [
66
+ MetricSpec(class_name="helm.benchmark.metrics.ruler_qa_metrics.RulerQAMetric")
67
+ ]
58
68
 
59
69
  return RunSpec(
60
70
  name=f"ruler_squad:max_num_words={max_num_words}",
61
71
  scenario_spec=scenario_spec,
62
72
  adapter_spec=adapter_spec,
63
- metric_specs=get_open_ended_generation_metric_specs(),
73
+ metric_specs=metric_specs,
64
74
  groups=["ruler_squad"],
65
75
  )
66
76
 
67
77
 
68
- @run_spec_function("infinite_bench_sum")
69
- def get_infinite_bench_sum_spec(min_num_words: int = 0, max_num_words: int = 65536) -> RunSpec:
78
+ @run_spec_function("infinite_bench_en_qa")
79
+ def get_infinite_bench_en_qa_spec(max_num_words: int = 131072) -> RunSpec:
80
+ scenario_spec = ScenarioSpec(
81
+ class_name="helm.benchmark.scenarios.infinite_bench_en_qa_scenario.InfiniteBenchEnQAScenario",
82
+ args={
83
+ "max_num_words": max_num_words,
84
+ },
85
+ )
86
+
87
+ adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=40)
88
+ metric_specs = get_open_ended_generation_metric_specs()
89
+
90
+ return RunSpec(
91
+ name=f"infinite_bench_en_qa:max_num_words={max_num_words}",
92
+ scenario_spec=scenario_spec,
93
+ adapter_spec=adapter_spec,
94
+ metric_specs=metric_specs,
95
+ groups=["infinite_bench_en_qa"],
96
+ )
97
+
98
+
99
+ @run_spec_function("infinite_bench_en_sum")
100
+ def get_infinite_bench_en_sum_spec(max_num_words: int = 131072) -> RunSpec:
70
101
 
71
102
  scenario_spec = ScenarioSpec(
72
- class_name="helm.benchmark.scenarios.infinite_bench_sum_scenario.InfiniteBenchSumScenario",
103
+ class_name="helm.benchmark.scenarios.infinite_bench_en_sum_scenario.InfiniteBenchEnSumScenario",
73
104
  args={
74
- "min_num_words": min_num_words,
75
105
  "max_num_words": max_num_words,
76
106
  },
77
107
  )
78
108
 
79
- # No official number for max tokens, the average output token is 1.1k according to the paper
80
- adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=2000)
81
- metric_specs = get_basic_metric_specs(["rouge_l"])
109
+ adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=1200)
110
+ metric_specs = get_open_ended_generation_metric_specs()
111
+
112
+ return RunSpec(
113
+ name=f"infinite_bench_en_sum:max_num_words={max_num_words}",
114
+ scenario_spec=scenario_spec,
115
+ adapter_spec=adapter_spec,
116
+ metric_specs=metric_specs,
117
+ groups=["infinite_bench_en_sum"],
118
+ )
119
+
120
+
121
+ @run_spec_function("openai_mrcr")
122
+ def get_openai_mrcr_spec(needles: int, max_num_words: int = 131072) -> RunSpec:
123
+ scenario_spec = ScenarioSpec(
124
+ class_name="helm.benchmark.scenarios.openai_mrcr_scenario.OpenAIMRCRScenario",
125
+ args={"needles": needles, "max_num_words": max_num_words},
126
+ )
127
+
128
+ adapter_spec = AdapterSpec(
129
+ method=ADAPT_CHAT, input_prefix="", output_prefix="", max_tokens=2000, num_outputs=1, temperature=0.0
130
+ )
131
+ metric_specs = get_exact_match_metric_specs() + [
132
+ MetricSpec(class_name="helm.benchmark.metrics.openai_mrcr_metrics.OpenAIMRCRMetric")
133
+ ]
82
134
 
83
135
  return RunSpec(
84
- name=f"infinite_bench_sum:min_num_words={min_num_words},max_num_words={max_num_words}",
136
+ name=f"openai_mrcr:needles={needles},max_num_words={max_num_words}",
85
137
  scenario_spec=scenario_spec,
86
138
  adapter_spec=adapter_spec,
87
139
  metric_specs=metric_specs,
88
- groups=["infinite_bench_sum"],
140
+ groups=["openai_mrcr"],
89
141
  )