crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -0,0 +1,50 @@
1
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
2
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
3
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
4
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
5
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
6
+
7
+
8
+ @run_spec_function("mmmlu")
9
+ def get_mmmlu_spec(locale: str, subject: str) -> RunSpec:
10
+ scenario_spec = ScenarioSpec(
11
+ class_name="helm.benchmark.scenarios.mmmlu_scenario.MMMLUScenario", args={"locale": locale, "subject": subject}
12
+ )
13
+
14
+ adapter_spec = get_multiple_choice_adapter_spec(
15
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
16
+ instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
17
+ input_noun="Question",
18
+ output_noun="Answer",
19
+ )
20
+
21
+ return RunSpec(
22
+ name=f"mmmlu:locale={locale},subject={subject}",
23
+ scenario_spec=scenario_spec,
24
+ adapter_spec=adapter_spec,
25
+ metric_specs=get_exact_match_metric_specs(),
26
+ groups=["mmmlu", f"mmmlu_{locale}_{subject}"],
27
+ )
28
+
29
+
30
+ @run_spec_function("exams_multilingual")
31
+ def get_exams_multilingual_spec(language: str, subject: str) -> RunSpec:
32
+ scenario_spec = ScenarioSpec(
33
+ class_name="helm.benchmark.scenarios.exams_multilingual_scenario.EXAMSMultilingualScenario",
34
+ args={"language": language, "subject": subject},
35
+ )
36
+
37
+ adapter_spec = get_multiple_choice_adapter_spec(
38
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
39
+ instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
40
+ input_noun="Question",
41
+ output_noun="Answer",
42
+ )
43
+
44
+ return RunSpec(
45
+ name=f"exams_multilingual:locale={language},subject={subject}",
46
+ scenario_spec=scenario_spec,
47
+ adapter_spec=adapter_spec,
48
+ metric_specs=get_exact_match_metric_specs(),
49
+ groups=["exams_multilingual", f"exams_multilingual_{language}_{subject}"],
50
+ )
@@ -0,0 +1,163 @@
1
+ from typing import List, Optional
2
+ from helm.benchmark.adaptation.adapter_spec import (
3
+ ADAPT_GENERATION_MULTIMODAL,
4
+ ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
5
+ AdapterSpec,
6
+ )
7
+ from helm.benchmark.metrics.common_metric_specs import (
8
+ get_basic_generation_metric_specs,
9
+ get_basic_metric_specs,
10
+ get_multiple_choice_classification_metric_specs,
11
+ )
12
+ from helm.benchmark.metrics.metric import MetricSpec
13
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
14
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
15
+
16
+
17
+ def audio_classification_metric_specs() -> List[MetricSpec]:
18
+ return get_multiple_choice_classification_metric_specs() + get_basic_metric_specs(
19
+ ["exact_match", "quasi_exact_match"]
20
+ )
21
+
22
+
23
+ def _get_multiple_choice_joint_adapter_spec(
24
+ input_noun: Optional[str],
25
+ output_noun: str,
26
+ max_train_instances: int = 0,
27
+ num_outputs: int = 1,
28
+ ) -> AdapterSpec:
29
+ return AdapterSpec(
30
+ method=ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
31
+ global_prefix="",
32
+ instructions="Answer the multiple choice question by just giving the letter of the correct answer "
33
+ "and nothing else.",
34
+ input_prefix=f"{input_noun}: " if input_noun is not None else "",
35
+ input_suffix="\n",
36
+ output_prefix=f"{output_noun}: ",
37
+ output_suffix="\n",
38
+ instance_prefix="\n",
39
+ max_train_instances=max_train_instances,
40
+ num_outputs=num_outputs,
41
+ max_tokens=1,
42
+ stop_sequences=["\n"],
43
+ temperature=0.0,
44
+ random=None,
45
+ )
46
+
47
+
48
+ def _get_generation_adapter_spec(
49
+ max_tokens: int,
50
+ instructions: str = "",
51
+ max_train_instances: int = 0,
52
+ temperature: float = 0.0,
53
+ stop_sequences: Optional[List[str]] = None,
54
+ ) -> AdapterSpec:
55
+ return AdapterSpec(
56
+ method=ADAPT_GENERATION_MULTIMODAL,
57
+ instructions=instructions,
58
+ input_prefix="",
59
+ input_suffix="",
60
+ output_prefix="",
61
+ output_suffix="",
62
+ instance_prefix="",
63
+ max_train_instances=max_train_instances,
64
+ num_outputs=1,
65
+ max_tokens=max_tokens,
66
+ temperature=temperature,
67
+ stop_sequences=stop_sequences if stop_sequences is not None else [],
68
+ )
69
+
70
+
71
+ @run_spec_function("ultra_suite_classification")
72
+ def get_ultra_suite_classification_run_spec() -> RunSpec:
73
+ scenario_spec = ScenarioSpec(
74
+ class_name="helm.benchmark.scenarios.audio_language.ultra_suite_classification_scenario.UltraSuiteClassificationScenario", # noqa: E501
75
+ )
76
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer")
77
+ metric_specs: List[MetricSpec] = audio_classification_metric_specs()
78
+ run_spec_name: str = "ultra_suite_classification"
79
+ return RunSpec(
80
+ name=f"{run_spec_name}",
81
+ scenario_spec=scenario_spec,
82
+ adapter_spec=adapter_spec,
83
+ metric_specs=metric_specs,
84
+ groups=[run_spec_name],
85
+ )
86
+
87
+
88
+ @run_spec_function("ultra_suite_classification_breakdown")
89
+ def get_ultra_suite_disorder_breakdown_run_spec() -> RunSpec:
90
+ scenario_spec = ScenarioSpec(
91
+ class_name="helm.benchmark.scenarios.audio_language.ultra_suite_disorder_breakdown_scenario.UltraSuiteDisorderBreakdownScenario", # noqa: E501
92
+ )
93
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer")
94
+ metric_specs: List[MetricSpec] = audio_classification_metric_specs()
95
+ run_spec_name: str = "ultra_suite_classification_breakdown"
96
+ return RunSpec(
97
+ name=f"{run_spec_name}",
98
+ scenario_spec=scenario_spec,
99
+ adapter_spec=adapter_spec,
100
+ metric_specs=metric_specs,
101
+ groups=[run_spec_name],
102
+ )
103
+
104
+
105
+ # Makes the model transcribe the child's speech into text without assuming what the child is supposed to say
106
+ # if the transcription matches the prompt, then it is classified as typically developing
107
+ # otherwise, it is classified as having a speech disorder
108
+ @run_spec_function("ultra_suite_asr_classification")
109
+ def get_ultra_suite_asr_classification_run_spec() -> RunSpec:
110
+ scenario_spec = ScenarioSpec(
111
+ class_name="helm.benchmark.scenarios.audio_language.ultra_suite_asr_classification_scenario.UltraSuiteASRClassificationScenario", # noqa: E501
112
+ )
113
+ adapter_spec = _get_generation_adapter_spec(
114
+ instructions="""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording is provided to you, typically consisting of a speech prompt from a pathologist followed by a child's repetition. Based on your expertise transcribe the child's speech into text. Do not make any assumptions about the words the child is expected to say. Only transcribe based on the words that the child actually says. Only respond with the text transcription, no other text or commentary.""", # noqa: E501
115
+ max_tokens=10,
116
+ )
117
+ metric_specs: List[MetricSpec] = audio_classification_metric_specs()
118
+ run_spec_name: str = "ultra_suite_asr_classification"
119
+ return RunSpec(
120
+ name=run_spec_name,
121
+ scenario_spec=scenario_spec,
122
+ adapter_spec=adapter_spec,
123
+ metric_specs=metric_specs,
124
+ groups=[run_spec_name],
125
+ )
126
+
127
+
128
+ # Makes the model transcribe the child's speech into text and is allowed to assume what the child is supposed to say
129
+ @run_spec_function("ultra_suite_asr_transcription")
130
+ def get_ultra_suite_asr_transcription_run_spec() -> RunSpec:
131
+ scenario_spec = ScenarioSpec(
132
+ class_name="helm.benchmark.scenarios.audio_language.ultra_suite_asr_transcription_scenario.UltraSuiteASRTranscriptionScenario", # noqa: E501
133
+ )
134
+ adapter_spec = _get_generation_adapter_spec(
135
+ instructions="""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. Based on your expertise transcribe the child's speech into text. Try to understand what the child is expected to say. And only respond with the transcription of the child's speech. Not the pathologist's prompt or any other commentary. Only respond with the text transcription, no other text, commentary or punctuations.""", # noqa: E501
136
+ max_tokens=50,
137
+ )
138
+ metric_specs: List[MetricSpec] = get_basic_generation_metric_specs(["wer_score", "mer_score", "wip_score"])
139
+ run_spec_name: str = "ultra_suite_asr_transcription"
140
+ return RunSpec(
141
+ name=run_spec_name,
142
+ scenario_spec=scenario_spec,
143
+ adapter_spec=adapter_spec,
144
+ metric_specs=metric_specs,
145
+ groups=[run_spec_name],
146
+ )
147
+
148
+
149
+ @run_spec_function("ultra_suite_disorder_symptoms")
150
+ def get_ultra_suite_disorder_symptoms_run_spec() -> RunSpec:
151
+ scenario_spec = ScenarioSpec(
152
+ class_name="helm.benchmark.scenarios.audio_language.ultra_suite_disorder_symptoms_scenario.UltraSuiteDisorderSymptomsScenario", # noqa: E501
153
+ )
154
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer")
155
+ metric_specs: List[MetricSpec] = audio_classification_metric_specs()
156
+ run_spec_name: str = "ultra_suite_disorder_symptoms"
157
+ return RunSpec(
158
+ name=f"{run_spec_name}",
159
+ scenario_spec=scenario_spec,
160
+ adapter_spec=adapter_spec,
161
+ metric_specs=metric_specs,
162
+ groups=[run_spec_name],
163
+ )
@@ -1027,3 +1027,31 @@ def get_vqa_rad_spec() -> RunSpec:
1027
1027
  metric_specs=metric_specs,
1028
1028
  groups=[run_spec_name],
1029
1029
  )
1030
+
1031
+
1032
+ ############################################################
1033
+ # Video understanding run specs
1034
+
1035
+
1036
+ @run_spec_function("msr_vtt")
1037
+ def get_msr_vtt_spec() -> RunSpec:
1038
+ scenario_spec = ScenarioSpec(
1039
+ class_name="helm.benchmark.scenarios.vision_language.msr_vtt_scenario.MSRVTTScenario",
1040
+ args={},
1041
+ )
1042
+
1043
+ adapter_spec: AdapterSpec = _get_generation_adapter_spec(
1044
+ instructions="Generate a short caption for the video in plain words. Just give the caption and nothing else.",
1045
+ max_tokens=30,
1046
+ max_train_instances=0,
1047
+ )
1048
+ metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
1049
+
1050
+ run_spec_name: str = "msr_vtt"
1051
+ return RunSpec(
1052
+ name=run_spec_name,
1053
+ scenario_spec=scenario_spec,
1054
+ adapter_spec=adapter_spec,
1055
+ metric_specs=metric_specs,
1056
+ groups=[run_spec_name],
1057
+ )
helm/benchmark/runner.py CHANGED
@@ -13,7 +13,7 @@ from tqdm import tqdm
13
13
 
14
14
  from helm.benchmark.adaptation.request_state import RequestState
15
15
  from helm.common.general import ensure_directory_exists, write, asdict_without_nones
16
- from helm.common.hierarchical_logger import hlog, htrack_block
16
+ from helm.common.hierarchical_logger import hlog, htrack_block, hwarn
17
17
  from helm.common.cache import cache_stats
18
18
  from helm.benchmark.scenarios.scenario import (
19
19
  EVAL_SPLITS,
@@ -82,7 +82,7 @@ def remove_stats_nans(stats: List[Stat]) -> List[Stat]:
82
82
  result: List[Stat] = []
83
83
  for stat in stats:
84
84
  if math.isnan(stat.sum):
85
- hlog(f"WARNING: Removing stat {stat.name.name} because its value is NaN")
85
+ hwarn(f"Removing stat {stat.name.name} because its value is NaN")
86
86
  continue
87
87
  result.append(stat)
88
88
  return result
@@ -164,8 +164,8 @@ class Runner:
164
164
  )
165
165
  )
166
166
  self.dry_run: bool = execution_spec.dry_run
167
- self.tokenizer_service = TokenizerService(self.executor.service, execution_spec.auth)
168
- self.metric_service = MetricService(self.executor.service, execution_spec.auth)
167
+ self.tokenizer_service = TokenizerService(self.executor.context)
168
+ self.metric_service = MetricService(self.executor.context)
169
169
  self.skip_instances: bool = skip_instances
170
170
  self.cache_instances: bool = cache_instances
171
171
  self.cache_instances_only: bool = cache_instances_only
@@ -318,7 +318,7 @@ class Runner:
318
318
  metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
319
319
  for metric_name, count in metric_counts.items():
320
320
  if count > 1:
321
- hlog(f"WARNING: duplicate metric name {metric_name}")
321
+ hwarn(f"duplicate metric name {metric_name}")
322
322
 
323
323
  # Print out the number of stats
324
324
  hlog(f"Generated {len(stats)} stats.")
@@ -72,7 +72,13 @@ class ACIBenchScenario(Scenario):
72
72
  ]
73
73
 
74
74
  name = "aci_bench"
75
- description = "A dataset of patient-doctor conversations paired with structured clinical notes."
75
+ description = (
76
+ "ACI-Bench is a benchmark of real-world patient-doctor conversations paired with"
77
+ "structured clinical notes. The benchmark evaluates a model's ability to understand"
78
+ "spoken medical dialogue and convert it into formal clinical documentation, covering"
79
+ "sections such as history of present illness, physical exam findings, results, and assessment"
80
+ "and plan."
81
+ )
76
82
  tags = ["summarization", "medicine"]
77
83
 
78
84
  def download_json(self, url: str, output_path: str, file_name: str) -> str:
@@ -0,0 +1,126 @@
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ TRAIN_SPLIT,
13
+ CORRECT_TAG,
14
+ Input,
15
+ Output,
16
+ )
17
+
18
+
19
+ class AlGhafaScenario(Scenario):
20
+ """AlGhafa Evaluation Benchmark for Arabic Language Models
21
+
22
+ EXPERIMENTAL: This scenario may have future reverse incompatible changes.
23
+
24
+ Multiple-choice evaluation benchmark for zero- and few-shot evaluation of Arabic LLMs,
25
+ consisting of
26
+
27
+ - https://huggingface.co/datasets/OALL/AlGhafa-Arabic-LLM-Benchmark-Native/
28
+ - https://aclanthology.org/2023.arabicnlp-1.21/
29
+
30
+ Citation:
31
+
32
+ ```
33
+ @inproceedings{almazrouei-etal-2023-alghafa,
34
+ title = "{A}l{G}hafa Evaluation Benchmark for {A}rabic Language Models",
35
+ author = "Almazrouei, Ebtesam and
36
+ Cojocaru, Ruxandra and
37
+ Baldo, Michele and
38
+ Malartic, Quentin and
39
+ Alobeidli, Hamza and
40
+ Mazzotta, Daniele and
41
+ Penedo, Guilherme and
42
+ Campesan, Giulia and
43
+ Farooq, Mugariya and
44
+ Alhammadi, Maitha and
45
+ Launay, Julien and
46
+ Noune, Badreddine",
47
+ editor = "Sawaf, Hassan and
48
+ El-Beltagy, Samhaa and
49
+ Zaghouani, Wajdi and
50
+ Magdy, Walid and
51
+ Abdelali, Ahmed and
52
+ Tomeh, Nadi and
53
+ Abu Farha, Ibrahim and
54
+ Habash, Nizar and
55
+ Khalifa, Salam and
56
+ Keleg, Amr and
57
+ Haddad, Hatem and
58
+ Zitouni, Imed and
59
+ Mrini, Khalil and
60
+ Almatham, Rawan",
61
+ booktitle = "Proceedings of ArabicNLP 2023",
62
+ month = dec,
63
+ year = "2023",
64
+ address = "Singapore (Hybrid)",
65
+ publisher = "Association for Computational Linguistics",
66
+ url = "https://aclanthology.org/2023.arabicnlp-1.21/",
67
+ doi = "10.18653/v1/2023.arabicnlp-1.21",
68
+ pages = "244--275",
69
+ abstract = "Recent advances in the space of Arabic large language models have opened up a wealth of potential practical applications. From optimal training strategies, large scale data acquisition and continuously increasing NLP resources, the Arabic LLM landscape has improved in a very short span of time, despite being plagued by training data scarcity and limited evaluation resources compared to English. In line with contributing towards this ever-growing field, we introduce AlGhafa, a new multiple-choice evaluation benchmark for Arabic LLMs. For showcasing purposes, we train a new suite of models, including a 14 billion parameter model, the largest monolingual Arabic decoder-only model to date. We use a collection of publicly available datasets, as well as a newly introduced HandMade dataset consisting of 8 billion tokens. Finally, we explore the quantitative and qualitative toxicity of several Arabic models, comparing our models to existing public Arabic LLMs."
70
+ }
71
+ ```
72
+ """ # noqa: E501
73
+
74
+ name = "alghafa"
75
+ description = "AlGhafa"
76
+ tags = ["multiple choice"]
77
+
78
+ HF_SPLIT_TO_HELM_SPLIT = {"validation": TRAIN_SPLIT, "test": TEST_SPLIT}
79
+ REFERENCE_PREFIX = "sol"
80
+
81
+ def __init__(self, subset: str):
82
+ super().__init__()
83
+ self.subset = subset
84
+
85
+ def get_instances(self, output_path: str) -> List[Instance]:
86
+ cache_dir = os.path.join(output_path, "data")
87
+ ensure_directory_exists(cache_dir)
88
+ dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
89
+ "OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
90
+ self.subset,
91
+ revision="a31ebd34ca311d7e0cfc6ad7f458b3435af280f5",
92
+ cache_dir=cache_dir,
93
+ )
94
+
95
+ # Read all instances
96
+ instances: List[Instance] = []
97
+ for split_name, dataset in dataset_splits.items():
98
+ assert isinstance(dataset, datasets.Dataset)
99
+ option_indexes = [
100
+ int(s.removeprefix(self.REFERENCE_PREFIX))
101
+ for s in dataset[0].keys()
102
+ if s.startswith(self.REFERENCE_PREFIX)
103
+ ]
104
+ for row_index, row in enumerate(dataset):
105
+ input = Input(text=row["query"])
106
+ references: List[Reference] = []
107
+ # Need to add 1 because label is zero-indexed and has a value from 0 to (N - 1),
108
+ # but column names are 1 indexed and have values from "sol1" to "solN"
109
+ correct_option_index = int(row["label"]) + 1
110
+ for option_index in option_indexes:
111
+ column_name = f"{self.REFERENCE_PREFIX}{option_index}"
112
+ references.append(
113
+ Reference(
114
+ output=Output(text=row[column_name]),
115
+ tags=[CORRECT_TAG] if option_index == correct_option_index else [],
116
+ )
117
+ )
118
+ instance = Instance(
119
+ id=f"id{row_index}_{split_name}",
120
+ input=input,
121
+ references=references,
122
+ split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
123
+ )
124
+ instances.append(instance)
125
+
126
+ return instances
@@ -0,0 +1,78 @@
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ TRAIN_SPLIT,
13
+ CORRECT_TAG,
14
+ Input,
15
+ Output,
16
+ )
17
+
18
+
19
+ class ArabicMMLUScenario(Scenario):
20
+ """ArabicMMLU
21
+
22
+ EXPERIMENTAL: This scenario may have future reverse incompatible changes.
23
+
24
+ ArabicMMLU is the first multi-task language understanding benchmark
25
+ for Arabic language, sourced from school exams across diverse educational
26
+ levels in different countries spanning North Africa, the Levant, and the
27
+ Gulf regions. The data comprises 40 tasks and 14,575 multiple-choice questions
28
+ in Modern Standard Arabic (MSA), and is carefully constructed by collaborating
29
+ with native speakers in the region.
30
+
31
+ - https://huggingface.co/datasets/MBZUAI/ArabicMMLU
32
+ - https://aclanthology.org/2024.findings-acl.334/
33
+ """
34
+
35
+ name = "arabic_mmlu"
36
+ description = "Arabic Massive Multitask Language Understanding"
37
+ tags = ["knowledge", "multiple_choice"]
38
+
39
+ OPTIONS = ["A", "B", "C", "D"]
40
+ HF_SPLIT_TO_HELM_SPLIT = {"dev": TRAIN_SPLIT, "test": TEST_SPLIT}
41
+
42
+ def get_instances(self, output_path: str) -> List[Instance]:
43
+ cache_dir = os.path.join(output_path, "data")
44
+ ensure_directory_exists(cache_dir)
45
+ dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
46
+ "MBZUAI/ArabicMMLU",
47
+ "All",
48
+ revision="7aa530e2893ac420352b3f5c1a1310c010e9758b",
49
+ cache_dir=cache_dir,
50
+ )
51
+
52
+ # Read all instances
53
+ instances: List[Instance] = []
54
+ for split_name, dataset in dataset_splits.items():
55
+ assert isinstance(dataset, datasets.Dataset)
56
+ for row_index, row in enumerate(dataset):
57
+ input = Input(text=row["Question"])
58
+ references: List[Reference] = []
59
+ correct_option_index = ord(row["Answer Key"]) - ord("A") + 1
60
+ for option_index in range(1, 6):
61
+ column_name = f"Option {option_index}"
62
+ if not row[column_name]:
63
+ continue
64
+ references.append(
65
+ Reference(
66
+ output=Output(text=row[column_name]),
67
+ tags=[CORRECT_TAG] if option_index == correct_option_index else [],
68
+ )
69
+ )
70
+ instance = Instance(
71
+ id=f"id{row_index}",
72
+ input=input,
73
+ references=references,
74
+ split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
75
+ )
76
+ instances.append(instance)
77
+
78
+ return instances
@@ -0,0 +1,76 @@
1
+ import os
2
+ from typing import List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class AraTrustScenario(Scenario):
19
+ """AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic
20
+
21
+ EXPERIMENTAL: This scenario may have future reverse incompatible changes.
22
+
23
+ AraTrust is a comprehensive Trustworthiness benchmark for LLMs in Arabic.
24
+ AraTrust comprises 522 human-written multiple-choice questions addressing
25
+ diverse dimensions related to truthfulness, ethics, safety, physical health,
26
+ mental health, unfairness, illegal activities, privacy, and offensive language.
27
+
28
+ - https://huggingface.co/datasets/asas-ai/AraTrust
29
+ - https://arxiv.org/abs/2403.09017
30
+
31
+ Citation:
32
+
33
+ ```
34
+ @misc{alghamdi2024aratrustevaluationtrustworthinessllms,
35
+ title={AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic},
36
+ author={Emad A. Alghamdi and Reem I. Masoud and Deema Alnuhait and Afnan Y. Alomairi and Ahmed Ashraf and Mohamed Zaytoon},
37
+ year={2024},
38
+ eprint={2403.09017},
39
+ archivePrefix={arXiv},
40
+ primaryClass={cs.CL},
41
+ url={https://arxiv.org/abs/2403.09017},
42
+ }
43
+ ```
44
+ """ # noqa: E501
45
+
46
+ name = "aratrust"
47
+ description = "aratrust"
48
+ tags = ["trustworthiness"]
49
+
50
+ OPTION_KEYS = ["A", "B", "C"]
51
+
52
+ def get_instances(self, output_path: str) -> List[Instance]:
53
+ cache_dir = os.path.join(output_path, "data")
54
+ ensure_directory_exists(cache_dir)
55
+ dataset: datasets.Dataset = datasets.load_dataset(
56
+ "asas-ai/AraTrust",
57
+ revision="d4dd124ed5b90aeb65a7dda7d88e34fb464a31ec",
58
+ cache_dir=cache_dir,
59
+ split="test",
60
+ )
61
+ instances: List[Instance] = []
62
+ for row_index, row in enumerate(dataset):
63
+ question_text = row["Question"]
64
+ option_texts = [row[option_key] for option_key in self.OPTION_KEYS if row[option_key]]
65
+ joined_option_texts = "\n".join(option_texts)
66
+ input = Input(text=f"{question_text}\n\n{joined_option_texts}\n")
67
+ references = [Reference(output=Output(text=row["Answer"]), tags=[CORRECT_TAG])]
68
+ instance = Instance(
69
+ id=f"id{row_index}",
70
+ input=input,
71
+ references=references,
72
+ split=TEST_SPLIT,
73
+ )
74
+ instances.append(instance)
75
+
76
+ return instances
@@ -93,7 +93,8 @@ class AirBenchChatScenario(Scenario):
93
93
  meta_data = json.load(open(meta_data_path))
94
94
  subject_indices = self._get_subject_indices(meta_data)
95
95
  valid_testing_indices = []
96
- for _, row in enumerate(subject_indices):
96
+
97
+ for _, row in tqdm(enumerate(subject_indices)):
97
98
  audio_meda_data = meta_data[row]
98
99
  hf_audio_file_path = os.path.join(
99
100
  self.HF_DATA_PATH_PREFIX,
@@ -105,6 +106,7 @@ class AirBenchChatScenario(Scenario):
105
106
  ensure_file_downloaded(source_url=hf_audio_file_path, target_path=local_audio_file_path)
106
107
  if not is_invalid_audio_file(local_audio_file_path):
107
108
  valid_testing_indices.append(row)
109
+
108
110
  for _, row in enumerate(tqdm(valid_testing_indices)):
109
111
  audio_meda_data_valid = meta_data[row]
110
112
  local_audio_file_path_valid = os.path.join(
@@ -53,9 +53,9 @@ class AirBenchFoundationScenario(Scenario):
53
53
  META_DATA_FILE_PATH = (
54
54
  "https://huggingface.co/datasets/qyang1021/AIR-Bench-Dataset/resolve/main/Foundation/Foundation_meta.json"
55
55
  )
56
- SUJECTS_DICT = {
56
+ SUBJECTS_DICT = {
57
57
  "music_instrument_classification": "Music_Instruments_Classfication",
58
- "music_genera_recognition": "Music_Genre_Recognition",
58
+ "music_genre_recognition": "Music_Genre_Recognition",
59
59
  "music_qa": "Music_AQA",
60
60
  }
61
61
  OPTION_KEYS = ["choice_a", "choice_b", "choice_c", "choice_d"]
@@ -68,15 +68,15 @@ class AirBenchFoundationScenario(Scenario):
68
68
  def __init__(self, subject: str) -> None:
69
69
  super().__init__()
70
70
 
71
- if subject not in AirBenchFoundationScenario.SUJECTS_DICT.keys():
72
- raise ValueError(f"Invalid subject. Valid subjects are: {AirBenchFoundationScenario.SUJECTS_DICT.keys()}")
71
+ if subject not in AirBenchFoundationScenario.SUBJECTS_DICT.keys():
72
+ raise ValueError(f"Invalid subject. Valid subjects are: {AirBenchFoundationScenario.SUBJECTS_DICT.keys()}")
73
73
 
74
74
  self._subject: str = subject
75
75
 
76
76
  def _get_subject_indices(self, meta_data) -> List[int]:
77
77
  subject_indices = []
78
78
  for idx, line in enumerate(meta_data):
79
- if line["task_name"] == self.SUJECTS_DICT[self._subject]:
79
+ if line["task_name"] == self.SUBJECTS_DICT[self._subject]:
80
80
  subject_indices.append(idx)
81
81
  return subject_indices
82
82
 
@@ -19,7 +19,7 @@ from helm.common.audio_utils import extract_audio
19
19
 
20
20
 
21
21
  class CasualConversations2Scenario(Scenario):
22
- """
22
+ r"""
23
23
  Casual Conversation v2 (Porgali et al, 2023) is composed of over 5,567 participants (26,467 videos).
24
24
  The videos feature paid individuals who agreed to participate in the project and explicitly provided
25
25
  Age, Gender, Language/Dialect, Geo-location, Disability, Physical adornments, Physical attributes labels