crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/classification_metrics.py +19 -1
  27. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  28. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  29. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  30. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  31. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  32. helm/benchmark/metrics/comet_metric.py +1 -1
  33. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  34. helm/benchmark/metrics/copyright_metrics.py +1 -1
  35. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  36. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  37. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  38. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  39. helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
  40. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  41. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  42. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  43. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  44. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  45. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  46. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  47. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  48. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  49. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  50. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  51. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  52. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  53. helm/benchmark/metrics/medec_metrics.py +25 -2
  54. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  55. helm/benchmark/metrics/metric.py +25 -0
  56. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  57. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  58. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  59. helm/benchmark/metrics/summac/model_summac.py +3 -3
  60. helm/benchmark/metrics/summarization_metrics.py +129 -1
  61. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  62. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  63. helm/benchmark/model_deployment_registry.py +11 -19
  64. helm/benchmark/presentation/create_plots.py +11 -2
  65. helm/benchmark/presentation/schema.py +10 -22
  66. helm/benchmark/presentation/summarize.py +189 -14
  67. helm/benchmark/presentation/taxonomy_info.py +20 -0
  68. helm/benchmark/presentation/test_create_plots.py +4 -1
  69. helm/benchmark/run.py +7 -1
  70. helm/benchmark/run_expander.py +4 -0
  71. helm/benchmark/run_specs/arabic_run_specs.py +191 -0
  72. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  73. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  74. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  75. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  76. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  77. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  78. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  79. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  80. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  81. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  82. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  83. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  84. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  85. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  86. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  87. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  88. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  89. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  90. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  91. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  92. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  93. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  94. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  95. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  96. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  97. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  98. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  99. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  100. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  101. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  102. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  103. helm/benchmark/scenarios/bold_scenario.py +15 -0
  104. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  105. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  106. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  107. helm/benchmark/scenarios/clear_scenario.py +23 -0
  108. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  109. helm/benchmark/scenarios/code_scenario.py +28 -0
  110. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  111. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  112. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  113. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  114. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  115. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  116. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  117. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  118. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  119. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  120. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  121. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  122. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  123. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  124. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  125. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  126. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  127. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  128. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  129. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  130. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  131. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  132. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  133. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  134. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  135. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  136. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  137. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  138. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  139. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  140. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  141. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  142. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  143. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  144. helm/benchmark/scenarios/ice_scenario.py +21 -1
  145. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  146. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  147. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  148. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  149. helm/benchmark/scenarios/koala_scenario.py +21 -1
  150. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  151. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  152. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  153. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  154. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  155. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  156. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  157. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  158. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  159. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  160. helm/benchmark/scenarios/math_scenario.py +47 -20
  161. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  162. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  163. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  164. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  165. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  166. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  167. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  168. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  169. helm/benchmark/scenarios/medec_scenario.py +23 -0
  170. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  171. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  172. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  173. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  174. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  175. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  176. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  177. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  178. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  179. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  180. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  181. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  182. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  183. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  184. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  185. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  186. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  187. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  188. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  189. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  190. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  191. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  192. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  193. helm/benchmark/scenarios/quac_scenario.py +14 -0
  194. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  195. helm/benchmark/scenarios/raft_scenario.py +15 -0
  196. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  197. helm/benchmark/scenarios/scenario.py +31 -0
  198. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  199. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  200. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  201. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  202. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  203. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  204. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  205. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  206. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  207. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  208. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  209. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  210. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  211. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  212. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  213. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  214. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  215. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  216. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  217. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  218. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  219. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  220. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  221. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  222. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  223. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  224. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  225. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  226. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  227. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  228. helm/benchmark/slurm_jobs.py +1 -2
  229. helm/benchmark/slurm_runner.py +8 -1
  230. helm/benchmark/static/schema_arabic.yaml +271 -0
  231. helm/benchmark/static/schema_classic.yaml +0 -17
  232. helm/benchmark/static/schema_long_context.yaml +24 -6
  233. helm/benchmark/static/schema_medhelm.yaml +36 -0
  234. helm/benchmark/static/schema_slp.yaml +219 -0
  235. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  236. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  237. helm/benchmark/static_build/index.html +2 -2
  238. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  239. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  240. helm/clients/audio_language/llama_omni/constants.py +9 -0
  241. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  242. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  243. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  244. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  245. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  246. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  247. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  248. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  249. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  250. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  251. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  252. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  253. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  254. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  255. helm/clients/audio_language/llama_omni/utils.py +202 -0
  256. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  257. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  258. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  259. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  260. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  261. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  262. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  263. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  264. helm/clients/huggingface_client.py +2 -2
  265. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  266. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  267. helm/clients/openai_client.py +33 -20
  268. helm/clients/openai_responses_client.py +34 -8
  269. helm/clients/openrouter_client.py +31 -0
  270. helm/clients/test_huggingface_client.py +3 -3
  271. helm/clients/test_openrouter_client.py +69 -0
  272. helm/clients/together_client.py +48 -13
  273. helm/clients/vertexai_client.py +19 -11
  274. helm/clients/vllm_client.py +43 -7
  275. helm/clients/vllm_granite_thinking_client.py +56 -0
  276. helm/common/critique_request.py +0 -1
  277. helm/common/hierarchical_logger.py +83 -34
  278. helm/common/object_spec.py +23 -8
  279. helm/common/test_logging.py +94 -0
  280. helm/config/model_deployments.yaml +525 -172
  281. helm/config/model_metadata.yaml +185 -10
  282. helm/config/tokenizer_configs.yaml +100 -2
  283. helm/proxy/cli.py +1 -1
  284. helm/proxy/example_queries.py +8 -8
  285. helm/proxy/retry.py +5 -0
  286. helm/proxy/server.py +2 -1
  287. helm/proxy/static/index.css +4 -0
  288. helm/proxy/static/index.js +7 -1
  289. helm/tokenizers/grok_tokenizer.py +2 -0
  290. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  291. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  292. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  293. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  294. helm/benchmark/metrics/medalign_metrics.py +0 -14
  295. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  296. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  297. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  298. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  299. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  300. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  301. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  302. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  303. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  304. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  305. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  306. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  307. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  308. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  309. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  310. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  311. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,40 @@
1
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
2
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
3
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
4
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
5
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
6
+
7
+
8
+ @run_spec_function("bluex")
9
+ def get_bluex_spec() -> RunSpec:
10
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.BLUEXScenario", args={})
11
+
12
+ adapter_spec = get_multiple_choice_adapter_spec(
13
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
14
+ instructions="""
15
+ Escolha a alternativa correta para as questões de vestibulares (responda apenas com a letra).
16
+ Exemplo de Pergunta com a resposta:
17
+ Em um romance narrado em primeira pessoa, o narrador participa dos acontecimentos da trama,
18
+ relatando suas próprias experiências e sentimentos. Qual alternativa apresenta essa característica?
19
+
20
+ (A) Narrador onisciente que conhece os pensamentos de todas as personagens.
21
+ (B) Narrador que descreve os fatos de forma imparcial, sem envolvimento emocional.
22
+ (C) Narrador-personagem que vivencia e relata os eventos da história.
23
+ (D) Narrador observador que apenas registra as ações visíveis.
24
+ (E) Narrador em segunda pessoa que se dirige constantemente ao leitor.
25
+
26
+ Resposta correta: C
27
+
28
+ A partir disso, responda:
29
+ """,
30
+ input_noun="Pergunta",
31
+ output_noun="Resposta",
32
+ )
33
+
34
+ return RunSpec(
35
+ name="bluex",
36
+ scenario_spec=scenario_spec,
37
+ adapter_spec=adapter_spec,
38
+ metric_specs=get_exact_match_metric_specs(),
39
+ groups=["bluex"],
40
+ )
@@ -35,7 +35,6 @@ from helm.benchmark.metrics.common_metric_specs import (
35
35
  get_f1_metric_specs,
36
36
  get_generative_harms_metric_specs,
37
37
  get_language_modeling_metric_specs,
38
- get_numeracy_metric_specs,
39
38
  get_open_ended_generation_metric_specs,
40
39
  get_summarization_metric_specs,
41
40
  get_basic_generation_metric_specs,
@@ -381,58 +380,6 @@ def get_raft_spec(subset: str) -> RunSpec:
381
380
  )
382
381
 
383
382
 
384
- @run_spec_function("numeracy")
385
- def get_numeracy_spec(
386
- relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False"
387
- ) -> RunSpec:
388
- from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
389
-
390
- run_solver_bool: bool = True if run_solver.lower() == "true" else False
391
- del run_solver
392
- random_seed = int(seed)
393
- scenario_spec = ScenarioSpec(
394
- class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
395
- args={"seed": random_seed, "relation_type": relation_type, "mode": mode},
396
- )
397
-
398
- if mode in ["example", "standard"]:
399
- # Test a model's ability to impute datapoints for a given (example or randomly sampled) relation.
400
- adapter_args: Dict[str, Any] = {
401
- "max_train_instances": 100,
402
- "max_eval_instances": 100,
403
- "dim": RELTYPE_INFO[relation_type].num_variables + 1,
404
- }
405
- elif mode == "function":
406
- # Test a model's ability to impute datapoints for randomly sampled relations
407
- # (resampled for each evaluation point).
408
- adapter_args = {
409
- "instructions": "",
410
- "max_train_instances": 0, # Turn off general version of `function` mode because it doesn't cleanly
411
- # capture a higher-order version of this task / is a little convoluted
412
- # for models, currently.
413
- # (In the general version, the model sees other relations of the same class,
414
- # and needs to impute a datapoint for the last one. Presumably, inferring
415
- # the class - eg. the degree of the relation - would help.)
416
- "max_eval_instances": 1000,
417
- "dim": RELTYPE_INFO[relation_type].num_variables + 1,
418
- "instance_prefix": "\n\n",
419
- }
420
- else:
421
- raise ValueError(f"Invalid mode: {mode}")
422
-
423
- adapter_spec = get_numeracy_adapter_spec(**adapter_args) # Construct the AdapterSpec using a helper function.
424
- # `get_numeracy_adapter_spec` is defined in numeracy_scenario.py
425
- # because it is used within the scenario to construct the instances themselves.
426
-
427
- return RunSpec(
428
- name=f"numeracy:relation_type={relation_type},mode={mode}",
429
- scenario_spec=scenario_spec,
430
- adapter_spec=adapter_spec,
431
- metric_specs=get_numeracy_metric_specs(run_solver_bool),
432
- groups=["numeracy"],
433
- )
434
-
435
-
436
383
  @run_spec_function("boolq")
437
384
  def get_boolq_spec(only_contrast=False) -> RunSpec:
438
385
  scenario_spec = ScenarioSpec(
@@ -806,12 +753,12 @@ def get_xsum_sampled_summarization_spec(temperature: float = 0.3, device: str =
806
753
  )
807
754
 
808
755
  return RunSpec(
809
- name=f"summarization_xsum:temperature={temperature},device={device}",
756
+ name=f"summarization_xsum_sampled:temperature={temperature},device={device}",
810
757
  scenario_spec=scenario_spec,
811
758
  adapter_spec=adapter_spec,
812
759
  metric_specs=get_summarization_metric_specs({"task": "summarization_xsum_sampled", "device": device})
813
760
  + get_generative_harms_metric_specs(),
814
- groups=["summarization_xsum"],
761
+ groups=["summarization_xsum_sampled"],
815
762
  )
816
763
 
817
764
 
@@ -0,0 +1,192 @@
1
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
2
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
3
+ from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
4
+ from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
5
+ from helm.benchmark.metrics.codeinsights_metric_specs import (
6
+ get_functional_correctness_metric_specs,
7
+ get_comprehensive_code_evaluation_metric_specs,
8
+ get_edge_case_metric_specs,
9
+ get_code_efficiency_metric_specs,
10
+ )
11
+
12
+
13
+ @run_spec_function("codeinsights_correct_code")
14
+ def get_codeinsights_correct_code_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
15
+ scenario_spec = ScenarioSpec(
16
+ class_name="helm.benchmark.scenarios.codeinsights_correct_code_scenario.CodeInsightsCorrectCodeScenario",
17
+ args={"num_testcases": num_testcases},
18
+ )
19
+
20
+ instruction = (
21
+ "You are a skilled C++ programmer working on a foundational programming course assignment. "
22
+ "Your task is to write correct, efficient C++ code that solves the given problem. "
23
+ "Write clean, well-structured code that follows good programming practices. "
24
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
25
+ "DO NOT reproduce the template part as the generated code would be inserted to the template,"
26
+ "and make sure the code is compatible with the Unit Test Input"
27
+ "Ensure your code is correct, efficient, includes any class definition when needed, and handles all edge cases properly."
28
+ )
29
+
30
+ adapter_spec = get_generation_adapter_spec(
31
+ instructions=instruction,
32
+ output_noun="Your code",
33
+ stop_sequences=[],
34
+ max_tokens=4000,
35
+ temperature=tpr,
36
+ )
37
+
38
+ return RunSpec(
39
+ name=f"codeinsights_correct_code:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
40
+ scenario_spec=scenario_spec,
41
+ adapter_spec=adapter_spec,
42
+ metric_specs=get_functional_correctness_metric_specs() + get_basic_metric_specs([]),
43
+ groups=["codeinsights", "codeinsights_correct_code"],
44
+ )
45
+
46
+
47
+ @run_spec_function("codeinsights_student_coding")
48
+ def get_codeinsights_student_coding_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
49
+ scenario_spec = ScenarioSpec(
50
+ class_name="helm.benchmark.scenarios.codeinsights_student_coding_scenario.CodeInsightsStudentCodingScenario",
51
+ args={"num_testcases": num_testcases},
52
+ )
53
+
54
+ instruction = (
55
+ "You are the same student who wrote the three examples below in your foundational C++ course. "
56
+ "Mimic exactly your personal coding style, conventions, and level of proficiency—"
57
+ "do not over‐optimize or introduce unfamiliar patterns. "
58
+ "Include the same sort of formatting, variable names, and minor imperfections you demonstrated. "
59
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
60
+ "DO NOT reproduce the template part as the generated code would be inserted to the template,"
61
+ "and make sure the code is compatible with the Unit Test Input"
62
+ "Ensure your code includes any class definition when needed."
63
+ )
64
+
65
+ adapter_spec = get_generation_adapter_spec(
66
+ instructions=instruction,
67
+ output_noun="Your code",
68
+ stop_sequences=[],
69
+ max_tokens=4000,
70
+ temperature=tpr,
71
+ )
72
+
73
+ return RunSpec(
74
+ name=f"codeinsights_student_coding:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
75
+ scenario_spec=scenario_spec,
76
+ adapter_spec=adapter_spec,
77
+ metric_specs=get_comprehensive_code_evaluation_metric_specs() + get_basic_metric_specs([]),
78
+ groups=["codeinsights", "codeinsights_student_coding"],
79
+ )
80
+
81
+
82
+ @run_spec_function("codeinsights_student_mistake")
83
+ def get_codeinsights_student_mistake_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
84
+ scenario_spec = ScenarioSpec(
85
+ class_name="helm.benchmark.scenarios.codeinsights_student_mistake_scenario.CodeInsightsStudentMistakeScenario",
86
+ args={"num_testcases": num_testcases},
87
+ )
88
+
89
+ instruction = (
90
+ "You are a C++ student with a consistent personal style, conventions, and proficiency level.\n"
91
+ "Your task is to attempt the target problem **but introduce realistic mistake** you would typically make—"
92
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
93
+ "DO NOT reproduce the template part as the generated code would be inserted to the template,"
94
+ "and make sure the code is compatible with the Unit Test Input"
95
+ "Ensure your code is includes any class definition when needed."
96
+ )
97
+
98
+ adapter_spec = get_generation_adapter_spec(
99
+ instructions=instruction,
100
+ output_noun="Your code",
101
+ stop_sequences=[],
102
+ max_tokens=4000,
103
+ temperature=tpr,
104
+ )
105
+
106
+ return RunSpec(
107
+ name=f"codeinsights_student_mistake:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
108
+ scenario_spec=scenario_spec,
109
+ adapter_spec=adapter_spec,
110
+ metric_specs=get_comprehensive_code_evaluation_metric_specs() + get_basic_metric_specs([]),
111
+ groups=["codeinsights", "codeinsights_student_mistake"],
112
+ )
113
+
114
+
115
+ @run_spec_function("codeinsights_code_efficiency")
116
+ def get_codeinsights_code_efficiency_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
117
+ """
118
+ Run specification for code efficiency evaluation scenario.
119
+
120
+ This scenario evaluates whether LLM-generated code has similar runtime efficiency
121
+ as the original student code. It focuses on problems where both solutions are
122
+ functionally correct and measures runtime performance alignment.
123
+
124
+ Requires C++ compiler (g++) to be available for actual compilation and execution.
125
+ """
126
+ scenario_spec = ScenarioSpec(
127
+ class_name="helm.benchmark.scenarios.codeinsights_code_efficiency_scenario.CodeInsightsCodeEfficiencyScenario",
128
+ args={"num_testcases": num_testcases},
129
+ )
130
+
131
+ instruction = (
132
+ "You are the same student who wrote the three examples below in your foundational C++ course. "
133
+ "Mimic exactly your personal coding style, conventions, and make sure to generate a correct code. "
134
+ "Do not over-optimize or introduce unfamiliar patterns. If the code is correct but inefficient, "
135
+ "imitate the inefficiency. "
136
+ "If the student writes efficiently, write efficiently too. "
137
+ "Include the same sort of formatting, variable names, and minor imperfections you demonstrated. "
138
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
139
+ "DO NOT reproduce the template part as the generated code would be inserted to the template,"
140
+ "and make sure the code is compatible with the Unit Test Input"
141
+ "Ensure your code is correct, includes any class definition when needed, and handles all edge cases properly."
142
+ )
143
+
144
+ adapter_spec = get_generation_adapter_spec(
145
+ instructions=instruction,
146
+ output_noun="Your code",
147
+ stop_sequences=[],
148
+ max_tokens=4000,
149
+ temperature=tpr,
150
+ )
151
+
152
+ return RunSpec(
153
+ name=f"codeinsights_code_efficiency:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
154
+ scenario_spec=scenario_spec,
155
+ adapter_spec=adapter_spec,
156
+ metric_specs=get_code_efficiency_metric_specs(
157
+ num_runtime_runs=5, # Run each solution 5 times for averaging
158
+ timeout_seconds=10, # 10 second timeout per execution
159
+ )
160
+ + get_basic_metric_specs([]),
161
+ groups=["codeinsights", "codeinsights_code_efficiency"],
162
+ )
163
+
164
+
165
+ @run_spec_function("codeinsights_edge_case")
166
+ def get_codeinsights_edge_case_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
167
+ scenario_spec = ScenarioSpec(
168
+ class_name="helm.benchmark.scenarios.codeinsights_edge_case_scenario.CodeInsightsEdgeCaseScenario",
169
+ args={"num_testcases": num_testcases},
170
+ )
171
+
172
+ instruction = (
173
+ "You are a student studying C++ with a consistent personal style, conventions, and proficiency level.\n"
174
+ "Your task is to identify which test case you would likely to fail for a given question with unit tests.\n"
175
+ "Respond only with integer of the unittest number\n\n"
176
+ )
177
+
178
+ adapter_spec = get_generation_adapter_spec(
179
+ instructions=instruction,
180
+ output_noun="Your code",
181
+ stop_sequences=[],
182
+ max_tokens=4000,
183
+ temperature=tpr,
184
+ )
185
+
186
+ return RunSpec(
187
+ name=f"codeinsights_edge_case:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
188
+ scenario_spec=scenario_spec,
189
+ adapter_spec=adapter_spec,
190
+ metric_specs=get_edge_case_metric_specs() + get_basic_metric_specs([]),
191
+ groups=["codeinsights", "codeinsights_edge_case"],
192
+ )
@@ -0,0 +1,40 @@
1
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
2
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
3
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
4
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
5
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
6
+
7
+
8
+ @run_spec_function("healthqa_br")
9
+ def get_healthqa_br_spec() -> RunSpec:
10
+ scenario_spec = ScenarioSpec(
11
+ class_name="helm.benchmark.scenarios.healthqa_br_scenario.HEALTHQA_BR_Scenario", args={}
12
+ )
13
+
14
+ adapter_spec = get_multiple_choice_adapter_spec(
15
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
16
+ instructions="""
17
+ Escolha a alternativa correta para as questões de medicina (responda apenas com a letra).
18
+ Exemplo de Pergunta com a resposta:
19
+ Qual dos seguintes órgãos é responsável pela produção da insulina no corpo humano?
20
+ A) Fígado
21
+ B) Rins
22
+ C) Pâncreas
23
+ D) Baço
24
+ E) Coração
25
+
26
+ Resposta correta: C
27
+
28
+ A partir disso, responda:
29
+ """,
30
+ input_noun="Pergunta",
31
+ output_noun="Resposta",
32
+ )
33
+
34
+ return RunSpec(
35
+ name="healthqa_br",
36
+ scenario_spec=scenario_spec,
37
+ adapter_spec=adapter_spec,
38
+ metric_specs=get_exact_match_metric_specs(),
39
+ groups=["healthqa_br"],
40
+ )
@@ -60,7 +60,9 @@ def get_core_heim_metric_specs() -> List[MetricSpec]:
60
60
  class_name="helm.benchmark.metrics.image_generation.fractal_dimension_metric.FractalDimensionMetric",
61
61
  args={},
62
62
  ),
63
- MetricSpec(class_name="helm.benchmark.metrics.image_generation.nsfw_metrics.NSFWMetric", args={}),
63
+ # Disabled due to keras issue.
64
+ # See: https://github.com/stanford-crfm/helm/issues/3741#issuecomment-3109478877
65
+ # MetricSpec(class_name="helm.benchmark.metrics.image_generation.nsfw_metrics.NSFWMetric", args={}),
64
66
  MetricSpec(class_name="helm.benchmark.metrics.image_generation.nudity_metrics.NudityMetric", args={}),
65
67
  MetricSpec(class_name="helm.benchmark.metrics.image_generation.watermark_metrics.WatermarkMetric", args={}),
66
68
  ] + get_basic_metric_specs(names=[])
@@ -0,0 +1,144 @@
1
+ """Run spec functions for Vietnam WVS cultural alignment evaluation."""
2
+
3
+ from helm.benchmark.adaptation.common_adapter_specs import (
4
+ get_generation_adapter_spec,
5
+ )
6
+ from helm.benchmark.metrics.common_metric_specs import (
7
+ get_exact_match_metric_specs,
8
+ get_f1_metric_specs,
9
+ get_open_ended_generation_metric_specs,
10
+ )
11
+ from helm.benchmark.metrics.lmkt_metric_specs import get_semantic_similarity_metric_specs
12
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
13
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
14
+
15
+ INSTRUCTIONS = {
16
+ "cultural_value_understanding_wvs": {
17
+ "en": {
18
+ "instructions": "Please respond as the {country} persona described below.",
19
+ "input_noun": "Question",
20
+ "output_noun": "Answer",
21
+ },
22
+ "vi": {
23
+ "instructions": "Vui lòng trả lời như một người {country} được mô tả bên dưới.",
24
+ "input_noun": "Câu hỏi",
25
+ "output_noun": "Trả lời",
26
+ },
27
+ },
28
+ "social_norm_application_normad": {
29
+ "en": {
30
+ "instructions": "Please respond as the {country} persona described below.",
31
+ "input_noun": "Situation",
32
+ "output_noun": "Response",
33
+ },
34
+ "vi": {
35
+ "instructions": "Vui lòng trả lời như một người {country} được mô tả bên dưới.",
36
+ "input_noun": "Tình huống",
37
+ "output_noun": "Phản hồi",
38
+ },
39
+ },
40
+ "social_norm_explanation_normad": {
41
+ "en": {
42
+ "instructions": "Please respond as the {country} persona described below.",
43
+ "input_noun": "Situation",
44
+ "output_noun": "Explanation",
45
+ },
46
+ "vi": {
47
+ "instructions": "Vui lòng trả lời như một người {country} được mô tả bên dưới.",
48
+ "input_noun": "Tình huống",
49
+ "output_noun": "Giải thích",
50
+ },
51
+ },
52
+ }
53
+
54
+ COUNTRIES = {
55
+ "US": "United States",
56
+ "VN": "Vietnam",
57
+ }
58
+
59
+
60
+ @run_spec_function("cultural_value_understanding_wvs")
61
+ def get_cultural_value_understanding_wvs_spec(language: str, country: str) -> RunSpec:
62
+ scenario_spec = ScenarioSpec(
63
+ class_name="helm.benchmark.scenarios.lmkt_scenarios.CulturalValueUnderstandingWVSScenario",
64
+ args={
65
+ "language": language,
66
+ "num_personas": 300,
67
+ "num_question_variants": 4,
68
+ "include_few_shot_examples": True,
69
+ },
70
+ )
71
+
72
+ adapter_spec = get_generation_adapter_spec(
73
+ instructions=INSTRUCTIONS["cultural_value_understanding_wvs"][language]["instructions"].format(
74
+ country=COUNTRIES[country]
75
+ ),
76
+ input_noun=INSTRUCTIONS["cultural_value_understanding_wvs"][language]["input_noun"],
77
+ output_noun=INSTRUCTIONS["cultural_value_understanding_wvs"][language]["output_noun"],
78
+ max_tokens=3,
79
+ stop_sequences=[],
80
+ )
81
+
82
+ return RunSpec(
83
+ name="cultural_value_understanding_wvs",
84
+ scenario_spec=scenario_spec,
85
+ adapter_spec=adapter_spec,
86
+ metric_specs=get_exact_match_metric_specs() + get_f1_metric_specs(),
87
+ groups=["lmkt", "cultural_value_understanding_wvs"],
88
+ )
89
+
90
+
91
+ @run_spec_function("social_norm_application_normad")
92
+ def get_social_norm_application_normad_spec(language: str, country: str) -> RunSpec:
93
+ scenario_spec = ScenarioSpec(
94
+ class_name="helm.benchmark.scenarios.lmkt_scenarios.SocialNormApplicationNormADScenario",
95
+ args={
96
+ "language": language,
97
+ },
98
+ )
99
+
100
+ adapter_spec = get_generation_adapter_spec(
101
+ instructions=INSTRUCTIONS["social_norm_application_normad"][language]["instructions"].format(
102
+ country=COUNTRIES[country]
103
+ ),
104
+ input_noun=INSTRUCTIONS["social_norm_application_normad"][language]["input_noun"],
105
+ output_noun=INSTRUCTIONS["social_norm_application_normad"][language]["output_noun"],
106
+ max_tokens=5,
107
+ stop_sequences=[],
108
+ )
109
+
110
+ return RunSpec(
111
+ name="social_norm_application_normad",
112
+ scenario_spec=scenario_spec,
113
+ adapter_spec=adapter_spec,
114
+ metric_specs=get_exact_match_metric_specs() + get_f1_metric_specs(),
115
+ groups=["lmkt", "social_norm_application_normad"],
116
+ )
117
+
118
+
119
+ @run_spec_function("social_norm_explanation_normad")
120
+ def get_social_norm_explanation_normad_spec(language: str, country: str) -> RunSpec:
121
+ scenario_spec = ScenarioSpec(
122
+ class_name="helm.benchmark.scenarios.lmkt_scenarios.SocialNormExplanationNormADScenario",
123
+ args={
124
+ "language": language,
125
+ },
126
+ )
127
+
128
+ adapter_spec = get_generation_adapter_spec(
129
+ instructions=INSTRUCTIONS["social_norm_explanation_normad"][language]["instructions"].format(
130
+ country=COUNTRIES[country]
131
+ ),
132
+ input_noun=INSTRUCTIONS["social_norm_explanation_normad"][language]["input_noun"],
133
+ output_noun=INSTRUCTIONS["social_norm_explanation_normad"][language]["output_noun"],
134
+ max_tokens=128,
135
+ stop_sequences=[],
136
+ )
137
+
138
+ return RunSpec(
139
+ name="social_norm_explanation_normad",
140
+ scenario_spec=scenario_spec,
141
+ adapter_spec=adapter_spec,
142
+ metric_specs=get_open_ended_generation_metric_specs() + get_semantic_similarity_metric_specs(),
143
+ groups=["lmkt", "social_norm_explanation_normad"],
144
+ )
@@ -1,4 +1,9 @@
1
- from helm.benchmark.adaptation.adapter_spec import ADAPT_CHAT, ADAPT_GENERATION, AdapterSpec
1
+ from helm.benchmark.adaptation.adapter_spec import (
2
+ ADAPT_CHAT,
3
+ ADAPT_GENERATION,
4
+ ADAPT_MULTIPLE_CHOICE_JOINT,
5
+ AdapterSpec,
6
+ )
2
7
  from helm.benchmark.metrics.common_metric_specs import (
3
8
  get_exact_match_metric_specs,
4
9
  get_open_ended_generation_metric_specs,
@@ -29,6 +34,27 @@ def _get_long_context_generation_adapter_spec(max_tokens: int) -> AdapterSpec:
29
34
  )
30
35
 
31
36
 
37
+ def _get_long_context_multiple_choice_adapter_spec(max_tokens: int) -> AdapterSpec:
38
+ return AdapterSpec(
39
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
40
+ global_prefix="",
41
+ global_suffix="",
42
+ instructions="Read the passage and answer the following question. Respond with only a single letter corresponding to your choice. Do not include a period in your answer.\n\n", # noqa: E501
43
+ input_prefix="",
44
+ input_suffix="\n",
45
+ reference_prefix="A. ",
46
+ reference_suffix="\n",
47
+ output_prefix="\nAnswer the question above based on the passage. Respond with only a single letter corresponding to your choice. Do not include a period in your answer.\n", # noqa: E501
48
+ output_suffix="",
49
+ instance_prefix="",
50
+ max_train_instances=0,
51
+ num_outputs=1,
52
+ temperature=0.0,
53
+ max_tokens=max_tokens,
54
+ stop_sequences=[],
55
+ )
56
+
57
+
32
58
  @run_spec_function("ruler_hotpotqa")
33
59
  def get_ruler_hotpotqa_spec(max_num_words: int = 131072) -> RunSpec:
34
60
  scenario_spec = ScenarioSpec(
@@ -96,6 +122,27 @@ def get_infinite_bench_en_qa_spec(max_num_words: int = 131072) -> RunSpec:
96
122
  )
97
123
 
98
124
 
125
+ @run_spec_function("infinite_bench_en_mc")
126
+ def get_infinite_bench_en_mc_spec(max_num_words: int = 131072) -> RunSpec:
127
+ scenario_spec = ScenarioSpec(
128
+ class_name="helm.benchmark.scenarios.infinite_bench_en_mc_scenario.InfiniteBenchEnMCScenario",
129
+ args={
130
+ "max_num_words": max_num_words,
131
+ },
132
+ )
133
+
134
+ adapter_spec = _get_long_context_multiple_choice_adapter_spec(max_tokens=40)
135
+ metric_specs = get_exact_match_metric_specs()
136
+
137
+ return RunSpec(
138
+ name=f"infinite_bench_en_mc:max_num_words={max_num_words}",
139
+ scenario_spec=scenario_spec,
140
+ adapter_spec=adapter_spec,
141
+ metric_specs=metric_specs,
142
+ groups=["infinite_bench_en_mc"],
143
+ )
144
+
145
+
99
146
  @run_spec_function("infinite_bench_en_sum")
100
147
  def get_infinite_bench_en_sum_spec(max_num_words: int = 131072) -> RunSpec:
101
148
 
File without changes