crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,342 @@
1
+ from typing import Dict, List, Tuple, Literal
2
+
3
+ import random
4
+ import dataclasses
5
+
6
+ from copy import copy
7
+ from dataclasses import dataclass
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Scenario,
10
+ Instance,
11
+ Reference,
12
+ TRAIN_SPLIT,
13
+ TEST_SPLIT,
14
+ VALID_SPLIT,
15
+ CORRECT_TAG,
16
+ Input,
17
+ Output,
18
+ )
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class MELTLanguageLogicalStatement:
23
+ """This class describes a logical statement in Vietnamese language, inspired by HELM
24
+ implementation of "Transformers as Soft Reasoners over Language" paper.
25
+ https://arxiv.org/abs/2002.05867
26
+ """
27
+
28
+ subject: str # e.g. either the individual or group to which this statement applies
29
+ subject_category: str # e.g. the group to which this fact applies
30
+ specifier_type: Literal["một", "cái"] # the specifier used for the subject
31
+
32
+ def generate_specified_subject(self, upper=False, specifier_type=None) -> str:
33
+ """Handle the specification of the subject in the statement.
34
+ It is similar to the English "a" or "the" in the statement.
35
+
36
+ Example:
37
+ if (subject="con mèo", subject_category="động vật", specifier_type="cái", upper=False) -> "Cái con mèo"
38
+ if (subject="quả táo", subject_category="thực vật", specifier_type="a", upper=True) -> "Một quả táo"
39
+ """
40
+
41
+ specifier_type = self.specifier_type if specifier_type is None else specifier_type
42
+ if not (self.subject_category != "người") or (self.subject == "người"):
43
+ return self.subject
44
+ base_char = specifier_type[0].upper() if upper else specifier_type[0].lower()
45
+ return f"{base_char}{specifier_type[1:]} {self.subject}"
46
+
47
+
48
+ @dataclass(frozen=True)
49
+ class MELTLanguageRule(MELTLanguageLogicalStatement):
50
+ """Class describing how a set of attributes about an individual/group imply another attribute.
51
+ This class is inspired by HELM
52
+ implementation of "Transformers as Soft Reasoners over Language" paper.
53
+ https://arxiv.org/abs/2002.05867
54
+ """
55
+
56
+ condition: List[str] # a list of attributes which must apply for the rule to apply
57
+ condition_conjunction: Literal["và", "hoặc"] # "and" or "or", corresponding to
58
+ consequent: str # the attribute resulting from the application of the rule
59
+
60
+ def __str__(self) -> str:
61
+ """Renders the rule, i.e. corresponding to "if x (and/or y) then z"
62
+
63
+ Rules should have the following format:
64
+ {
65
+ 'subject': 'An',
66
+ 'subject_category': 'người',
67
+ 'specifier_type': 'cái' or 'một'
68
+ 'condition': ['đỏ', 'tốt'],
69
+ 'condition_conjunction': 'và',
70
+ 'consequent': 'cold'
71
+ }
72
+
73
+ and this example will output a string: "Nếu An là đỏ và tốt, thì An là lạnh."
74
+ """
75
+
76
+ condition = f" {self.condition_conjunction} ".join(self.condition)
77
+ specified_subject = self.generate_specified_subject()
78
+ specified_particular_subject = self.generate_specified_subject(specifier_type="cái")
79
+ return f"Nếu {specified_subject} là {condition}, thì {specified_particular_subject} là {self.consequent}."
80
+
81
+
82
+ @dataclass(frozen=True)
83
+ class MELTLanguageFact(MELTLanguageLogicalStatement):
84
+ """Class describing a statement that a subject has some attributes.
85
+ This class is inspired by HELM
86
+ implementation of "Transformers as Soft Reasoners over Language" paper.
87
+ https://arxiv.org/abs/2002.05867
88
+ """
89
+
90
+ specific_attributes: List[str] # more specific versions of the attributes
91
+ generic_attributes: List[str] # a list of attributes which apply to the subject
92
+ use_specific_attributes: bool # whether to use the more specific attributes (i.e. hard mode)
93
+ upper: bool = True # whether the statement should be uppercase
94
+
95
+ def __str__(self) -> str:
96
+ """Maps from a set of attributes about a subject to a string
97
+
98
+ e.g. if (subject="con chó", attributes=["to", "đỏ"], specifier="cái") ->
99
+ "Cái con chó thì to và đỏ."
100
+ """
101
+
102
+ if len(self.generic_attributes) == 0:
103
+ return "Không có gì."
104
+ target_attributes = self.specific_attributes if self.use_specific_attributes else self.generic_attributes
105
+ specified_subject = self.generate_specified_subject(upper=self.upper)
106
+ return f"{specified_subject} là {' và '.join(target_attributes)}."
107
+
108
+
109
+ def get_vocab() -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
110
+ """All potential subjects for the facts and rules for sythetic_reasoning_natural as well as their categories.
111
+ Subjects is a dictionary of subject categories like "người" and "động vật" which correspond to
112
+ a list of potential subjects.
113
+
114
+ Attributes corresponds to an initial list of attributes which are only synonymous with themselves.
115
+ Intially, we default to not including these attributes, but we leave this feature in for convenience.
116
+
117
+ Attribute groups are a more general version of attributes, where a single attribute corresponds to a class of
118
+ attributes e.g. if we know something is chilly, we know that it is cold (but not assuming the reverse).
119
+ """
120
+
121
+ # A list of subjects and their categories
122
+ subjects: Dict[str, List[str]] = {
123
+ "người": ["An", "Bình", "Cường", "Duy", "Đạt", "Phương"],
124
+ "động vật": [
125
+ "con chó",
126
+ "con mèo",
127
+ "con thỏ",
128
+ "con chuột",
129
+ "con hổ",
130
+ "con sư tử",
131
+ "con gấu",
132
+ "con sóc",
133
+ "con bò",
134
+ "con gấu trúc",
135
+ "con nhím",
136
+ "con voi",
137
+ "con hươu cao cổ",
138
+ "con hà mã",
139
+ ],
140
+ "thực vật": ["hoa anh túc", "hoa bồ công anh", "cây", "hoa hồng", "hoa hướng dương"],
141
+ }
142
+
143
+ # Convert list of attributes into dictionary
144
+ # A list of attributes and their overarching meaning (used in hard difficulty)
145
+ attribute_groups = {
146
+ "trẻ": ["trẻ"],
147
+ "mềm": ["mềm"],
148
+ "buồn": ["buồn"],
149
+ "sợ": ["sợ"],
150
+ "lạnh": ["lạnh", "lạnh buốt", "mát mẻ"],
151
+ "nóng": ["nóng", "ấm"],
152
+ "thông minh": ["thông minh", "tài giỏi", "khôn", "sáng trí"],
153
+ "sạch": ["sạch", "ngăn nắp"],
154
+ "nhỏ": ["nhỏ", "bé", "tí nị"],
155
+ "to": ["to", "khổng lồ", "bự", "lớn"],
156
+ "tốt": ["tốt", "tử tế", "tốt bụng"],
157
+ "đẹp": ["đẹp", "xinh"],
158
+ "đỏ": ["đỏ", "đỏ thẫm"],
159
+ "xanh dương": ["xanh dương", "xanh lam"],
160
+ "xanh lục": ["xanh lục", "xanh lá cây"],
161
+ "tím": ["tím", "tím than"],
162
+ "chán": ["chán", "đần"],
163
+ "cũ": ["cũ", "xưa", "cổ"],
164
+ "mạnh": ["mạnh", "mạnh mẽ", "cơ bắp"],
165
+ "yếu": ["yếu", "yếu đuối", "mỏng manh"],
166
+ "nhanh": ["nhanh", "mau"],
167
+ "chậm": ["chậm", "chậm chạp"],
168
+ "xấu": ["xấu", "xấu xa", "ác", "độc ác"],
169
+ "hạnh phúc": ["hạnh phúc", "hân hoan", "vui mừng", "vui vẻ"],
170
+ "tròn": ["tròn", "hình tròn", "hình cầu"],
171
+ }
172
+ # Remove any keys which duplicate subitems
173
+ new_attribute_groups: Dict[str, List[str]] = copy(attribute_groups)
174
+ for general_attribute, specific_attributes in attribute_groups.items():
175
+ for specific_attribute in specific_attributes:
176
+ if (general_attribute != specific_attribute) and (specific_attribute in attribute_groups):
177
+ del new_attribute_groups[specific_attribute]
178
+
179
+ return new_attribute_groups, subjects
180
+
181
+
182
+ def generate_rules(
183
+ attribute_groups: Dict[str, List[str]],
184
+ subject: str,
185
+ subject_category: str,
186
+ max_rules: int = 5,
187
+ specific_category: bool = False,
188
+ ) -> List[MELTLanguageRule]:
189
+ """Generates a random set of rules about a subject as dictionaries,
190
+ given a list of potential attributes and the category (e.g. người) of the subject (e.g. An)
191
+
192
+ These rules are guaranteed to not contradict one another, and attributes implied by a single rule will
193
+ not imply any attributes in any other rules (i.e. there is only a single step of reasoning).
194
+ """
195
+ attributes_shuffled = list(attribute_groups.keys()).copy()
196
+ random.shuffle(attributes_shuffled)
197
+ rules: List[MELTLanguageRule] = []
198
+
199
+ while len(attributes_shuffled) > 2 and len(rules) < max_rules:
200
+ rule_subject = subject if specific_category else random.choice([subject_category, subject])
201
+ n_rule_attributes = random.randint(2, 3)
202
+ rule_attributes, attributes_shuffled = (
203
+ attributes_shuffled[:n_rule_attributes],
204
+ attributes_shuffled[n_rule_attributes:],
205
+ )
206
+ rules.append(
207
+ MELTLanguageRule(
208
+ subject=rule_subject,
209
+ subject_category=subject_category,
210
+ specifier_type="một",
211
+ condition=rule_attributes[:-1],
212
+ condition_conjunction=random.choice(["và", "hoặc"]),
213
+ consequent=rule_attributes[-1],
214
+ )
215
+ )
216
+ return rules
217
+
218
+
219
+ def generate_test(
220
+ attribute_groups: Dict[str, List[str]],
221
+ subject: str,
222
+ subject_category: str,
223
+ rules: List[MELTLanguageRule],
224
+ use_specific_attributes: bool,
225
+ p_consequenceless=0.1,
226
+ ) -> Tuple[MELTLanguageFact, List[MELTLanguageRule], MELTLanguageFact]:
227
+ """Generates a test case given a set of rules, i.e. a statement about the subject from which something
228
+ can be potentially deduced given the rules. We include an argument, p_consequenceless, to re-roll with
229
+ some probability if the generated fact does not allow anything to be determined.
230
+ """
231
+
232
+ # The generic attributes which the test fact will assign to the subject
233
+ test_attributes: List[str] = random.sample(list(attribute_groups.keys()), 2)
234
+ # The specific versions of the test attributes
235
+ test_attributes_specific: List[str] = [
236
+ random.choice(attribute_groups[subcondition]) for subcondition in test_attributes
237
+ ]
238
+ test_consequents: List[str] = [] # The attributes implied by the test attributes and rules
239
+ test_rules_used: List[MELTLanguageRule] = []
240
+ for rule in rules:
241
+ if rule.consequent in test_attributes:
242
+ continue
243
+ if rule.condition_conjunction == "và":
244
+ if set(rule.condition).issubset(test_attributes):
245
+ test_rules_used.append(rule)
246
+ test_consequents.append(rule.consequent)
247
+ elif rule.condition_conjunction == "hoặc":
248
+ if not set(rule.condition).isdisjoint(test_attributes):
249
+ test_rules_used.append(rule)
250
+ test_consequents.append(rule.consequent)
251
+ if len(test_consequents) == 0 and random.random() > p_consequenceless:
252
+ return generate_test(
253
+ attribute_groups, subject, subject_category, rules, use_specific_attributes, p_consequenceless
254
+ )
255
+
256
+ test_fact: MELTLanguageFact = MELTLanguageFact(
257
+ subject,
258
+ subject_category,
259
+ specifier_type="cái",
260
+ specific_attributes=test_attributes_specific,
261
+ generic_attributes=test_attributes,
262
+ use_specific_attributes=use_specific_attributes,
263
+ )
264
+
265
+ target_fact: MELTLanguageFact = dataclasses.replace(
266
+ test_fact,
267
+ specific_attributes=test_consequents,
268
+ generic_attributes=test_consequents,
269
+ )
270
+
271
+ return test_fact, test_rules_used, target_fact
272
+
273
+
274
+ class MELTSRNScenario(Scenario):
275
+ """
276
+ Synthetic Reasoning Natural Language benchmark inspired by "Transformers as Soft Reasoners over Language"
277
+ https://arxiv.org/abs/2002.05867
278
+ """
279
+
280
+ name = "sythetic_reasoning_natural"
281
+ description = "Language Pattern Matching"
282
+ tags = ["reasoning", "language", "pattern_matching"]
283
+
284
+ def __init__(self, difficulty: str, random_seed=42):
285
+ super().__init__()
286
+ self.attribute_groups, self.subjects = get_vocab()
287
+
288
+ # specific_category specifies that the specific category should always be used
289
+ # e.g. "dog" instead of "an động vật"
290
+ self.specific_category: bool = difficulty == "easy"
291
+ # use_specific_attributes specifies that the synonymous attributes can be used
292
+ # e.g. "chill" instead of "cold"
293
+ self.use_specific_attributes: bool = difficulty == "hard"
294
+ self.include_intermediates: bool = False
295
+ self.num_train_instances: int = 1000
296
+ self.num_val_instances: int = 5000
297
+ self.num_test_instances: int = 5000
298
+ self.random_seed = random_seed
299
+
300
+ def generate_problem(
301
+ self,
302
+ ) -> Tuple[List[MELTLanguageRule], MELTLanguageFact, List[MELTLanguageRule], MELTLanguageFact]:
303
+ subject_category = random.choice(list(self.subjects.keys()))
304
+ subject = random.choice(self.subjects[subject_category])
305
+ rules = generate_rules(
306
+ self.attribute_groups, subject, subject_category, specific_category=self.specific_category
307
+ )
308
+ test_fact, test_rules_used, target_fact = generate_test(
309
+ self.attribute_groups, subject, subject_category, rules, self.use_specific_attributes
310
+ )
311
+ return rules, test_fact, test_rules_used, target_fact
312
+
313
+ def get_instances(self, output_path: str) -> List[Instance]:
314
+ # Read all the instances
315
+ instances: List[Instance] = []
316
+ random.seed(self.random_seed)
317
+
318
+ for sample_idx in range(self.num_train_instances + self.num_val_instances + self.num_test_instances):
319
+ rules, test_fact, test_rules_used, target_fact = self.generate_problem()
320
+
321
+ question = "\n".join(str(rule) for rule in rules) + "\n"
322
+ test_specified_subject = test_fact.generate_specified_subject(upper=False)
323
+ question += f"Sự thật:\n{test_fact}\n"
324
+ if self.include_intermediates:
325
+ question += "Luật đã dùng:\n" + "\n".join(str(test_rule) for test_rule in test_rules_used) + "\n"
326
+ question += f"Những điều sau đây có thể được xác định về {test_specified_subject}:"
327
+
328
+ if sample_idx < self.num_train_instances:
329
+ split = TRAIN_SPLIT
330
+ elif sample_idx < self.num_train_instances + self.num_val_instances:
331
+ split = VALID_SPLIT
332
+ else:
333
+ split = TEST_SPLIT
334
+
335
+ instance = Instance(
336
+ input=Input(text=question),
337
+ references=[Reference(Output(text=str(target_fact)), tags=[CORRECT_TAG])],
338
+ split=split,
339
+ )
340
+ instances.append(instance)
341
+
342
+ return instances
@@ -0,0 +1,222 @@
1
+ from typing import Dict, List, Tuple
2
+
3
+ import numpy as np
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TRAIN_SPLIT,
10
+ TEST_SPLIT,
11
+ VALID_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ ANIMALS = [
19
+ "con ngựa vằn",
20
+ "con rắn hổ mang",
21
+ "con cò",
22
+ "con chim cánh cụt",
23
+ "con cá mập",
24
+ "con sư tử",
25
+ "con trâu",
26
+ "con cá voi",
27
+ "con hải cẩu",
28
+ "con đại bàng",
29
+ "con ngựa",
30
+ "con chuột",
31
+ ]
32
+ FRUITS = [
33
+ "quả táo",
34
+ "quả đào",
35
+ "quả dưa hấu",
36
+ "quả chuối",
37
+ "quả nho",
38
+ "quả kiwi",
39
+ "quả lê",
40
+ "quả dâu tây",
41
+ "quả việt quất",
42
+ "quả mâm xôi",
43
+ ]
44
+ RULE_SYMBOLS = ["X", "Y", "Z"]
45
+ MATH_SYMBOLS = ["+", "-", "*", "="]
46
+
47
+
48
+ def subst(pattern: List[str], rule_symbol: str, substitute_str: str) -> List[str]:
49
+ """
50
+ We substitute one rule symbols in a pattern according by a substitution str.
51
+
52
+ example:
53
+ pattern = "A+B=B+A"
54
+ rule_symbol = "A"
55
+ substitute_str = "quả táo"
56
+ return: "quả táo+B=B+quả táo"
57
+
58
+ :param pattern: A Pattern representing the rule.
59
+ :param rule_symbol: One rule symbol.
60
+ :param substitute_str: The substitution string.
61
+ :return: The result of substitution.
62
+ """
63
+ assert rule_symbol in pattern
64
+ # check which index is the same as rule_symbol
65
+ indices = [i for i, x in enumerate(pattern) if x == rule_symbol]
66
+
67
+ # form a new string with the symbol replaced
68
+ new_string = pattern[: indices[0]] + [substitute_str]
69
+ for i, j in zip(indices[:-1], indices[1:]):
70
+ new_string += pattern[i + 1 : j]
71
+ new_string += [substitute_str]
72
+ new_string += pattern[indices[-1] + 1 :]
73
+
74
+ return new_string
75
+
76
+
77
+ def pattern_subst(pattern: List[str], rule_symbols: List[str], substitute_dict: Dict[str, str]) -> List[str]:
78
+ """
79
+ We substitute the rule symbols in a pattern according to a substitution dictionary.
80
+
81
+ example:
82
+ pattern = "A+B=B+A"
83
+ rule_symbols = ["A", "B"]
84
+ substitute_dict = {"A":"quả táo", "B":"quả đào"}
85
+ return: "quả táo+quả đào=quả đào+quả táo"
86
+
87
+ :param pattern: A Pattern representing the rule.
88
+ :param rule_symbols: The set of rule symbols.
89
+ :param substitute_dict: The substitution dictionary.
90
+ :return: The result of substitution.
91
+ """
92
+
93
+ out = pattern
94
+ # we iteratively replace each rule symbol with its subsitution string
95
+ for symbol in rule_symbols:
96
+ out = subst(out, symbol, substitute_dict[symbol])
97
+ return out
98
+
99
+
100
+ class MELTSyntheticReasoningScenario(Scenario):
101
+ """
102
+ Synthetic Reasoning benchmark inspired by
103
+ "LIME: Learning Inductive Bias for Primitives of Mathematical Reasoning"
104
+ https://arxiv.org/abs/2101.06223
105
+ """
106
+
107
+ name = "synthetic_reasoning"
108
+ description = "Synthetic reasoning benchmark"
109
+ tags = ["reasoning", "language", "pattern_matching"]
110
+
111
+ def __init__(self, mode: str, random_seed=42):
112
+ super().__init__()
113
+ self.num_train_instances: int = 1000
114
+ self.num_val_instances: int = 5000
115
+ self.num_test_instances: int = 5000
116
+ self.rng = np.random.RandomState(random_seed)
117
+ self.mode = mode
118
+ assert self.mode in ["variable_substitution", "pattern_match", "induction"], f"Unsupported mode: {self.mode}"
119
+
120
+ def gen_subst(self, rule_symbols: List[str], tokens: List[str]) -> Tuple[Dict[str, str], str]:
121
+ """
122
+ For each of the rule symbol, we sample a random substitution string composed of randomly sampled tokens.
123
+
124
+ :param rule_symbols: A list of rule symbols.
125
+ :param tokens: Tokens used to construct the substitution.
126
+ :return: We return a substitution dictionary, and its string representation.
127
+ """
128
+ substitute_dict = {}
129
+ substitute_str = []
130
+ for char in rule_symbols:
131
+ subst_len = self.rng.randint(1, 3)
132
+ subst = " ".join(self.rng.choice(tokens, size=subst_len))
133
+ substitute_dict.update({char: subst})
134
+ substitute_str.append(char)
135
+ substitute_str.append("bởi")
136
+ substitute_str.append('"')
137
+ substitute_str.append(subst)
138
+ substitute_str.append('"')
139
+ substitute_str.append(",")
140
+ substitute_dict_str = " ".join(substitute_str[:-1])
141
+ return substitute_dict, substitute_dict_str
142
+
143
+ def gen_pattern(self, math_symbols: List[str], rule_symbols: List[str]) -> List[str]:
144
+ """
145
+ Generate a pattern string.
146
+
147
+ Example Input: math_symbols: ["+", "-", "*"], rule_symbols: ["Y", "Y", "Z"]
148
+ Example Output: ["Y", "Y", "+", "Z", "="]
149
+ """
150
+ pattern = rule_symbols + math_symbols
151
+ self.rng.shuffle(pattern)
152
+ return pattern
153
+
154
+ def get_instances(self, output_path: str) -> List[Instance]:
155
+ # We fix the seed for data generation to ensure reproducibility.
156
+ # Read all the instances
157
+ instances: List[Instance] = []
158
+
159
+ rule_symbols = RULE_SYMBOLS
160
+ tokens = ANIMALS + FRUITS
161
+ math_symbols = MATH_SYMBOLS
162
+
163
+ for sample_idx in range(self.num_train_instances + self.num_val_instances + self.num_test_instances):
164
+ # Sample rule symbols
165
+ sampled_rule_symbols = list(self.rng.choice(rule_symbols, size=self.rng.randint(2, 4)))
166
+ sampled_rule_symbols_set = sorted(list(set(sampled_rule_symbols))) # sorted to make it deterministic
167
+
168
+ # Sample math symbols
169
+ sampled_math_symbols = list(self.rng.choice(math_symbols, size=self.rng.randint(2, 4)))
170
+
171
+ # generate the pattern
172
+ pattern = self.gen_pattern(sampled_math_symbols, sampled_rule_symbols)
173
+
174
+ # generate one substitution
175
+ substitute_dict, substitute_dict_str = self.gen_subst(sampled_rule_symbols_set, tokens)
176
+ result = pattern_subst(pattern, sampled_rule_symbols_set, substitute_dict)
177
+
178
+ # generate another substitution
179
+ substitute_dict_2, _ = self.gen_subst(sampled_rule_symbols_set, tokens)
180
+ result_2 = pattern_subst(pattern, sampled_rule_symbols_set, substitute_dict_2)
181
+
182
+ result_string = " ".join(result)
183
+ pattern_string = " ".join(pattern)
184
+
185
+ src: str
186
+ tgt: str
187
+ if self.mode == "induction":
188
+ result_string_2 = " ".join(result_2)
189
+ src = f"Hai kết quả: {result_string} | {result_string_2}"
190
+ tgt = f"Quy luật: {pattern_string}"
191
+ elif self.mode == "variable_substitution":
192
+ src = f"Các quy luật: {pattern_string} | Thay thế: {substitute_dict_str}"
193
+ tgt = " ".join(result)
194
+ elif self.mode == "pattern_match":
195
+ # we sample 3 other pattern strings as negatives for patterns matching.
196
+ other_patterns = [
197
+ " ".join(self.gen_pattern(sampled_math_symbols, sampled_rule_symbols_set)) for _ in range(3)
198
+ ]
199
+ all_patterns = other_patterns + [pattern_string]
200
+ self.rng.shuffle(all_patterns)
201
+ all_pattern_string = " | ".join(all_patterns)
202
+ src = f"Các quy luật: {all_pattern_string} | Kết quả: {result_string}"
203
+ tgt = pattern_string
204
+ else:
205
+ raise ValueError(f"Invalid mode: {self.mode}")
206
+
207
+ split: str
208
+ if sample_idx < self.num_train_instances:
209
+ split = TRAIN_SPLIT
210
+ elif sample_idx < self.num_train_instances + self.num_val_instances:
211
+ split = VALID_SPLIT
212
+ else:
213
+ split = TEST_SPLIT
214
+
215
+ instance = Instance(
216
+ input=Input(text=src),
217
+ references=[Reference(Output(text=tgt), tags=[CORRECT_TAG])],
218
+ split=split,
219
+ )
220
+ instances.append(instance)
221
+
222
+ return instances