crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,185 @@
1
+ import json
2
+ import os
3
+ from typing import List
4
+
5
+ from helm.common.general import ensure_file_downloaded
6
+ from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
7
+
8
+
9
+ class LMEntryScenario(Scenario):
10
+ """
11
+ The LMentry Benchmark
12
+ https://arxiv.org/pdf/2211.02069.pdf
13
+
14
+ The implementation is with reference to the original repo: https://github.com/aviaefrat/lmentry
15
+ The data is also downloaded from the repo.
16
+
17
+ LMentry evaluates LM's abilities of performing elementary language tasks. Examples include
18
+ finding which word is shorter, or which word is the last in a sentence.
19
+ """
20
+
21
+ name = "lm_entry"
22
+ description = "The LMentry benchmark for elementary language tasks"
23
+ tags: List[str] = []
24
+ url_template = "https://raw.githubusercontent.com/aviaefrat/lmentry/main/data/{subtask}.json"
25
+ task_to_subtasks = {
26
+ "all_words_from_category": [
27
+ "all_words_from_category",
28
+ "all_words_from_category_0_distractors",
29
+ "all_words_from_category_1_distractors",
30
+ "all_words_from_category_2_distractors",
31
+ ],
32
+ "any_words_from_category": [
33
+ "any_words_from_category",
34
+ "any_words_from_category_3_distractors",
35
+ "any_words_from_category_4_distractors",
36
+ "any_words_from_category_5_distractors",
37
+ ],
38
+ "bigger_number": ["bigger_number"],
39
+ # "ends_with_letter": ["ends_with_letter"], # HELM's metrics currently don't support this
40
+ # "ends_with_word": ["ends_with_word"], # HELM's metrics currently don't support this
41
+ "first_alphabetically": [
42
+ "first_alphabetically",
43
+ "first_alphabetically_consecutive_first_letter",
44
+ "first_alphabetically_different_first_letter",
45
+ "first_alphabetically_far_first_letter",
46
+ "first_alphabetically_same_first_letter",
47
+ ],
48
+ "first_letter": ["first_letter"],
49
+ "first_word": ["first_word"],
50
+ "homophones": ["homophones"],
51
+ "last_letter": ["last_letter"],
52
+ "last_word": ["last_word"],
53
+ "least_associated_word": ["least_associated_word"],
54
+ "less_letters": ["less_letters", "less_letters_length_diff_1", "less_letters_length_diff_3plus"],
55
+ "more_letters": ["more_letters", "more_letters_length_diff_1", "more_letters_length_diff_3plus"],
56
+ "most_associated_word": ["most_associated_word"],
57
+ "rhyming_word": [
58
+ "rhyming_word",
59
+ "rhyming_word_orthographically_different",
60
+ "rhyming_word_orthographically_similar",
61
+ ],
62
+ # "sentence_containing": ["sentence_containing"], # HELM's metrics currently don't support this
63
+ # "sentence_not_containing": ["sentence_not_containing"], # HELM's metrics currently don't support this
64
+ "smaller_number": ["smaller_number"],
65
+ # "starts_with_letter": ["starts_with_letter"], # HELM's metrics currently don't support this
66
+ # "starts_with_word": ["starts_with_word"], # HELM's metrics currently don't support this
67
+ "word_after": ["word_after"],
68
+ "word_before": ["word_before"],
69
+ # "word_containing": ["word_containing"], # HELM's metrics currently don't support this
70
+ # "word_not_containing": ["word_not_containing"], # HELM's metrics currently don't support this
71
+ }
72
+
73
+ def __init__(self, task: str):
74
+ super().__init__()
75
+ assert task in self.task_to_subtasks, f"Unsupported task: {task}"
76
+ self.task: str = task
77
+
78
+ def get_instances(self, output_path: str) -> List[Instance]:
79
+ # Download the raw data
80
+ data_paths: List[str] = []
81
+
82
+ for subtask in self.task_to_subtasks[self.task]:
83
+ data_path: str = os.path.join(output_path, f"{subtask}.json")
84
+ ensure_file_downloaded(
85
+ source_url=self.url_template.format(subtask=subtask),
86
+ target_path=data_path,
87
+ unpack=False,
88
+ )
89
+ data_paths.append(data_path)
90
+
91
+ def generate_references_for_multiple_choice_question(
92
+ options: List[str], correct_answer: str
93
+ ) -> List[Reference]:
94
+ references: List[Reference] = []
95
+
96
+ for option in options:
97
+ if option == correct_answer:
98
+ references.append(Reference(Output(text=option), tags=[CORRECT_TAG]))
99
+ else:
100
+ references.append(Reference(Output(text=option), tags=[]))
101
+
102
+ return references
103
+
104
+ def generate_references_for_generation_question(correct_answer: str) -> List[Reference]:
105
+ return generate_references_for_multiple_choice_question(
106
+ options=[correct_answer], correct_answer=correct_answer
107
+ )
108
+
109
+ instances: List[Instance] = []
110
+
111
+ for data_path in data_paths:
112
+ data: dict = json.load(open(data_path))
113
+
114
+ for example in data["examples"].values():
115
+ input_text: str = example["input"]
116
+
117
+ # Normalize the input text to the same format
118
+ if input_text.startswith("Q: "):
119
+ input_text = input_text[3:]
120
+
121
+ if input_text.endswith("\n"):
122
+ input_text = input_text[:-1]
123
+ elif input_text.endswith("\nA:"):
124
+ input_text = input_text[:-3]
125
+
126
+ input = Input(text=input_text)
127
+ references: List[Reference]
128
+
129
+ if self.task == "all_words_from_category":
130
+ correct_answer = "yes" if example["metadata"]["num_distractors"] == 0 else "no"
131
+ references = generate_references_for_multiple_choice_question(
132
+ options=["yes", "no"], correct_answer=correct_answer
133
+ )
134
+ elif self.task == "any_words_from_category":
135
+ correct_answer = "yes" if len(example["metadata"]["category_words"]) > 0 else "no"
136
+ references = generate_references_for_multiple_choice_question(
137
+ options=["yes", "no"], correct_answer=correct_answer
138
+ )
139
+ elif self.task == "bigger_number" or self.task == "smaller_number":
140
+ references = generate_references_for_multiple_choice_question(
141
+ options=[str(example["metadata"]["n1"]), str(example["metadata"]["n2"])],
142
+ correct_answer=str(example["metadata"]["answer"]),
143
+ )
144
+ elif self.task == "first_alphabetically":
145
+ references = generate_references_for_multiple_choice_question(
146
+ options=[example["metadata"]["word1"], example["metadata"]["word2"]],
147
+ correct_answer=example["metadata"]["answer"],
148
+ )
149
+ elif self.task == "first_letter" or self.task == "last_letter":
150
+ references = generate_references_for_generation_question(example["metadata"]["answer"])
151
+ elif self.task == "first_word" or self.task == "last_word":
152
+ references = generate_references_for_generation_question(example["metadata"]["answer"])
153
+ elif self.task == "homophones":
154
+ references = generate_references_for_multiple_choice_question(
155
+ options=[example["metadata"]["answer"], example["metadata"]["distractor"]],
156
+ correct_answer=example["metadata"]["answer"],
157
+ )
158
+ elif self.task == "least_associated_word" or self.task == "most_associated_word":
159
+ references = generate_references_for_multiple_choice_question(
160
+ options=[example["metadata"]["answer"]] + example["metadata"]["distractors"],
161
+ correct_answer=example["metadata"]["answer"],
162
+ )
163
+ elif self.task == "less_letters" or self.task == "more_letters":
164
+ references = generate_references_for_multiple_choice_question(
165
+ options=[example["metadata"]["word1"], example["metadata"]["word2"]],
166
+ correct_answer=example["metadata"]["answer"],
167
+ )
168
+ elif self.task == "rhyming_word":
169
+ references = generate_references_for_multiple_choice_question(
170
+ options=[example["metadata"]["answer"], example["metadata"]["distractor"]],
171
+ correct_answer=example["metadata"]["answer"],
172
+ )
173
+ elif self.task == "word_before" or self.task == "word_after":
174
+ references = generate_references_for_generation_question(example["metadata"]["answer"])
175
+ else:
176
+ raise ValueError(f"Unsupported task: {self.task}")
177
+
178
+ instance = Instance(
179
+ input=input,
180
+ references=references,
181
+ split=TEST_SPLIT,
182
+ )
183
+ instances.append(instance)
184
+
185
+ return instances
@@ -101,9 +101,11 @@ class LSATScenario(Scenario):
101
101
 
102
102
  def get_question_types(self, tags: List[str]) -> List[str]:
103
103
  question_type: str = tags[2].replace("grouping (distribution)", "distribution grouping") or "miscellaneous"
104
+ types = [question_type.replace(" ", "_").replace("/", "_")]
104
105
  main_type = self.subtype2type.get(question_type)
105
- assert main_type
106
- return [question_type.replace(" ", "_").replace("/", "_"), main_type]
106
+ if main_type is not None:
107
+ types.append(main_type)
108
+ return types
107
109
 
108
110
  def get_instances(self, output_path: str) -> List[Instance]:
109
111
  data_path = os.path.join(output_path, "data")
@@ -1,9 +1,20 @@
1
1
  import collections
2
+ import os
2
3
  import typing
3
4
  from typing import Dict, List, Optional
4
5
  from datasets import load_dataset, DatasetDict
5
6
 
6
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
7
+ from helm.common.general import ensure_directory_exists
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Scenario,
10
+ Instance,
11
+ Reference,
12
+ TRAIN_SPLIT,
13
+ TEST_SPLIT,
14
+ CORRECT_TAG,
15
+ Input,
16
+ Output,
17
+ )
7
18
 
8
19
 
9
20
  def remove_boxed(string: str) -> Optional[str]:
@@ -354,7 +365,13 @@ class MATHScenario(Scenario):
354
365
 
355
366
  def get_instances(self, output_path: str) -> List[Instance]:
356
367
  dataset = {}
357
- data = typing.cast(DatasetDict, load_dataset("competition_math", ignore_verifications=True))
368
+ cache_dir = os.path.join(output_path, "data")
369
+ ensure_directory_exists(cache_dir)
370
+ data = (
371
+ typing.cast(DatasetDict, load_dataset("competition_math", cache_dir=cache_dir))
372
+ .sort("problem")
373
+ .shuffle(seed=42)
374
+ )
358
375
 
359
376
  def group_by_key(dataset_list, key):
360
377
  dataset_per_key = collections.defaultdict(list)
@@ -0,0 +1,60 @@
1
+ import os
2
+ from typing import List
3
+
4
+ import pandas as pd
5
+
6
+ from helm.common.general import ensure_file_downloaded
7
+
8
+ from .scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
9
+
10
+
11
+ class MedicationQAScenario(Scenario):
12
+ """
13
+ The gold standard corpus for medication question answering introduced in the MedInfo 2019 paper
14
+ "Bridging the Gap between Consumers’ Medication Questions and Trusted Answers":
15
+ http://ebooks.iospress.nl/publication/51941
16
+
17
+ This dataset has consumer questions, as opposed to very clinical questions.
18
+
19
+ Paper citation:
20
+
21
+ @inproceedings{BenAbacha:MEDINFO19,
22
+ author = {Asma {Ben Abacha} and Yassine Mrabet and Mark Sharp and
23
+ Travis Goodwin and Sonya E. Shooshan and Dina Demner{-}Fushman},
24
+ title = {Bridging the Gap between Consumers’ Medication Questions and Trusted Answers},
25
+ booktitle = {MEDINFO 2019},
26
+ year = {2019},
27
+ }
28
+ """
29
+
30
+ SOURCE_REPO_URL = "https://github.com/abachaa/Medication_QA_MedInfo2019/raw/master/"
31
+ FILENAME = "MedInfo2019-QA-Medications.xlsx"
32
+
33
+ name = "medication_qa"
34
+ description = "MedInfo 2019 MedicationQA medication question answering task"
35
+ tags = ["knowledge", "generation", "question_answering", "biomedical"]
36
+
37
+ def download_medication_qa(self, path: str):
38
+ """download the .xlsx spreadsheet containing the question-answer pairs"""
39
+ ensure_file_downloaded(
40
+ source_url=os.path.join(self.SOURCE_REPO_URL, self.FILENAME),
41
+ target_path=os.path.join(path, self.FILENAME),
42
+ unpack=False,
43
+ )
44
+
45
+ def get_instances(self, output_path: str) -> List[Instance]:
46
+ self.download_medication_qa(output_path)
47
+ data_path = os.path.join(output_path, self.FILENAME)
48
+
49
+ data = pd.read_excel(data_path)
50
+ data = data[~data.Answer.isna()] # remove rows missing answers
51
+ instances = [
52
+ Instance(
53
+ input=Input(row.Question),
54
+ references=[Reference(Output(row.Answer), tags=[CORRECT_TAG])],
55
+ split=TEST_SPLIT,
56
+ )
57
+ for _, row in data.iterrows()
58
+ ]
59
+
60
+ return instances
@@ -3,7 +3,7 @@ from collections import defaultdict
3
3
  from dataclasses import dataclass, field
4
4
  from itertools import combinations_with_replacement, product
5
5
  import math
6
- from math import comb # type: ignore
6
+ from math import comb
7
7
  import numpy as np
8
8
  import numpy.typing as npt
9
9
  import random
@@ -358,7 +358,7 @@ def distance_paraboloid(point: List[int], rel_str: str, TOL: float = 1e-10):
358
358
  sols = []
359
359
  # Try each possible combined solution for x, y, z, λ
360
360
  for sol_xyz, val_λs in zip(sols_xyz, vals_λ):
361
- val_λs = list(set(filter(lambda _: not _.is_symbol, val_λs))) # get distinct values for λ if there are any
361
+ val_λs = tuple(set(filter(lambda _: not _.is_symbol, val_λs))) # get distinct values for λ if there are any
362
362
  if len(val_λs) > 1: # there can be at most one distinct value for λ
363
363
  continue
364
364
  val_λ = val_λs[0] if val_λs else λ
@@ -544,7 +544,7 @@ def get_numeracy_adapter_spec(
544
544
  "max_eval_instances": max_eval_instances,
545
545
  "num_outputs": 1,
546
546
  "num_train_trials": 1,
547
- "model": "openai/davinci",
547
+ "model_deployment": "openai/davinci",
548
548
  "temperature": 0,
549
549
  "stop_sequences": ["\n"],
550
550
  "max_tokens": 20,
@@ -107,18 +107,17 @@ class OpinionsQAScenario(Scenario):
107
107
  self.survey_type: str = survey_type
108
108
  self.context: str = context
109
109
 
110
- def download_data(self):
111
-
112
- output_path: str = os.path.join(output_path, "data")
113
- if not os.path.exists(output_path):
114
- os.makedirs(output_path)
110
+ def download_data(self, output_path: str):
111
+ data_dir: str = os.path.join(output_path, "data")
112
+ if not os.path.exists(data_dir):
113
+ os.makedirs(data_dir)
115
114
 
116
115
  DOWNLOAD_FILENAMES = [self.FILE_NAME.format(wave=wave) for wave in self.PEW_SURVEY_WAVES]
117
116
  DOWNLOAD_FILENAMES += [f"{steer}.csv" for steer in ["steer-qa", "steer-bio", "steer-portray"]]
118
117
  DOWNLOAD_FILENAMES += ["Pew_American_Trends_Panel_disagreement_500.csv"]
119
118
 
120
119
  for filename in DOWNLOAD_FILENAMES:
121
- data_path: str = os.path.join(output_path, filename)
120
+ data_path: str = os.path.join(data_dir, filename)
122
121
 
123
122
  source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, filename=filename)
124
123
  ensure_file_downloaded(source_url=source_url, target_path=data_path, downloader_executable="gdown")
@@ -129,7 +128,7 @@ class OpinionsQAScenario(Scenario):
129
128
  return df
130
129
 
131
130
  def get_instances(self, output_path: str) -> List[Instance]:
132
- self.download_data()
131
+ self.download_data(output_path)
133
132
 
134
133
  # Read all the instances
135
134
  instances: List[Instance] = []
@@ -150,14 +149,12 @@ class OpinionsQAScenario(Scenario):
150
149
  bios_df = pd.read_csv(bios_path, sep="\t")
151
150
 
152
151
  for split in all_splits:
153
-
154
152
  csv_path: str = csv_dict[split]
155
153
  assert os.path.exists(csv_path)
156
154
 
157
155
  question_df = self.read_survey_questions(csv_path)
158
156
 
159
157
  for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
160
-
161
158
  # Opinions QA test questions have no correct answer and thus we set it to be None by default
162
159
  # for all test instances.
163
160
  # In the case where context = steer-qa, we add demographic information in the form of a
@@ -182,7 +179,6 @@ class OpinionsQAScenario(Scenario):
182
179
  else:
183
180
  # context = "steer-bio"or "steer-portray"
184
181
  for bio in bios_df["question"].values:
185
-
186
182
  context = PassageQuestionInput(passage=bio, question=question + "\n")
187
183
  instance = Instance(
188
184
  context,
@@ -1,7 +1,6 @@
1
1
  import random
2
2
  import os
3
3
  import json
4
- import tempfile
5
4
  import datasets
6
5
  from pathlib import Path
7
6
  from typing import List, Dict
@@ -26,12 +25,9 @@ SUBSETS = [
26
25
  ]
27
26
 
28
27
 
29
- def get_raft_prompt_settings(subset: str, cache_dir=None):
28
+ def get_raft_prompt_settings(subset: str, cache_dir: str):
30
29
  assert subset in SUBSETS, "Unknown subset: {}".format(subset)
31
30
 
32
- if cache_dir is None:
33
- cache_dir = tempfile.gettempdir()
34
-
35
31
  prompt_construction_settings_path = os.path.join(cache_dir, "prompt_construction_settings.json")
36
32
  ensure_directory_exists(cache_dir)
37
33
  ensure_file_downloaded(
@@ -44,7 +40,7 @@ def get_raft_prompt_settings(subset: str, cache_dir=None):
44
40
  return field_ordering[subset], instructions[subset]
45
41
 
46
42
 
47
- def get_raft_instructions(subset: str, cache_dir=None):
43
+ def get_raft_instructions(subset: str, cache_dir: str):
48
44
  return get_raft_prompt_settings(subset, cache_dir)[1]
49
45
 
50
46
 
@@ -1,12 +1,13 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from dataclasses import dataclass, field, replace
3
3
  from typing import List, Optional, Tuple
4
+ import os
4
5
  from pathlib import PurePath
5
6
  import inspect
6
7
 
7
8
  from helm.common.media_object import MultimediaObject
8
9
  from helm.common.object_spec import ObjectSpec, create_object
9
- from helm.common.general import format_text, format_split, format_tags, indent_lines
10
+ from helm.common.general import ensure_directory_exists, format_text, format_split, format_tags, indent_lines
10
11
  from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
11
12
 
12
13
  """ Data splits """
@@ -24,6 +25,10 @@ DEFAULT_TEST_SIZE: int = 1000
24
25
  """ Reference tags """
25
26
  CORRECT_TAG: str = "correct"
26
27
 
28
+ """ Asset tags (used for compiled outputs such as image2structure)"""
29
+ ASSET_NAME_TAG: str = "asset_name"
30
+ ASSET_PATH_TAG: str = "asset_path"
31
+
27
32
  # Reference tag functions for ranking scenarios.
28
33
  # @TODO: (For future) Should there be a base RankingScenario class?
29
34
 
@@ -177,7 +182,7 @@ class Instance:
177
182
 
178
183
 
179
184
  # TODO(#1212): Scenario should not be a dataclass.
180
- @dataclass # type: ignore
185
+ @dataclass
181
186
  class Scenario(ABC):
182
187
  """
183
188
  A scenario represents a (task, data distribution).
@@ -249,3 +254,10 @@ class ScenarioSpec(ObjectSpec):
249
254
  def create_scenario(scenario_spec: ScenarioSpec) -> Scenario:
250
255
  """Construct the scenario and set some fields."""
251
256
  return create_object(scenario_spec)
257
+
258
+
259
+ def get_scenario_cache_path(benchmark_output_path: str, scenario_name: str):
260
+ """Return a directory under benchmark_output_path in which Scenario can cache temporary data."""
261
+ scenarios_path: str = os.path.join(benchmark_output_path, "scenarios", scenario_name)
262
+ ensure_directory_exists(scenarios_path)
263
+ return scenarios_path
@@ -1,7 +1,128 @@
1
+ """Simple scenarios for debugging and for tutorials.
2
+
3
+ NOTE: Typically, each scenario should be in its own file,
4
+ but these scenarios are placed in the same module for
5
+ tutorial purposes."""
6
+
1
7
  import random
2
8
  from typing import List
3
9
 
4
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
10
+ from helm.benchmark.scenarios.scenario import (
11
+ Scenario,
12
+ Instance,
13
+ Reference,
14
+ TRAIN_SPLIT,
15
+ TEST_SPLIT,
16
+ CORRECT_TAG,
17
+ Input,
18
+ Output,
19
+ )
20
+
21
+
22
+ class SimpleMCQAScenario(Scenario):
23
+ """Simple multiple-choice question answering scenario for tutorials and debugging.
24
+
25
+ The task is to answer questions about whether two-digit numbers are even or odd.
26
+
27
+ Example:
28
+
29
+ Answer the following questions with a single letter only.
30
+
31
+ Question: Is 24 even or odd?
32
+ A. Even
33
+ B. Odd
34
+ Answer: A"""
35
+
36
+ name = "simple_mcqa"
37
+ description = "Answer if two-digit numbers are even or odd."
38
+ tags = ["question answering"]
39
+
40
+ def get_instances(self, output_path: str) -> List[Instance]:
41
+ instances: List[Instance] = []
42
+ for i in range(10, 100):
43
+ # NOTE: For simplicity, the input text and reference output text
44
+ # is the same for all instances.
45
+ # However, for most question answering scenarios, the input text
46
+ # and reference output text can vary between questions.
47
+ input = Input(text=f"Is {i} even or odd?")
48
+ references = [
49
+ Reference(Output(text="Even"), tags=[CORRECT_TAG] if i % 2 == 0 else []),
50
+ Reference(Output(text="Odd"), tags=[CORRECT_TAG] if i % 2 == 1 else []),
51
+ ]
52
+ split = TRAIN_SPLIT if i <= 20 else TEST_SPLIT
53
+ instance = Instance(input=input, references=references, split=split)
54
+ instances.append(instance)
55
+ return instances
56
+
57
+
58
+ class SimpleShortAnswerQAScenario(Scenario):
59
+ """Simple short answer question answering scenario for tutorials and debugging.
60
+
61
+ The task is to answer questions about whether two-digit numbers are even or odd.
62
+
63
+ Example:
64
+
65
+ Answer the following questions with a single word only.
66
+
67
+ Question: Is 24 even or odd?
68
+ Answer: Even"""
69
+
70
+ name = "simple_mcqa"
71
+ description = "Answer if two-digit numbers are even or odd."
72
+ tags = ["question answering"]
73
+
74
+ def get_instances(self, output_path: str) -> List[Instance]:
75
+ instances: List[Instance] = []
76
+ for i in range(10, 100):
77
+ # NOTE: For simplicity, the input text and reference output text
78
+ # is the same for all instances.
79
+ # However, for most question answering scenarios, the input text
80
+ # and reference output text can vary between questions.
81
+ input = Input(text=f"Is {i} even or odd?")
82
+ correct_answer = "Even" if i % 2 == 0 else "Odd"
83
+ # NOTE: Unlike multiple-choice question answering, only the correct
84
+ # references are needed for short-answer question answering.
85
+ references = [
86
+ Reference(Output(text=correct_answer), tags=[CORRECT_TAG]),
87
+ ]
88
+ split = TRAIN_SPLIT if i <= 20 else TEST_SPLIT
89
+ instance = Instance(input=input, references=references, split=split)
90
+ instances.append(instance)
91
+ return instances
92
+
93
+
94
+ class SimpleClassificationScenario(Scenario):
95
+ """Simple multiple-choice question answering scenario for tutorials and debugging.
96
+
97
+ The task is to classify two-digit numbers as even or odd.
98
+
99
+ Example:
100
+
101
+ Classify the following numbers by their pairity. The classes are "Even" and "Odd".
102
+
103
+ Number: 24
104
+ Pairity: Even"""
105
+
106
+ name = "simple_classification"
107
+ description = "Classify numbers by pairity."
108
+ tags = ["classification"]
109
+
110
+ def get_instances(self, output_path: str) -> List[Instance]:
111
+ instances: List[Instance] = []
112
+ for i in range(10, 100):
113
+ input = Input(text=str(i))
114
+ # NOTE: For classification scenarios, the reference outputs should be the same
115
+ # for all instances, and should include both correct and incorrect classes.
116
+ # HELM only supports single-label classification. Exactly one reference
117
+ # should have the CORRECT_TAG tag.
118
+ references = [
119
+ Reference(Output(text="Even"), tags=[CORRECT_TAG] if i % 2 == 0 else []),
120
+ Reference(Output(text="Odd"), tags=[CORRECT_TAG] if i % 2 == 1 else []),
121
+ ]
122
+ split = TRAIN_SPLIT if i <= 20 else TEST_SPLIT
123
+ instance = Instance(input=input, references=references, split=split)
124
+ instances.append(instance)
125
+ return instances
5
126
 
6
127
 
7
128
  class Simple1Scenario(Scenario):
@@ -0,0 +1,22 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.math_scenario import MATHScenario
5
+ from helm.benchmark.scenarios.scenario import Input, Output, Reference
6
+
7
+
8
+ # TODO: Fix the test for newer versions of diffusers: https://github.com/stanford-crfm/helm/issues/2168
9
+ @pytest.mark.skip(
10
+ reason="Incompatible with newer versions with diffusers>0.24.0. Fails with "
11
+ '"Loading a dataset cached in a LocalFileSystem is not supported"'
12
+ )
13
+ def test_math_scenario_get_instances():
14
+ math_scenario = MATHScenario(subject="number_theory", level="1")
15
+ with TemporaryDirectory() as tmpdir:
16
+ actual_instances = math_scenario.get_instances(tmpdir)
17
+ assert len(actual_instances) == 77
18
+ assert actual_instances[0].input == Input(text="What is the remainder when (99)(101) is divided by 9?")
19
+ assert actual_instances[0].references == [
20
+ Reference(output=Output(text="0", multimedia_content=None), tags=["correct"])
21
+ ]
22
+ assert actual_instances[0].split == "train"
@@ -1,10 +1,13 @@
1
- from helm.benchmark.run_specs import get_scenario_spec_tiny
2
- from helm.benchmark.scenarios.scenario import create_scenario, Scenario, Input, PassageQuestionInput
1
+ from helm.benchmark.scenarios.scenario import ScenarioSpec, create_scenario, Scenario, Input, PassageQuestionInput
3
2
 
4
3
 
5
4
  class TestScenario:
6
5
  def setup_method(self, method):
7
- self.scenario: Scenario = create_scenario(get_scenario_spec_tiny())
6
+ scenario_spec: ScenarioSpec = ScenarioSpec(
7
+ class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario",
8
+ args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 2, "num_test_instances": 2},
9
+ )
10
+ self.scenario: Scenario = create_scenario(scenario_spec)
8
11
 
9
12
  def test_render_lines(self):
10
13
  instances = self.scenario.get_instances(output_path="")