crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -263,13 +263,6 @@ class PersonNamePerturbation(Perturbation):
263
263
  name = rng.choice(list(options))
264
264
  return name
265
265
 
266
- def perturb(self, text: str, rng: Random) -> str:
267
- """
268
- Perturbing the text is handled in `perturb_with_persistency` to ensure that perturbed names
269
- in `Instance`s and `Reference`s match.
270
- """
271
- raise NotImplementedError("Should never be called")
272
-
273
266
  def perturb_with_persistency(
274
267
  self, text: str, rng: Random, name_substitution_mapping: Dict[str, str], skipped_tokens: Set[str]
275
268
  ) -> str:
@@ -10,7 +10,6 @@ from helm.common.object_spec import ObjectSpec, create_object
10
10
 
11
11
 
12
12
  class Perturbation(ABC):
13
-
14
13
  # Unique name to describe perturbation
15
14
  name: str
16
15
 
@@ -28,17 +27,24 @@ class Perturbation(ABC):
28
27
  # If seed exists, use it as part of the random seed
29
28
  return Random(instance.id if seed is None else str(seed) + instance.id)
30
29
 
30
+ @abstractmethod
31
+ def apply(self, instance: Instance, seed: Optional[int] = None) -> Instance:
32
+ """Generate a modified instance from the input instance."""
33
+ pass
34
+
35
+
36
+ class TextPerturbation(Perturbation, ABC):
31
37
  def apply(self, instance: Instance, seed: Optional[int] = None) -> Instance:
32
38
  """
33
- Generates a new Instance by perturbing the input, tagging the Instance and perturbing the References,
34
- if should_perturb_references is true. Initializes a random number generator based on instance_id that gets
35
- passed to perturb and perturb_references.
39
+ Generates a new Instance by applying `perturb` to the input and (if requested) the references.
40
+ Initializes a random number generator based on instance_id that gets
41
+ passed to perturb.
36
42
  """
37
43
  rng: Random = self.get_rng(instance, seed)
38
44
 
39
45
  references: List[Reference] = instance.references
40
46
  if self.should_perturb_references:
41
- references = [self.perturb_reference(reference, rng) for reference in references]
47
+ references = [self._perturb_reference(reference, rng) for reference in references]
42
48
 
43
49
  description = replace(self.description, seed=seed)
44
50
 
@@ -49,11 +55,18 @@ class Perturbation(ABC):
49
55
  input=Input(text=self.perturb(instance.input.text, rng)),
50
56
  references=references,
51
57
  perturbation=description,
58
+ contrast_inputs=[instance.input],
52
59
  )
53
60
 
54
- def perturb_reference(self, reference: Reference, rng: Random) -> Reference:
61
+ def _perturb_reference(self, reference: Reference, rng: Random) -> Reference:
55
62
  """Generates a new Reference by perturbing the output and tagging the Reference."""
56
- return replace(reference, output=Output(text=self.perturb(reference.output.text, rng)), tags=reference.tags)
63
+ return replace(
64
+ reference,
65
+ output=Output(
66
+ text=self.perturb(reference.output.text, rng), multimedia_content=reference.output.multimedia_content
67
+ ),
68
+ tags=reference.tags,
69
+ )
57
70
 
58
71
  @abstractmethod
59
72
  def perturb(self, text: str, rng: Random) -> str:
@@ -23,7 +23,7 @@ class PerturbationDescription:
23
23
  computed_on: str = PERTURBATION_PERTURBED
24
24
  """Which types of Instances we are evaluating, to be populated during metric evaluation. PERTURBATION_PERTURBED
25
25
  (default) means we are evaluating on perturbed instances, PERTURBATION_ORIGINAL means we are evaluating the
26
- unperturbed version of instances where this perturbation appplies, and, PERTURBATION_WORST means the the minimum
26
+ unperturbed version of instances where this perturbation applies, and, PERTURBATION_WORST means the the minimum
27
27
  metric between the two."""
28
28
 
29
29
  seed: Optional[int] = None
@@ -2,11 +2,11 @@ from dataclasses import dataclass
2
2
  from random import Random
3
3
  import re
4
4
 
5
- from .perturbation import Perturbation
5
+ from .perturbation import TextPerturbation
6
6
  from .perturbation_description import PerturbationDescription
7
7
 
8
8
 
9
- class SpacePerturbation(Perturbation):
9
+ class SpacePerturbation(TextPerturbation):
10
10
  """
11
11
  A simple perturbation that replaces existing spaces with 0-max_spaces spaces (thus potentially merging words).
12
12
  """
@@ -0,0 +1,29 @@
1
+ from dataclasses import dataclass
2
+ from random import Random
3
+
4
+ from .perturbation import TextPerturbation
5
+ from .perturbation_description import PerturbationDescription
6
+
7
+
8
+ class SuffixPerturbation(TextPerturbation):
9
+ """
10
+ Appends a suffix to the end of the text. Example:
11
+
12
+ A picture of a dog -> A picture of a dog, picasso
13
+ """
14
+
15
+ @dataclass(frozen=True)
16
+ class Description(PerturbationDescription):
17
+ suffix: str = ""
18
+
19
+ name: str = "style"
20
+
21
+ def __init__(self, suffix: str):
22
+ self._suffix: str = suffix
23
+
24
+ @property
25
+ def description(self) -> PerturbationDescription:
26
+ return SuffixPerturbation.Description(name=self.name, suffix=self._suffix)
27
+
28
+ def perturb(self, text: str, rng: Random) -> str:
29
+ return f"{text}, {self._suffix}"
@@ -11,10 +11,10 @@ import spacy
11
11
 
12
12
  from helm.common.general import match_case, ensure_file_downloaded
13
13
  from .perturbation_description import PerturbationDescription
14
- from .perturbation import Perturbation
14
+ from .perturbation import TextPerturbation
15
15
 
16
16
 
17
- class SynonymPerturbation(Perturbation):
17
+ class SynonymPerturbation(TextPerturbation):
18
18
  """
19
19
  Synonyms. For implementation details, see
20
20
  https://github.com/GEM-benchmark/NL-Augmenter/blob/main/nlaugmenter/transformations/synonym_substitution/transformation.py
@@ -15,6 +15,7 @@ from .space_perturbation import SpacePerturbation
15
15
  from .dialect_perturbation import DialectPerturbation
16
16
  from .person_name_perturbation import PersonNamePerturbation
17
17
  from .gender_perturbation import GenderPerturbation
18
+ from .suffix_perturbation import SuffixPerturbation
18
19
 
19
20
 
20
21
  def test_extra_space_perturbation():
@@ -145,7 +146,6 @@ def test_space_perturbation():
145
146
  instance: Instance = Instance(id="id0", input=Input(text="Hello World!\nQuite a day, huh?"), references=[])
146
147
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
147
148
 
148
- print(instances)
149
149
  assert len(instances) == 2
150
150
  assert instances[1].perturbation.name == "space"
151
151
  assert instances[1].input.text == "Hello World!\nQuite a day, huh?"
@@ -162,7 +162,6 @@ def test_dialect_perturbation():
162
162
  )
163
163
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
164
164
 
165
- print(instances)
166
165
  assert len(instances) == 2
167
166
  assert instances[1].perturbation.name == "dialect"
168
167
  assert instances[1].input.text == "I gon remember dis day to b the best day of mah life."
@@ -188,7 +187,6 @@ def test_person_name_perturbation():
188
187
  )
189
188
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
190
189
 
191
- print(instances)
192
190
  assert len(instances) == 2
193
191
  assert instances[1].perturbation.name == "person_name"
194
192
  assert (
@@ -209,7 +207,6 @@ def test_gender_pronoun_perturbation():
209
207
  )
210
208
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
211
209
 
212
- print(instances)
213
210
  assert len(instances) == 2
214
211
  assert instances[1].perturbation.mode == "pronouns"
215
212
  assert instances[1].input.text == "Did she mention that she was coming with her parents and their friends?"
@@ -227,13 +224,22 @@ def test_gender_term_perturbation():
227
224
  )
228
225
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
229
226
 
230
- print(instances)
231
227
  assert len(instances) == 2
232
228
  assert instances[1].perturbation.mode == "terms"
233
229
  assert instances[1].input.text == "His granddaughters looked a lot like their mom."
234
230
  assert instances[1].references[0].output.text == "How did their mother look like?"
235
231
 
236
232
 
233
+ def test_suffix_perturbation():
234
+ data_augmenter = DataAugmenter(perturbations=[SuffixPerturbation(suffix="pixel art")])
235
+ instance: Instance = Instance(id="id0", input=Input(text="A blue dog"), references=[])
236
+ instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
237
+
238
+ assert len(instances) == 2
239
+ assert instances[1].perturbation.suffix == "pixel art"
240
+ assert instances[1].input.text == "A blue dog, pixel art"
241
+
242
+
237
243
  # TODO(#1958) Fix the logic to renable this test
238
244
  @unittest.skip("Currently cannot replace words at either text boundary.")
239
245
  def test_gender_term_perturbation_edge_word():
@@ -247,7 +253,6 @@ def test_gender_term_perturbation_edge_word():
247
253
  )
248
254
  instances: List[Instance] = data_augmenter.generate([instance], include_original=False)
249
255
 
250
- print(instances)
251
256
  assert len(instances) == 1
252
257
  assert instances[0].input.text == "mom said it is okay"
253
258
  assert instances[0].references[0].output.text == "Sure he did daughter"
@@ -266,6 +271,5 @@ def test_gender_term_perturbation_consequtive_words():
266
271
  )
267
272
  instances: List[Instance] = data_augmenter.generate([instance], include_original=False)
268
273
 
269
- print(instances)
270
274
  assert len(instances) == 1
271
275
  assert instances[0].input.text == "I'm a mom mom: my daughter has a daughter."
@@ -0,0 +1,30 @@
1
+ from dataclasses import dataclass
2
+ from random import Random
3
+
4
+ from helm.clients.google_translate_client import GoogleTranslateClient
5
+ from .perturbation import TextPerturbation
6
+ from .perturbation_description import PerturbationDescription
7
+
8
+
9
+ class TranslatePerturbation(TextPerturbation):
10
+ """
11
+ Translates to different languages.
12
+ """
13
+
14
+ @dataclass(frozen=True)
15
+ class Description(PerturbationDescription):
16
+ # Language code to translate to. Needs a default value since we inherit from `PerturbationDescription`
17
+ language_code: str = "zh-CN"
18
+
19
+ name: str = "translate"
20
+
21
+ def __init__(self, language_code: str):
22
+ self.language_code: str = language_code
23
+ self.google_translate_client = GoogleTranslateClient()
24
+
25
+ @property
26
+ def description(self) -> PerturbationDescription:
27
+ return TranslatePerturbation.Description(name=self.name, language_code=self.language_code)
28
+
29
+ def perturb(self, text: str, rng: Random) -> str:
30
+ return self.google_translate_client.translate(text, self.language_code)
@@ -2,10 +2,10 @@ from dataclasses import dataclass
2
2
  from random import Random
3
3
 
4
4
  from .perturbation_description import PerturbationDescription
5
- from .perturbation import Perturbation
5
+ from .perturbation import TextPerturbation
6
6
 
7
7
 
8
- class TyposPerturbation(Perturbation):
8
+ class TyposPerturbation(TextPerturbation):
9
9
  """
10
10
  Typos. For implementation details, see
11
11
  https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/butter_fingers_perturbation
@@ -0,0 +1,38 @@
1
+ import os
2
+ import importlib_resources as resources
3
+
4
+ from helm.benchmark.model_deployment_registry import register_model_deployments_from_path
5
+ from helm.benchmark.model_metadata_registry import register_model_metadata_from_path
6
+ from helm.benchmark.tokenizer_config_registry import register_tokenizer_configs_from_path
7
+ from helm.benchmark.runner_config_registry import register_runner_config_from_path
8
+
9
+
10
+ MODEL_METADATA_FILE: str = "model_metadata.yaml"
11
+ TOKENIZER_CONFIGS_FILE: str = "tokenizer_configs.yaml"
12
+ MODEL_DEPLOYMENTS_FILE: str = "model_deployments.yaml"
13
+ RUNNER_CONFIG_FILE: str = "runner_config.yaml"
14
+
15
+ CONFIG_PACKAGE = "helm.config"
16
+
17
+
18
+ def register_configs_from_directory(dir_path: str) -> None:
19
+ model_metadata_path = os.path.join(dir_path, MODEL_METADATA_FILE)
20
+ if os.path.isfile(model_metadata_path):
21
+ register_model_metadata_from_path(model_metadata_path)
22
+
23
+ tokenizer_configs_path = os.path.join(dir_path, TOKENIZER_CONFIGS_FILE)
24
+ if os.path.isfile(tokenizer_configs_path):
25
+ register_tokenizer_configs_from_path(tokenizer_configs_path)
26
+
27
+ model_deployments_path = os.path.join(dir_path, MODEL_DEPLOYMENTS_FILE)
28
+ if os.path.isfile(model_deployments_path):
29
+ register_model_deployments_from_path(model_deployments_path)
30
+
31
+ runner_config_path = os.path.join(dir_path, RUNNER_CONFIG_FILE)
32
+ if os.path.isfile(runner_config_path):
33
+ register_runner_config_from_path(runner_config_path)
34
+
35
+
36
+ def register_builtin_configs_from_helm_package() -> None:
37
+ package_path = str(resources.files(CONFIG_PACKAGE))
38
+ register_configs_from_directory(package_path)
@@ -1,9 +1,15 @@
1
1
  from typing import Optional
2
2
  from dataclasses import dataclass, replace
3
+ from helm.common.cache_backend_config import (
4
+ CacheBackendConfig,
5
+ BlackHoleCacheBackendConfig,
6
+ MongoCacheBackendConfig,
7
+ SqliteCacheBackendConfig,
8
+ )
3
9
 
4
10
  from helm.common.general import parallel_map
5
11
  from helm.common.hierarchical_logger import htrack, hlog
6
- from helm.common.request import RequestResult, Sequence
12
+ from helm.common.request import RequestResult, GeneratedOutput
7
13
  from helm.common.authentication import Authentication
8
14
  from helm.proxy.services.remote_service import RemoteService
9
15
  from helm.proxy.services.server_service import ServerService
@@ -18,28 +24,36 @@ class ExecutorError(Exception):
18
24
 
19
25
  @dataclass(frozen=True)
20
26
  class ExecutionSpec:
21
- # If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959).
27
+
22
28
  url: Optional[str]
29
+ """If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959)."""
23
30
 
24
- # Pass into the service
25
31
  auth: Authentication
32
+ """Authentication that will be passed into the local service, if using the local service."""
26
33
 
27
- # Path where API credentials and cache is stored.
28
- # This path is the same as `--base-path` when launching the proxy server (see server.py).
29
- # Required when url is not set.
30
34
  local_path: Optional[str]
35
+ """Path where API credentials and cache is stored.
36
+
37
+ This path is the same as `--base-path` when launching the proxy server (see server.py).
38
+ Required when url is not set."""
31
39
 
32
- # How many threads to have at once
33
40
  parallelism: int
41
+ """How many threads to have at once"""
34
42
 
35
- # Whether to skip execution
36
43
  dry_run: bool = False
44
+ """Whether to skip execution"""
45
+
46
+ sqlite_cache_backend_config: Optional[SqliteCacheBackendConfig] = None
47
+ """If set, SQLite will be used for the cache.
48
+
49
+ This specifies the directory in which the SQLite cache will store files.
50
+ At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set."""
37
51
 
38
- # URL to the MongoDB database.
39
- # If non-empty, the MongoDB database will be used for caching instead of SQLite.
40
- # Example format: mongodb://[username:password@]host1[:port1]/[dbname]
41
- # For full format, see: https://www.mongodb.com/docs/manual/reference/connection-string/
42
- mongo_uri: str = ""
52
+ mongo_cache_backend_config: Optional[MongoCacheBackendConfig] = None
53
+ """If set, MongoDB will be used for the cache.
54
+
55
+ This specifies the MongoDB database to be used by the MongoDB cache.
56
+ At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set."""
43
57
 
44
58
 
45
59
  class Executor:
@@ -51,6 +65,16 @@ class Executor:
51
65
  def __init__(self, execution_spec: ExecutionSpec):
52
66
  self.execution_spec = execution_spec
53
67
 
68
+ cache_backend_config: CacheBackendConfig
69
+ if execution_spec.sqlite_cache_backend_config and execution_spec.mongo_cache_backend_config:
70
+ raise ExecutorError("At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set.")
71
+ elif execution_spec.sqlite_cache_backend_config:
72
+ cache_backend_config = execution_spec.sqlite_cache_backend_config
73
+ elif execution_spec.mongo_cache_backend_config:
74
+ cache_backend_config = execution_spec.mongo_cache_backend_config
75
+ else:
76
+ cache_backend_config = BlackHoleCacheBackendConfig()
77
+
54
78
  self.service: Service
55
79
  if execution_spec.url:
56
80
  hlog(f"Running using remote API proxy server: {execution_spec.url}")
@@ -58,7 +82,9 @@ class Executor:
58
82
  elif execution_spec.local_path:
59
83
  hlog(f"Running in local mode with base path: {execution_spec.local_path}")
60
84
  self.service = ServerService(
61
- base_path=execution_spec.local_path, root_mode=True, mongo_uri=execution_spec.mongo_uri
85
+ base_path=execution_spec.local_path,
86
+ root_mode=True,
87
+ cache_backend_config=cache_backend_config,
62
88
  )
63
89
  else:
64
90
  raise ValueError("Either the proxy server URL or the local path must be set")
@@ -77,7 +103,11 @@ class Executor:
77
103
  )
78
104
 
79
105
  hlog(f"Processed {len(request_states)} requests")
80
- return ScenarioState(scenario_state.adapter_spec, request_states)
106
+ return ScenarioState(
107
+ adapter_spec=scenario_state.adapter_spec,
108
+ request_states=request_states,
109
+ annotator_specs=scenario_state.annotator_specs,
110
+ )
81
111
 
82
112
  def process(self, state: RequestState) -> RequestState:
83
113
  try:
@@ -87,7 +117,7 @@ class Executor:
87
117
  if not result.success:
88
118
  if result.error_flags and not result.error_flags.is_fatal:
89
119
  hlog(f"WARNING: Non-fatal error treated as empty completion: {result.error}")
90
- result.completions = [Sequence(text="", logprob=0, tokens=[])]
120
+ result.completions = [GeneratedOutput(text="", logprob=0, tokens=[])]
91
121
  else:
92
122
  raise ExecutorError(f"{str(result.error)} Request: {state.request}")
93
123
  return replace(state, result=result)
@@ -4,10 +4,16 @@ from typing import Optional
4
4
  from helm.benchmark.model_deployment_registry import (
5
5
  ClientSpec,
6
6
  ModelDeployment,
7
- WindowServiceSpec,
8
7
  register_model_deployment,
9
8
  )
9
+ from helm.benchmark.model_metadata_registry import (
10
+ get_model_metadata,
11
+ get_unknown_model_metadata,
12
+ register_model_metadata,
13
+ )
10
14
  from helm.benchmark.tokenizer_config_registry import TokenizerConfig, TokenizerSpec, register_tokenizer_config
15
+ from helm.common.hierarchical_logger import hlog
16
+ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
11
17
 
12
18
 
13
19
  def register_huggingface_model(
@@ -17,26 +23,50 @@ def register_huggingface_model(
17
23
  if revision:
18
24
  object_spec_args["revision"] = revision
19
25
 
26
+ # Auto-infer model properties from the tokenizer.
27
+ with HuggingFaceTokenizer.create_tokenizer(**object_spec_args) as tokenizer:
28
+ max_sequence_length = tokenizer.model_max_length
29
+ end_of_text_token = tokenizer.eos_token or ""
30
+ prefix_token = tokenizer.bos_token or ""
31
+ # If the tokenizer config has a model_max_length of 1000000000000000019884624838656
32
+ # it means that model creator did not specify model_max_length.
33
+ if max_sequence_length > 1_000_000:
34
+ raise ValueError(
35
+ f"Could not infer the model_max_length of Hugging Face model {pretrained_model_name_or_path}, so "
36
+ f"--enable-huggingface-models and --enable-local-huggingface-models cannot be used for this model. "
37
+ f"Please configure the model using prod_env/model_deployments.yaml instead."
38
+ )
39
+
20
40
  model_deployment = ModelDeployment(
21
41
  name=helm_model_name,
22
42
  client_spec=ClientSpec(
23
- class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient",
43
+ class_name="helm.clients.huggingface_client.HuggingFaceClient",
24
44
  args=object_spec_args,
25
45
  ),
26
46
  model_name=helm_model_name,
27
47
  tokenizer_name=helm_model_name,
28
- window_service_spec=WindowServiceSpec(
29
- class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService",
30
- args=object_spec_args,
31
- ),
48
+ max_sequence_length=max_sequence_length,
32
49
  )
50
+
51
+ # We check if the model is already registered because we don't want to
52
+ # overwrite the model metadata if it's already registered.
53
+ # If it's not registered, we register it, as otherwise an error would be thrown
54
+ # when we try to register the model deployment.
55
+ try:
56
+ _ = get_model_metadata(model_name=helm_model_name)
57
+ except ValueError:
58
+ register_model_metadata(get_unknown_model_metadata(helm_model_name))
59
+ hlog(f"Registered default metadata for model {helm_model_name}")
60
+
33
61
  register_model_deployment(model_deployment)
34
62
  tokenizer_config = TokenizerConfig(
35
63
  name=helm_model_name,
36
64
  tokenizer_spec=TokenizerSpec(
37
- class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer",
65
+ class_name="helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer",
38
66
  args=object_spec_args,
39
67
  ),
68
+ end_of_text_token=end_of_text_token,
69
+ prefix_token=prefix_token,
40
70
  )
41
71
  register_tokenizer_config(tokenizer_config)
42
72