crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,392 @@
1
+ from dataclasses import replace
2
+ from typing import Callable, Dict, List, Optional, Set, Tuple, cast
3
+ import numpy as np
4
+ from functools import partial
5
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
8
+ from helm.benchmark.metrics.metric_name import MetricName
9
+ from helm.benchmark.metrics.metric_service import MetricService
10
+ from helm.benchmark.metrics.statistic import Stat
11
+ from helm.benchmark.scenarios.code_scenario import CodeReference
12
+ from helm.benchmark.scenarios.scenario import Reference
13
+ from helm.common.optional_dependencies import handle_module_not_found_error
14
+ from helm.common.request import GeneratedOutput
15
+ from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
16
+ from nltk.metrics.scores import f_measure
17
+ from nltk.translate.bleu_score import sentence_bleu
18
+ from nltk.tokenize import word_tokenize
19
+ from rouge_score import rouge_scorer
20
+ import re
21
+ import string
22
+ from . import code_metrics_helper
23
+ import nltk
24
+
25
+
26
+ try:
27
+ nltk.data.find("tokenizers/punkt")
28
+ except LookupError:
29
+ nltk.download("punkt") # Required for rouge
30
+
31
+
32
+ def pass_at_k_estimator(n: int, c: int, k: int) -> float:
33
+ """Calculates 1 - comb(n - c, k) / comb(n, k).
34
+
35
+ Numerically stable version defined in
36
+ https://arxiv.org/pdf/2107.03374.pdf
37
+ """
38
+ if n - c < k:
39
+ return 1.0
40
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
41
+
42
+
43
+ def normalize_text(text: str) -> str:
44
+ """Lower text and remove punctuation, articles and extra whitespace.
45
+ Copied from the [QuAC](http://quac.ai/) evaluation script found at
46
+ https://s3.amazonaws.com/my89public/quac/scorer.py"""
47
+
48
+ def remove_articles(text: str) -> str:
49
+ return re.sub(r"\b(a|an|the)\b", " ", text)
50
+
51
+ def white_space_fix(text: str) -> str:
52
+ return " ".join(text.split())
53
+
54
+ def remove_punc(text: str) -> str:
55
+ exclude = set(string.punctuation)
56
+ return "".join(ch for ch in text if ch not in exclude)
57
+
58
+ def lower(text: str) -> str:
59
+ return text.lower()
60
+
61
+ return white_space_fix(remove_articles(remove_punc(lower(text))))
62
+
63
+
64
+ def exact_match(gold: str, pred: str) -> float:
65
+ if not pred:
66
+ return 0
67
+
68
+ return 1 if gold.strip() == pred.strip() else 0
69
+
70
+
71
+ def quasi_exact_match(gold: str, pred: str) -> float:
72
+ if not pred:
73
+ return 0
74
+
75
+ return 1 if normalize_text(gold) == normalize_text(pred) else 0
76
+
77
+
78
+ def prefix_exact_match(gold: str, pred: str) -> float:
79
+ """
80
+ The `prefix_exact_match` metric is particularly useful in the zero-shot setting, where the model is
81
+ not given examples of the expected outputs and tends to output more tokens than it should.
82
+
83
+ For example, for this zero-shot prompt from BoolQ,
84
+
85
+ Passage: Elmendorf Air Force Base (IATA: EDF, ICAO: PAED, FAA LID: EDF) is a United States military facility
86
+ in Anchorage, the largest city in Alaska. Originally known as Elmendorf Field, it became Elmendorf Air Force
87
+ Base after World War II, and in 2010 it merged with nearby Fort Richardson to form Joint Base Elmendorf-Richardson.
88
+ Question: Is there an air force base in anchorage alaska?
89
+ Answer:
90
+
91
+ the model could output up to `max_tokens` number of tokens "Yes, Elmendorf" instead of just "Yes".
92
+ """
93
+ if not pred:
94
+ return 0
95
+
96
+ return 1 if pred.strip().startswith(gold.strip()) else 0
97
+
98
+
99
+ def quasi_prefix_exact_match(gold: str, pred: str) -> float:
100
+ """
101
+ Same thing as `prefix_exact_match` but we normalize the text before checking if the prefix match.
102
+ """
103
+ if not pred:
104
+ return 0
105
+
106
+ return 1 if normalize_text(pred).startswith(normalize_text(gold)) else 0
107
+
108
+
109
+ def f1_score(gold: str, pred: str) -> float:
110
+ ret = f_measure(set(normalize_text(gold).split()), set(normalize_text(pred).split()))
111
+ if ret is None: # answer is the empty string after normalizing
112
+ return 0.0
113
+
114
+ return ret
115
+
116
+
117
+ def exact_match_indicator(gold: str, pred: str, indicator: str = " ") -> float:
118
+ """
119
+ Exact match, allowing for some preceding context.
120
+ For example, the following two answers are considered matching:
121
+ - Because of x and y, the answer is ## <answer>
122
+ - Given reasons y and z, the answer is ## <answer>
123
+ While the following is considered different from the earlier two
124
+ - Given reasons x and a, the answer is ## <other answer>
125
+ """
126
+ pred = pred.split(indicator)[-1].strip()
127
+ gold = gold.split(indicator)[-1].strip()
128
+ return exact_match(gold, pred)
129
+
130
+
131
+ def final_number_exact_match(gold: str, pred: str) -> float:
132
+ """
133
+ Returns 1 iff the final number in gold and pred match.
134
+ Similar to exact_match_indicator.
135
+ Example:
136
+ - gold = "The answer is 15."
137
+ - pred = "The answer is 15 eggs."
138
+ - Returns 1
139
+ """
140
+
141
+ def get_final_number(x: str) -> str:
142
+ matches = re.findall(r"-?[\d,]+(?:.\d+)?", x)
143
+ if not matches:
144
+ return ""
145
+ return matches[-1].replace(",", "")
146
+
147
+ return exact_match(get_final_number(gold), get_final_number(pred))
148
+
149
+
150
+ def rouge_score(gold: str, pred: str, rouge_type: str, scorer: rouge_scorer.RougeScorer) -> float:
151
+ scores = scorer.score(gold, pred)
152
+ return scores[rouge_type].fmeasure
153
+
154
+
155
+ def get_rouge_function(rouge_type: str) -> Callable[[str, str], float]:
156
+ scorer = rouge_scorer.RougeScorer([rouge_type], use_stemmer=True)
157
+ return partial(rouge_score, scorer=scorer, rouge_type=rouge_type)
158
+
159
+
160
+ def bleu_1(gold: str, pred: str) -> float:
161
+ return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(1, 0, 0, 0))
162
+
163
+
164
+ def chinese_bleu_1(gold: str, pred: str) -> float:
165
+ char_tokenizer = ChineseTokenizer()
166
+ return sentence_bleu([char_tokenizer.tokenize(gold)], char_tokenizer.tokenize(pred), weights=(1, 0, 0, 0))
167
+
168
+
169
+ def get_chinese_rouge_function(rouge_type: str) -> Callable[[str, str], float]:
170
+ char_tokenizer = ChineseTokenizer()
171
+ scorer = rouge_scorer.RougeScorer([rouge_type], use_stemmer=True, tokenizer=char_tokenizer)
172
+ return partial(rouge_score, scorer=scorer, rouge_type=rouge_type)
173
+
174
+
175
+ def cleva_math_result_match(gold: str, pred: str) -> float:
176
+ """
177
+ Exact match that only cares the last math expression.
178
+ Common math expressions are numbers and fractions.
179
+ """
180
+ pattern = r"[-+*/%\.\(\)\d]+"
181
+ matches = re.findall(pattern, pred)
182
+ if matches:
183
+ pred = matches[-1].lstrip(")")
184
+ # remove space in front or at the end
185
+ pred = pred.strip()
186
+ return exact_match(gold, pred)
187
+
188
+
189
+ def bleu_4(gold: str, pred: str) -> float:
190
+ return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(0, 0, 0, 1))
191
+
192
+
193
+ def cider(gold: str, pred: str) -> float:
194
+ try:
195
+ from pycocoevalcap.cider.cider import Cider
196
+ except ModuleNotFoundError as e:
197
+ handle_module_not_found_error(e, ["vlm"])
198
+
199
+ cider_evaluator = Cider()
200
+ candidate = {"caption": [pred]}
201
+ reference = {"caption": [gold]}
202
+ average_score, _ = cider_evaluator.compute_score(reference, candidate)
203
+ return average_score
204
+
205
+
206
+ def extract_set_from_text(
207
+ set_str: str,
208
+ set_start_str: str = " is ",
209
+ set_separator: str = " and ",
210
+ empty_set_str: str = "Nothing.",
211
+ ) -> Set[str]:
212
+ """
213
+ Given a string, extract the set of strings implied by that string.
214
+ set_start_str denotes the start of the set
215
+ set_separator denotes the string separating set elements
216
+ empty_set_str is the string which denotes the empty set
217
+ """
218
+ if set_str == empty_set_str:
219
+ return set()
220
+ set_str = set_str.replace(".", "")
221
+ extracted_set = set(set_str.split(set_start_str)[-1].split(set_separator))
222
+ return extracted_set
223
+
224
+
225
+ def extract_gold_pred_sets(gold: str, pred: str) -> Tuple[Set[str], Set[str]]:
226
+ """Extract the set of strings implied by the gold and pred strings"""
227
+ gold_set = extract_set_from_text(gold)
228
+ pred_set = extract_set_from_text(pred.split("\n")[0])
229
+ return gold_set, pred_set
230
+
231
+
232
+ def iou_set_match(gold: str, pred: str) -> float:
233
+ """Compute the intersection over union of the gold and pred sets"""
234
+ gold_set, pred_set = extract_gold_pred_sets(gold, pred)
235
+ if len(gold_set) == 0: # If gold is empty, just check if the pred set is also empty
236
+ return float(gold_set == pred_set)
237
+ return len(gold_set.intersection(pred_set)) / len(gold_set.union(pred_set))
238
+
239
+
240
+ def f1_set_match(gold: str, pred: str) -> float:
241
+ """Compute the F1 score of the gold and pred sets"""
242
+ gold_set, pred_set = extract_gold_pred_sets(gold, pred)
243
+ if len(gold_set) == 0: # If gold is empty, just check if the pred set is also empty
244
+ return float(gold_set == pred_set)
245
+ true_positives = gold_set.intersection(pred_set)
246
+ return 2 * len(true_positives) / (len(gold_set) + len(pred_set))
247
+
248
+
249
+ def exact_set_match(gold: str, pred: str) -> float:
250
+ """Compute whether the sets generated exactly match"""
251
+ gold_set, pred_set = extract_gold_pred_sets(gold, pred)
252
+ return float(gold_set == pred_set)
253
+
254
+
255
+ def absolute_value_difference(gold: str, pred: str) -> float:
256
+ """Compute the absolute value of the difference between two numbers (provided as strings),
257
+ or 0.0 if invalid input.
258
+ """
259
+
260
+ def maybe_int(text: str):
261
+ """Parse int, ignoring commas in numbers."""
262
+ try:
263
+ val = int(text.replace(",", ""))
264
+ except ValueError:
265
+ return 0.0
266
+ return val
267
+
268
+ gold_val = maybe_int(gold)
269
+ pred_val = maybe_int(pred)
270
+ return abs(gold_val - pred_val)
271
+
272
+
273
+ def code_eval(gold: Tuple[str, Optional[Dict]], pred: str) -> float:
274
+ """Evaluate Code Correctness on test examples."""
275
+ assert gold[1] is not None # gold[1]["canonical_solution"]
276
+ # Warning: will execute machine generated code; need to sandbox before executing
277
+ return float(code_metrics_helper.check_correctness(gold[1], pred, 3.0)["passed"]) # type: ignore
278
+
279
+
280
+ # TODO This should probably be made into an implementation of MetricInterface. For now it lives here
281
+ # just to separate it from basic_metrics.py.
282
+ def compute_reference_metrics(
283
+ names: List[str], adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
284
+ ) -> List[Stat]:
285
+ """
286
+ Setup:
287
+
288
+ - Gold (correct references): G1 ... Gm
289
+ - Predictions (completions): P1 ... Pk
290
+
291
+ For each pair (G, P), we can define a ${score} (e.g., exact match, F1, BLEU).
292
+
293
+ We define the following stats:
294
+
295
+ - ${score}: max_i score(Gi, P1)
296
+ - ${score}@k: max_{i,j} score(Gi, Pj)
297
+ """
298
+
299
+ def compute_metrics_helper(
300
+ name: MetricName,
301
+ score_func: Callable,
302
+ group: Optional[str] = None,
303
+ ) -> List[Stat]:
304
+ if name.name == "pass": # Calculate pass@k for HumanEval from CodeScenario.
305
+ score_func = cast(Callable[[Tuple[str, Optional[Dict]], str], float], score_func) # Make mypy happy.
306
+ code_golds = cast(List[CodeReference], golds)
307
+ results = [score_func((gold.output.text, gold.test_cases), pred) for gold in code_golds for pred in preds]
308
+ _len, _sum = len(results), int(sum(results)) # Cast to int to make type match.
309
+ score_1 = pass_at_k_estimator(_len, _sum, 1)
310
+ score_k = pass_at_k_estimator(_len, _sum, adapter_spec.num_outputs)
311
+ elif name.name == "code_eval_acc":
312
+ score_func = cast(Callable[[Tuple[str, Optional[Dict]], str], float], score_func) # Make mypy happy.
313
+ code_golds = cast(List[CodeReference], golds)
314
+ score_1 = max(score_func((gold.output.text, gold.test_cases), preds[0]) for gold in code_golds)
315
+ score_k = max(
316
+ score_func((gold.output.text, gold.test_cases), pred) for gold in code_golds for pred in preds
317
+ )
318
+ else:
319
+ score_func = cast(Callable[[str, str], float], score_func) # Make mypy happy.
320
+ score_1 = max(score_func(gold.output.text, preds[0]) for gold in golds)
321
+ score_k = max(score_func(gold.output.text, pred) for gold in golds for pred in preds)
322
+
323
+ metrics = [Stat(name).add(score_1)] # score_1 corresponds using one prediction
324
+ if adapter_spec.num_outputs != 1:
325
+ metrics.append(Stat(replace(name, name=f"{name.name}@{adapter_spec.num_outputs}")).add(score_k))
326
+ return metrics
327
+
328
+ # maps each string metric name to its associated function
329
+ metric_fn_mapping: Dict[str, Callable] = {
330
+ "exact_match": exact_match,
331
+ "quasi_exact_match": quasi_exact_match,
332
+ "prefix_exact_match": prefix_exact_match,
333
+ "quasi_prefix_exact_match": quasi_prefix_exact_match,
334
+ "exact_match_indicator": exact_match_indicator,
335
+ "final_number_exact_match": final_number_exact_match,
336
+ "exact_set_match": exact_set_match,
337
+ "iou_set_match": iou_set_match,
338
+ "f1_set_match": f1_set_match,
339
+ "math_equiv": is_equiv,
340
+ "math_equiv_chain_of_thought": is_equiv_chain_of_thought,
341
+ "code_eval_acc": code_eval,
342
+ "pass": code_eval,
343
+ "cider": cider,
344
+ "f1_score": f1_score,
345
+ "rouge_1": get_rouge_function("rouge1"),
346
+ "rouge_2": get_rouge_function("rouge2"),
347
+ "rouge_l": get_rouge_function("rougeL"),
348
+ "bleu_1": bleu_1,
349
+ "bleu_4": bleu_4,
350
+ "chinese_bleu_1": chinese_bleu_1,
351
+ "chinese_rouge_1": get_chinese_rouge_function("rouge1"),
352
+ "chinese_rouge_2": get_chinese_rouge_function("rouge2"),
353
+ "cleva_math_result_match": cleva_math_result_match,
354
+ "absolute_value_difference": absolute_value_difference,
355
+ }
356
+
357
+ stats: List[Stat] = []
358
+
359
+ # Gold outputs
360
+ golds: List[Reference] = [reference for reference in request_state.instance.references if reference.is_correct]
361
+ assert len(golds) > 0
362
+
363
+ # Predicted outputs
364
+ assert request_state.result is not None
365
+ sorted_completions: List[GeneratedOutput] = sorted(request_state.result.completions, key=lambda x: -x.logprob)
366
+ preds: List[str] = [completion.text.strip() for completion in sorted_completions]
367
+
368
+ # Apply mapping if exists (e.g., for multiple-choice questions A -> Boston, B -> New York)
369
+ # Note: If 'A' and 'B' were the only possible choices, smaller language models like GPT-2 would
370
+ # sometimes predict a random letter like 'M'.
371
+ if request_state.output_mapping is not None:
372
+ preds = [request_state.output_mapping.get(pred) for pred in preds] # type: ignore
373
+
374
+ # Compute max_prob, the probability that the model assigns to its generated text.
375
+ # Use the log prob of sorted_completions[0], which is the completion with the highest
376
+ # log_prob. We use this since that's what's used for computing metrics like exact_match.
377
+ # One subtlety is that when computing exact_match, we strip whitespace, so the actual
378
+ # max_prob is the sum of all the probabilities in the set {x : strip(x) = prediction}.
379
+ # In practice, we think this may not make much of a difference because models may not place
380
+ # high probabilities on having additional spaces (should check this). Also, the sum
381
+ # involves computing the log_prob for many completions which could be intractable.
382
+ max_prob = np.exp(sorted_completions[0].logprob)
383
+ stats.append(Stat(MetricName("max_prob")).add(max_prob))
384
+
385
+ # Add other metrics
386
+ for metric_name in names:
387
+ if metric_name in metric_fn_mapping:
388
+ stats.extend(compute_metrics_helper(MetricName(metric_name), metric_fn_mapping[metric_name]))
389
+ else:
390
+ raise NameError(f"{metric_name} is not in the list of metric functions.")
391
+
392
+ return stats
@@ -0,0 +1,54 @@
1
+ from statistics import mean
2
+ from typing import List, Optional
3
+
4
+ from helm.common.images_utils import is_blacked_out_image
5
+ from helm.common.request import RequestResult
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
+ from helm.benchmark.metrics.statistic import Stat
9
+ from helm.benchmark.metrics.metric import Metric
10
+ from helm.benchmark.metrics.metric_name import MetricName
11
+ from helm.benchmark.metrics.metric_service import MetricService
12
+ from .aesthetics_scorer import AestheticsScorer
13
+ from helm.common.multimodal_request_utils import gather_generated_image_locations
14
+
15
+
16
+ class AestheticsMetric(Metric):
17
+ """
18
+ Defines metrics for LAION's CLIP-based aesthetics predictor for images
19
+ (https://github.com/LAION-AI/aesthetic-predictor).
20
+ """
21
+
22
+ def __init__(self):
23
+ self._aesthetics_scorer: Optional[AestheticsScorer] = None
24
+
25
+ def __repr__(self):
26
+ return "AestheticsMetric()"
27
+
28
+ def evaluate_generation(
29
+ self,
30
+ adapter_spec: AdapterSpec,
31
+ request_state: RequestState,
32
+ metric_service: MetricService,
33
+ eval_cache_path: str,
34
+ ) -> List[Stat]:
35
+ assert request_state.result is not None
36
+ request_result: RequestResult = request_state.result
37
+ image_locations: List[str] = gather_generated_image_locations(request_result)
38
+ if len(image_locations) == 0:
39
+ return []
40
+
41
+ if self._aesthetics_scorer is None:
42
+ self._aesthetics_scorer = AestheticsScorer()
43
+
44
+ # Compute the aesthetics score for each generated image. Skip blacked out images.
45
+ scores: List[float] = [
46
+ self._aesthetics_scorer.compute_aesthetics_score(location)
47
+ for location in image_locations
48
+ if not is_blacked_out_image(location)
49
+ ]
50
+ stats: List[Stat] = [
51
+ Stat(MetricName("expected_aesthetics_score")).add(mean(scores) if len(scores) > 0 else 0),
52
+ Stat(MetricName("max_aesthetics_score")).add(max(scores) if len(scores) > 0 else 0),
53
+ ]
54
+ return stats
@@ -0,0 +1,66 @@
1
+ from urllib.request import urlretrieve
2
+ import os
3
+
4
+ import torch
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.common.gpu_utils import get_torch_device
8
+ from helm.common.images_utils import open_image
9
+ from helm.common.optional_dependencies import handle_module_not_found_error
10
+ from helm.benchmark.runner import get_cached_models_path
11
+
12
+
13
+ class AestheticsScorer:
14
+ """
15
+ LAION's CLIP-based aesthetics predictor for images (https://github.com/LAION-AI/aesthetic-predictor).
16
+ Adapted from
17
+ https://colab.research.google.com/github/LAION-AI/aesthetic-predictor/blob/main/asthetics_predictor.ipynb.
18
+ """
19
+
20
+ MODEL_URL_TEMPLATE: str = (
21
+ "https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_{clip_model}_linear.pth?raw=true"
22
+ )
23
+
24
+ @staticmethod
25
+ def load_model(clip_model="vit_l_14"):
26
+ """Load the aesthetics model."""
27
+ cache_folder: str = os.path.join(get_cached_models_path(), "emb_reader")
28
+ ensure_directory_exists(cache_folder)
29
+ model_path: str = os.path.join(cache_folder, f"sa_0_4_{clip_model}_linear.pth")
30
+
31
+ if not os.path.exists(model_path):
32
+ model_url: str = os.path.join(AestheticsScorer.MODEL_URL_TEMPLATE.format(clip_model=clip_model))
33
+ urlretrieve(model_url, model_path)
34
+
35
+ if clip_model == "vit_l_14":
36
+ m = torch.nn.Linear(768, 1)
37
+ elif clip_model == "vit_b_32":
38
+ m = torch.nn.Linear(512, 1)
39
+ else:
40
+ raise ValueError(f"Invalid model: {clip_model}")
41
+
42
+ s = torch.load(model_path)
43
+ m.load_state_dict(s)
44
+ m.eval()
45
+ return m
46
+
47
+ def __init__(self):
48
+ try:
49
+ import clip
50
+ except ModuleNotFoundError as e:
51
+ handle_module_not_found_error(e, ["heim"])
52
+
53
+ # Load the CLIP and aesthetics model
54
+ self._device: torch.device = get_torch_device()
55
+ self._model, self._preprocess = clip.load("ViT-L/14", device=self._device)
56
+ self._aesthetics_model = self.load_model().to(self._device)
57
+
58
+ def compute_aesthetics_score(self, image_location: str) -> float:
59
+ """
60
+ Compute the aesthetics score. Returns a value between 1 and 10.
61
+ """
62
+ image = self._preprocess(open_image(image_location)).unsqueeze(0).to(self._device)
63
+ with torch.no_grad():
64
+ image_features = self._model.encode_image(image)
65
+ image_features /= image_features.norm(dim=-1, keepdim=True)
66
+ return self._aesthetics_model(image_features.float()).detach().item()
@@ -0,0 +1,73 @@
1
+ from statistics import mean
2
+ from typing import List
3
+
4
+ from helm.common.general import singleton
5
+ from helm.common.request import RequestResult
6
+ from helm.common.clip_score_request import DEFAULT_CLIP_SCORE_MODEL, CLIPScoreResult, CLIPScoreRequest
7
+ from helm.benchmark.adaptation.request_state import RequestState
8
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
9
+ from helm.benchmark.metrics.statistic import Stat
10
+ from helm.benchmark.metrics.metric import Metric
11
+ from helm.benchmark.metrics.metric_name import MetricName
12
+ from helm.benchmark.metrics.metric_service import MetricService
13
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
14
+ from helm.common.images_utils import is_blacked_out_image
15
+ from helm.common.multimodal_request_utils import gather_generated_image_locations
16
+
17
+
18
+ class CLIPScoreMetric(Metric):
19
+ """
20
+ Defines CLIPScore-based metrics (https://arxiv.org/abs/2104.08718).
21
+ CLIPScore is a reference free metric that can be used to evaluate the correlation between an image
22
+ caption and the content of the image. It has been found to be highly correlated with human judgement.
23
+ """
24
+
25
+ def __init__(self, multilingual: bool = False):
26
+ self._multilingual: bool = multilingual
27
+
28
+ def __repr__(self):
29
+ return f"CLIPScoreMetric(multilingual={self._multilingual})"
30
+
31
+ def evaluate_generation(
32
+ self,
33
+ adapter_spec: AdapterSpec,
34
+ request_state: RequestState,
35
+ metric_service: MetricService,
36
+ eval_cache_path: str,
37
+ ) -> List[Stat]:
38
+ def get_metric_name(base_name: str) -> str:
39
+ if self._multilingual:
40
+ base_name = f"{base_name}_multilingual"
41
+ return base_name
42
+
43
+ assert request_state.result is not None
44
+ request_result: RequestResult = request_state.result
45
+
46
+ prompt: str = request_state.request.prompt
47
+ perturbation_name: str = request_state.instance.perturbation.name if request_state.instance.perturbation else ""
48
+ if (
49
+ request_state.instance.contrast_inputs is not None
50
+ and len(request_state.instance.contrast_inputs) > 0
51
+ and perturbation_name in ["translate", "dialect", "mild_mix"]
52
+ ):
53
+ prompt = singleton(request_state.instance.contrast_inputs).text
54
+
55
+ # Truncate the prompt using the CLIP tokenizer before feeding into the CLIP model.
56
+ # Otherwise, the library will throw an error.
57
+ model = DEFAULT_CLIP_SCORE_MODEL
58
+ prompt = WindowServiceFactory.get_window_service(model, metric_service).truncate_from_right(prompt)
59
+
60
+ scores: List[float] = []
61
+ image_locations: List[str] = gather_generated_image_locations(request_result)
62
+ for location in image_locations:
63
+ if not is_blacked_out_image(location):
64
+ result: CLIPScoreResult = metric_service.compute_clip_score(
65
+ CLIPScoreRequest(prompt, location, model=model, multilingual=self._multilingual)
66
+ )
67
+ scores.append(result.score)
68
+
69
+ stats: List[Stat] = [
70
+ Stat(MetricName(get_metric_name("expected_clip_score"))).add(mean(scores) if len(scores) > 0 else 0),
71
+ Stat(MetricName(get_metric_name("max_clip_score"))).add(max(scores) if len(scores) > 0 else 0),
72
+ ]
73
+ return stats
@@ -0,0 +1,42 @@
1
+ from collections import defaultdict
2
+ from tqdm import tqdm
3
+ from typing import Dict
4
+ import math
5
+ import numpy as np
6
+
7
+ from helm.common.request import RequestResult
8
+ from helm.benchmark.scenarios.scenario import Instance
9
+ from helm.benchmark.adaptation.scenario_state import ScenarioState
10
+ from helm.benchmark.metrics.statistic import Stat
11
+ from helm.benchmark.metrics.metric import MetricInterface, MetricResult
12
+ from helm.benchmark.metrics.metric_name import MetricName
13
+ from helm.benchmark.metrics.metric_service import MetricService
14
+
15
+
16
+ class DenoisedRuntimeMetric(MetricInterface):
17
+ def __repr__(self):
18
+ return "DenoisedRuntimeMetric()"
19
+
20
+ def evaluate(
21
+ self,
22
+ scenario_state: ScenarioState,
23
+ metric_service: MetricService,
24
+ eval_cache_path: str,
25
+ parallelism: int,
26
+ ) -> MetricResult:
27
+
28
+ instance_to_min_request_times: Dict[Instance, float] = defaultdict(lambda: math.inf)
29
+ for request_state in tqdm(scenario_state.request_states):
30
+ assert request_state.result is not None
31
+ request_result: RequestResult = request_state.result
32
+
33
+ assert request_result.request_time is not None
34
+ request_time: float = request_result.request_time
35
+
36
+ instance: Instance = request_state.instance
37
+ instance_to_min_request_times[instance] = min(instance_to_min_request_times[instance], request_time)
38
+
39
+ denoised_runtime: float = float(np.mean(list(instance_to_min_request_times.values())))
40
+ return MetricResult(
41
+ aggregated_stats=[Stat(MetricName("denoised_runtime")).add(denoised_runtime)], per_instance_stats=[]
42
+ )
@@ -0,0 +1,57 @@
1
+ from typing import List, Dict, Any
2
+ import json
3
+ from statistics import mean
4
+
5
+ from helm.common.request import RequestResult
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
+ from helm.benchmark.metrics.statistic import Stat
9
+ from helm.benchmark.metrics.metric import Metric
10
+ from helm.benchmark.metrics.metric_name import MetricName
11
+ from helm.benchmark.metrics.metric_service import MetricService
12
+ from helm.common.multimodal_request_utils import gather_generated_image_locations
13
+ from .detectors.vitdet import ViTDetDetector
14
+
15
+
16
+ class DetectionMetric(Metric):
17
+ """
18
+ Define metrics following DALL-EVAL (https://arxiv.org/abs/2202.04053),
19
+ which measure whether generated images contain the correct objects, counts, and relations
20
+ as specified in input text prompts.
21
+ """
22
+
23
+ def __init__(self):
24
+ self._detection_model = None
25
+
26
+ def __repr__(self):
27
+ return "DetectionMetric()"
28
+
29
+ def evaluate_generation(
30
+ self,
31
+ adapter_spec: AdapterSpec,
32
+ request_state: RequestState,
33
+ metric_service: MetricService,
34
+ eval_cache_path: str,
35
+ ) -> List[Stat]:
36
+ assert request_state.result is not None
37
+ request_result: RequestResult = request_state.result
38
+ image_locations: List[str] = gather_generated_image_locations(request_result)
39
+ if len(image_locations) == 0:
40
+ return []
41
+
42
+ if self._detection_model is None:
43
+ self._detection_model = ViTDetDetector()
44
+
45
+ instance = request_state.instance
46
+ references: Dict[str, Any] = {**json.loads(instance.references[0].output.text), "skill": instance.sub_split}
47
+
48
+ prompt: str = request_state.request.prompt
49
+ scores: List[float] = []
50
+ for image_location in image_locations:
51
+ score: float = self._detection_model.compute_score(prompt, image_location, references)
52
+ scores.append(score)
53
+
54
+ stats: List[Stat] = [
55
+ Stat(MetricName("detection_correct_frac")).add(mean(scores) if len(scores) > 0 else 0),
56
+ ]
57
+ return stats