crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -1,7 +1,7 @@
1
- from abc import ABC
1
+ from abc import ABC, abstractmethod
2
2
  from dataclasses import dataclass, replace
3
3
  from collections import defaultdict
4
- from typing import List, Dict, Tuple, Optional, Iterable, Set
4
+ from typing import List, Dict, Tuple, Optional, Iterable
5
5
 
6
6
  from helm.common.object_spec import ObjectSpec, create_object
7
7
  from helm.common.general import singleton, parallel_map
@@ -67,35 +67,36 @@ class Processor:
67
67
  eval_cache_path: str
68
68
  adapter_spec: AdapterSpec
69
69
 
70
- def process(self, request_state_set: RequestStateSet) -> List[Stat]:
70
+ def process(self, generation_states: List[RequestState]) -> List[Stat]:
71
71
  instance_stats: List[Stat] = []
72
72
 
73
73
  # Evaluate generated request_state
74
- generation_states = request_state_set.generation_states
75
- if len(generation_states) != 0:
76
- instance_stats.extend(
77
- self.metric.evaluate_generation(
78
- self.adapter_spec, singleton(generation_states), self.metric_service, self.eval_cache_path
79
- )
80
- )
81
-
82
- # Evaluate the references
83
- references_states = request_state_set.references_states
84
- if len(references_states) != 0:
85
- instance_stats.extend(
86
- self.metric.evaluate_references(
87
- self.adapter_spec, references_states, self.metric_service, self.eval_cache_path
88
- )
74
+ if len(generation_states) == 0:
75
+ return instance_stats
76
+ instance_stats.extend(
77
+ self.metric.evaluate_generation(
78
+ self.adapter_spec, singleton(generation_states), self.metric_service, self.eval_cache_path
89
79
  )
80
+ )
90
81
 
91
82
  # Add instance-related context (e.g., split, perturbation) to the metrics
92
83
  for i, stat in enumerate(instance_stats):
93
- instance_stats[i] = add_context(stat, MetricContext.from_instance(request_state_set.instance))
84
+ instance_stats[i] = add_context(stat, MetricContext.from_instance(generation_states[0].instance))
94
85
 
95
86
  return instance_stats
96
87
 
97
88
 
98
- class Metric(ABC):
89
+ class MetricInterface(ABC):
90
+ """Interface for all Metrics."""
91
+
92
+ @abstractmethod
93
+ def evaluate(
94
+ self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
95
+ ) -> MetricResult:
96
+ pass
97
+
98
+
99
+ class Metric(MetricInterface, ABC):
99
100
  """
100
101
  A `Metric` takes the results of execution and produces `Stat`s for a
101
102
  scenario.
@@ -116,8 +117,11 @@ class Metric(ABC):
116
117
  Any logic that doesn't decompose along instances should go here, such
117
118
  as robustness.
118
119
  """
119
- if scenario_state.adapter_spec.method == ADAPT_LANGUAGE_MODELING:
120
- return self.evaluate_language_modeling(scenario_state, metric_service, eval_cache_path)
120
+ assert scenario_state.adapter_spec.method != ADAPT_LANGUAGE_MODELING, (
121
+ "Metric no longer knows how to handle the language modeling adapter. "
122
+ + "All run_specs with that adapter should use LanguageModelingMetric. "
123
+ + "If you are seeing this issue, please file a Github issue."
124
+ )
121
125
 
122
126
  adapter_spec = scenario_state.adapter_spec
123
127
  global_stats: Dict[MetricName, Stat] = {}
@@ -125,18 +129,9 @@ class Metric(ABC):
125
129
 
126
130
  for train_trial_index in range(adapter_spec.num_train_trials):
127
131
  # Construct inputs
128
- request_state_sets: List[RequestStateSet] = []
132
+ generation_state_sets: List[List[RequestState]] = []
129
133
  for instance in scenario_state.instances:
130
- generation_states = scenario_state.get_request_states(train_trial_index, instance, None)
131
- references_states = []
132
- for reference_index in range(len(instance.references)):
133
- references_states.extend(
134
- scenario_state.get_request_states(train_trial_index, instance, reference_index)
135
- )
136
- request_state_set = RequestStateSet(
137
- instance=instance, generation_states=generation_states, references_states=references_states
138
- )
139
- request_state_sets.append(request_state_set)
134
+ generation_state_sets.append(scenario_state.get_request_states(train_trial_index, instance, None))
140
135
 
141
136
  # Do it!
142
137
  processor = Processor(
@@ -147,7 +142,7 @@ class Metric(ABC):
147
142
  )
148
143
  results: List[List[Stat]] = parallel_map(
149
144
  processor.process,
150
- request_state_sets,
145
+ generation_state_sets,
151
146
  parallelism=parallelism,
152
147
  )
153
148
 
@@ -194,22 +189,10 @@ class Metric(ABC):
194
189
  for stat in self.derive_per_instance_stats(instance_dict):
195
190
  merge_stat(trial_stats, add_context(stat, context))
196
191
 
197
- # Compute statistics that depend on all the `RequestStates` (e.g., bias metrics).
198
- # Aggregate request states and call evaluate_instances in case the metric needs it.
199
- grouped_request_states: Dict[MetricContext, List[RequestState]] = defaultdict(list)
200
- for instance in scenario_state.instances:
201
- # TODO: do we need to support reference_index that is not None?
202
- grouped_request_states[MetricContext.from_instance(instance)].extend(
203
- scenario_state.get_request_states(train_trial_index, instance, None)
204
- )
205
- for context, request_states in grouped_request_states.items():
206
- for stat in self.evaluate_instances(request_states):
207
- merge_stat(trial_stats, add_context(stat, context))
208
-
209
192
  # Compute worst-case metrics.
210
193
  # This is here since we want these stats for all metrics and they
211
194
  # aggregate across contexts (perturbations).
212
- worst_case_stats = self.compute_worst_case_metrics(dict(zip(scenario_state.instances, results)))
195
+ worst_case_stats = compute_worst_case_metrics(dict(zip(scenario_state.instances, results)))
213
196
  for stat in worst_case_stats:
214
197
  merge_stat(trial_stats, stat)
215
198
 
@@ -222,6 +205,7 @@ class Metric(ABC):
222
205
  # Wrap aggregated and per-instance stats in a MetricResult.
223
206
  return MetricResult(list(global_stats.values()), all_per_instance_stats)
224
207
 
208
+ @abstractmethod
225
209
  def evaluate_generation(
226
210
  self,
227
211
  adapter_spec: AdapterSpec,
@@ -230,21 +214,7 @@ class Metric(ABC):
230
214
  eval_cache_path: str,
231
215
  ) -> List[Stat]:
232
216
  """Evaluate free-form generation. Override me!"""
233
- return []
234
-
235
- def evaluate_references(
236
- self,
237
- adapter_spec: AdapterSpec,
238
- reference_request_states: List[RequestState],
239
- metric_service: MetricService,
240
- eval_cache_path: str,
241
- ) -> List[Stat]:
242
- """Evaluate the references. Override me!"""
243
- return []
244
-
245
- def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
246
- """Evaluate all request states directly. Use only if nothing else works. Override me!"""
247
- return []
217
+ pass
248
218
 
249
219
  def derive_stats(self, stats_dict: Dict[MetricName, Stat]) -> List[Stat]:
250
220
  """Derive stats based on existing stats, e.g., for perplexity. Override me!"""
@@ -254,119 +224,70 @@ class Metric(ABC):
254
224
  """Derive stats based on existing per-instance stats, e.g., for calibration. Override me!"""
255
225
  return []
256
226
 
257
- def evaluate_language_modeling(
258
- self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str
259
- ) -> MetricResult:
260
- global_stats: Dict[MetricName, Stat] = {}
261
- # The first and only trial
262
- trial_stats: Dict[MetricName, Stat] = {}
263
- # Per-instance stats
264
- all_per_instance_stats: List[PerInstanceStats] = []
265
- instance_ids_per_context: Dict[MetricContext, Set[str]] = defaultdict(set)
266
-
267
- for request_state in scenario_state.request_states:
268
- # Evaluate request_state
269
- request_stats = self.evaluate_generation(
270
- scenario_state.adapter_spec, request_state, metric_service, eval_cache_path
271
- )
272
-
273
- # Add instance-related context (e.g., split, perturbation) to the metrics
274
- for i, stat in enumerate(request_stats):
275
- context = MetricContext.from_instance(request_state.instance)
276
- request_stats[i] = add_context(stat, context)
277
- assert request_state.instance.id is not None
278
- instance_ids_per_context[context].add(request_state.instance.id)
279
-
280
- # Use trial index of 0 here since we run only one trial for LM
281
- assert request_state.instance.id is not None
282
- all_per_instance_stats.append(
283
- PerInstanceStats(request_state.instance.id, request_state.instance.perturbation, 0, request_stats)
284
- )
285
-
286
- for stat in request_stats:
287
- merge_stat(trial_stats, stat)
288
-
289
- # group stats according to the context (e.g., split, perturbation) and call derive_stats on each grouping
290
- grouped_trial_stats: Dict[MetricContext, Dict[MetricName, Stat]] = defaultdict(dict)
291
- for metric_name, stat in trial_stats.items():
292
- grouped_trial_stats[MetricContext.from_metric_name(metric_name)][metric_name] = stat # group by context
293
-
294
- for context, stats_dict in grouped_trial_stats.items():
295
- for stat in self.derive_stats(stats_dict):
296
- merge_stat(trial_stats, add_context(stat, context))
297
- # keep track of how many instances are in each subset
298
- num_instances_stat = Stat(MetricName("num_instances")).add(len(instance_ids_per_context[context]))
299
- merge_stat(trial_stats, add_context(num_instances_stat, context))
300
227
 
301
- for stat in trial_stats.values():
302
- merge_stat(global_stats, stat.take_mean())
303
- return MetricResult(list(global_stats.values()), all_per_instance_stats)
304
-
305
- def compute_worst_case_metrics(self, per_instance_stats: Dict[Instance, List[Stat]]) -> List[Stat]:
306
- """
307
- For each instance, we compute the worst case perfomance between each perturbation and the non-perturbed input
308
- (perturbation=None). This allows us to reason about the invariances of a model as opposed to just looking
309
- at its performance on perturbed inputs. We also compute the worst case performance across all robustness-related
310
- and fairness-related perturbations (including the original input in both).
228
+ def compute_worst_case_metrics(per_instance_stats: Dict[Instance, List[Stat]]) -> List[Stat]:
229
+ """
230
+ For each instance, we compute the worst case perfomance between each perturbation and the non-perturbed input
231
+ (perturbation=None). This allows us to reason about the invariances of a model as opposed to just looking
232
+ at its performance on perturbed inputs. We also compute the worst case performance across all robustness-related
233
+ and fairness-related perturbations (including the original input in both).
311
234
 
312
- For each such worst-case metric, we record a `before_` metric that aggregates the performance on the
313
- non-perturbed version of the corresponding inputs.
235
+ For each such worst-case metric, we record a `before_` metric that aggregates the performance on the
236
+ non-perturbed version of the corresponding inputs.
314
237
 
315
- We return the aggregate metrics across instances. Note that none of these metrics make a lot of sense if the
316
- original, un-perturbed version of an Instance is not included in a scenario (i.e., we want
317
- `include_original=True`).
318
- """
319
- # Collect statistics per input-metric pair across perturbations
320
- per_instance_perturbation_stats: Dict[Tuple[MetricName, str], List[Stat]] = defaultdict(list)
321
- for instance, stats in per_instance_stats.items():
322
- for stat in stats:
323
- assert instance.id is not None
324
- # Group all perturbations for a specific metric name together
325
- per_instance_perturbation_stats[(replace(stat.name, perturbation=None), instance.id)].append(stat)
326
-
327
- # Compute worst perturbation stats
328
- derived_stats_dict: Dict[MetricName, Stat] = {}
329
- for (metric_name, instance_id), stats in per_instance_perturbation_stats.items():
330
- original_stat: Optional[Stat] = None
331
- robustness_stat = Stat(
332
- replace(metric_name, perturbation=PerturbationDescription(name="robustness", robustness=True))
333
- )
334
- fairness_stat = Stat(
335
- replace(metric_name, perturbation=PerturbationDescription(name="fairness", fairness=True))
336
- )
337
- individual_perturbation_stats: Dict[PerturbationDescription, Stat] = {}
338
-
339
- for stat in stats: # go through all the perturbations of the instance and merge relevant stats
340
- perturbation = stat.name.perturbation
341
- if perturbation is None:
342
- assert original_stat is None # we should only have one original stat
343
- original_stat = stat
344
- else:
345
- if perturbation.robustness:
346
- robustness_stat.merge(stat)
347
- if perturbation.fairness:
348
- fairness_stat.merge(stat)
349
- assert perturbation not in individual_perturbation_stats, perturbation
350
- individual_perturbation_stats[perturbation] = Stat(stat.name).merge(stat) # copy
351
-
352
- for stat in [robustness_stat, fairness_stat, *individual_perturbation_stats.values()]:
353
- perturbation = stat.name.perturbation
354
- assert perturbation is not None
355
-
356
- if original_stat is not None:
357
- stat.merge(original_stat)
358
- if perturbation.name not in ["robustness", "fairness"]:
359
- before = replace(perturbation, computed_on=PERTURBATION_ORIGINAL)
360
- merge_stat(
361
- derived_stats_dict, Stat(replace(stat.name, perturbation=before)).merge(original_stat)
362
- )
363
-
364
- # keep the minimum performance for each input
365
- worst = replace(perturbation, computed_on=PERTURBATION_WORST)
366
- if stat.count > 0:
367
- # TODO: take stat.max if lower_is_better = True
368
- merge_stat(derived_stats_dict, Stat(replace(stat.name, perturbation=worst)).add(stat.min))
369
- return list(derived_stats_dict.values())
238
+ We return the aggregate metrics across instances. Note that none of these metrics make a lot of sense if the
239
+ original, un-perturbed version of an Instance is not included in a scenario (i.e., we want
240
+ `include_original=True`).
241
+ """
242
+ # Collect statistics per input-metric pair across perturbations
243
+ per_instance_perturbation_stats: Dict[Tuple[MetricName, str], List[Stat]] = defaultdict(list)
244
+ for instance, stats in per_instance_stats.items():
245
+ for stat in stats:
246
+ assert instance.id is not None
247
+ # Group all perturbations for a specific metric name together
248
+ per_instance_perturbation_stats[(replace(stat.name, perturbation=None), instance.id)].append(stat)
249
+
250
+ # Compute worst perturbation stats
251
+ derived_stats_dict: Dict[MetricName, Stat] = {}
252
+ for (metric_name, instance_id), stats in per_instance_perturbation_stats.items():
253
+ original_stat: Optional[Stat] = None
254
+ robustness_stat = Stat(
255
+ replace(metric_name, perturbation=PerturbationDescription(name="robustness", robustness=True))
256
+ )
257
+ fairness_stat = Stat(replace(metric_name, perturbation=PerturbationDescription(name="fairness", fairness=True)))
258
+ individual_perturbation_stats: Dict[PerturbationDescription, Stat] = {}
259
+
260
+ for stat in stats: # go through all the perturbations of the instance and merge relevant stats
261
+ perturbation = stat.name.perturbation
262
+ if perturbation is None:
263
+ assert (
264
+ original_stat is None
265
+ ), f"For {metric_name} got both {original_stat} and {stat}" # we should only have one original stat
266
+ original_stat = stat
267
+ else:
268
+ if perturbation.robustness:
269
+ robustness_stat.merge(stat)
270
+ if perturbation.fairness:
271
+ fairness_stat.merge(stat)
272
+ assert perturbation not in individual_perturbation_stats, perturbation
273
+ individual_perturbation_stats[perturbation] = Stat(stat.name).merge(stat) # copy
274
+
275
+ for stat in [robustness_stat, fairness_stat, *individual_perturbation_stats.values()]:
276
+ perturbation = stat.name.perturbation
277
+ assert perturbation is not None
278
+
279
+ if original_stat is not None:
280
+ stat.merge(original_stat)
281
+ if perturbation.name not in ["robustness", "fairness"]:
282
+ before = replace(perturbation, computed_on=PERTURBATION_ORIGINAL)
283
+ merge_stat(derived_stats_dict, Stat(replace(stat.name, perturbation=before)).merge(original_stat))
284
+
285
+ # keep the minimum performance for each input
286
+ worst = replace(perturbation, computed_on=PERTURBATION_WORST)
287
+ if stat.count > 0:
288
+ # TODO: take stat.max if lower_is_better = True
289
+ merge_stat(derived_stats_dict, Stat(replace(stat.name, perturbation=worst)).add(stat.min))
290
+ return list(derived_stats_dict.values())
370
291
 
371
292
 
372
293
  class MetricSpec(ObjectSpec):
@@ -7,7 +7,6 @@ from helm.benchmark.scenarios.scenario import Instance
7
7
 
8
8
  @dataclass(frozen=True, eq=True)
9
9
  class MetricName:
10
-
11
10
  # The name of the metric
12
11
  name: str
13
12
 
@@ -2,9 +2,13 @@ from typing import Optional
2
2
 
3
3
  from helm.common.authentication import Authentication
4
4
  from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
5
+ from helm.common.file_upload_request import FileUploadResult, FileUploadRequest
6
+ from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
7
+ from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
5
8
  from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
6
9
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
10
  from helm.proxy.services.service import Service
11
+ from helm.common.cache import Cache
8
12
 
9
13
 
10
14
  class MetricService(TokenizerService):
@@ -15,8 +19,20 @@ class MetricService(TokenizerService):
15
19
  def __init__(self, service: Service, auth: Authentication):
16
20
  super().__init__(service, auth)
17
21
 
22
+ def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
23
+ return self._service.check_nudity(self._auth, request)
24
+
25
+ def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
26
+ return self._service.compute_clip_score(self._auth, request)
27
+
28
+ def upload(self, request: FileUploadRequest) -> FileUploadResult:
29
+ return self._service.upload(self._auth, request)
30
+
18
31
  def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
19
32
  return self._service.get_toxicity_scores(self._auth, request)
20
33
 
21
34
  def make_critique_request(self, request: CritiqueRequest) -> Optional[CritiqueRequestResult]:
22
35
  return self._service.make_critique_request(self._auth, request)
36
+
37
+ def get_cache(self, shard_name: str) -> Cache:
38
+ return Cache(self._service.get_cache_config(shard_name))
@@ -1,13 +1,13 @@
1
1
  from typing import List
2
2
 
3
3
  from helm.benchmark.adaptation.request_state import RequestState
4
- from .metric import Metric
4
+ from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
5
5
  from .metric_name import MetricName
6
6
  from .statistic import Stat
7
7
  from nltk.translate.bleu_score import corpus_bleu
8
8
 
9
9
 
10
- class CLEVAParaphraseGenerationMetric(Metric):
10
+ class CLEVAParaphraseGenerationMetric(EvaluateInstancesMetric):
11
11
  """
12
12
  Compute the Chinese iBLEU score for Paraphrase Generation scenarios of CLEVA benchmark.
13
13
  This implementation allows variable number of references (i.e., golds).
@@ -21,8 +21,7 @@ class CLEVAParaphraseGenerationMetric(Metric):
21
21
  def __init__(self, alpha: float = 0.8): # calculate iBLEU_0.8 by default
22
22
  self.alpha = alpha
23
23
 
24
- def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
25
-
24
+ def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
26
25
  inputs: List = []
27
26
  preds: List = []
28
27
  golds: List[List[str]] = []
@@ -4,11 +4,11 @@ from typing import Callable, Dict, List, Tuple, Optional
4
4
  from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_RANKING_BINARY
5
5
  from helm.benchmark.adaptation.request_state import RequestState
6
6
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
7
+ from helm.benchmark.metrics.reference_metric import ReferenceMetric
7
8
  from helm.common.optional_dependencies import handle_module_not_found_error
8
9
  from helm.benchmark.scenarios.scenario import unpack_tag, CORRECT_TAG, Reference
9
10
  from helm.common.request import RequestResult
10
11
  from helm.common.general import assert_present, binarize_dict
11
- from .metric import Metric
12
12
  from .metric_name import MetricName
13
13
  from .metric_service import MetricService
14
14
  from .statistic import Stat
@@ -58,7 +58,7 @@ class RankingObject:
58
58
  model_relevance: Optional[int] = None
59
59
 
60
60
 
61
- class RankingMetric(Metric):
61
+ class RankingMetric(ReferenceMetric):
62
62
  """Ranking metric."""
63
63
 
64
64
  """ Methods supported by this metric.
@@ -0,0 +1,148 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import List, Dict
4
+ from helm.benchmark.metrics.metric import (
5
+ MetricInterface,
6
+ MetricResult,
7
+ PerInstanceStats,
8
+ add_context,
9
+ compute_worst_case_metrics,
10
+ )
11
+
12
+ from helm.common.general import parallel_map
13
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_LANGUAGE_MODELING
14
+ from helm.benchmark.adaptation.scenario_state import ScenarioState
15
+ from helm.benchmark.adaptation.request_state import RequestState
16
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
17
+ from .metric_name import MetricName, MetricContext
18
+ from .metric_service import MetricService
19
+ from .statistic import Stat, merge_stat
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class Processor:
24
+ """Evaluates an instance."""
25
+
26
+ # TODO: not ideal that we have circular dependencies; subclasses of Metric
27
+ # should override the Processor rather than the Metric.
28
+ metric: "ReferenceMetric"
29
+ metric_service: MetricService
30
+ eval_cache_path: str
31
+ adapter_spec: AdapterSpec
32
+
33
+ def process(self, references_states: List[RequestState]) -> List[Stat]:
34
+ instance_stats: List[Stat] = []
35
+
36
+ # Evaluate the references
37
+ if len(references_states) == 0:
38
+ return instance_stats
39
+ instance_stats.extend(
40
+ self.metric.evaluate_references(
41
+ self.adapter_spec, references_states, self.metric_service, self.eval_cache_path
42
+ )
43
+ )
44
+
45
+ # Add instance-related context (e.g., split, perturbation) to the metrics
46
+ for i, stat in enumerate(instance_stats):
47
+ instance_stats[i] = add_context(stat, MetricContext.from_instance(references_states[0].instance))
48
+
49
+ return instance_stats
50
+
51
+
52
+ class ReferenceMetric(MetricInterface, ABC):
53
+ """
54
+ A `Metric` takes the results of execution and produces `Stat`s for a
55
+ scenario.
56
+
57
+ Note: `Metric` actually right now is a bit of misnomer because it produces many
58
+ `Stat`s, that might be distinct but are computed together. Eventually we
59
+ might move to a world where there is one (or very few metrics that are domain-independent).
60
+ """
61
+
62
+ def evaluate(
63
+ self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
64
+ ) -> MetricResult:
65
+ """
66
+ Main entry point for a `Metric`. This function groups the single
67
+ list of `RequestState` by training trial and instance, and invokes
68
+ other functions to process those. This should serve most purposes.
69
+
70
+ Any logic that doesn't decompose along instances should go here, such
71
+ as robustness.
72
+ """
73
+ assert scenario_state.adapter_spec.method != ADAPT_LANGUAGE_MODELING, (
74
+ "Metric no longer knows how to handle the language modeling adapter. "
75
+ + "All run_specs with that adapter should use LanguageModelingMetric. "
76
+ + "If you are seeing this issue, please file a Github issue."
77
+ )
78
+
79
+ adapter_spec = scenario_state.adapter_spec
80
+ global_stats: Dict[MetricName, Stat] = {}
81
+ all_per_instance_stats: List[PerInstanceStats] = []
82
+
83
+ for train_trial_index in range(adapter_spec.num_train_trials):
84
+ # Construct inputs
85
+ request_state_sets: List[List[RequestState]] = []
86
+ for instance in scenario_state.instances:
87
+ references_states = []
88
+ for reference_index in range(len(instance.references)):
89
+ references_states.extend(
90
+ scenario_state.get_request_states(train_trial_index, instance, reference_index)
91
+ )
92
+ request_state_sets.append(references_states)
93
+
94
+ # Do it!
95
+ processor = Processor(
96
+ metric=self,
97
+ metric_service=metric_service,
98
+ eval_cache_path=eval_cache_path,
99
+ adapter_spec=scenario_state.adapter_spec,
100
+ )
101
+ results: List[List[Stat]] = parallel_map(
102
+ processor.process,
103
+ request_state_sets,
104
+ parallelism=parallelism,
105
+ )
106
+
107
+ # Compute per-instance stats
108
+ per_instance_stats: List[PerInstanceStats] = []
109
+ for instance, stats in zip(scenario_state.instances, results):
110
+ assert instance.id is not None, f"id was none for instance: {instance}"
111
+ # Sometimes a metric (e.g., BiasMetric) doesn't produce any statistics
112
+ if len(stats) > 0:
113
+ per_instance_stats.append(
114
+ PerInstanceStats(instance.id, instance.perturbation, train_trial_index, stats)
115
+ )
116
+
117
+ # Aggregate these stats
118
+ trial_stats: Dict[MetricName, Stat] = {} # Statistics just for this trial
119
+ for instance_stats in results:
120
+ for stat in instance_stats:
121
+ merge_stat(trial_stats, stat)
122
+
123
+ # Compute worst-case metrics.
124
+ # This is here since we want these stats for all metrics and they
125
+ # aggregate across contexts (perturbations).
126
+ worst_case_stats = compute_worst_case_metrics(dict(zip(scenario_state.instances, results)))
127
+ for stat in worst_case_stats:
128
+ merge_stat(trial_stats, stat)
129
+
130
+ # We take the mean value for each trial.
131
+ for stat in trial_stats.values():
132
+ merge_stat(global_stats, stat.take_mean())
133
+
134
+ all_per_instance_stats.extend(per_instance_stats)
135
+
136
+ # Wrap aggregated and per-instance stats in a MetricResult.
137
+ return MetricResult(list(global_stats.values()), all_per_instance_stats)
138
+
139
+ @abstractmethod
140
+ def evaluate_references(
141
+ self,
142
+ adapter_spec: AdapterSpec,
143
+ reference_request_states: List[RequestState],
144
+ metric_service: MetricService,
145
+ eval_cache_path: str,
146
+ ) -> List[Stat]:
147
+ """Evaluate the references. Override me!"""
148
+ pass
@@ -50,7 +50,6 @@ class SummaCImager:
50
50
  def __init__(
51
51
  self, model_name="mnli", granularity="paragraph", use_cache=True, max_doc_sents=100, device="cuda", **kwargs
52
52
  ):
53
-
54
53
  self.grans = granularity.split("-")
55
54
 
56
55
  assert (
@@ -154,7 +153,6 @@ class SummaCImager:
154
153
  for j in range(N_gen)
155
154
  ]
156
155
  for batch in utils_misc.batcher(dataset, batch_size=20):
157
-
158
156
  if self.model_name == "decomp":
159
157
  batch_evids, batch_conts, batch_neuts = [], [], []
160
158
  batch_json = [{"premise": d["premise"], "hypothesis": d["hypothesis"]} for d in batch]
@@ -12,13 +12,13 @@ from collections import defaultdict
12
12
  from helm.benchmark.adaptation.scenario_state import ScenarioState
13
13
  from helm.benchmark.adaptation.request_state import RequestState
14
14
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
15
+ from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
15
16
  from helm.common.hierarchical_logger import hlog
16
17
  from helm.common.general import ensure_file_downloaded
17
18
  from helm.common.optional_dependencies import handle_module_not_found_error
18
19
  from .metric import Metric, MetricResult
19
20
  from .metric_name import MetricName
20
21
  from .metric_service import MetricService
21
- from .basic_metrics import get_rouge_function
22
22
  from .statistic import Stat
23
23
  from .summac.model_summac import SummaCZS
24
24
  from bert_score import BERTScorer
@@ -58,7 +58,7 @@ class SummarizationMetric(Metric):
58
58
  try:
59
59
  from summ_eval.data_stats_metric import DataStatsMetric
60
60
  except ModuleNotFoundError as e:
61
- handle_module_not_found_error(e, ["metrics"])
61
+ handle_module_not_found_error(e, ["summarization"])
62
62
 
63
63
  self.data_stats_metric = DataStatsMetric()
64
64
  self.task: str = task