crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,304 @@
1
+ ---
2
+ ############################################################
3
+ adapter:
4
+ - name: method
5
+ description: The high-level strategy for converting instances into a prompt for the language model.
6
+ values:
7
+ - name: generation
8
+ description: Given the input, the model generates the output free-form.
9
+ - name: generation_multimodal
10
+ description: Given the multimodal input, the model generates the output free-form.
11
+ - name: multiple_choice_joint
12
+ description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
13
+ - name: multiple_choice_separate_original
14
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
15
+ - name: multiple_choice_separate_calibrated
16
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
17
+ - name: language_modeling
18
+ description: Given the input, the model assigns the sequence a probability.
19
+ - name: instructions
20
+ description: The description of the task that is included at the very beginning of the prompt.
21
+ - name: global_prefix
22
+ description: The string that is prepended to the prompt.
23
+ - name: global_suffix
24
+ description: The string that is appended to the prompt.
25
+ - name: instance_prefix
26
+ description: The string that is included before each instance (e.g., '\n\n').
27
+ - name: input_prefix
28
+ description: The string that is included before each input (e.g., 'Question:').
29
+ - name: input_suffix
30
+ description: The string that is included after each input (e.g., '\n').
31
+ - name: reference_prefix
32
+ description: The string that is included before each reference (for multiple-choice questions).
33
+ - name: reference_suffix
34
+ description: The string that is included after each reference (for multiple-choice questions).
35
+ - name: output_prefix
36
+ description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
37
+ - name: output_suffix
38
+ description: The string that is included after the correct answer/predicted output (e.g., '\n').
39
+ - name: substitutions
40
+ description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
41
+ - name: max_train_instances
42
+ description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
43
+ - name: max_eval_instances
44
+ description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
45
+ - name: num_outputs
46
+ description: Maximum number of possible outputs to generate by sampling multiple outputs.
47
+ - name: num_train_trials
48
+ description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
49
+ - name: sample_train
50
+ description: If true, randomly sample N training examples; if false, select N consecutive training examples
51
+ - name: model
52
+ description: Name of the language model (<creator_organization>/<model name>) to send requests to.
53
+ - name: model_deployment
54
+ description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
55
+ - name: temperature
56
+ description: Temperature parameter used in generation.
57
+ - name: max_tokens
58
+ description: Maximum number of tokens to generate.
59
+ - name: stop_sequences
60
+ description: List of sequences, where we stop generation if we encounter any of them.
61
+ - name: random
62
+ description: Random seed (string), which guarantees reproducibility.
63
+ - name: multi_label
64
+ description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
65
+
66
+ ############################################################
67
+ metrics:
68
+ # Infrastructure metrics:
69
+ - name: num_perplexity_tokens
70
+ display_name: '# tokens'
71
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
72
+ - name: num_bytes
73
+ display_name: '# bytes'
74
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
75
+
76
+ - name: num_references
77
+ display_name: '# ref'
78
+ description: Number of references.
79
+ - name: num_train_trials
80
+ display_name: '# trials'
81
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
82
+ - name: estimated_num_tokens_cost
83
+ display_name: 'cost'
84
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
85
+ - name: num_prompt_tokens
86
+ display_name: '# prompt tokens'
87
+ description: Number of tokens in the prompt.
88
+ - name: num_prompt_characters
89
+ display_name: '# prompt chars'
90
+ description: Number of characters in the prompt.
91
+ - name: num_completion_tokens
92
+ display_name: '# completion tokens'
93
+ description: Actual number of completion tokens (over all completions).
94
+ - name: num_output_tokens
95
+ display_name: '# output tokens'
96
+ description: Actual number of output tokens.
97
+ - name: max_num_output_tokens
98
+ display_name: 'Max output tokens'
99
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
100
+ - name: num_requests
101
+ display_name: '# requests'
102
+ description: Number of distinct API requests.
103
+ - name: num_instances
104
+ display_name: '# eval'
105
+ description: Number of evaluation instances.
106
+ - name: num_train_instances
107
+ display_name: '# train'
108
+ description: Number of training instances (e.g., in-context examples).
109
+ - name: prompt_truncated
110
+ display_name: truncated
111
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
112
+ - name: finish_reason_length
113
+ display_name: finish b/c length
114
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
115
+ - name: finish_reason_stop
116
+ display_name: finish b/c stop
117
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
118
+ - name: finish_reason_endoftext
119
+ display_name: finish b/c endoftext
120
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
121
+ - name: finish_reason_unknown
122
+ display_name: finish b/c unknown
123
+ description: Fraction of instances where the the output was terminated for unknown reasons.
124
+ - name: num_completions
125
+ display_name: '# completions'
126
+ description: Number of completions.
127
+ - name: predicted_index
128
+ display_name: Predicted index
129
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
130
+
131
+ # Vision Language metrics [text]:
132
+ - name: edit_similarity
133
+ display_name: Edit similarity (Levenshtein)
134
+ short_display_name: Edit sim.
135
+ lower_is_better: false
136
+ description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
137
+
138
+ # Vision Language metrics [image]:
139
+ - name: block_emd_similarity
140
+ display_name: Block Earth Mover Similarity
141
+ short_display_name: Block EMS
142
+ description: Block Earth Mover Similarity
143
+ lower_is_better: false
144
+ - name: block_emd_similarity_white
145
+ display_name: Block Earth Mover Similarity (white)
146
+ short_display_name: Block EMS (white)
147
+ description: Block Earth Mover Similarity (white)
148
+ lower_is_better: false
149
+ - name: block_emd_similarity_median_color
150
+ display_name: Block Earth Mover Similarity (median)
151
+ short_display_name: Block EMS (median)
152
+ description: Block Earth Mover Similarity (median)
153
+ lower_is_better: false
154
+ - name: pixel_similarity
155
+ display_name: Pixel Similarity
156
+ short_display_name: PS
157
+ description: Pixel Similarity between an image generated by the model and the target image.
158
+ lower_is_better: false
159
+ - name: sift_similarity
160
+ display_name: SIFT Similarity
161
+ short_display_name: SIFT
162
+ description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
163
+ lower_is_better: false
164
+ - name: compilation_success
165
+ display_name: Compilation success
166
+ description: Fraction of instances where the generated code compiles successfully.
167
+ lower_is_better: false
168
+ - name: lpips_similarity
169
+ display_name: LPIPS similarity
170
+ short_display_name: LPIPS
171
+ description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
172
+ lower_is_better: false
173
+ - name: fid_similarity
174
+ display_name: FID similarity
175
+ short_display_name: FID
176
+ description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
177
+ lower_is_better: false
178
+ - name: ssim_similarity
179
+ display_name: SSIM
180
+ short_display_name: SSIM
181
+ description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
182
+ lower_is_better: false
183
+
184
+ # Accuracy metrics:
185
+ - name: exact_match
186
+ display_name: Exact match
187
+ short_display_name: EM
188
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
189
+ lower_is_better: false
190
+ - name: quasi_exact_match
191
+ display_name: Quasi-exact match
192
+ short_display_name: EM
193
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
194
+ lower_is_better: false
195
+ - name: prefix_exact_match
196
+ display_name: Prefix exact match
197
+ short_display_name: PEM
198
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
199
+ lower_is_better: false
200
+ - name: quasi_prefix_exact_match
201
+ # TODO: should call this prefix_quasi_exact_match
202
+ display_name: Prefix quasi-exact match
203
+ short_display_name: PEM
204
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
205
+ lower_is_better: false
206
+
207
+ ############################################################
208
+ perturbations:
209
+ - name: robustness
210
+ display_name: Robustness
211
+ description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
212
+
213
+ ############################################################
214
+ metric_groups:
215
+ - name: accuracy
216
+ display_name: Compilation Rate and Earth Mover Similarity
217
+ metrics:
218
+ - name: ${main_name}
219
+ split: ${main_split}
220
+ - name: compilation_success
221
+ split: ${main_split}
222
+
223
+ - name: generation_image
224
+ display_name: Generation (image)
225
+ metrics:
226
+ - name: pixel_similarity
227
+ split: ${main_split}
228
+ - name: compilation_success
229
+ split: ${main_split}
230
+ - name: fid_similarity
231
+ split: ${main_split}
232
+ - name: block_emd_similarity
233
+ split: ${main_split}
234
+ - name: block_emd_similarity_white
235
+ split: ${main_split}
236
+ - name: block_emd_similarity_median_color
237
+ split: ${main_split}
238
+
239
+ - name: generation_text
240
+ display_name: Generation (text)
241
+ metrics:
242
+ - name: edit_similarity
243
+ split: ${main_split}
244
+
245
+ ############################################################
246
+ run_groups:
247
+ - name: core_scenarios
248
+ display_name: Image2Structure
249
+ description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
250
+ category: All scenarios
251
+ subgroups:
252
+ - image2latex
253
+ - image2webpage
254
+ - image2musicsheet
255
+
256
+ - name: image2latex
257
+ display_name: Image2LaTeX
258
+ description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
259
+ metric_groups:
260
+ - accuracy
261
+ - generation_image
262
+ - generation_text
263
+ environment:
264
+ main_name: block_emd_similarity
265
+ main_split: valid
266
+ taxonomy:
267
+ task: image-to-text
268
+ what: mathematical equations, tables, algorithms, tikz
269
+ who: n/a
270
+ when: "2024"
271
+ language: English
272
+
273
+ - name: image2webpage
274
+ display_name: Image2webpage
275
+ description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript.
276
+ metric_groups:
277
+ - accuracy
278
+ - generation_image
279
+ - generation_text
280
+ environment:
281
+ main_name: block_emd_similarity
282
+ main_split: valid
283
+ taxonomy:
284
+ task: image-to-text
285
+ what: css, html, javascript
286
+ who: n/a
287
+ when: "2024"
288
+ language: English
289
+
290
+ - name: image2musicsheet
291
+ display_name: Image2musicsheet
292
+ description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond.
293
+ metric_groups:
294
+ - accuracy
295
+ - generation_image
296
+ environment:
297
+ main_name: block_emd_similarity
298
+ main_split: valid
299
+ taxonomy:
300
+ task: image-to-text
301
+ what: music sheets
302
+ who: n/a
303
+ when: "2024"
304
+ language: English
@@ -0,0 +1,210 @@
1
+ ---
2
+ ############################################################
3
+ perturbations: []
4
+ adapter:
5
+ - name: method
6
+ description: The high-level strategy for converting instances into a prompt for the language model.
7
+ values:
8
+ - name: generation
9
+ description: Given the input, the model generates the output free-form.
10
+ - name: instructions
11
+ description: The description of the task that is included at the very beginning of the prompt.
12
+ - name: global_prefix
13
+ description: The string that is prepended to the prompt.
14
+ - name: instance_prefix
15
+ description: The string that is included before each instance (e.g., '\n\n').
16
+ - name: input_prefix
17
+ description: The string that is included before each input (e.g., 'Question:').
18
+ - name: input_suffix
19
+ description: The string that is included after each input (e.g., '\n').
20
+ - name: reference_prefix
21
+ description: The string that is included before each reference (for multiple-choice questions).
22
+ - name: reference_suffix
23
+ description: The string that is included after each reference (for multiple-choice questions).
24
+ - name: output_prefix
25
+ description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
26
+ - name: output_suffix
27
+ description: The string that is included after the correct answer/predicted output (e.g., '\n').
28
+ - name: substitutions
29
+ description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
30
+ - name: max_train_instances
31
+ description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
32
+ - name: max_eval_instances
33
+ description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
34
+ - name: num_outputs
35
+ description: Maximum number of possible outputs to generate by sampling multiple outputs.
36
+ - name: num_train_trials
37
+ description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
38
+ - name: sample_train
39
+ description: If true, randomly sample N training examples; if false, select N consecutive training examples
40
+ - name: model
41
+ description: Name of the language model (<creator_organization>/<model name>) to send requests to.
42
+ - name: model_deployment
43
+ description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
44
+ - name: temperature
45
+ description: Temperature parameter used in generation.
46
+ - name: max_tokens
47
+ description: Maximum number of tokens to generate.
48
+ - name: stop_sequences
49
+ description: List of sequences, where we stop generation if we encounter any of them.
50
+ - name: random
51
+ description: Random seed (string), which guarantees reproducibility.
52
+ - name: multi_label
53
+ description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
54
+
55
+ ############################################################
56
+ metrics:
57
+ - name: Helpfulness
58
+ display_name: Helpfulness
59
+ short_display_name: Helpfulness
60
+ description: Whether the model appears to do what it is instructed to.
61
+ lower_is_better: false
62
+ - name: Understandability
63
+ display_name: Understandability
64
+ short_display_name: Understandability
65
+ description: Whether the answer is easy to comprehend.
66
+ lower_is_better: false
67
+ - name: Completeness
68
+ display_name: Completeness
69
+ short_display_name: Completeness
70
+ description: Whether the response provides a comprehensive answer to the question or request.
71
+ lower_is_better: false
72
+ - name: Conciseness
73
+ display_name: Conciseness
74
+ short_display_name: Conciseness
75
+ description: Whether the response is concise and to the point, without unnecessary repetition or elaboration.
76
+ lower_is_better: false
77
+ - name: Harmlessness
78
+ display_name: Harmlessness
79
+ short_display_name: Harmlessness
80
+ description: Whether the response avoids using toxic language or helping the user with harmful goals.
81
+ lower_is_better: false
82
+
83
+ ############################################################
84
+ metric_groups:
85
+ - name: instruction_following_metrics
86
+ display_name: Instruction Following
87
+ metrics:
88
+ - name: Helpfulness
89
+ split: ${main_split}
90
+ - name: Understandability
91
+ split: ${main_split}
92
+ - name: Completeness
93
+ split: ${main_split}
94
+ - name: Conciseness
95
+ split: ${main_split}
96
+ - name: Harmlessness
97
+ split: ${main_split}
98
+
99
+ ############################################################
100
+ run_groups:
101
+ - name: instruction_following
102
+ display_name: Instruction Following
103
+ description: Given an open-ended instruction in natural language, the goal is to produce a text response that is helpful, understandable, complete, concise and harmless.
104
+ subgroups:
105
+ - anthropic_hh_rlhf
106
+ - grammar
107
+ - koala
108
+ - open_assistant
109
+ - self_instruct
110
+ - vicuna
111
+
112
+ - name: anthropic_hh_rlhf
113
+ display_name: Anthropic RLHF dataset
114
+ short_display_name: Anthropic RLHF dataset
115
+ description: The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.
116
+ metric_groups:
117
+ - instruction_following_metrics
118
+ environment:
119
+ main_name: Helpfulness
120
+ main_split: test
121
+ taxonomy:
122
+ task: open-ended instruction following
123
+ what: "Human-LM dialogues and preference labels"
124
+ who: "Workers from MTurk and Upwork, language models from Anthropic"
125
+ when: "2022"
126
+ language: English
127
+
128
+ # Ideally, the name should be "best_chatgpt_prompts".
129
+ # But unfortunately the group name in the results is "grammar",
130
+ # so the schema has to match the same group name.
131
+ # TODO: Change the group name in the "grammar" run spec, and then change this group name.
132
+ - name: grammar
133
+ display_name: Best ChatGPT Prompts
134
+ short_display_name: Best ChatGPT Prompts
135
+ description: A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).
136
+ metric_groups:
137
+ - instruction_following_metrics
138
+ environment:
139
+ main_name: Helpfulness
140
+ main_split: test
141
+ taxonomy:
142
+ task: open-ended instruction following
143
+ what: "Instructions for LLMs"
144
+ who: "Gridfiti Staff"
145
+ when: "2023"
146
+ language: English
147
+
148
+ - name: koala
149
+ display_name: Koala test dataset
150
+ short_display_name: Koala test dataset
151
+ description: The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.
152
+ metric_groups:
153
+ - instruction_following_metrics
154
+ environment:
155
+ main_name: Helpfulness
156
+ main_split: test
157
+ taxonomy:
158
+ task: open-ended instruction following
159
+ what: "Instructions for LLMs"
160
+ who: "Web users"
161
+ when: "Before 2023"
162
+ language: English
163
+
164
+ - name: open_assistant
165
+ display_name: Open Assistant
166
+ short_display_name: Open Assistant
167
+ description: LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.
168
+ metric_groups:
169
+ - instruction_following_metrics
170
+ environment:
171
+ main_name: Helpfulness
172
+ main_split: valid
173
+ taxonomy:
174
+ task: open-ended instruction following
175
+ what: "Human-written dialogues and response rankings"
176
+ who: "Open Assistant participants"
177
+ when: "2023"
178
+ language: "35 languages"
179
+
180
+ - name: self_instruct
181
+ display_name: Self Instruct
182
+ short_display_name: Self Instruct
183
+ description: The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).
184
+ metric_groups:
185
+ - instruction_following_metrics
186
+ environment:
187
+ main_name: Helpfulness
188
+ main_split: test
189
+ taxonomy:
190
+ task: open-ended instruction following
191
+ what: "Instructions for LLMs"
192
+ who: "Authors of the research paper"
193
+ when: "2022"
194
+ language: English
195
+
196
+ - name: vicuna
197
+ display_name: Vicuna
198
+ short_display_name: Vicuna
199
+ description: The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.
200
+ metric_groups:
201
+ - instruction_following_metrics
202
+ environment:
203
+ main_name: Helpfulness
204
+ main_split: test
205
+ taxonomy:
206
+ task: open-ended instruction following
207
+ what: "Instructions for LLMs"
208
+ who: "Unknown"
209
+ when: "Before 2023"
210
+ language: English