crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,428 @@
1
+ ---
2
+ ############################################################
3
+ adapter:
4
+ - name: method
5
+ description: The high-level strategy for converting instances into a prompt for the language model.
6
+ values:
7
+ - name: generation
8
+ description: Given the input, the model generates the output free-form.
9
+ - name: multiple_choice_joint
10
+ description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
+ - name: multiple_choice_separate_original
12
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
+ - name: multiple_choice_separate_calibrated
14
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
+ - name: language_modeling
16
+ description: Given the input, the model assigns the sequence a probability.
17
+ - name: instructions
18
+ description: The description of the task that is included at the very beginning of the prompt.
19
+ - name: global_prefix
20
+ description: The string that is prepended to the prompt.
21
+ - name: global_suffix
22
+ description: The string that is appended to the prompt.
23
+ - name: instance_prefix
24
+ description: The string that is included before each instance (e.g., '\n\n').
25
+ - name: input_prefix
26
+ description: The string that is included before each input (e.g., 'Question:').
27
+ - name: input_suffix
28
+ description: The string that is included after each input (e.g., '\n').
29
+ - name: reference_prefix
30
+ description: The string that is included before each reference (for multiple-choice questions).
31
+ - name: reference_suffix
32
+ description: The string that is included after each reference (for multiple-choice questions).
33
+ - name: output_prefix
34
+ description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
35
+ - name: output_suffix
36
+ description: The string that is included after the correct answer/predicted output (e.g., '\n').
37
+ - name: substitutions
38
+ description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
39
+ - name: max_train_instances
40
+ description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
41
+ - name: max_eval_instances
42
+ description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
43
+ - name: num_outputs
44
+ description: Maximum number of possible outputs to generate by sampling multiple outputs.
45
+ - name: num_train_trials
46
+ description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
47
+ - name: sample_train
48
+ description: If true, randomly sample N training examples; if false, select N consecutive training examples
49
+ - name: model
50
+ description: Name of the language model (<creator_organization>/<model name>) to send requests to.
51
+ - name: model_deployment
52
+ description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
53
+ - name: temperature
54
+ description: Temperature parameter used in generation.
55
+ - name: max_tokens
56
+ description: Maximum number of tokens to generate.
57
+ - name: stop_sequences
58
+ description: List of sequences, where we stop generation if we encounter any of them.
59
+ - name: random
60
+ description: Random seed (string), which guarantees reproducibility.
61
+ - name: multi_label
62
+ description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
63
+
64
+ ############################################################
65
+ metrics:
66
+ # Infrastructure metrics:
67
+ - name: num_perplexity_tokens
68
+ display_name: '# tokens'
69
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
70
+ - name: num_bytes
71
+ display_name: '# bytes'
72
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
73
+ - name: num_references
74
+ display_name: '# ref'
75
+ description: Number of references.
76
+ - name: num_train_trials
77
+ display_name: '# trials'
78
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
79
+ - name: estimated_num_tokens_cost
80
+ display_name: 'cost'
81
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
82
+ - name: num_prompt_tokens
83
+ display_name: '# prompt tokens'
84
+ description: Number of tokens in the prompt.
85
+ - name: num_prompt_characters
86
+ display_name: '# prompt chars'
87
+ description: Number of characters in the prompt.
88
+ - name: num_completion_tokens
89
+ display_name: '# completion tokens'
90
+ description: Actual number of completion tokens (over all completions).
91
+ - name: num_output_tokens
92
+ display_name: '# output tokens'
93
+ description: Actual number of output tokens.
94
+ - name: max_num_output_tokens
95
+ display_name: 'Max output tokens'
96
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
97
+ - name: num_requests
98
+ display_name: '# requests'
99
+ description: Number of distinct API requests.
100
+ - name: num_instances
101
+ display_name: '# eval'
102
+ description: Number of evaluation instances.
103
+ - name: num_train_instances
104
+ display_name: '# train'
105
+ description: Number of training instances (e.g., in-context examples).
106
+ - name: prompt_truncated
107
+ display_name: truncated
108
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
109
+ - name: finish_reason_length
110
+ display_name: finish b/c length
111
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
112
+ - name: finish_reason_stop
113
+ display_name: finish b/c stop
114
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
115
+ - name: finish_reason_endoftext
116
+ display_name: finish b/c endoftext
117
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
118
+ - name: finish_reason_unknown
119
+ display_name: finish b/c unknown
120
+ description: Fraction of instances where the the output was terminated for unknown reasons.
121
+ - name: num_completions
122
+ display_name: '# completions'
123
+ description: Number of completions.
124
+ - name: predicted_index
125
+ display_name: Predicted index
126
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
127
+
128
+ # Efficiency metrics:
129
+ - name: training_co2_cost
130
+ display_name: Estimated training emissions (kg CO2)
131
+ short_display_name: Training emissions (kg CO2)
132
+ lower_is_better: true
133
+ description: Estimate of the CO2 emissions from training the model.
134
+ - name: training_energy_cost
135
+ display_name: Estimated training energy cost (MWh)
136
+ short_display_name: Training energy (MWh)
137
+ lower_is_better: true
138
+ description: Estimate of the amount of energy used to train the model.
139
+ - name: inference_runtime
140
+ display_name: Observed inference runtime (s)
141
+ short_display_name: Observed inference time (s)
142
+ lower_is_better: true
143
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
144
+ - name: inference_idealized_runtime
145
+ display_name: Idealized inference runtime (s)
146
+ short_display_name: Idealized inference time (s)
147
+ lower_is_better: true
148
+ description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
149
+ - name: inference_denoised_runtime
150
+ display_name: Denoised inference runtime (s)
151
+ short_display_name: Denoised inference time (s)
152
+ lower_is_better: true
153
+ description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
154
+ - name: batch_size
155
+ display_name: Batch size
156
+ description: For batch jobs, how many requests are in a batch.
157
+
158
+ # Unitxt Metrics
159
+ - name: accuracy
160
+ display_name: accuracy
161
+ short_display_name: accuracy
162
+ description: accuracy
163
+ - name: accuracy_ci_high
164
+ display_name: accuracy_ci_high
165
+ short_display_name: accuracy_ci_high
166
+ description: accuracy_ci_high
167
+ - name: accuracy_ci_low
168
+ display_name: accuracy_ci_low
169
+ short_display_name: accuracy_ci_low
170
+ description: accuracy_ci_low
171
+ - name: f1_audio_volume_up
172
+ display_name: f1_audio_volume_up
173
+ short_display_name: f1_audio_volume_up
174
+ description: f1_audio_volume_up
175
+ - name: f1_calendar_remove
176
+ display_name: f1_calendar_remove
177
+ short_display_name: f1_calendar_remove
178
+ description: f1_calendar_remove
179
+ - name: f1_contradiction
180
+ display_name: f1_contradiction
181
+ short_display_name: f1_contradiction
182
+ description: f1_contradiction
183
+ - name: f1_cooking_recipe
184
+ display_name: f1_cooking_recipe
185
+ short_display_name: f1_cooking_recipe
186
+ description: f1_cooking_recipe
187
+ - name: f1_datetime_query
188
+ display_name: f1_datetime_query
189
+ short_display_name: f1_datetime_query
190
+ description: f1_datetime_query
191
+ - name: f1_entailment
192
+ display_name: f1_entailment
193
+ short_display_name: f1_entailment
194
+ description: f1_entailment
195
+ - name: f1_lists_remove
196
+ display_name: f1_lists_remove
197
+ short_display_name: f1_lists_remove
198
+ description: f1_lists_remove
199
+ - name: f1_macro
200
+ display_name: f1_macro
201
+ short_display_name: f1_macro
202
+ description: f1_macro
203
+ - name: f1_macro_ci_high
204
+ display_name: f1_macro_ci_high
205
+ short_display_name: f1_macro_ci_high
206
+ description: f1_macro_ci_high
207
+ - name: f1_macro_ci_low
208
+ display_name: f1_macro_ci_low
209
+ short_display_name: f1_macro_ci_low
210
+ description: f1_macro_ci_low
211
+ - name: f1_micro
212
+ display_name: f1_micro
213
+ short_display_name: f1_micro
214
+ description: f1_micro
215
+ - name: f1_micro_ci_high
216
+ display_name: f1_micro_ci_high
217
+ short_display_name: f1_micro_ci_high
218
+ description: f1_micro_ci_high
219
+ - name: f1_micro_ci_low
220
+ display_name: f1_micro_ci_low
221
+ short_display_name: f1_micro_ci_low
222
+ description: f1_micro_ci_low
223
+ - name: f1_music_query
224
+ display_name: f1_music_query
225
+ short_display_name: f1_music_query
226
+ description: f1_music_query
227
+ - name: f1_neutral
228
+ display_name: f1_neutral
229
+ short_display_name: f1_neutral
230
+ description: f1_neutral
231
+ - name: f1_news_query
232
+ display_name: f1_news_query
233
+ short_display_name: f1_news_query
234
+ description: f1_news_query
235
+ - name: f1_play_game
236
+ display_name: f1_play_game
237
+ short_display_name: f1_play_game
238
+ description: f1_play_game
239
+ - name: f1_play_music
240
+ display_name: f1_play_music
241
+ short_display_name: f1_play_music
242
+ description: f1_play_music
243
+ - name: fairness
244
+ display_name: fairness
245
+ short_display_name: fairness
246
+ description: fairness
247
+ - name: groups_mean_score
248
+ display_name: groups_mean_score
249
+ short_display_name: groups_mean_score
250
+ description: groups_mean_score
251
+ - name: max_prob
252
+ display_name: max_prob
253
+ short_display_name: max_prob
254
+ description: max_prob
255
+ - name: perplexity
256
+ display_name: perplexity
257
+ short_display_name: perplexity
258
+ description: perplexity
259
+ - name: robustness
260
+ display_name: robustness
261
+ short_display_name: robustness
262
+ description: robustness
263
+ - name: rouge1
264
+ display_name: rouge1
265
+ short_display_name: rouge1
266
+ description: rouge1
267
+ - name: rouge2
268
+ display_name: rouge2
269
+ short_display_name: rouge2
270
+ description: rouge2
271
+ - name: rougeL
272
+ display_name: rougeL
273
+ short_display_name: rougeL
274
+ description: rougeL
275
+ - name: rougeLsum
276
+ display_name: rougeLsum
277
+ short_display_name: rougeLsum
278
+ description: rougeLsum
279
+ - name: score_ci_high
280
+ display_name: score_ci_high
281
+ short_display_name: score_ci_high
282
+ description: score_ci_high
283
+ - name: score_ci_low
284
+ display_name: score_ci_low
285
+ short_display_name: score_ci_low
286
+ description: score_ci_low
287
+
288
+ perturbations: []
289
+
290
+ metric_groups:
291
+ - name: main_score
292
+ display_name: Main Score
293
+ metrics:
294
+ - name: ${main_name}
295
+ split: __all__
296
+
297
+ - name: efficiency
298
+ display_name: Efficiency
299
+ metrics:
300
+ - name: inference_runtime
301
+ split: ${main_split}
302
+
303
+ - name: general_information
304
+ display_name: General information
305
+ metrics:
306
+ - name: num_instances
307
+ split: ${main_split}
308
+ - name: num_train_instances
309
+ split: ${main_split}
310
+ - name: prompt_truncated
311
+ split: ${main_split}
312
+ - name: num_prompt_tokens
313
+ split: ${main_split}
314
+ - name: num_output_tokens
315
+ split: ${main_split}
316
+
317
+ - name: classification_metrics
318
+ display_name: Main Score
319
+ metrics:
320
+ # Not included because already in main_score
321
+ # - name: accuracy
322
+ # split: __all__
323
+ - name: f1_macro
324
+ split: __all__
325
+ - name: f1_micro
326
+ split: __all__
327
+
328
+ - name: summarization_metrics
329
+ display_name: Main Score
330
+ metrics:
331
+ - name: rouge1
332
+ split: __all__
333
+ - name: rouge2
334
+ split: __all__
335
+ # Not included because already in main_score
336
+ # - name: rougeL
337
+ # split: __all__
338
+ - name: rougeLsum
339
+ split: __all__
340
+
341
+ run_groups:
342
+ - name: spanish_scenarios
343
+ display_name: Spanish Scenarios
344
+ description: Spanish Scenarios
345
+ category: Multi-lingual Scenarios
346
+ subgroups:
347
+ - unitxt_cards.amazon_mass.es_ES
348
+ - unitxt_cards.xnli.es
349
+ - unitxt_cards.xlsum.spanish
350
+ - unitxt_cards.mlsum.es
351
+
352
+ - name: unitxt_cards.amazon_mass.es_ES
353
+ display_name: Amazon MASS
354
+ short_display_name: Amazon MASS
355
+ description: Amazon MASS
356
+ metric_groups:
357
+ - main_score
358
+ - classification_metrics
359
+ - efficiency
360
+ - general_information
361
+ environment:
362
+ # TODO: f1_macro instead?
363
+ main_name: accuracy
364
+ main_split: test
365
+ taxonomy:
366
+ task: "?"
367
+ what: "?"
368
+ who: "?"
369
+ when: "?"
370
+ language: Spanish
371
+
372
+ - name: unitxt_cards.xnli.es
373
+ display_name: XNLI
374
+ short_display_name: XNLI
375
+ description: XNLI
376
+ metric_groups:
377
+ - main_score
378
+ - classification_metrics
379
+ - efficiency
380
+ - general_information
381
+ environment:
382
+ # TODO: f1_macro instead?
383
+ main_name: accuracy
384
+ main_split: test
385
+ taxonomy:
386
+ task: "?"
387
+ what: "?"
388
+ who: "?"
389
+ when: "?"
390
+ language: Spanish
391
+
392
+ - name: unitxt_cards.xlsum.spanish
393
+ display_name: XL-Sum
394
+ short_display_name: XL-Sum
395
+ description: XL-Sum
396
+ metric_groups:
397
+ - main_score
398
+ - summarization_metrics
399
+ - efficiency
400
+ - general_information
401
+ environment:
402
+ main_name: rougeL
403
+ main_split: test
404
+ taxonomy:
405
+ task: "?"
406
+ what: "?"
407
+ who: "?"
408
+ when: "?"
409
+ language: Spanish
410
+
411
+ - name: unitxt_cards.mlsum.es
412
+ display_name: MLSUM
413
+ short_display_name: MLSUM
414
+ description: MLSUM
415
+ metric_groups:
416
+ - main_score
417
+ - summarization_metrics
418
+ - efficiency
419
+ - general_information
420
+ environment:
421
+ main_name: rougeL
422
+ main_split: test
423
+ taxonomy:
424
+ task: "?"
425
+ what: "?"
426
+ who: "?"
427
+ when: "?"
428
+ language: Spanish
@@ -0,0 +1,164 @@
1
+ ---
2
+ ############################################################
3
+ adapter:
4
+ - name: method
5
+ description: The high-level strategy for converting instances into a prompt for the language model.
6
+ values:
7
+ - name: generation
8
+ description: Given the input, the model generates the output free-form.
9
+ - name: generation_multimodal
10
+ description: Given the multimodal input, the model generates the output free-form.
11
+ - name: multiple_choice_joint
12
+ description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
13
+ - name: multiple_choice_joint_multimodal
14
+ description: Given the multimodal input, the model selects from multiple-choice options (A., B., C., D., E.).
15
+ - name: instructions
16
+ description: The description of the task that is included at the very beginning of the prompt.
17
+ - name: global_prefix
18
+ description: The string that is prepended to the prompt.
19
+ - name: global_suffix
20
+ description: The string that is appended to the prompt.
21
+ - name: instance_prefix
22
+ description: The string that is included before each instance (e.g., '\n\n').
23
+ - name: input_prefix
24
+ description: The string that is included before each input (e.g., 'Question:').
25
+ - name: input_suffix
26
+ description: The string that is included after each input (e.g., '\n').
27
+ - name: reference_prefix
28
+ description: The string that is included before each reference (for multiple-choice questions).
29
+ - name: reference_suffix
30
+ description: The string that is included after each reference (for multiple-choice questions).
31
+ - name: output_prefix
32
+ description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
33
+ - name: output_suffix
34
+ description: The string that is included after the correct answer/predicted output (e.g., '\n').
35
+ - name: substitutions
36
+ description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
37
+ - name: max_train_instances
38
+ description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
39
+ - name: max_eval_instances
40
+ description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
41
+ - name: num_outputs
42
+ description: Maximum number of possible outputs to generate by sampling multiple outputs.
43
+ - name: num_train_trials
44
+ description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
45
+ - name: sample_train
46
+ description: If true, randomly sample N training examples; if false, select N consecutive training examples
47
+ - name: model
48
+ description: Name of the language model (<creator_organization>/<model name>) to send requests to.
49
+ - name: model_deployment
50
+ description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
51
+ - name: temperature
52
+ description: Temperature parameter used in generation.
53
+ - name: max_tokens
54
+ description: Maximum number of tokens to generate.
55
+ - name: stop_sequences
56
+ description: List of sequences, where we stop generation if we encounter any of them.
57
+ - name: random
58
+ description: Random seed (string), which guarantees reproducibility.
59
+ - name: multi_label
60
+ description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
61
+
62
+ perturbations:
63
+ - name: robustness
64
+ display_name: Robustness
65
+ description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
66
+
67
+ ############################################################
68
+ metrics:
69
+ # Accuracy metrics:
70
+ - name: exact_match
71
+ display_name: Exact match
72
+ short_display_name: EM
73
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
74
+ lower_is_better: false
75
+ - name: quasi_exact_match
76
+ display_name: Quasi-exact match
77
+ short_display_name: EM
78
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
79
+ lower_is_better: false
80
+ - name: prefix_exact_match
81
+ display_name: Prefix exact match
82
+ short_display_name: PEM
83
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
84
+ lower_is_better: false
85
+ - name: quasi_prefix_exact_match
86
+ # TODO: should call this prefix_quasi_exact_match
87
+ display_name: Prefix quasi-exact match
88
+ short_display_name: PEM
89
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
90
+ lower_is_better: false
91
+
92
+ - name: f1_score
93
+ display_name: F1
94
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
95
+ lower_is_better: false
96
+ - name: cider
97
+ display_name: CIDEr
98
+ description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
99
+ lower_is_better: false
100
+
101
+
102
+ ############################################################
103
+ metric_groups:
104
+ - name: accuracy
105
+ display_name: Accuracy
106
+ metrics:
107
+ - name: ${main_name}
108
+ split: ${main_split}
109
+
110
+ ############################################################
111
+ run_groups:
112
+ - name: core_scenarios
113
+ display_name: Core scenarios
114
+ description: The scenarios where we evaluate all the models.
115
+ category: All scenarios
116
+ subgroups:
117
+ - viz_wiz
118
+ - vqa
119
+ - mmmu
120
+
121
+ - name: viz_wiz
122
+ display_name: VizWiz
123
+ description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
124
+ metric_groups:
125
+ - accuracy
126
+ environment:
127
+ main_name: quasi_exact_match
128
+ main_split: valid
129
+ taxonomy:
130
+ task: multimodal short answer question answering
131
+ what: Real-world images
132
+ who: Visually impaired people
133
+ when: "2018"
134
+ language: English
135
+
136
+ - name: vqa
137
+ display_name: VQAv2
138
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
139
+ metric_groups:
140
+ - accuracy
141
+ environment:
142
+ main_name: quasi_exact_match
143
+ main_split: valid
144
+ taxonomy:
145
+ task: multimodal short answer question answering
146
+ what: Real-world images
147
+ who: Human experts
148
+ when: "2017"
149
+ language: English
150
+
151
+ - name: mmmu
152
+ display_name: MMMU
153
+ description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
154
+ metric_groups:
155
+ - accuracy
156
+ environment:
157
+ main_name: exact_match
158
+ main_split: valid
159
+ taxonomy:
160
+ task: multimodal multiple-choice question answering
161
+ what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
162
+ who: Human experts
163
+ when: "2023"
164
+ language: English