crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,428 @@
1
+ ---
2
+ ############################################################
3
+ adapter:
4
+ - name: method
5
+ description: The high-level strategy for converting instances into a prompt for the language model.
6
+ values:
7
+ - name: generation
8
+ description: Given the input, the model generates the output free-form.
9
+ - name: multiple_choice_joint
10
+ description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
+ - name: multiple_choice_separate_original
12
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
+ - name: multiple_choice_separate_calibrated
14
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
+ - name: language_modeling
16
+ description: Given the input, the model assigns the sequence a probability.
17
+ - name: instructions
18
+ description: The description of the task that is included at the very beginning of the prompt.
19
+ - name: global_prefix
20
+ description: The string that is prepended to the prompt.
21
+ - name: global_suffix
22
+ description: The string that is appended to the prompt.
23
+ - name: instance_prefix
24
+ description: The string that is included before each instance (e.g., '\n\n').
25
+ - name: input_prefix
26
+ description: The string that is included before each input (e.g., 'Question:').
27
+ - name: input_suffix
28
+ description: The string that is included after each input (e.g., '\n').
29
+ - name: reference_prefix
30
+ description: The string that is included before each reference (for multiple-choice questions).
31
+ - name: reference_suffix
32
+ description: The string that is included after each reference (for multiple-choice questions).
33
+ - name: output_prefix
34
+ description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
35
+ - name: output_suffix
36
+ description: The string that is included after the correct answer/predicted output (e.g., '\n').
37
+ - name: substitutions
38
+ description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
39
+ - name: max_train_instances
40
+ description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
41
+ - name: max_eval_instances
42
+ description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
43
+ - name: num_outputs
44
+ description: Maximum number of possible outputs to generate by sampling multiple outputs.
45
+ - name: num_train_trials
46
+ description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
47
+ - name: sample_train
48
+ description: If true, randomly sample N training examples; if false, select N consecutive training examples
49
+ - name: model
50
+ description: Name of the language model (<creator_organization>/<model name>) to send requests to.
51
+ - name: model_deployment
52
+ description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
53
+ - name: temperature
54
+ description: Temperature parameter used in generation.
55
+ - name: max_tokens
56
+ description: Maximum number of tokens to generate.
57
+ - name: stop_sequences
58
+ description: List of sequences, where we stop generation if we encounter any of them.
59
+ - name: random
60
+ description: Random seed (string), which guarantees reproducibility.
61
+ - name: multi_label
62
+ description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
63
+
64
+ ############################################################
65
+ metrics:
66
+ # Infrastructure metrics:
67
+ - name: num_perplexity_tokens
68
+ display_name: '# tokens'
69
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
70
+ - name: num_bytes
71
+ display_name: '# bytes'
72
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
73
+ - name: num_references
74
+ display_name: '# ref'
75
+ description: Number of references.
76
+ - name: num_train_trials
77
+ display_name: '# trials'
78
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
79
+ - name: estimated_num_tokens_cost
80
+ display_name: 'cost'
81
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
82
+ - name: num_prompt_tokens
83
+ display_name: '# prompt tokens'
84
+ description: Number of tokens in the prompt.
85
+ - name: num_prompt_characters
86
+ display_name: '# prompt chars'
87
+ description: Number of characters in the prompt.
88
+ - name: num_completion_tokens
89
+ display_name: '# completion tokens'
90
+ description: Actual number of completion tokens (over all completions).
91
+ - name: num_output_tokens
92
+ display_name: '# output tokens'
93
+ description: Actual number of output tokens.
94
+ - name: max_num_output_tokens
95
+ display_name: 'Max output tokens'
96
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
97
+ - name: num_requests
98
+ display_name: '# requests'
99
+ description: Number of distinct API requests.
100
+ - name: num_instances
101
+ display_name: '# eval'
102
+ description: Number of evaluation instances.
103
+ - name: num_train_instances
104
+ display_name: '# train'
105
+ description: Number of training instances (e.g., in-context examples).
106
+ - name: prompt_truncated
107
+ display_name: truncated
108
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
109
+ - name: finish_reason_length
110
+ display_name: finish b/c length
111
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
112
+ - name: finish_reason_stop
113
+ display_name: finish b/c stop
114
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
115
+ - name: finish_reason_endoftext
116
+ display_name: finish b/c endoftext
117
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
118
+ - name: finish_reason_unknown
119
+ display_name: finish b/c unknown
120
+ description: Fraction of instances where the the output was terminated for unknown reasons.
121
+ - name: num_completions
122
+ display_name: '# completions'
123
+ description: Number of completions.
124
+ - name: predicted_index
125
+ display_name: Predicted index
126
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
127
+
128
+ # Efficiency metrics:
129
+ - name: training_co2_cost
130
+ display_name: Estimated training emissions (kg CO2)
131
+ short_display_name: Training emissions (kg CO2)
132
+ lower_is_better: true
133
+ description: Estimate of the CO2 emissions from training the model.
134
+ - name: training_energy_cost
135
+ display_name: Estimated training energy cost (MWh)
136
+ short_display_name: Training energy (MWh)
137
+ lower_is_better: true
138
+ description: Estimate of the amount of energy used to train the model.
139
+ - name: inference_runtime
140
+ display_name: Observed inference runtime (s)
141
+ short_display_name: Observed inference time (s)
142
+ lower_is_better: true
143
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
144
+ - name: inference_idealized_runtime
145
+ display_name: Idealized inference runtime (s)
146
+ short_display_name: Idealized inference time (s)
147
+ lower_is_better: true
148
+ description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
149
+ - name: inference_denoised_runtime
150
+ display_name: Denoised inference runtime (s)
151
+ short_display_name: Denoised inference time (s)
152
+ lower_is_better: true
153
+ description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
154
+ - name: batch_size
155
+ display_name: Batch size
156
+ description: For batch jobs, how many requests are in a batch.
157
+
158
+ # Unitxt Metrics
159
+ - name: accuracy
160
+ display_name: accuracy
161
+ short_display_name: accuracy
162
+ description: accuracy
163
+ - name: accuracy_ci_high
164
+ display_name: accuracy_ci_high
165
+ short_display_name: accuracy_ci_high
166
+ description: accuracy_ci_high
167
+ - name: accuracy_ci_low
168
+ display_name: accuracy_ci_low
169
+ short_display_name: accuracy_ci_low
170
+ description: accuracy_ci_low
171
+ - name: f1_audio_volume_up
172
+ display_name: f1_audio_volume_up
173
+ short_display_name: f1_audio_volume_up
174
+ description: f1_audio_volume_up
175
+ - name: f1_calendar_remove
176
+ display_name: f1_calendar_remove
177
+ short_display_name: f1_calendar_remove
178
+ description: f1_calendar_remove
179
+ - name: f1_contradiction
180
+ display_name: f1_contradiction
181
+ short_display_name: f1_contradiction
182
+ description: f1_contradiction
183
+ - name: f1_cooking_recipe
184
+ display_name: f1_cooking_recipe
185
+ short_display_name: f1_cooking_recipe
186
+ description: f1_cooking_recipe
187
+ - name: f1_datetime_query
188
+ display_name: f1_datetime_query
189
+ short_display_name: f1_datetime_query
190
+ description: f1_datetime_query
191
+ - name: f1_entailment
192
+ display_name: f1_entailment
193
+ short_display_name: f1_entailment
194
+ description: f1_entailment
195
+ - name: f1_lists_remove
196
+ display_name: f1_lists_remove
197
+ short_display_name: f1_lists_remove
198
+ description: f1_lists_remove
199
+ - name: f1_macro
200
+ display_name: f1_macro
201
+ short_display_name: f1_macro
202
+ description: f1_macro
203
+ - name: f1_macro_ci_high
204
+ display_name: f1_macro_ci_high
205
+ short_display_name: f1_macro_ci_high
206
+ description: f1_macro_ci_high
207
+ - name: f1_macro_ci_low
208
+ display_name: f1_macro_ci_low
209
+ short_display_name: f1_macro_ci_low
210
+ description: f1_macro_ci_low
211
+ - name: f1_micro
212
+ display_name: f1_micro
213
+ short_display_name: f1_micro
214
+ description: f1_micro
215
+ - name: f1_micro_ci_high
216
+ display_name: f1_micro_ci_high
217
+ short_display_name: f1_micro_ci_high
218
+ description: f1_micro_ci_high
219
+ - name: f1_micro_ci_low
220
+ display_name: f1_micro_ci_low
221
+ short_display_name: f1_micro_ci_low
222
+ description: f1_micro_ci_low
223
+ - name: f1_music_query
224
+ display_name: f1_music_query
225
+ short_display_name: f1_music_query
226
+ description: f1_music_query
227
+ - name: f1_neutral
228
+ display_name: f1_neutral
229
+ short_display_name: f1_neutral
230
+ description: f1_neutral
231
+ - name: f1_news_query
232
+ display_name: f1_news_query
233
+ short_display_name: f1_news_query
234
+ description: f1_news_query
235
+ - name: f1_play_game
236
+ display_name: f1_play_game
237
+ short_display_name: f1_play_game
238
+ description: f1_play_game
239
+ - name: f1_play_music
240
+ display_name: f1_play_music
241
+ short_display_name: f1_play_music
242
+ description: f1_play_music
243
+ - name: fairness
244
+ display_name: fairness
245
+ short_display_name: fairness
246
+ description: fairness
247
+ - name: groups_mean_score
248
+ display_name: groups_mean_score
249
+ short_display_name: groups_mean_score
250
+ description: groups_mean_score
251
+ - name: max_prob
252
+ display_name: max_prob
253
+ short_display_name: max_prob
254
+ description: max_prob
255
+ - name: perplexity
256
+ display_name: perplexity
257
+ short_display_name: perplexity
258
+ description: perplexity
259
+ - name: robustness
260
+ display_name: robustness
261
+ short_display_name: robustness
262
+ description: robustness
263
+ - name: rouge1
264
+ display_name: rouge1
265
+ short_display_name: rouge1
266
+ description: rouge1
267
+ - name: rouge2
268
+ display_name: rouge2
269
+ short_display_name: rouge2
270
+ description: rouge2
271
+ - name: rougeL
272
+ display_name: rougeL
273
+ short_display_name: rougeL
274
+ description: rougeL
275
+ - name: rougeLsum
276
+ display_name: rougeLsum
277
+ short_display_name: rougeLsum
278
+ description: rougeLsum
279
+ - name: score_ci_high
280
+ display_name: score_ci_high
281
+ short_display_name: score_ci_high
282
+ description: score_ci_high
283
+ - name: score_ci_low
284
+ display_name: score_ci_low
285
+ short_display_name: score_ci_low
286
+ description: score_ci_low
287
+
288
+ perturbations: []
289
+
290
+ metric_groups:
291
+ - name: main_score
292
+ display_name: Main Score
293
+ metrics:
294
+ - name: ${main_name}
295
+ split: __all__
296
+
297
+ - name: efficiency
298
+ display_name: Efficiency
299
+ metrics:
300
+ - name: inference_runtime
301
+ split: ${main_split}
302
+
303
+ - name: general_information
304
+ display_name: General information
305
+ metrics:
306
+ - name: num_instances
307
+ split: ${main_split}
308
+ - name: num_train_instances
309
+ split: ${main_split}
310
+ - name: prompt_truncated
311
+ split: ${main_split}
312
+ - name: num_prompt_tokens
313
+ split: ${main_split}
314
+ - name: num_output_tokens
315
+ split: ${main_split}
316
+
317
+ - name: classification_metrics
318
+ display_name: Main Score
319
+ metrics:
320
+ # Not included because already in main_score
321
+ # - name: accuracy
322
+ # split: __all__
323
+ - name: f1_macro
324
+ split: __all__
325
+ - name: f1_micro
326
+ split: __all__
327
+
328
+ - name: summarization_metrics
329
+ display_name: Main Score
330
+ metrics:
331
+ - name: rouge1
332
+ split: __all__
333
+ - name: rouge2
334
+ split: __all__
335
+ # Not included because already in main_score
336
+ # - name: rougeL
337
+ # split: __all__
338
+ - name: rougeLsum
339
+ split: __all__
340
+
341
+ run_groups:
342
+ - name: spanish_scenarios
343
+ display_name: Spanish Scenarios
344
+ description: Spanish Scenarios
345
+ category: Multi-lingual Scenarios
346
+ subgroups:
347
+ - unitxt_cards.amazon_mass.es_ES
348
+ - unitxt_cards.xnli.es
349
+ - unitxt_cards.xlsum.spanish
350
+ - unitxt_cards.mlsum.es
351
+
352
+ - name: unitxt_cards.amazon_mass.es_ES
353
+ display_name: Amazon MASS
354
+ short_display_name: Amazon MASS
355
+ description: Amazon MASS
356
+ metric_groups:
357
+ - main_score
358
+ - classification_metrics
359
+ - efficiency
360
+ - general_information
361
+ environment:
362
+ # TODO: f1_macro instead?
363
+ main_name: accuracy
364
+ main_split: test
365
+ taxonomy:
366
+ task: "?"
367
+ what: "?"
368
+ who: "?"
369
+ when: "?"
370
+ language: Spanish
371
+
372
+ - name: unitxt_cards.xnli.es
373
+ display_name: XNLI
374
+ short_display_name: XNLI
375
+ description: XNLI
376
+ metric_groups:
377
+ - main_score
378
+ - classification_metrics
379
+ - efficiency
380
+ - general_information
381
+ environment:
382
+ # TODO: f1_macro instead?
383
+ main_name: accuracy
384
+ main_split: test
385
+ taxonomy:
386
+ task: "?"
387
+ what: "?"
388
+ who: "?"
389
+ when: "?"
390
+ language: Spanish
391
+
392
+ - name: unitxt_cards.xlsum.spanish
393
+ display_name: XL-Sum
394
+ short_display_name: XL-Sum
395
+ description: XL-Sum
396
+ metric_groups:
397
+ - main_score
398
+ - summarization_metrics
399
+ - efficiency
400
+ - general_information
401
+ environment:
402
+ main_name: rougeL
403
+ main_split: test
404
+ taxonomy:
405
+ task: "?"
406
+ what: "?"
407
+ who: "?"
408
+ when: "?"
409
+ language: Spanish
410
+
411
+ - name: unitxt_cards.mlsum.es
412
+ display_name: MLSUM
413
+ short_display_name: MLSUM
414
+ description: MLSUM
415
+ metric_groups:
416
+ - main_score
417
+ - summarization_metrics
418
+ - efficiency
419
+ - general_information
420
+ environment:
421
+ main_name: rougeL
422
+ main_split: test
423
+ taxonomy:
424
+ task: "?"
425
+ what: "?"
426
+ who: "?"
427
+ when: "?"
428
+ language: Spanish