crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (1033) hide show
  1. crfm_helm-0.5.10.dist-info/METADATA +369 -0
  2. crfm_helm-0.5.10.dist-info/RECORD +1008 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +80 -29
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  8. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  9. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  10. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
  11. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
  12. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
  13. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  14. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
  15. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  16. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
  17. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
  18. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
  19. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  20. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
  21. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  22. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  23. helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
  24. helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
  25. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
  26. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
  27. helm/benchmark/adaptation/common_adapter_specs.py +443 -0
  28. helm/benchmark/adaptation/prompt.py +1 -1
  29. helm/benchmark/adaptation/request_state.py +6 -1
  30. helm/benchmark/adaptation/scenario_state.py +6 -2
  31. helm/benchmark/annotation/aci_bench_annotator.py +84 -0
  32. helm/benchmark/annotation/air_bench_annotator.py +79 -0
  33. helm/benchmark/annotation/alrage_annotator.py +90 -0
  34. helm/benchmark/annotation/annotator.py +48 -0
  35. helm/benchmark/annotation/annotator_factory.py +50 -0
  36. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  37. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  38. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  39. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  40. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  41. helm/benchmark/annotation/call_center_annotator.py +258 -0
  42. helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
  43. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  44. helm/benchmark/annotation/dischargeme_annotator.py +96 -0
  45. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  46. helm/benchmark/annotation/financebench_annotator.py +79 -0
  47. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  48. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  49. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
  50. helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
  51. helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
  52. helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
  53. helm/benchmark/annotation/live_qa_annotator.py +76 -0
  54. helm/benchmark/annotation/med_dialog_annotator.py +88 -0
  55. helm/benchmark/annotation/medalign_annotator.py +89 -0
  56. helm/benchmark/annotation/medi_qa_annotator.py +87 -0
  57. helm/benchmark/annotation/medication_qa_annotator.py +86 -0
  58. helm/benchmark/annotation/mental_health_annotator.py +87 -0
  59. helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
  60. helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
  61. helm/benchmark/annotation/model_as_judge.py +309 -0
  62. helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
  63. helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
  64. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  65. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  66. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  67. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  68. helm/benchmark/annotation/spider_annotator.py +18 -0
  69. helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
  70. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  71. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  72. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  73. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  74. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  75. helm/benchmark/annotation/xstest_annotator.py +100 -0
  76. helm/benchmark/annotation_executor.py +144 -0
  77. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  78. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  79. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  80. helm/benchmark/augmentations/data_augmenter.py +0 -2
  81. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  82. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  83. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  84. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  85. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  86. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  87. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  88. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  89. helm/benchmark/augmentations/perturbation.py +26 -4
  90. helm/benchmark/augmentations/perturbation_description.py +1 -1
  91. helm/benchmark/augmentations/space_perturbation.py +2 -2
  92. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  93. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  94. helm/benchmark/augmentations/test_perturbation.py +56 -19
  95. helm/benchmark/augmentations/translate_perturbation.py +31 -0
  96. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  97. helm/benchmark/config_registry.py +7 -1
  98. helm/benchmark/data_preprocessor.py +2 -2
  99. helm/benchmark/executor.py +54 -25
  100. helm/benchmark/huggingface_registration.py +28 -10
  101. helm/benchmark/metrics/air_bench_metrics.py +3212 -0
  102. helm/benchmark/metrics/alrage_metric.py +35 -0
  103. helm/benchmark/metrics/annotation_metrics.py +108 -0
  104. helm/benchmark/metrics/basic_metrics.py +437 -667
  105. helm/benchmark/metrics/bbq_metrics.py +17 -6
  106. helm/benchmark/metrics/bias_metrics.py +18 -9
  107. helm/benchmark/metrics/bias_word_lists.py +1 -1
  108. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  109. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  110. helm/benchmark/metrics/classification_metrics.py +107 -22
  111. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  112. helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
  113. helm/benchmark/metrics/code_metrics.py +5 -5
  114. helm/benchmark/metrics/code_metrics_helper.py +11 -3
  115. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  116. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  117. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  118. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  119. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  120. helm/benchmark/metrics/comet_metric.py +125 -0
  121. helm/benchmark/metrics/common_metric_specs.py +174 -0
  122. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
  123. helm/benchmark/metrics/copyright_metrics.py +5 -5
  124. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  125. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  126. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  127. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  128. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  129. helm/benchmark/metrics/disinformation_metrics.py +8 -114
  130. helm/benchmark/metrics/dry_run_metrics.py +35 -6
  131. helm/benchmark/metrics/efficiency_metrics.py +287 -0
  132. helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
  133. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  134. helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
  135. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  136. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  137. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
  138. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  139. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  140. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  141. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
  142. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  143. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  144. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  145. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  146. helm/benchmark/metrics/ifeval_metrics.py +67 -0
  147. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  148. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  149. helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
  150. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  151. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  152. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  153. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  154. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  155. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  156. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  157. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  158. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  159. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  160. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  161. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  162. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  163. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  164. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  165. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  166. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  167. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  168. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  169. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  170. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  171. helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
  172. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  173. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  174. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  175. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  176. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  177. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  178. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  179. helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
  180. helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
  181. helm/benchmark/metrics/language_modeling_metrics.py +111 -0
  182. helm/benchmark/metrics/live_qa_metrics.py +35 -0
  183. helm/benchmark/metrics/llm_jury_metrics.py +58 -0
  184. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  185. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  186. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  187. helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
  188. helm/benchmark/metrics/medec_metrics.py +124 -0
  189. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  190. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  191. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  192. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  193. helm/benchmark/metrics/metric.py +121 -175
  194. helm/benchmark/metrics/metric_name.py +0 -1
  195. helm/benchmark/metrics/metric_service.py +23 -7
  196. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
  197. helm/benchmark/metrics/nltk_helper.py +32 -0
  198. helm/benchmark/metrics/omni_math_metrics.py +44 -0
  199. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  200. helm/benchmark/metrics/output_processing_metric.py +60 -0
  201. helm/benchmark/metrics/output_processors.py +15 -0
  202. helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
  203. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  204. helm/benchmark/metrics/ranking_metrics.py +5 -5
  205. helm/benchmark/metrics/reference_metric.py +148 -0
  206. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  207. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  208. helm/benchmark/metrics/safety_metrics.py +91 -0
  209. helm/benchmark/metrics/seahelm_metrics.py +201 -0
  210. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  211. helm/benchmark/metrics/spider_metrics.py +7 -0
  212. helm/benchmark/metrics/statistic.py +1 -1
  213. helm/benchmark/metrics/summac/model_summac.py +8 -11
  214. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  215. helm/benchmark/metrics/summarization_metrics.py +150 -11
  216. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  217. helm/benchmark/metrics/test_classification_metrics.py +145 -70
  218. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  219. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
  220. helm/benchmark/metrics/test_metric.py +3 -3
  221. helm/benchmark/metrics/test_statistic.py +2 -2
  222. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  223. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  224. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  225. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  226. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
  227. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  228. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
  229. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
  230. helm/benchmark/metrics/toxicity_metrics.py +37 -7
  231. helm/benchmark/metrics/toxicity_utils.py +23 -0
  232. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  233. helm/benchmark/metrics/unitxt_metrics.py +107 -0
  234. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  235. helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
  236. helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
  237. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  238. helm/benchmark/metrics/wildbench_metrics.py +54 -0
  239. helm/benchmark/model_deployment_registry.py +69 -5
  240. helm/benchmark/model_metadata_registry.py +58 -2
  241. helm/benchmark/multi_gpu_runner.py +133 -0
  242. helm/benchmark/presentation/contamination.py +3 -3
  243. helm/benchmark/presentation/create_plots.py +51 -20
  244. helm/benchmark/presentation/run_display.py +51 -12
  245. helm/benchmark/presentation/run_entry.py +2 -2
  246. helm/benchmark/presentation/schema.py +83 -66
  247. helm/benchmark/presentation/summarize.py +483 -388
  248. helm/benchmark/presentation/table.py +8 -8
  249. helm/benchmark/presentation/taxonomy_info.py +20 -0
  250. helm/benchmark/presentation/test_contamination.py +2 -2
  251. helm/benchmark/presentation/test_create_plots.py +4 -1
  252. helm/benchmark/presentation/test_run_entry.py +2 -2
  253. helm/benchmark/presentation/test_schema.py +11 -0
  254. helm/benchmark/presentation/test_summarize.py +148 -6
  255. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  256. helm/benchmark/reeval_run.py +202 -0
  257. helm/benchmark/reeval_runner.py +355 -0
  258. helm/benchmark/run.py +151 -87
  259. helm/benchmark/run_expander.py +418 -33
  260. helm/benchmark/run_spec.py +93 -0
  261. helm/benchmark/run_spec_factory.py +180 -0
  262. helm/benchmark/run_specs/__init__.py +0 -0
  263. helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
  264. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  265. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  266. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  267. helm/benchmark/run_specs/call_center_run_specs.py +201 -0
  268. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  269. helm/benchmark/run_specs/classic_run_specs.py +1393 -0
  270. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  271. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  272. helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
  273. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  274. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  275. helm/benchmark/run_specs/experimental_run_specs.py +224 -0
  276. helm/benchmark/run_specs/finance_run_specs.py +114 -0
  277. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  278. helm/benchmark/run_specs/heim_run_specs.py +625 -0
  279. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  280. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  281. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  282. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  283. helm/benchmark/run_specs/long_context_run_specs.py +188 -0
  284. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  285. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  286. helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
  287. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  288. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  289. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  290. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  291. helm/benchmark/run_specs/safety_run_specs.py +191 -0
  292. helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
  293. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  294. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
  295. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  296. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  297. helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
  298. helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
  299. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  300. helm/benchmark/runner.py +63 -62
  301. helm/benchmark/runner_config_registry.py +21 -0
  302. helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
  303. helm/benchmark/scenarios/air_bench_scenario.py +76 -0
  304. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  305. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  306. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
  307. helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
  308. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  309. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  310. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  311. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  312. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  313. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  314. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  315. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  316. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  317. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  318. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  319. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  320. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  321. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  322. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  323. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  324. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  325. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  326. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  327. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  328. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  329. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  330. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  331. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  332. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  333. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  334. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  335. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
  336. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  337. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
  338. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  339. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  340. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  341. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  342. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  343. helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
  344. helm/benchmark/scenarios/banking77_scenario.py +77 -0
  345. helm/benchmark/scenarios/bbq_scenario.py +17 -2
  346. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  347. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  348. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  349. helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
  350. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  351. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  352. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  353. helm/benchmark/scenarios/bold_scenario.py +18 -3
  354. helm/benchmark/scenarios/boolq_scenario.py +21 -1
  355. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  356. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  357. helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
  358. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  359. helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
  360. helm/benchmark/scenarios/clear_scenario.py +180 -0
  361. helm/benchmark/scenarios/cleva_scenario.py +482 -3
  362. helm/benchmark/scenarios/code_scenario.py +46 -4
  363. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  364. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  365. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  366. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  367. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  368. helm/benchmark/scenarios/commonsense_scenario.py +33 -1
  369. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  370. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
  371. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  372. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  373. helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
  374. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  375. helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
  376. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
  377. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
  378. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
  379. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
  380. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
  381. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
  382. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
  383. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
  384. helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
  385. helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
  386. helm/benchmark/scenarios/disinformation_scenario.py +32 -1
  387. helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
  388. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  389. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  390. helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
  391. helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
  392. helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
  393. helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
  394. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  395. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  396. helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
  397. helm/benchmark/scenarios/financebench_scenario.py +74 -0
  398. helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
  399. helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
  400. helm/benchmark/scenarios/gpqa_scenario.py +98 -0
  401. helm/benchmark/scenarios/grammar.py +2 -2
  402. helm/benchmark/scenarios/grammar_scenario.py +21 -2
  403. helm/benchmark/scenarios/gsm_scenario.py +31 -1
  404. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
  405. helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
  406. helm/benchmark/scenarios/headqa_scenario.py +158 -0
  407. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  408. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
  409. helm/benchmark/scenarios/ice_scenario.py +28 -4
  410. helm/benchmark/scenarios/ifeval_scenario.py +71 -0
  411. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  412. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  413. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  414. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  415. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  416. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  417. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  418. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  419. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  420. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  421. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  422. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  423. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  424. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  425. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  426. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  427. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  428. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  429. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  430. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  431. helm/benchmark/scenarios/imdb_scenario.py +26 -3
  432. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  433. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  434. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
  435. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  436. helm/benchmark/scenarios/koala_scenario.py +21 -1
  437. helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
  438. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
  439. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  440. helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
  441. helm/benchmark/scenarios/legal_support_scenario.py +24 -1
  442. helm/benchmark/scenarios/legalbench_scenario.py +45 -3
  443. helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
  444. helm/benchmark/scenarios/lextreme_scenario.py +22 -1
  445. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  446. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  447. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  448. helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
  449. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  450. helm/benchmark/scenarios/math_scenario.py +81 -22
  451. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  452. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  453. helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
  454. helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
  455. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  456. helm/benchmark/scenarios/med_qa_scenario.py +30 -1
  457. helm/benchmark/scenarios/medalign_scenario.py +117 -0
  458. helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
  459. helm/benchmark/scenarios/medbullets_scenario.py +167 -0
  460. helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
  461. helm/benchmark/scenarios/medec_scenario.py +148 -0
  462. helm/benchmark/scenarios/medhallu_scenario.py +95 -0
  463. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  464. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  465. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  466. helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
  467. helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
  468. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  469. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  470. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  471. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  472. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  473. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  474. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  475. helm/benchmark/scenarios/mental_health_scenario.py +146 -0
  476. helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
  477. helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
  478. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
  479. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  480. helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
  481. helm/benchmark/scenarios/mmlu_scenario.py +32 -1
  482. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  483. helm/benchmark/scenarios/msmarco_scenario.py +31 -1
  484. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
  485. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
  486. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
  487. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
  488. helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
  489. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  490. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  491. helm/benchmark/scenarios/omni_math_scenario.py +71 -0
  492. helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
  493. helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
  494. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
  495. helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
  496. helm/benchmark/scenarios/quac_scenario.py +24 -1
  497. helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
  498. helm/benchmark/scenarios/raft_scenario.py +33 -3
  499. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  500. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  501. helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
  502. helm/benchmark/scenarios/scenario.py +44 -1
  503. helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
  504. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  505. helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
  506. helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
  507. helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
  508. helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
  509. helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
  510. helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
  511. helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
  512. helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
  513. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  514. helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
  515. helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
  516. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  517. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  518. helm/benchmark/scenarios/spider_scenario.py +109 -0
  519. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
  520. helm/benchmark/scenarios/summarization_scenario.py +48 -1
  521. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  522. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  523. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
  524. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  525. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  526. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  527. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  528. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  529. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  530. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  531. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  532. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  533. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  534. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  535. helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
  536. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  537. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  538. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  539. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  540. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  541. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  542. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  543. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  544. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  545. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  546. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  547. helm/benchmark/scenarios/test_math_scenario.py +4 -3
  548. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  549. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  550. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  551. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  552. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  553. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  554. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  555. helm/benchmark/scenarios/test_scenario.py +6 -3
  556. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  557. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  558. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  559. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  560. helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
  561. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  562. helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
  563. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  564. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  565. helm/benchmark/scenarios/unitxt_scenario.py +62 -0
  566. helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
  567. helm/benchmark/scenarios/vicuna_scenario.py +22 -2
  568. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  569. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  570. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  571. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
  572. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  573. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  574. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  575. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  576. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  577. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  578. helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
  579. helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
  580. helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
  581. helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
  582. helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
  583. helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
  584. helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
  585. helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
  586. helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
  587. helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
  588. helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
  589. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  590. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  591. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  592. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  593. helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
  594. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  595. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  596. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  597. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  598. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  599. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  600. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  601. helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
  602. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  603. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
  604. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  605. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  606. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
  607. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  608. helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
  609. helm/benchmark/scenarios/wikifact_scenario.py +31 -1
  610. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  611. helm/benchmark/scenarios/wildbench_scenario.py +101 -0
  612. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  613. helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
  614. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  615. helm/benchmark/server.py +32 -2
  616. helm/benchmark/slurm_jobs.py +1 -2
  617. helm/benchmark/slurm_runner.py +78 -50
  618. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  619. helm/benchmark/static/schema_arabic.yaml +271 -0
  620. helm/benchmark/static/schema_audio.yaml +763 -0
  621. helm/benchmark/static/schema_autobencher.yaml +150 -0
  622. helm/benchmark/static/schema_call_center.yaml +269 -0
  623. helm/benchmark/static/schema_capabilities.yaml +254 -0
  624. helm/benchmark/static/schema_classic.yaml +259 -1140
  625. helm/benchmark/static/schema_cleva.yaml +768 -0
  626. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  627. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  628. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  629. helm/benchmark/static/schema_enterprise.yaml +319 -0
  630. helm/benchmark/static/schema_ewok.yaml +367 -0
  631. helm/benchmark/static/schema_finance.yaml +191 -0
  632. helm/benchmark/static/schema_heim.yaml +1389 -0
  633. helm/benchmark/static/schema_image2struct.yaml +588 -0
  634. helm/benchmark/static/schema_instruction_following.yaml +161 -0
  635. helm/benchmark/static/schema_legal.yaml +566 -0
  636. helm/benchmark/static/schema_lite.yaml +3 -286
  637. helm/benchmark/static/schema_long_context.yaml +282 -0
  638. helm/benchmark/static/schema_medhelm.yaml +1176 -0
  639. helm/benchmark/static/schema_melt.yaml +1257 -0
  640. helm/benchmark/static/schema_mmlu.yaml +1449 -0
  641. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  642. helm/benchmark/static/schema_safety.yaml +283 -0
  643. helm/benchmark/static/schema_seahelm.yaml +723 -0
  644. helm/benchmark/static/schema_slp.yaml +219 -0
  645. helm/benchmark/static/schema_slphelm.yaml +162 -0
  646. helm/benchmark/static/schema_social_audio.yaml +224 -0
  647. helm/benchmark/static/schema_sql.yaml +171 -0
  648. helm/benchmark/static/schema_thai.yaml +244 -0
  649. helm/benchmark/static/schema_torr.yaml +474 -0
  650. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  651. helm/benchmark/static/schema_unitxt.yaml +370 -0
  652. helm/benchmark/static/schema_vhelm.yaml +933 -0
  653. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  654. helm/benchmark/static/schema_video.yaml +219 -0
  655. helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
  656. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  657. helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
  658. helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
  659. helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
  660. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  661. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  662. helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
  663. helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
  664. helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
  665. helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
  666. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  667. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  668. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  669. helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
  670. helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
  671. helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
  672. helm/benchmark/static_build/config.js +4 -0
  673. helm/benchmark/static_build/index.html +19 -0
  674. helm/benchmark/test_data_preprocessor.py +3 -3
  675. helm/benchmark/test_run_expander.py +1 -1
  676. helm/benchmark/window_services/default_window_service.py +3 -45
  677. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
  678. helm/benchmark/window_services/ice_window_service.py +1 -35
  679. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  680. helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
  681. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  682. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  683. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  684. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  685. helm/benchmark/window_services/local_window_service.py +22 -5
  686. helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
  687. helm/benchmark/window_services/test_bloom_window_service.py +5 -4
  688. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  689. helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
  690. helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
  691. helm/benchmark/window_services/test_gptj_window_service.py +11 -5
  692. helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
  693. helm/benchmark/window_services/test_openai_window_service.py +18 -12
  694. helm/benchmark/window_services/test_opt_window_service.py +6 -5
  695. helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
  696. helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
  697. helm/benchmark/window_services/test_t511b_window_service.py +5 -4
  698. helm/benchmark/window_services/test_ul2_window_service.py +5 -4
  699. helm/benchmark/window_services/test_utils.py +6 -6
  700. helm/benchmark/window_services/test_yalm_window_service.py +5 -4
  701. helm/benchmark/window_services/tokenizer_service.py +7 -13
  702. helm/benchmark/window_services/window_service.py +42 -0
  703. helm/benchmark/window_services/window_service_factory.py +4 -1
  704. helm/benchmark/window_services/yalm_window_service.py +1 -28
  705. helm/clients/__init__.py +0 -0
  706. helm/{proxy/clients → clients}/ai21_client.py +78 -12
  707. helm/clients/aleph_alpha_client.py +114 -0
  708. helm/{proxy/clients → clients}/anthropic_client.py +304 -21
  709. helm/clients/audio_language/__init__.py +0 -0
  710. helm/clients/audio_language/diva_llama_client.py +122 -0
  711. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  712. helm/clients/audio_language/llama_omni/constants.py +9 -0
  713. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  714. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  715. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  716. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  717. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  718. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  719. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  720. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  721. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  722. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  723. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  724. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  725. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  726. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  727. helm/clients/audio_language/llama_omni/utils.py +202 -0
  728. helm/clients/audio_language/llama_omni_client.py +199 -0
  729. helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
  730. helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
  731. helm/clients/audio_language/qwen_audiolm_client.py +153 -0
  732. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  733. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  734. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  735. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  736. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  737. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  738. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  739. helm/clients/audio_language/test.py +62 -0
  740. helm/{proxy/clients → clients}/auto_client.py +72 -31
  741. helm/clients/azure_openai_client.py +55 -0
  742. helm/clients/bedrock_client.py +381 -0
  743. helm/clients/bedrock_utils.py +105 -0
  744. helm/{proxy/clients → clients}/client.py +92 -17
  745. helm/clients/clip_score_client.py +49 -0
  746. helm/clients/clip_scorers/__init__.py +0 -0
  747. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  748. helm/clients/clip_scorers/clip_scorer.py +50 -0
  749. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  750. helm/{proxy/clients → clients}/cohere_client.py +105 -14
  751. helm/clients/dspy_client.py +135 -0
  752. helm/clients/gcs_client.py +82 -0
  753. helm/{proxy/clients → clients}/google_client.py +8 -6
  754. helm/clients/google_translate_client.py +35 -0
  755. helm/clients/grok_client.py +36 -0
  756. helm/{proxy/clients → clients}/http_model_client.py +8 -8
  757. helm/{proxy/clients → clients}/huggingface_client.py +157 -86
  758. helm/clients/huggingface_pipeline_client.py +138 -0
  759. helm/clients/ibm_client.py +269 -0
  760. helm/clients/image_generation/__init__.py +0 -0
  761. helm/clients/image_generation/adobe_vision_client.py +80 -0
  762. helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
  763. helm/clients/image_generation/cogview2/__init__.py +0 -0
  764. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  765. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  766. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  767. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
  768. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  769. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  770. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
  771. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  772. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  773. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  774. helm/clients/image_generation/cogview2_client.py +192 -0
  775. helm/clients/image_generation/dalle2_client.py +194 -0
  776. helm/clients/image_generation/dalle3_client.py +108 -0
  777. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  778. helm/clients/image_generation/dalle_mini/data.py +442 -0
  779. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  780. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  781. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  782. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  783. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  784. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  785. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  786. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  787. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  788. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  789. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  790. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  791. helm/clients/image_generation/dalle_mini_client.py +191 -0
  792. helm/clients/image_generation/deep_floyd_client.py +80 -0
  793. helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
  794. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  795. helm/clients/image_generation/lexica_client.py +88 -0
  796. helm/clients/image_generation/mindalle/__init__.py +0 -0
  797. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  798. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  799. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  800. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  801. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  802. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  803. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  804. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  805. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  806. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  807. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  808. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  809. helm/clients/image_generation/mindalle_client.py +116 -0
  810. helm/clients/image_generation/nudity_check_client.py +64 -0
  811. helm/clients/image_generation/together_image_generation_client.py +113 -0
  812. helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
  813. helm/{proxy/clients → clients}/megatron_client.py +7 -5
  814. helm/clients/mistral_client.py +180 -0
  815. helm/clients/moderation_api_client.py +111 -0
  816. helm/clients/nvidia_nim_client.py +32 -0
  817. helm/clients/open_lm_client.py +43 -0
  818. helm/clients/openai_client.py +604 -0
  819. helm/clients/openai_responses_client.py +200 -0
  820. helm/clients/openrouter_client.py +31 -0
  821. helm/{proxy/clients → clients}/palmyra_client.py +31 -14
  822. helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
  823. helm/clients/reka_client.py +190 -0
  824. helm/clients/simple_client.py +64 -0
  825. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  826. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  827. helm/clients/stanfordhealthcare_google_client.py +43 -0
  828. helm/clients/stanfordhealthcare_http_model_client.py +95 -0
  829. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  830. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  831. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  832. helm/clients/test_client.py +98 -0
  833. helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
  834. helm/clients/test_openrouter_client.py +69 -0
  835. helm/clients/test_simple_client.py +19 -0
  836. helm/clients/test_together_client.py +184 -0
  837. helm/clients/together_client.py +599 -0
  838. helm/clients/upstage_client.py +23 -0
  839. helm/clients/vertexai_client.py +488 -0
  840. helm/clients/vision_language/__init__.py +0 -0
  841. helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
  842. helm/clients/vision_language/huggingface_vlm_client.py +114 -0
  843. helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
  844. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  845. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  846. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  847. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  848. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  849. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  850. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  851. helm/clients/vision_language/open_flamingo_client.py +155 -0
  852. helm/clients/vision_language/paligemma_client.py +147 -0
  853. helm/clients/vision_language/palmyra_vision_client.py +101 -0
  854. helm/clients/vision_language/qwen2_vlm_client.py +189 -0
  855. helm/clients/vision_language/qwen_vlm_client.py +174 -0
  856. helm/clients/vllm_client.py +80 -0
  857. helm/clients/vllm_granite_thinking_client.py +56 -0
  858. helm/clients/writer_client.py +105 -0
  859. helm/clients/yi_client.py +28 -0
  860. helm/common/audio_utils.py +111 -0
  861. helm/common/cache.py +23 -33
  862. helm/common/cache_backend_config.py +47 -0
  863. helm/common/clip_score_request.py +41 -0
  864. helm/common/context.py +80 -0
  865. helm/common/credentials_utils.py +5 -5
  866. helm/common/critique_request.py +10 -2
  867. helm/common/file_caches/__init__.py +0 -0
  868. helm/common/file_caches/file_cache.py +16 -0
  869. helm/common/file_caches/local_file_cache.py +61 -0
  870. helm/common/file_caches/test_local_file_cache.py +25 -0
  871. helm/common/file_upload_request.py +27 -0
  872. helm/common/general.py +10 -3
  873. helm/common/hierarchical_logger.py +124 -12
  874. helm/common/image_generation_parameters.py +25 -0
  875. helm/common/images_utils.py +60 -5
  876. helm/common/key_value_store.py +41 -10
  877. helm/common/local_context.py +140 -0
  878. helm/common/media_object.py +14 -1
  879. helm/common/moderations_api_request.py +71 -0
  880. helm/common/mongo_key_value_store.py +8 -7
  881. helm/common/multimodal_request_utils.py +57 -0
  882. helm/common/nudity_check_request.py +29 -0
  883. helm/common/object_spec.py +23 -8
  884. helm/common/optional_dependencies.py +1 -1
  885. helm/common/reeval_parameters.py +12 -0
  886. helm/common/remote_context.py +61 -0
  887. helm/common/request.py +45 -19
  888. helm/common/response_format.py +18 -0
  889. helm/common/test_cache.py +1 -48
  890. helm/common/test_general.py +10 -0
  891. helm/common/test_logging.py +94 -0
  892. helm/common/test_media_object.py +1 -1
  893. helm/common/tokenization_request.py +1 -10
  894. helm/config/model_deployments.yaml +4713 -1005
  895. helm/config/model_metadata.yaml +4045 -255
  896. helm/config/tokenizer_configs.yaml +1091 -50
  897. helm/proxy/accounts.py +31 -4
  898. helm/proxy/cli.py +6 -4
  899. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  900. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  901. helm/proxy/critique/model_critique_client.py +40 -10
  902. helm/proxy/example_queries.py +33 -28
  903. helm/proxy/retry.py +5 -0
  904. helm/proxy/server.py +82 -18
  905. helm/proxy/services/remote_service.py +32 -7
  906. helm/proxy/services/server_service.py +71 -69
  907. helm/proxy/services/service.py +30 -6
  908. helm/proxy/services/test_remote_service.py +6 -5
  909. helm/proxy/services/test_service.py +1 -13
  910. helm/proxy/static/help.html +99 -0
  911. helm/proxy/static/index.css +61 -0
  912. helm/proxy/static/index.html +40 -0
  913. helm/proxy/static/index.js +462 -0
  914. helm/proxy/test_accounts.py +32 -0
  915. helm/proxy/test_retry.py +1 -1
  916. helm/proxy/token_counters/auto_token_counter.py +37 -37
  917. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  918. helm/proxy/token_counters/token_counter.py +3 -5
  919. helm/tokenizers/__init__.py +0 -0
  920. helm/tokenizers/ai21_tokenizer.py +52 -0
  921. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
  922. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
  923. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
  924. helm/tokenizers/cohere_tokenizer.py +50 -0
  925. helm/tokenizers/grok_tokenizer.py +55 -0
  926. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
  927. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
  928. helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
  929. helm/tokenizers/simple_tokenizer.py +33 -0
  930. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  931. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
  932. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  933. helm/tokenizers/test_grok_tokenizer.py +33 -0
  934. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
  935. helm/tokenizers/test_simple_tokenizer.py +33 -0
  936. helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
  937. helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
  938. helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
  939. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  940. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
  941. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  942. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  943. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  944. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  945. crfm_helm-0.4.0.dist-info/METADATA +0 -264
  946. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  947. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  948. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  949. helm/benchmark/data_overlap/light_scenario.py +0 -60
  950. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  951. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  952. helm/benchmark/run_specs.py +0 -2762
  953. helm/benchmark/scenarios/numeracy_scenario.py +0 -784
  954. helm/benchmark/static/benchmarking.css +0 -156
  955. helm/benchmark/static/benchmarking.js +0 -1705
  956. helm/benchmark/static/config.js +0 -3
  957. helm/benchmark/static/images/helm-logo.png +0 -0
  958. helm/benchmark/static/images/language-model-helm.png +0 -0
  959. helm/benchmark/static/images/organizations/ai21.png +0 -0
  960. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  961. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  962. helm/benchmark/static/images/organizations/cohere.png +0 -0
  963. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  964. helm/benchmark/static/images/organizations/google.png +0 -0
  965. helm/benchmark/static/images/organizations/meta.png +0 -0
  966. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  967. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  968. helm/benchmark/static/images/organizations/openai.png +0 -0
  969. helm/benchmark/static/images/organizations/together.png +0 -0
  970. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  971. helm/benchmark/static/images/organizations/yandex.png +0 -0
  972. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  973. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  974. helm/benchmark/static/index.html +0 -68
  975. helm/benchmark/static/json-urls.js +0 -69
  976. helm/benchmark/static/plot-captions.js +0 -27
  977. helm/benchmark/static/utils.js +0 -285
  978. helm/benchmark/test_model_deployment_definition.py +0 -92
  979. helm/benchmark/test_model_properties.py +0 -1570
  980. helm/benchmark/vlm_run_specs.py +0 -97
  981. helm/benchmark/window_services/ai21_window_service.py +0 -258
  982. helm/benchmark/window_services/cohere_window_service.py +0 -163
  983. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  984. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  985. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  986. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  987. helm/benchmark/window_services/t511b_window_service.py +0 -30
  988. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  989. helm/benchmark/window_services/test_cohere_window_service.py +0 -74
  990. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  991. helm/benchmark/window_services/test_ice_window_service.py +0 -326
  992. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  993. helm/benchmark/window_services/ul2_window_service.py +0 -30
  994. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  995. helm/common/cache_utils.py +0 -14
  996. helm/proxy/clients/aleph_alpha_client.py +0 -95
  997. helm/proxy/clients/goose_ai_client.py +0 -99
  998. helm/proxy/clients/microsoft_client.py +0 -180
  999. helm/proxy/clients/openai_client.py +0 -206
  1000. helm/proxy/clients/simple_client.py +0 -60
  1001. helm/proxy/clients/test_client.py +0 -49
  1002. helm/proxy/clients/test_together_client.py +0 -97
  1003. helm/proxy/clients/together_client.py +0 -334
  1004. helm/proxy/clients/vertexai_client.py +0 -115
  1005. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  1006. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  1007. helm/proxy/token_counters/free_token_counter.py +0 -12
  1008. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  1009. helm/proxy/token_counters/openai_token_counter.py +0 -22
  1010. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  1011. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  1012. helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
  1013. helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
  1014. helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
  1015. helm/proxy/tokenizers/ice_tokenizer.py +0 -30
  1016. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  1017. helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
  1018. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  1019. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
  1020. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  1021. /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
  1022. /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
  1023. /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
  1024. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  1025. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  1026. /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
  1027. /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
  1028. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  1029. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  1030. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  1031. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  1032. /helm/{benchmark → proxy}/static/general.js +0 -0
  1033. /helm/{benchmark → proxy}/static/info-icon.png +0 -0
@@ -1,1128 +1,8 @@
1
1
  ---
2
2
  ############################################################
3
- models:
4
- # AI21 Labs
5
- - name: ai21/j1-jumbo
6
- display_name: J1-Jumbo v1 (178B)
7
- description: Jurassic-1 Jumbo (178B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
8
- creator_organization: AI21 Labs
9
- access: limited
10
- num_parameters: 178000000000
11
- release_date: 2021-08-11
12
- - name: ai21/j1-large
13
- display_name: J1-Large v1 (7.5B)
14
- description: Jurassic-1 Large (7.5B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
15
- creator_organization: AI21 Labs
16
- access: limited
17
- num_parameters: 7500000000
18
- release_date: 2021-08-11
19
- - name: ai21/j1-grande
20
- display_name: J1-Grande v1 (17B)
21
- description: Jurassic-1 Grande (17B parameters) with a "few tweaks" to the training process ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
22
- creator_organization: AI21 Labs
23
- access: limited
24
- num_parameters: 17000000000
25
- release_date: 2022-05-03
26
- - name: ai21/j1-grande-v2-beta
27
- display_name: J1-Grande v2 beta (17B)
28
- description: Jurassic-1 Grande v2 beta (17B parameters)
29
- creator_organization: AI21 Labs
30
- access: limited
31
- num_parameters: 17000000000
32
- release_date: 2022-10-28
33
- - name: ai21/j2-jumbo
34
- display_name: Jurassic-2 Jumbo (178B)
35
- description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
36
- creator_organization: AI21 Labs
37
- access: limited
38
- num_parameters: 178000000000
39
- release_date: 2023-03-09
40
- - name: ai21/j2-grande
41
- display_name: Jurassic-2 Grande (17B)
42
- description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
43
- creator_organization: AI21 Labs
44
- access: limited
45
- num_parameters: 17000000000
46
- release_date: 2023-03-09
47
- - name: ai21/j2-large
48
- display_name: Jurassic-2 Large (7.5B)
49
- description: Jurassic-2 Large (7.5B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
50
- creator_organization: AI21 Labs
51
- access: limited
52
- num_parameters: 7500000000
53
- release_date: 2023-03-09
54
-
55
- # Aleph Alpha
56
- # TODO: add Luminous World when it's released
57
- - name: AlephAlpha/luminous-base
58
- display_name: Luminous Base (13B)
59
- description: Luminous Base (13B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
60
- creator_organization: Aleph Alpha
61
- access: limited
62
- num_parameters: 13000000000
63
- # TODO: get exact release date
64
- release_date: 2022-01-01
65
- - name: AlephAlpha/luminous-extended
66
- display_name: Luminous Extended (30B)
67
- description: Luminous Extended (30B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
68
- creator_organization: Aleph Alpha
69
- access: limited
70
- num_parameters: 30000000000
71
- release_date: 2022-01-01
72
- - name: AlephAlpha/luminous-supreme
73
- display_name: Luminous Supreme (70B)
74
- description: Luminous Supreme (70B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
75
- creator_organization: Aleph Alpha
76
- access: limited
77
- num_parameters: 70000000000
78
- release_date: 2022-01-01
79
-
80
- # TODO: Remove Once we have configurable model names
81
- - name: neurips/local
82
- display_name: Local service
83
- description: Local competition service
84
- creator_organization: neurips
85
- access: open
86
- num_parameters: 1
87
- release_date: 2021-12-01
88
-
89
-
90
- # Anthropic
91
- - name: anthropic/stanford-online-all-v4-s3
92
- display_name: Anthropic-LM v4-s3 (52B)
93
- description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf).
94
- creator_organization: Anthropic
95
- access: closed
96
- num_parameters: 52000000000
97
- release_date: 2021-12-01
98
- - name: anthropic/claude-2.0
99
- display_name: Anthropic Claude 2.0
100
- description: Claude 2.0 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
101
- creator_organization: Anthropic
102
- access: limited
103
- release_date: 2023-07-11
104
- - name: anthropic/claude-2.1
105
- display_name: Anthropic Claude 2.1
106
- description: Claude 2.1 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
107
- creator_organization: Anthropic
108
- access: limited
109
- release_date: 2023-11-21
110
- - name: anthropic/claude-v1.3
111
- display_name: Anthropic Claude v1.3
112
- description: A model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
113
- creator_organization: Anthropic
114
- access: limited
115
- release_date: 2023-03-17
116
- - name: anthropic/claude-instant-v1
117
- display_name: Anthropic Claude Instant V1
118
- description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
119
- creator_organization: Anthropic
120
- access: limited
121
- release_date: 2023-03-17
122
- - name: anthropic/claude-instant-1.2
123
- display_name: Anthropic Claude Instant 1.2
124
- description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
125
- creator_organization: Anthropic
126
- access: limited
127
- release_date: 2023-08-09
128
-
129
- # Berkeley
130
- - name: together/koala-13b
131
- display_name: Koala (13B)
132
- description: Koala (13B) is a chatbot fine-tuned from Llama (13B) on dialogue data gathered from the web. ([blog post](https://bair.berkeley.edu/blog/2023/04/03/koala/))
133
- creator_organization: UC Berkeley
134
- access: open
135
- num_parameters: 13000000000
136
- release_date: 2022-04-03
137
- todo: true
138
-
139
- # BigScience
140
- - name: together/bloom
141
- display_name: BLOOM (176B)
142
- description: BLOOM (176B parameters) is an autoregressive model trained on 46 natural languages and 13 programming languages ([paper](https://arxiv.org/pdf/2211.05100.pdf)).
143
- creator_organization: BigScience
144
- access: open
145
- num_parameters: 176000000000
146
- release_date: 2022-06-28
147
- - name: together/bloomz
148
- display_name: BLOOMZ (176B)
149
- description: BLOOMZ (176B parameters) is BLOOM that has been fine-tuned on natural language instructions ([details](https://huggingface.co/bigscience/bloomz)).
150
- creator_organization: BigScience
151
- access: open
152
- num_parameters: 176000000000
153
- release_date: 2022-11-03
154
- todo: true
155
- - name: together/t0pp
156
- display_name: T0pp (11B)
157
- description: T0pp (11B parameters) is an encoder-decoder model trained on a large set of different tasks specified in natural language prompts ([paper](https://arxiv.org/pdf/2110.08207.pdf)).
158
- creator_organization: BigScience
159
- access: open
160
- num_parameters: 11000000000
161
- release_date: 2021-10-15
162
-
163
- # BigCode
164
- - name: huggingface/santacoder
165
- display_name: SantaCoder (1.1B)
166
- description: SantaCoder (1.1B parameters) model trained on the Python, Java, and JavaScript subset of The Stack (v1.1) ([model card](https://huggingface.co/bigcode/santacoder)).
167
- creator_organization: BigCode
168
- access: open
169
- - name: huggingface/starcoder
170
- display_name: StarCoder (15.5B)
171
- description: The StarCoder (15.5B parameter) model trained on 80+ programming languages from The Stack (v1.2) ([model card](https://huggingface.co/bigcode/starcoder)).
172
- creator_organization: BigCode
173
- access: open
174
-
175
- # Hugging Face
176
- - name: huggingface/gpt2
177
- display_name: GPT-2 (124M)
178
- description: GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts.
179
- creator_organization: OpenAI
180
- access: open
181
- num_parameters: 124000000
182
- - name: huggingface/gpt2-medium
183
- display_name: GPT-2 Medium (355M)
184
- description: GPT-2 Medium is the 355M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
185
- creator_organization: OpenAI
186
- access: open
187
- num_parameters: 355000000
188
- - name: huggingface/gpt2-large
189
- display_name: GPT-2 Large (774M)
190
- description: GPT-2 Large is the 774M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
191
- creator_organization: OpenAI
192
- access: open
193
- num_parameters: 774000000
194
- - name: huggingface/gpt2-xl
195
- display_name: GPT-2 XL (1.5B)
196
- description: GPT-2 XL is the 1.5B parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
197
- creator_organization: OpenAI
198
- access: open
199
- num_parameters: 1500000000
200
-
201
- # HuggignfaceM4
202
- - name: HuggingFaceM4/idefics-9b
203
- display_name: IDEFICS (9B)
204
- description: IDEFICS (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
205
- creator_organization: HuggingFace
206
- access: open
207
- num_parameters: 9000000000
208
- release_date: 2023-08-22
209
- - name: HuggingFaceM4/idefics-9b-instruct
210
- display_name: IDEFICS instruct (9B)
211
- description: IDEFICS instruct (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
212
- creator_organization: HuggingFace
213
- access: open
214
- num_parameters: 9000000000
215
- release_date: 2023-08-22
216
- - name: HuggingFaceM4/idefics-80b
217
- display_name: IDEFICS (80B)
218
- description: IDEFICS (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
219
- creator_organization: HuggingFace
220
- access: open
221
- num_parameters: 80000000000
222
- release_date: 2023-08-22
223
- - name: HuggingFaceM4/idefics-80b-instruct
224
- display_name: IDEFICS instruct (80B)
225
- description: IDEFICS instruct (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
226
- creator_organization: HuggingFace
227
- access: open
228
- num_parameters: 80000000000
229
- release_date: 2023-08-22
230
-
231
- # Cerebras Systems
232
- - name: together/cerebras-gpt-6.7b
233
- display_name: Cerebras GPT (6.7B)
234
- description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
235
- creator_organization: Cerebras
236
- access: limited
237
- num_parameters: 6700000000
238
- release_date: 2023-04-06
239
- todo: true
240
- - name: together/cerebras-gpt-13b
241
- display_name: Cerebras GPT (13B)
242
- description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
243
- creator_organization: Cerebras
244
- access: limited
245
- num_parameters: 13000000000
246
- release_date: 2023-04-06
247
- todo: true
248
-
249
- # Cohere
250
- - name: cohere/xlarge-20220609
251
- display_name: Cohere xlarge v20220609 (52.4B)
252
- description: Cohere xlarge v20220609 (52.4B parameters)
253
- creator_organization: Cohere
254
- access: limited
255
- num_parameters: 52400000000
256
- release_date: 2022-06-09
257
- - name: cohere/large-20220720
258
- display_name: Cohere large v20220720 (13.1B)
259
- description: Cohere large v20220720 (13.1B parameters), which is deprecated by Cohere as of December 2, 2022.
260
- creator_organization: Cohere
261
- access: limited
262
- num_parameters: 13100000000
263
- release_date: 2022-07-20
264
- - name: cohere/medium-20220720
265
- display_name: Cohere medium v20220720 (6.1B)
266
- description: Cohere medium v20220720 (6.1B parameters)
267
- creator_organization: Cohere
268
- access: limited
269
- num_parameters: 6100000000
270
- release_date: 2022-07-20
271
- - name: cohere/small-20220720
272
- display_name: Cohere small v20220720 (410M)
273
- description: Cohere small v20220720 (410M parameters), which is deprecated by Cohere as of December 2, 2022.
274
- creator_organization: Cohere
275
- access: limited
276
- num_parameters: 410000000
277
- release_date: 2022-07-20
278
- - name: cohere/xlarge-20221108
279
- display_name: Cohere xlarge v20221108 (52.4B)
280
- description: Cohere xlarge v20221108 (52.4B parameters)
281
- creator_organization: Cohere
282
- access: limited
283
- num_parameters: 52400000000
284
- release_date: 2022-11-08
285
- - name: cohere/medium-20221108
286
- display_name: Cohere medium v20221108 (6.1B)
287
- description: Cohere medium v20221108 (6.1B parameters)
288
- creator_organization: Cohere
289
- access: limited
290
- num_parameters: 6100000000
291
- release_date: 2022-11-08
292
- - name: cohere/command-medium-beta
293
- display_name: Cohere Command beta (6.1B)
294
- description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
295
- creator_organization: Cohere
296
- access: limited
297
- num_parameters: 6100000000
298
- release_date: 2022-11-08
299
- - name: cohere/command-xlarge-beta
300
- display_name: Cohere Command beta (52.4B)
301
- description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
302
- creator_organization: Cohere
303
- access: limited
304
- num_parameters: 52400000000
305
- release_date: 2022-11-08
306
- - name: cohere/command
307
- display_name: Cohere Command
308
- description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
309
- creator_organization: Cohere
310
- access: limited
311
- release_date: 2023-09-29
312
- - name: cohere/command-light
313
- display_name: Cohere Command Light
314
- description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
315
- creator_organization: Cohere
316
- access: limited
317
- release_date: 2023-09-29
318
-
319
- # Databricks
320
- - name: databricks/dolly-v2-3b
321
- display_name: Dolly V2 (3B)
322
- description: Dolly V2 (3B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
323
- creator_organization: Databricks
324
- access: open
325
- num_parameters: 2517652480
326
- release_date: 2023-04-12
327
- todo: true
328
- - name: databricks/dolly-v2-7b
329
- display_name: Dolly V2 (7B)
330
- description: Dolly V2 (7B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
331
- creator_organization: Databricks
332
- access: open
333
- num_parameters: 6444163072
334
- release_date: 2023-04-12
335
- todo: true
336
- - name: databricks/dolly-v2-12b
337
- display_name: Dolly V2 (12B)
338
- description: Dolly V2 (12B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
339
- creator_organization: Databricks
340
- access: open
341
- num_parameters: 11327027200
342
- release_date: 2023-04-12
343
- todo: true
344
-
345
- # DeepMind
346
- - name: deepmind/gopher
347
- display_name: Gopher (280B)
348
- description: Gopher (540B parameters) ([paper](https://arxiv.org/pdf/2112.11446.pdf)).
349
- creator_organization: DeepMind
350
- access: closed
351
- todo: true
352
- - name: deepmind/chinchilla
353
- display_name: Chinchilla (70B)
354
- description: Chinchilla (70B parameters) ([paper](https://arxiv.org/pdf/2203.15556.pdf)).
355
- creator_organization: DeepMind
356
- access: closed
357
- todo: true
358
-
359
- # EleutherAI
360
- - name: together/gpt-j-6b
361
- display_name: GPT-J (6B)
362
- description: GPT-J (6B parameters) autoregressive language model trained on The Pile ([details](https://arankomatsuzaki.wordpress.com/2021/06/04/gpt-j/)).
363
- creator_organization: EleutherAI
364
- access: open
365
- num_parameters: 6000000000
366
- release_date: 2021-06-04
367
- - name: together/gpt-neox-20b
368
- display_name: GPT-NeoX (20B)
369
- description: GPT-NeoX (20B parameters) autoregressive language model trained on The Pile ([paper](https://arxiv.org/pdf/2204.06745.pdf)).
370
- creator_organization: EleutherAI
371
- access: open
372
- num_parameters: 20000000000
373
- release_date: 2022-02-02
374
- - name: eleutherai/pythia-1b-v0
375
- display_name: Pythia (1B)
376
- description: Pythia (1B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
377
- creator_organization: EleutherAI
378
- access: open
379
- num_parameters: 805736448
380
- release_date: 2023-02-13
381
- todo: true
382
- - name: eleutherai/pythia-2.8b-v0
383
- display_name: Pythia (2.8B)
384
- description: Pythia (2.8B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
385
- creator_organization: EleutherAI
386
- access: open
387
- num_parameters: 2517652480
388
- release_date: 2023-02-13
389
- todo: true
390
- - name: eleutherai/pythia-6.9b
391
- display_name: Pythia (6.9B)
392
- description: Pythia (6.9B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
393
- creator_organization: EleutherAI
394
- access: open
395
- num_parameters: 6444163072
396
- release_date: 2023-02-13
397
- - name: eleutherai/pythia-12b-v0
398
- display_name: Pythia (12B)
399
- description: Pythia (12B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
400
- creator_organization: EleutherAI
401
- access: open
402
- num_parameters: 11327027200
403
- release_date: 2023-02-13
404
-
405
- # Google
406
- - name: together/t5-11b
407
- display_name: T5 (11B)
408
- description: T5 (11B parameters) is an encoder-decoder model trained on a multi-task mixture, where each task is converted into a text-to-text format ([paper](https://arxiv.org/pdf/1910.10683.pdf)).
409
- creator_organization: Google
410
- access: open
411
- num_parameters: 11000000000
412
- release_date: 2019-10-23
413
- - name: together/ul2
414
- display_name: UL2 (20B)
415
- description: UL2 (20B parameters) is an encoder-decoder model trained on the C4 corpus. It's similar to T5 but trained with a different objective and slightly different scaling knobs ([paper](https://arxiv.org/pdf/2205.05131.pdf)).
416
- creator_organization: Google
417
- access: open
418
- num_parameters: 20000000000
419
- release_date: 2022-05-10
420
- - name: together/flan-t5-xxl
421
- display_name: Flan-T5 (11B)
422
- description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)).
423
- creator_organization: Google
424
- access: open
425
- - name: google/palm
426
- display_name: PaLM (540B)
427
- description: Pathways Language Model (540B parameters) is trained using 6144 TPU v4 chips ([paper](https://arxiv.org/pdf/2204.02311.pdf)).
428
- creator_organization: Google
429
- access: closed
430
- todo: true
431
- ## PaLM 2
432
- - name: google/text-bison@001
433
- display_name: PaLM-2 (Bison)
434
- description: The best value PaLM model. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
435
- creator_organization: Google
436
- access: limited
437
- release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
438
- - name: google/text-bison-32k
439
- display_name: PaLM-2 (Bison)
440
- description: The best value PaLM model with a 32K context. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
441
- creator_organization: Google
442
- access: limited
443
- release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
444
- - name: google/text-unicorn@001
445
- display_name: PaLM-2 (Unicorn)
446
- description: The largest model in PaLM family. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
447
- creator_organization: Google
448
- access: limited
449
- release_date: 2023-11-30 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
450
- - name: google/code-bison@001
451
- display_name: Codey PaLM-2 (Bison)
452
- description: A model fine-tuned to generate code based on a natural language description of the desired code. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
453
- creator_organization: Google
454
- access: limited
455
- release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
456
- - name: google/code-bison-32k
457
- display_name: Codey PaLM-2 (Bison)
458
- description: Codey with a 32K context. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
459
- creator_organization: Google
460
- access: limited
461
- release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
462
-
463
- # HazyResearch
464
- - name: together/h3-2.7b
465
- display_name: H3 (2.7B)
466
- description: H3 (2.7B parameters) is a decoder-only language model based on state space models ([paper](https://arxiv.org/abs/2212.14052)).
467
- creator_organization: HazyResearch
468
- access: open
469
- num_parameters: 2700000000
470
- release_date: 2023-01-23
471
- todo: true
472
-
473
- # Lightning AI's Lit-GPT
474
- - name: lightningai/lit-gpt
475
- display_name: Lit-GPT
476
- description: Lit-GPT is an optimized collection of open-source LLMs for finetuning and inference. It supports – Falcon, Llama 2, Vicuna, LongChat, and other top-performing open-source large language models.
477
- creator_organization: Lightning AI
478
- access: open
479
- num_parameters: 1
480
- release_date: 2023-04-04
481
-
482
-
483
- # Meta
484
- - name: together/opt-iml-175b
485
- display_name: OPT-IML (175B)
486
- description: OPT-IML (175B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
487
- creator_organization: Meta
488
- access: open
489
- num_parameters: 175000000000
490
- release_date: 2022-12-22
491
- todo: true
492
-
493
- - name: together/opt-iml-30b
494
- display_name: OPT-IML (30B)
495
- description: OPT-IML (30B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
496
- creator_organization: Meta
497
- access: open
498
- num_parameters: 30000000000
499
- release_date: 2022-12-22
500
- todo: true
501
-
502
- - name: together/opt-175b
503
- display_name: OPT (175B)
504
- description: Open Pre-trained Transformers (175B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
505
- creator_organization: Meta
506
- access: open
507
- num_parameters: 175000000000
508
- release_date: 2022-05-02
509
-
510
- - name: together/opt-66b
511
- display_name: OPT (66B)
512
- description: Open Pre-trained Transformers (66B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
513
- creator_organization: Meta
514
- access: open
515
- num_parameters: 66000000000
516
- release_date: 2022-05-02
517
-
518
- - name: together/opt-6.7b
519
- display_name: OPT (6.7B)
520
- description: Open Pre-trained Transformers (6.7B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
521
- creator_organization: Meta
522
- access: open
523
- num_parameters: 6700000000
524
- release_date: 2022-05-02
525
-
526
- - name: together/opt-1.3b
527
- display_name: OPT (1.3B)
528
- description: Open Pre-trained Transformers (1.3B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
529
- creator_organization: Meta
530
- access: open
531
- num_parameters: 1300000000
532
- release_date: 2022-05-02
533
-
534
- - name: together/galactica-120b
535
- display_name: Galactica (120B)
536
- description: Galactica (120B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
537
- creator_organization: Meta
538
- access: open
539
- num_parameters: 120000000000
540
- release_date: 2022-11-15
541
- todo: true
542
-
543
- - name: together/galactica-30b
544
- display_name: Galactica (30B)
545
- description: Galactica (30B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
546
- creator_organization: Meta
547
- access: open
548
- num_parameters: 30000000000
549
- release_date: 2022-11-15
550
- todo: true
551
- - name: meta/llama-7b
552
- display_name: LLaMA (7B)
553
- description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
554
- creator_organization: Meta
555
- access: open
556
- num_parameters: 7000000000
557
- release_date: 2023-02-24
558
- - name: meta/llama-13b
559
- display_name: LLaMA (13B)
560
- description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
561
- creator_organization: Meta
562
- access: open
563
- num_parameters: 13000000000
564
- release_date: 2023-02-24
565
- - name: meta/llama-30b
566
- display_name: LLaMA (30B)
567
- description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
568
- creator_organization: Meta
569
- access: open
570
- num_parameters: 30000000000
571
- release_date: 2023-02-24
572
- - name: meta/llama-65b
573
- display_name: LLaMA (65B)
574
- description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
575
- creator_organization: Meta
576
- access: open
577
- num_parameters: 65000000000
578
- release_date: 2023-02-24
579
- - name: meta/llama-2-7b
580
- display_name: Llama 2 (7B)
581
- description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
582
- creator_organization: Meta
583
- access: open
584
- num_parameters: 7000000000
585
- release_date: 2023-07-18
586
- - name: meta/llama-2-13b
587
- display_name: Llama 2 (13B)
588
- description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
589
- creator_organization: Meta
590
- access: open
591
- num_parameters: 13000000000
592
- release_date: 2023-07-18
593
- - name: meta/llama-2-70b
594
- display_name: Llama 2 (70B)
595
- description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
596
- creator_organization: Meta
597
- access: open
598
- num_parameters: 70000000000
599
- release_date: 2023-07-18
600
-
601
- # Stability AI
602
- - name: stabilityai/stablelm-base-alpha-3b
603
- display_name: StableLM-Base-Alpha (3B)
604
- description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
605
- creator_organization: Stability AI
606
- access: open
607
- num_parameters: 3000000000
608
- release_date: 2023-04-20
609
- todo: true
610
-
611
- - name: stabilityai/stablelm-base-alpha-7b
612
- display_name: StableLM-Base-Alpha (7B)
613
- description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
614
- creator_organization: Stability AI
615
- access: open
616
- num_parameters: 7000000000
617
- release_date: 2023-04-20
618
- todo: true
619
-
620
- # Stanford
621
- - name: stanford/alpaca-7b
622
- display_name: Alpaca (7B)
623
- description: Alpaca 7B is a model fine-tuned from the LLaMA 7B model on 52K instruction-following demonstrations
624
- creator_organization: Stanford
625
- access: open
626
- num_parameters: 7000000000
627
- release_date: 2023-03-13
628
-
629
- # LMSYS
630
- - name: lmsys/vicuna-7b-v1.3
631
- display_name: Vicuna v1.3 (7B)
632
- description: Vicuna v1.3 (7B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
633
- creator_organization: LMSYS
634
- access: open
635
- num_parameters: 7000000000
636
- release_date: 2023-06-22
637
- - name: lmsys/vicuna-13b-v1.3
638
- display_name: Vicuna v1.3 (13B)
639
- description: Vicuna v1.3 (13B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
640
- creator_organization: LMSYS
641
- access: open
642
- num_parameters: 13000000000
643
- release_date: 2023-06-22
644
-
645
- # 01.AI
646
- - name: 01-ai/yi-6b
647
- display_name: Yi (6B)
648
- description: The Yi models are large language models trained from scratch by developers at 01.AI.
649
- creator_organization: 01.AI
650
- access: open
651
- num_parameters: 6000000000
652
- release_date: 2023-11-02
653
- - name: 01-ai/yi-34b
654
- display_name: Yi (34B)
655
- description: The Yi models are large language models trained from scratch by developers at 01.AI.
656
- creator_organization: 01.AI
657
- access: open
658
- num_parameters: 34000000000
659
- release_date: 2023-11-02
660
-
661
- # Mistral AI
662
- - name: mistralai/mistral-7b-v0.1
663
- display_name: Mistral v0.1 (7B)
664
- description: Mistral 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA).
665
- creator_organization: Mistral AI
666
- access: open
667
- num_parameters: 7300000000
668
- release_date: 2023-09-27
669
-
670
- # Microsoft/NVIDIA
671
- - name: microsoft/TNLGv2_530B
672
- display_name: TNLG v2 (530B)
673
- description: TNLG v2 (530B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
674
- creator_organization: Microsoft/NVIDIA
675
- access: closed
676
- num_parameters: 530000000000
677
- release_date: 2022-01-28
678
- - name: microsoft/TNLGv2_7B
679
- display_name: TNLG v2 (6.7B)
680
- description: TNLG v2 (6.7B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
681
- creator_organization: Microsoft/NVIDIA
682
- access: closed
683
- num_parameters: 6700000000
684
- release_date: 2022-01-28
685
-
686
- # OpenAI: https://beta.openai.com/docs/engines/gpt-3
687
- - name: openai/davinci
688
- display_name: davinci (175B)
689
- description: Original GPT-3 (175B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
690
- creator_organization: OpenAI
691
- access: limited
692
- num_parameters: 175000000000
693
- release_date: 2020-05-28
694
- - name: openai/curie
695
- display_name: curie (6.7B)
696
- description: Original GPT-3 (6.7B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
697
- creator_organization: OpenAI
698
- access: limited
699
- num_parameters: 6700000000
700
- release_date: 2020-05-28
701
- - name: openai/babbage
702
- display_name: babbage (1.3B)
703
- description: Original GPT-3 (1.3B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
704
- creator_organization: OpenAI
705
- access: limited
706
- num_parameters: 1300000000
707
- release_date: 2020-05-28
708
- - name: openai/ada
709
- display_name: ada (350M)
710
- description: Original GPT-3 (350M parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
711
- creator_organization: OpenAI
712
- access: limited
713
- num_parameters: 350000000
714
- release_date: 2020-05-28
715
- - name: openai/text-davinci-003
716
- display_name: text-davinci-003
717
- description: text-davinci-003 model that involves reinforcement learning (PPO) with reward models. Derived from text-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
718
- creator_organization: OpenAI
719
- access: limited
720
- num_parameters: 175000000000
721
- release_date: 2022-11-28
722
- - name: openai/text-davinci-002
723
- display_name: text-davinci-002
724
- description: text-davinci-002 model that involves supervised fine-tuning on human-written demonstrations. Derived from code-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
725
- creator_organization: OpenAI
726
- access: limited
727
- num_parameters: 175000000000
728
- release_date: 2022-01-27
729
- - name: openai/text-davinci-001
730
- display_name: text-davinci-001
731
- description: text-davinci-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
732
- creator_organization: OpenAI
733
- access: limited
734
- num_parameters: 175000000000
735
- release_date: 2022-01-27
736
- todo: true
737
- - name: openai/text-curie-001
738
- display_name: text-curie-001
739
- description: text-curie-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
740
- creator_organization: OpenAI
741
- access: limited
742
- num_parameters: 6700000000
743
- release_date: 2022-01-27
744
- - name: openai/text-babbage-001
745
- display_name: text-babbage-001
746
- description: text-babbage-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
747
- creator_organization: OpenAI
748
- access: limited
749
- num_parameters: 1300000000
750
- release_date: 2022-01-27
751
- - name: openai/text-ada-001
752
- display_name: text-ada-001
753
- description: text-ada-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
754
- creator_organization: OpenAI
755
- access: limited
756
- num_parameters: 350000000
757
- release_date: 2022-01-27
758
- - name: openai/gpt-4-0314
759
- display_name: gpt-4-0314
760
- description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from March 14th 2023.
761
- creator_organization: OpenAI
762
- access: limited
763
- release_date: 2023-03-14
764
- - name: openai/gpt-4-32k-0314
765
- display_name: gpt-4-32k-0314
766
- description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from March 14th 2023.
767
- creator_organization: OpenAI
768
- access: limited
769
- release_date: 2023-03-14
770
- - name: openai/gpt-4-0613
771
- display_name: gpt-4-0613
772
- description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from 2023-06-13.
773
- creator_organization: OpenAI
774
- access: limited
775
- release_date: 2023-06-13
776
- - name: openai/gpt-4-32k-0613
777
- display_name: gpt-4-32k-0613
778
- description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from 2023-06-13.
779
- creator_organization: OpenAI
780
- access: limited
781
- release_date: 2023-06-13
782
- - name: openai/code-davinci-002
783
- display_name: code-davinci-002
784
- description: Codex-style model that is designed for pure code-completion tasks ([docs](https://beta.openai.com/docs/models/codex)).
785
- creator_organization: OpenAI
786
- access: limited
787
- - name: openai/code-davinci-001
788
- display_name: code-davinci-001
789
- description: code-davinci-001 model
790
- creator_organization: OpenAI
791
- access: limited
792
- todo: true
793
- - name: openai/code-cushman-001
794
- display_name: code-cushman-001 (12B)
795
- description: Codex-style model that is a stronger, multilingual version of the Codex (12B) model in the [Codex paper](https://arxiv.org/pdf/2107.03374.pdf).
796
- creator_organization: OpenAI
797
- access: limited
798
- - name: openai/gpt-3.5-turbo-0301
799
- display_name: gpt-3.5-turbo-0301
800
- description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01.
801
- creator_organization: OpenAI
802
- access: limited
803
- release_date: 2023-03-01
804
- - name: openai/gpt-3.5-turbo-0613
805
- display_name: gpt-3.5-turbo-0613
806
- description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13.
807
- creator_organization: OpenAI
808
- access: limited
809
- release_date: 2023-06-13
810
- - name: openai/gpt-3.5-turbo-16k-0613
811
- display_name: gpt-3.5-turbo-16k-0613
812
- description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13 with a longer context length of 16,384 tokens.
813
- creator_organization: OpenAI
814
- access: limited
815
- release_date: 2023-06-13
816
- - name: openai/gpt-4-1106-preview
817
- display_name: gpt-4-1106-preview
818
- description: GPT-4 Turbo (preview) is a large multimodal model that is optimized for chat but works well for traditional completions tasks. The model is cheaper and faster than the original GPT-4 model. Preview snapshot from November 6, 2023.
819
- creator_organization: OpenAI
820
- access: limited
821
- release_date: 2023-11-06
822
-
823
- # Together
824
- - name: together/Together-gpt-JT-6B-v1
825
- display_name: GPT-JT (6B)
826
- description: GPT-JT (6B parameters) is a fork of GPT-J ([blog post](https://www.together.xyz/blog/releasing-v1-of-gpt-jt-powered-by-open-source-ai)).
827
- creator_organization: Together
828
- access: open
829
- num_parameters: 6700000000
830
- release_date: 2022-11-29
831
- todo: true
832
- - name: together/gpt-neoxt-chat-base-20b
833
- display_name: GPT-NeoXT-Chat-Base (20B)
834
- description: GPT-NeoXT-Chat-Base (20B) is fine-tuned from GPT-NeoX, serving as a base model for developing open-source chatbots.
835
- creator_organization: Together
836
- access: open
837
- num_parameters: 20000000000
838
- release_date: 2023-03-08
839
- todo: true
840
- - name: together/redpajama-incite-base-3b-v1
841
- display_name: RedPajama-INCITE-Base-v1 (3B)
842
- description: RedPajama-INCITE-Base-v1 (3B parameters) is a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
843
- creator_organization: Together
844
- access: open
845
- num_parameters: 3000000000
846
- release_date: 2023-05-05
847
- - name: together/redpajama-incite-instruct-3b-v1
848
- display_name: RedPajama-INCITE-Instruct-v1 (3B)
849
- description: RedPajama-INCITE-Instruct-v1 (3B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
850
- creator_organization: Together
851
- access: open
852
- num_parameters: 3000000000
853
- release_date: 2023-05-05
854
- todo: true
855
- - name: together/redpajama-incite-chat-3b-v1
856
- display_name: RedPajama-INCITE-Chat-v1 (3B)
857
- description: RedPajama-INCITE-Chat-v1 (3B parameters) is a model fine-tuned on OASST1 and Dolly2 to enhance chatting ability. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
858
- creator_organization: Together
859
- access: open
860
- num_parameters: 3000000000
861
- release_date: 2023-05-05
862
- todo: true
863
- - name: together/redpajama-incite-base-7b
864
- display_name: RedPajama-INCITE-Base (7B)
865
- description: RedPajama-INCITE-Base (7B parameters) is a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
866
- creator_organization: Together
867
- access: open
868
- num_parameters: 7000000000
869
- release_date: 2023-05-05
870
- todo: true
871
- - name: together/redpajama-incite-instruct-7b
872
- display_name: RedPajama-INCITE-Instruct (7B)
873
- description: RedPajama-INCITE-Instruct (7B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base (7B), a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
874
- creator_organization: Together
875
- access: open
876
- num_parameters: 7000000000
877
- release_date: 2023-05-05
878
- todo: true
879
-
880
- # MosaicML
881
- - name: mosaicml/mpt-7b
882
- display_name: MPT (7B)
883
- description: MPT (7B) is a Transformer trained from scratch on 1T tokens of text and code.
884
- creator_organization: MosaicML
885
- access: open
886
- num_parameters: 6700000000
887
- release_date: 2023-05-05
888
- - name: mosaicml/mpt-7b-chat
889
- display_name: MPT-Chat (7B)
890
- description: MPT-Chat (7B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B) , a Transformer trained from scratch on 1T tokens of text and code.
891
- creator_organization: MosaicML
892
- access: open
893
- num_parameters: 6700000000
894
- release_date: 2023-05-05
895
- todo: true
896
- - name: mosaicml/mpt-instruct-7b
897
- display_name: MPT-Instruct (7B)
898
- description: MPT-Instruct (7B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
899
- creator_organization: MosaicML
900
- access: open
901
- num_parameters: 6700000000
902
- release_date: 2023-05-05
903
- - name: mosaicml/mpt-30b
904
- display_name: MPT (30B)
905
- description: MPT (30B) is a Transformer trained from scratch on 1T tokens of text and code.
906
- creator_organization: MosaicML
907
- access: open
908
- num_parameters: 30000000000
909
- release_date: 2023-06-22
910
- - name: mosaicml/mpt-30b-chat
911
- display_name: MPT-Chat (30B)
912
- description: MPT-Chat (30B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
913
- creator_organization: MosaicML
914
- access: open
915
- num_parameters: 30000000000
916
- release_date: 2023-06-22
917
- todo: true
918
- - name: mosaicml/mpt-instruct-30b
919
- display_name: MPT-Instruct (30B)
920
- description: MPT-Instruct (30B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
921
- creator_organization: MosaicML
922
- access: open
923
- num_parameters: 30000000000
924
- release_date: 2023-06-22
925
-
926
- # TII UAE
927
- - name: tiiuae/falcon-7b
928
- display_name: Falcon (7B)
929
- description: Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
930
- creator_organization: TII UAE
931
- access: open
932
- num_parameters: 7000000000
933
- release_date: 2023-03-15
934
- - name: tiiuae/falcon-7b-instruct
935
- display_name: Falcon-Instruct (7B)
936
- description: Falcon-7B-Instruct is a 7B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
937
- creator_organization: TII UAE
938
- access: open
939
- num_parameters: 7000000000
940
- release_date: 2023-03-15
941
- - name: tiiuae/falcon-40b
942
- display_name: Falcon (40B)
943
- description: Falcon-40B is a 40B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
944
- creator_organization: TII UAE
945
- access: open
946
- num_parameters: 40000000000
947
- release_date: 2023-05-25
948
- - name: tiiuae/falcon-40b-instruct
949
- display_name: Falcon-Instruct (40B)
950
- description: Falcon-40B-Instruct is a 40B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
951
- creator_organization: TII UAE
952
- access: open
953
- num_parameters: 40000000000
954
- release_date: 2023-05-25
955
-
956
- # Salesforce
957
- - name: together/codegen
958
- display_name: CodeGen (16B)
959
- description: CodeGen (16B parameters) is an open dense code model trained for multi-turn program synthesis ([blog](https://arxiv.org/pdf/2203.13474.pdf)).
960
- creator_organization: Tsinghua
961
- access: open
962
- num_parameters: 16000000000
963
- release_date: 2022-03-25
964
- todo: true
965
-
966
- # Tsinghua
967
- - name: together/glm
968
- display_name: GLM (130B)
969
- description: GLM (130B parameters) is an open bilingual (English & Chinese) bidirectional dense model that was trained using General Language Model (GLM) procedure ([paper](https://arxiv.org/pdf/2210.02414.pdf)).
970
- creator_organization: Tsinghua
971
- access: open
972
- num_parameters: 130000000000
973
- release_date: 2022-08-04
974
-
975
- - name: together/codegeex
976
- display_name: CodeGeeX (13B)
977
- description: CodeGeeX (13B parameters) is an open dense code model trained on more than 20 programming languages on a corpus of more than 850B tokens ([blog](http://keg.cs.tsinghua.edu.cn/codegeex/)).
978
- creator_organization: Tsinghua
979
- access: open
980
- num_parameters: 13000000000
981
- release_date: 2022-09-19
982
- todo: true
983
-
984
- # Writer
985
- - name: writer/palmyra-base
986
- display_name: Palmyra Base (5B)
987
- description: Palmyra Base (5B)
988
- creator_organization: Writer
989
- access: limited
990
- num_parameters: 5000000000
991
- release_date: 2022-10-13
992
- - name: writer/palmyra-large
993
- display_name: Palmyra Large (20B)
994
- description: Palmyra Large (20B)
995
- creator_organization: Writer
996
- access: limited
997
- num_parameters: 20000000000
998
- release_date: 2022-12-23
999
- - name: writer/palmyra-instruct-30
1000
- display_name: InstructPalmyra (30B)
1001
- description: InstructPalmyra (30B parameters) is trained using reinforcement learning techniques based on feedback from humans.
1002
- creator_organization: Writer
1003
- access: limited
1004
- num_parameters: 30000000000
1005
- release_date: 2023-02-16
1006
- - name: writer/palmyra-e
1007
- display_name: Palmyra E (30B)
1008
- description: Palmyra E (30B)
1009
- creator_organization: Writer
1010
- access: limited
1011
- num_parameters: 30000000000
1012
- release_date: 2023-03-03
1013
- - name: writer/silk-road
1014
- display_name: Silk Road (35B)
1015
- description: Silk Road (35B)
1016
- creator_organization: Writer
1017
- access: limited
1018
- num_parameters: 35000000000
1019
- release_date: 2023-04-13
1020
- - name: writer/palmyra-x
1021
- display_name: Palmyra X (43B)
1022
- description: Palmyra-X (43B parameters) is trained to adhere to instructions using human feedback and utilizes a technique called multiquery attention. Furthermore, a new feature called 'self-instruct' has been introduced, which includes the implementation of an early stopping criteria specifically designed for minimal instruction tuning ([paper](https://dev.writer.com/docs/becoming-self-instruct-introducing-early-stopping-criteria-for-minimal-instruct-tuning)).
1023
- creator_organization: Writer
1024
- access: limited
1025
- num_parameters: 43000000000
1026
- release_date: 2023-06-11
1027
- - name: writer/palmyra-x-v2
1028
- display_name: Palmyra X V2 (33B)
1029
- description: Palmyra-X V2 (33B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. The pre-training data more than 2 trillion tokens types are diverse and cover a wide range of areas, used FlashAttention-2.
1030
- creator_organization: Writer
1031
- access: limited
1032
- num_parameters: 33000000000
1033
- release_date: 2023-12-01
1034
- - name: writer/palmyra-x-v3
1035
- display_name: Palmyra X V3 (72B)
1036
- description: Palmyra-X V3 (72B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. It is trained via unsupervised learning and DPO and use multiquery attention.
1037
- creator_organization: Writer
1038
- access: limited
1039
- num_parameters: 72000000000
1040
- release_date: 2023-12-01
1041
- - name: writer/palmyra-x-32k
1042
- display_name: Palmyra X-32K (33B)
1043
- description: Palmyra-X-32K (33B parameters) is a Transformer-based model, which is trained on large-scale pre-training data. The pre-training data types are diverse and cover a wide range of areas. These data types are used in conjunction and the alignment mechanism to extend context window.
1044
- creator_organization: Writer
1045
- access: limited
1046
- num_parameters: 33000000000
1047
- release_date: 2023-12-01
1048
-
1049
- # Yandex
1050
- - name: together/yalm
1051
- display_name: YaLM (100B)
1052
- description: YaLM (100B parameters) is an autoregressive language model trained on English and Russian text ([GitHub](https://github.com/yandex/YaLM-100B)).
1053
- creator_organization: Yandex
1054
- access: open
1055
- num_parameters: 100000000000
1056
- release_date: 2022-06-23
1057
-
1058
- # NVIDIA
1059
- - name: nvidia/megatron-gpt2
1060
- display_name: Megatron GPT2
1061
- description: GPT-2 implemented in Megatron-LM ([paper](https://arxiv.org/abs/1909.08053)).
1062
- creator_organization: NVIDIA
1063
- access: open
1064
- todo: true
1065
-
1066
- ############################################################
1067
- adapter:
1068
- - name: method
1069
- description: The high-level strategy for converting instances into a prompt for the language model.
1070
- values:
1071
- - name: generation
1072
- description: Given the input, the model generates the output free-form.
1073
- - name: multiple_choice_joint
1074
- description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
1075
- - name: multiple_choice_separate_original
1076
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
1077
- - name: multiple_choice_separate_calibrated
1078
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
1079
- - name: language_modeling
1080
- description: Given the input, the model assigns the sequence a probability.
1081
- - name: instructions
1082
- description: The description of the task that is included at the very beginning of the prompt.
1083
- - name: global_prefix
1084
- description: The string that is prepended to the prompt.
1085
- - name: instance_prefix
1086
- description: The string that is included before each instance (e.g., '\n\n').
1087
- - name: input_prefix
1088
- description: The string that is included before each input (e.g., 'Question:').
1089
- - name: input_suffix
1090
- description: The string that is included after each input (e.g., '\n').
1091
- - name: reference_prefix
1092
- description: The string that is included before each reference (for multiple-choice questions).
1093
- - name: reference_suffix
1094
- description: The string that is included after each reference (for multiple-choice questions).
1095
- - name: output_prefix
1096
- description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
1097
- - name: output_suffix
1098
- description: The string that is included after the correct answer/predicted output (e.g., '\n').
1099
- - name: substitutions
1100
- description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
1101
- - name: max_train_instances
1102
- description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
1103
- - name: max_eval_instances
1104
- description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
1105
- - name: num_outputs
1106
- description: Maximum number of possible outputs to generate by sampling multiple outputs.
1107
- - name: num_train_trials
1108
- description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
1109
- - name: sample_train
1110
- description: If true, randomly sample N training examples; if false, select N consecutive training examples
1111
- - name: model
1112
- description: DEPRECATED. Name of the language model (<creator_organization>/<model name>) to send requests to.
1113
- - name: model_deployment
1114
- description: Name of the language model (<host_organization>/<model name>) to send requests to.
1115
- - name: temperature
1116
- description: Temperature parameter used in generation.
1117
- - name: max_tokens
1118
- description: Maximum number of tokens to generate.
1119
- - name: stop_sequences
1120
- description: List of sequences, where we stop generation if we encounter any of them.
1121
- - name: random
1122
- description: Random seed (string), which guarantees reproducibility.
1123
- - name: multi_label
1124
- description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
1125
-
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this after 2024-09-01.
5
+ adapter: []
1126
6
  ############################################################
1127
7
  metrics:
1128
8
  # Infrastructure metrics:
@@ -1632,6 +512,50 @@ metrics:
1632
512
  lower_is_better: false
1633
513
  # CLEVA Chinese bias, copyright and toxicity metrics share the same name as the original HELM metrics
1634
514
 
515
+
516
+ # DecodingTrust Fairness Metrics
517
+ - name: equalized_odds_difference
518
+ display_name: Equalized Odds Difference
519
+ short_display_name: Equalized Odds
520
+ description: The rate at which the model refuses to answer an OoD question.
521
+ lower_is_better: true
522
+
523
+ - name: demographic_parity_difference
524
+ display_name: Demographic Parity Difference
525
+ short_display_name: demographic parity
526
+ description: The demographic parity difference measures the difference between the probability of positive predictions conditioned on sensitive attribute A = 1 and that conditioned on A = 0
527
+ lower_is_better: true
528
+
529
+
530
+ # DecodingTrust OoD Metrics
531
+ - name: ood_knowledge_rr
532
+ display_name: OOD Knowledge Refusal Rate
533
+ short_display_name: OoD-RR
534
+ description: The rate at which the model refuses to answer an OoD question.
535
+ lower_is_better: false
536
+
537
+ - name: ood_knowledge_macc
538
+ display_name: OOD Meaningful Answer Accuracy (Exclude Refusal)
539
+ short_display_name: OoD-MACC
540
+ description: MACC indicates the percentage of correct answers when the model successfully generates meaningful responses by excluding outputs that are refused to answer
541
+ lower_is_better: false
542
+
543
+
544
+ # DecodingTrust Stereotype Bias Metrics
545
+
546
+ - name: decodingtrust_stereotype_bias
547
+ display_name: DecodingTrust Bias Score
548
+ short_display_name: Bias Score
549
+ description: 1 - agreementIndex (Section 4.2 of DecodingTrust)
550
+ lower_is_better: false
551
+
552
+ - name: decodingtrust_stereotype_bias_rejection_rate
553
+ display_name: Accuracy at 10% coverage
554
+ short_display_name: Bias Rejection Rate
555
+ description: Rejection rate of stereotype prompts
556
+ lower_is_better: false
557
+
558
+
1635
559
  ############################################################
1636
560
  perturbations:
1637
561
  - name: robustness
@@ -1915,6 +839,31 @@ metric_groups:
1915
839
  - name: chinese_bleu_1
1916
840
  split: ${main_split}
1917
841
 
842
+ - name: decodingtrust_fairness_metrics
843
+ display_name: DecodingTrust Fairness
844
+ metrics:
845
+ - name: equalized_odds_difference
846
+ split: ${main_split}
847
+ - name: demographic_parity_difference
848
+ split: ${main_split}
849
+
850
+ - name: decodingtrust_ood_metrics
851
+ display_name: DecodingTrust OOD Accuracy
852
+ metrics:
853
+ - name: ood_knowledge_rr
854
+ split: ${main_split}
855
+ - name: ood_knowledge_macc
856
+ split: ${main_split}
857
+
858
+ - name: decodingtrust_stereotype_bias_metrics
859
+ display_name: DecodingTrust Stereotype Bias
860
+ metrics:
861
+ - name: decodingtrust_stereotype_bias
862
+ split: ${main_split}
863
+ - name: decodingtrust_stereotype_bias_rejection_rate
864
+ split: ${main_split}
865
+
866
+
1918
867
  ############################################################
1919
868
  run_groups:
1920
869
  ## Top-level
@@ -2097,6 +1046,20 @@ run_groups:
2097
1046
  main_name: none
2098
1047
  main_split: none
2099
1048
 
1049
+ - name: decodingtrust
1050
+ display_name: DecodingTrust
1051
+ description: A comprehensive benchmark of the trustworthiness of large language models [(Wang et. al. 2023)](https://decodingtrust.github.io/)
1052
+ category: Core scenarios
1053
+ subgroups:
1054
+ - decodingtrust_adv_robustness
1055
+ - decodingtrust_adv_demonstration
1056
+ - decodingtrust_ood_robustness
1057
+ - decodingtrust_fairness
1058
+ - decodingtrust_privacy
1059
+ - decodingtrust_machine_ethics
1060
+ - decodingtrust_toxicity_prompts
1061
+ - decodingtrust_stereotype_bias
1062
+
2100
1063
  ### Ablations
2101
1064
  - name: ablation_in_context
2102
1065
  display_name: Vary number of in-context examples
@@ -2720,23 +1683,6 @@ run_groups:
2720
1683
  when: n/a
2721
1684
  language: synthetic
2722
1685
 
2723
- - name: numeracy
2724
- display_name: Numerical reasoning
2725
- description: Scenario introduced in this work to test numerical reasoning via symbolic regression.
2726
- metric_groups:
2727
- - accuracy
2728
- - efficiency
2729
- - general_information
2730
- environment:
2731
- main_name: absolute_value_difference
2732
- main_split: test
2733
- taxonomy:
2734
- task: next-word prediction
2735
- what: Dyck formal language
2736
- who: n/a
2737
- when: n/a
2738
- language: synthetic
2739
-
2740
1686
  - name: synthetic_reasoning
2741
1687
  display_name: Synthetic reasoning (abstract symbols)
2742
1688
  description: Synthetic reasoning tasks defined using abstract symbols based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
@@ -3873,3 +2819,176 @@ run_groups:
3873
2819
  task: user-facing tasks
3874
2820
  language: English dialects
3875
2821
  todo: true
2822
+
2823
+
2824
+ # DecodingTrust scenarios
2825
+ - name: decodingtrust_adv_robustness
2826
+ display_name: DecodingTrust - AdvGLUE++
2827
+ short_display_name: AdvGLUE++
2828
+ description: Adversarial perturbations of the GLUE dataset generated against open-source LLMs including Alpaca, Vicuna, and Stable-Vicuna
2829
+ metric_groups:
2830
+ - accuracy
2831
+ - calibration
2832
+ - efficiency
2833
+ - general_information
2834
+ environment:
2835
+ main_name: quasi_exact_match
2836
+ main_split: test
2837
+ taxonomy:
2838
+ task: text classification
2839
+ what: "?"
2840
+ who: "?"
2841
+ when: "?"
2842
+ language: English
2843
+ todo: true
2844
+
2845
+ - name: decodingtrust_adv_demonstration
2846
+ display_name: DecodingTrust - Adversarial Demonstrations
2847
+ short_display_name: AdvDemo
2848
+ description: Robustness analysis of LM generations when facing adversarial demonstrations
2849
+ metric_groups:
2850
+ - accuracy
2851
+ - calibration
2852
+ - efficiency
2853
+ - general_information
2854
+ environment:
2855
+ main_name: quasi_exact_match
2856
+ main_split: test
2857
+ taxonomy:
2858
+ task: text classification
2859
+ what: "?"
2860
+ who: "?"
2861
+ when: "?"
2862
+ language: English
2863
+
2864
+ - name: decodingtrust_ood_robustness
2865
+ display_name: DecodingTrust - OoD Robustness
2866
+ short_display_name: OoD
2867
+ description: Style perturbations of GLUE datasets (OoD styles) and out-of-scope OoD knowledge evaluations
2868
+ metric_groups:
2869
+ - accuracy
2870
+ - calibration
2871
+ - efficiency
2872
+ - general_information
2873
+ - decodingtrust_ood_metrics
2874
+ environment:
2875
+ main_name: quasi_exact_match
2876
+ main_split: test
2877
+ taxonomy:
2878
+ task: text classification
2879
+ what: "?"
2880
+ who: "?"
2881
+ when: "?"
2882
+ language: English
2883
+
2884
+ - name: decodingtrust_fairness
2885
+ display_name: DecodingTrust - Fairness
2886
+ short_display_name: Fairness
2887
+ description: Fairness analysis of LLMs
2888
+ metric_groups:
2889
+ - accuracy
2890
+ - calibration
2891
+ - efficiency
2892
+ - general_information
2893
+ - decodingtrust_fairness_metrics
2894
+ environment:
2895
+ main_name: quasi_exact_match
2896
+ main_split: test
2897
+ taxonomy:
2898
+ task: text classification
2899
+ what: "?"
2900
+ who: "?"
2901
+ when: "?"
2902
+ language: English
2903
+
2904
+ - name: decodingtrust_privacy
2905
+ display_name: DecodingTrust - Privacy
2906
+ short_display_name: Privacy
2907
+ description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
2908
+ metric_groups:
2909
+ - accuracy
2910
+ - calibration
2911
+ - efficiency
2912
+ - general_information
2913
+ environment:
2914
+ main_name: quasi_exact_match
2915
+ main_split: test
2916
+ taxonomy:
2917
+ task: text classification
2918
+ what: "?"
2919
+ who: "?"
2920
+ when: "?"
2921
+ language: English
2922
+
2923
+ - name: decodingtrust_machine_ethics
2924
+ display_name: DecodingTrust - Ethics
2925
+ short_display_name: Ethics
2926
+ description: Evaluation of the understanding of ethical behaviors of LLMs
2927
+ metric_groups:
2928
+ - accuracy
2929
+ - calibration
2930
+ - efficiency
2931
+ - general_information
2932
+ environment:
2933
+ main_name: quasi_exact_match
2934
+ main_split: test
2935
+ taxonomy:
2936
+ task: text classification
2937
+ what: "?"
2938
+ who: "?"
2939
+ when: "?"
2940
+ language: English
2941
+
2942
+ - name: decodingtrust_toxicity_prompts
2943
+ display_name: DecodingTrust - Toxicity
2944
+ short_display_name: Toxicity
2945
+ description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
2946
+ metric_groups:
2947
+ - toxicity
2948
+ - bias
2949
+ - efficiency
2950
+ - general_information
2951
+ environment:
2952
+ main_split: test
2953
+ taxonomy:
2954
+ task: "?"
2955
+ what: n/a
2956
+ who: n/a
2957
+ when: n/a
2958
+ language: synthetic
2959
+
2960
+ - name: decodingtrust_stereotype_bias
2961
+ display_name: DecodingTrust - Stereotype Bias
2962
+ short_display_name: Stereotype
2963
+ description: Manually crafted stereotype user prompts from DecodingTrust
2964
+ metric_groups:
2965
+ - toxicity
2966
+ - bias
2967
+ - efficiency
2968
+ - general_information
2969
+ - decodingtrust_stereotype_bias_metrics
2970
+ environment:
2971
+ main_split: test
2972
+ taxonomy:
2973
+ task: "?"
2974
+ what: n/a
2975
+ who: n/a
2976
+ when: n/a
2977
+ language: synthetic
2978
+
2979
+ - name: thai_exam
2980
+ display_name: Thai Exam
2981
+ short_display_name: ThaiExam
2982
+ description: A benchmark comprising Thai multiple-choice examinations.
2983
+ metric_groups:
2984
+ - accuracy
2985
+ - general_information
2986
+ environment:
2987
+ main_name: exact_match
2988
+ main_split: test
2989
+ taxonomy:
2990
+ task: question answering
2991
+ what: "?"
2992
+ who: "?"
2993
+ when: "?"
2994
+ language: Thai