crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -1,923 +1,4 @@
1
1
  ---
2
- ############################################################
3
- models:
4
- # AI21 Labs
5
- - name: ai21/j1-jumbo
6
- display_name: J1-Jumbo v1 (178B)
7
- description: Jurassic-1 Jumbo (178B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
8
- creator_organization: AI21 Labs
9
- access: limited
10
- num_parameters: 178000000000
11
- release_date: 2021-08-11
12
- - name: ai21/j1-large
13
- display_name: J1-Large v1 (7.5B)
14
- description: Jurassic-1 Large (7.5B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
15
- creator_organization: AI21 Labs
16
- access: limited
17
- num_parameters: 7500000000
18
- release_date: 2021-08-11
19
- - name: ai21/j1-grande
20
- display_name: J1-Grande v1 (17B)
21
- description: Jurassic-1 Grande (17B parameters) with a "few tweaks" to the training process ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
22
- creator_organization: AI21 Labs
23
- access: limited
24
- num_parameters: 17000000000
25
- release_date: 2022-05-03
26
- - name: ai21/j1-grande-v2-beta
27
- display_name: J1-Grande v2 beta (17B)
28
- description: Jurassic-1 Grande v2 beta (17B parameters)
29
- creator_organization: AI21 Labs
30
- access: limited
31
- num_parameters: 17000000000
32
- release_date: 2022-10-28
33
- - name: ai21/j2-jumbo
34
- display_name: Jurassic-2 Jumbo (178B)
35
- description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
36
- creator_organization: AI21 Labs
37
- access: limited
38
- num_parameters: 178000000000
39
- release_date: 2023-03-09
40
- - name: ai21/j2-grande
41
- display_name: Jurassic-2 Grande (17B)
42
- description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
43
- creator_organization: AI21 Labs
44
- access: limited
45
- num_parameters: 17000000000
46
- release_date: 2023-03-09
47
- - name: ai21/j2-large
48
- display_name: Jurassic-2 Large (7.5B)
49
- description: Jurassic-2 Large (7.5B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
50
- creator_organization: AI21 Labs
51
- access: limited
52
- num_parameters: 7500000000
53
- release_date: 2023-03-09
54
-
55
- # Aleph Alpha
56
- # TODO: add Luminous World when it's released
57
- - name: AlephAlpha/luminous-base
58
- display_name: Luminous Base (13B)
59
- description: Luminous Base (13B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
60
- creator_organization: Aleph Alpha
61
- access: limited
62
- num_parameters: 13000000000
63
- # TODO: get exact release date
64
- release_date: 2022-01-01
65
- - name: AlephAlpha/luminous-extended
66
- display_name: Luminous Extended (30B)
67
- description: Luminous Extended (30B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
68
- creator_organization: Aleph Alpha
69
- access: limited
70
- num_parameters: 30000000000
71
- release_date: 2022-01-01
72
- - name: AlephAlpha/luminous-supreme
73
- display_name: Luminous Supreme (70B)
74
- description: Luminous Supreme (70B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
75
- creator_organization: Aleph Alpha
76
- access: limited
77
- num_parameters: 70000000000
78
- release_date: 2022-01-01
79
-
80
- # TODO: Remove Once we have configurable model names
81
- - name: neurips/local
82
- display_name: Local service
83
- description: Local competition service
84
- creator_organization: neurips
85
- access: open
86
- num_parameters: 1
87
- release_date: 2021-12-01
88
-
89
-
90
- # Anthropic
91
- - name: anthropic/stanford-online-all-v4-s3
92
- display_name: Anthropic-LM v4-s3 (52B)
93
- description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf).
94
- creator_organization: Anthropic
95
- access: closed
96
- num_parameters: 52000000000
97
- release_date: 2021-12-01
98
- - name: anthropic/claude-2.0
99
- display_name: Anthropic Claude 2.0
100
- description: Claude 2.0 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
101
- creator_organization: Anthropic
102
- access: limited
103
- release_date: 2023-07-11
104
- - name: anthropic/claude-v1.3
105
- display_name: Anthropic Claude v1.3
106
- description: A model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
107
- creator_organization: Anthropic
108
- access: limited
109
- release_date: 2023-03-17
110
- - name: anthropic/claude-instant-v1
111
- display_name: Anthropic Claude Instant V1
112
- description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
113
- creator_organization: Anthropic
114
- access: limited
115
- release_date: 2023-03-17
116
-
117
- # Berkeley
118
- - name: together/koala-13b
119
- display_name: Koala (13B)
120
- description: Koala (13B) is a chatbot fine-tuned from Llama (13B) on dialogue data gathered from the web. ([blog post](https://bair.berkeley.edu/blog/2023/04/03/koala/))
121
- creator_organization: UC Berkeley
122
- access: open
123
- num_parameters: 13000000000
124
- release_date: 2022-04-03
125
- todo: true
126
-
127
- # BigScience
128
- - name: together/bloom
129
- display_name: BLOOM (176B)
130
- description: BLOOM (176B parameters) is an autoregressive model trained on 46 natural languages and 13 programming languages ([paper](https://arxiv.org/pdf/2211.05100.pdf)).
131
- creator_organization: BigScience
132
- access: open
133
- num_parameters: 176000000000
134
- release_date: 2022-06-28
135
- - name: together/bloomz
136
- display_name: BLOOMZ (176B)
137
- description: BLOOMZ (176B parameters) is BLOOM that has been fine-tuned on natural language instructions ([details](https://huggingface.co/bigscience/bloomz)).
138
- creator_organization: BigScience
139
- access: open
140
- num_parameters: 176000000000
141
- release_date: 2022-11-03
142
- todo: true
143
- - name: together/t0pp
144
- display_name: T0pp (11B)
145
- description: T0pp (11B parameters) is an encoder-decoder model trained on a large set of different tasks specified in natural language prompts ([paper](https://arxiv.org/pdf/2110.08207.pdf)).
146
- creator_organization: BigScience
147
- access: open
148
- num_parameters: 11000000000
149
- release_date: 2021-10-15
150
-
151
- # BigCode
152
- - name: huggingface/santacoder
153
- display_name: SantaCoder (1.1B)
154
- description: SantaCoder (1.1B parameters) model trained on the Python, Java, and JavaScript subset of The Stack (v1.1) ([model card](https://huggingface.co/bigcode/santacoder)).
155
- creator_organization: BigCode
156
- access: open
157
- - name: huggingface/starcoder
158
- display_name: StarCoder (15.5B)
159
- description: The StarCoder (15.5B parameter) model trained on 80+ programming languages from The Stack (v1.2) ([model card](https://huggingface.co/bigcode/starcoder)).
160
- creator_organization: BigCode
161
- access: open
162
-
163
- # Cerebras Systems
164
- - name: together/cerebras-gpt-6.7b
165
- display_name: Cerebras GPT (6.7B)
166
- description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
167
- creator_organization: Cerebras
168
- access: limited
169
- num_parameters: 6700000000
170
- release_date: 2023-04-06
171
- todo: true
172
- - name: together/cerebras-gpt-13b
173
- display_name: Cerebras GPT (13B)
174
- description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
175
- creator_organization: Cerebras
176
- access: limited
177
- num_parameters: 13000000000
178
- release_date: 2023-04-06
179
- todo: true
180
-
181
- # Cohere
182
- - name: cohere/xlarge-20220609
183
- display_name: Cohere xlarge v20220609 (52.4B)
184
- description: Cohere xlarge v20220609 (52.4B parameters)
185
- creator_organization: Cohere
186
- access: limited
187
- num_parameters: 52400000000
188
- release_date: 2022-06-09
189
- - name: cohere/large-20220720
190
- display_name: Cohere large v20220720 (13.1B)
191
- description: Cohere large v20220720 (13.1B parameters), which is deprecated by Cohere as of December 2, 2022.
192
- creator_organization: Cohere
193
- access: limited
194
- num_parameters: 13100000000
195
- release_date: 2022-07-20
196
- - name: cohere/medium-20220720
197
- display_name: Cohere medium v20220720 (6.1B)
198
- description: Cohere medium v20220720 (6.1B parameters)
199
- creator_organization: Cohere
200
- access: limited
201
- num_parameters: 6100000000
202
- release_date: 2022-07-20
203
- - name: cohere/small-20220720
204
- display_name: Cohere small v20220720 (410M)
205
- description: Cohere small v20220720 (410M parameters), which is deprecated by Cohere as of December 2, 2022.
206
- creator_organization: Cohere
207
- access: limited
208
- num_parameters: 410000000
209
- release_date: 2022-07-20
210
- - name: cohere/xlarge-20221108
211
- display_name: Cohere xlarge v20221108 (52.4B)
212
- description: Cohere xlarge v20221108 (52.4B parameters)
213
- creator_organization: Cohere
214
- access: limited
215
- num_parameters: 52400000000
216
- release_date: 2022-11-08
217
- - name: cohere/medium-20221108
218
- display_name: Cohere medium v20221108 (6.1B)
219
- description: Cohere medium v20221108 (6.1B parameters)
220
- creator_organization: Cohere
221
- access: limited
222
- num_parameters: 6100000000
223
- release_date: 2022-11-08
224
- - name: cohere/command-medium-beta
225
- display_name: Cohere Command beta (6.1B)
226
- description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
227
- creator_organization: Cohere
228
- access: limited
229
- num_parameters: 6100000000
230
- release_date: 2022-11-08
231
- - name: cohere/command-xlarge-beta
232
- display_name: Cohere Command beta (52.4B)
233
- description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
234
- creator_organization: Cohere
235
- access: limited
236
- num_parameters: 52400000000
237
- release_date: 2022-11-08
238
-
239
- # Databricks
240
- - name: databricks/dolly-v2-3b
241
- display_name: Dolly V2 (3B)
242
- description: Dolly V2 (3B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
243
- creator_organization: Databricks
244
- access: open
245
- num_parameters: 2517652480
246
- release_date: 2023-04-12
247
- todo: true
248
- - name: databricks/dolly-v2-7b
249
- display_name: Dolly V2 (7B)
250
- description: Dolly V2 (7B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
251
- creator_organization: Databricks
252
- access: open
253
- num_parameters: 6444163072
254
- release_date: 2023-04-12
255
- todo: true
256
- - name: databricks/dolly-v2-12b
257
- display_name: Dolly V2 (12B)
258
- description: Dolly V2 (12B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
259
- creator_organization: Databricks
260
- access: open
261
- num_parameters: 11327027200
262
- release_date: 2023-04-12
263
- todo: true
264
-
265
- # DeepMind
266
- - name: deepmind/gopher
267
- display_name: Gopher (280B)
268
- description: Gopher (540B parameters) ([paper](https://arxiv.org/pdf/2112.11446.pdf)).
269
- creator_organization: DeepMind
270
- access: closed
271
- todo: true
272
- - name: deepmind/chinchilla
273
- display_name: Chinchilla (70B)
274
- description: Chinchilla (70B parameters) ([paper](https://arxiv.org/pdf/2203.15556.pdf)).
275
- creator_organization: DeepMind
276
- access: closed
277
- todo: true
278
-
279
- # EleutherAI
280
- - name: together/gpt-j-6b
281
- display_name: GPT-J (6B)
282
- description: GPT-J (6B parameters) autoregressive language model trained on The Pile ([details](https://arankomatsuzaki.wordpress.com/2021/06/04/gpt-j/)).
283
- creator_organization: EleutherAI
284
- access: open
285
- num_parameters: 6000000000
286
- release_date: 2021-06-04
287
- - name: together/gpt-neox-20b
288
- display_name: GPT-NeoX (20B)
289
- description: GPT-NeoX (20B parameters) autoregressive language model trained on The Pile ([paper](https://arxiv.org/pdf/2204.06745.pdf)).
290
- creator_organization: EleutherAI
291
- access: open
292
- num_parameters: 20000000000
293
- release_date: 2022-02-02
294
- - name: eleutherai/pythia-1b-v0
295
- display_name: Pythia (1B)
296
- description: Pythia (1B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
297
- creator_organization: EleutherAI
298
- access: open
299
- num_parameters: 805736448
300
- release_date: 2023-02-13
301
- todo: true
302
- - name: eleutherai/pythia-2.8b-v0
303
- display_name: Pythia (2.8B)
304
- description: Pythia (2.8B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
305
- creator_organization: EleutherAI
306
- access: open
307
- num_parameters: 2517652480
308
- release_date: 2023-02-13
309
- todo: true
310
- - name: eleutherai/pythia-6.9b
311
- display_name: Pythia (6.9B)
312
- description: Pythia (6.9B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
313
- creator_organization: EleutherAI
314
- access: open
315
- num_parameters: 6444163072
316
- release_date: 2023-02-13
317
- - name: eleutherai/pythia-12b-v0
318
- display_name: Pythia (12B)
319
- description: Pythia (12B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
320
- creator_organization: EleutherAI
321
- access: open
322
- num_parameters: 11327027200
323
- release_date: 2023-02-13
324
-
325
- # Google
326
- - name: together/t5-11b
327
- display_name: T5 (11B)
328
- description: T5 (11B parameters) is an encoder-decoder model trained on a multi-task mixture, where each task is converted into a text-to-text format ([paper](https://arxiv.org/pdf/1910.10683.pdf)).
329
- creator_organization: Google
330
- access: open
331
- num_parameters: 11000000000
332
- release_date: 2019-10-23
333
-
334
- - name: together/ul2
335
- display_name: UL2 (20B)
336
- description: UL2 (20B parameters) is an encoder-decoder model trained on the C4 corpus. It's similar to T5 but trained with a different objective and slightly different scaling knobs ([paper](https://arxiv.org/pdf/2205.05131.pdf)).
337
- creator_organization: Google
338
- access: open
339
- num_parameters: 20000000000
340
- release_date: 2022-05-10
341
-
342
- - name: together/flan-t5-xxl
343
- display_name: Flan-T5 (11B)
344
- description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)).
345
- creator_organization: Google
346
- access: open
347
-
348
- - name: google/palm
349
- display_name: PaLM (540B)
350
- description: Pathways Language Model (540B parameters) is trained using 6144 TPU v4 chips ([paper](https://arxiv.org/pdf/2204.02311.pdf)).
351
- creator_organization: Google
352
- access: closed
353
- todo: true
354
-
355
- # HazyResearch
356
- - name: together/h3-2.7b
357
- display_name: H3 (2.7B)
358
- description: H3 (2.7B parameters) is a decoder-only language model based on state space models ([paper](https://arxiv.org/abs/2212.14052)).
359
- creator_organization: HazyResearch
360
- access: open
361
- num_parameters: 2700000000
362
- release_date: 2023-01-23
363
- todo: true
364
-
365
- # Lightning AI's Lit-GPT
366
- - name: lightningai/lit-gpt
367
- display_name: Lit-GPT
368
- description: Lit-GPT is an optimized collection of open-source LLMs for finetuning and inference. It supports – Falcon, Llama 2, Vicuna, LongChat, and other top-performing open-source large language models.
369
- creator_organization: Lightning AI
370
- access: open
371
- num_parameters: 1
372
- release_date: 2023-04-04
373
-
374
-
375
- # Meta
376
- - name: together/opt-iml-175b
377
- display_name: OPT-IML (175B)
378
- description: OPT-IML (175B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
379
- creator_organization: Meta
380
- access: open
381
- num_parameters: 175000000000
382
- release_date: 2022-12-22
383
- todo: true
384
-
385
- - name: together/opt-iml-30b
386
- display_name: OPT-IML (30B)
387
- description: OPT-IML (30B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
388
- creator_organization: Meta
389
- access: open
390
- num_parameters: 30000000000
391
- release_date: 2022-12-22
392
- todo: true
393
-
394
- - name: together/opt-175b
395
- display_name: OPT (175B)
396
- description: Open Pre-trained Transformers (175B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
397
- creator_organization: Meta
398
- access: open
399
- num_parameters: 175000000000
400
- release_date: 2022-05-02
401
-
402
- - name: together/opt-66b
403
- display_name: OPT (66B)
404
- description: Open Pre-trained Transformers (66B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
405
- creator_organization: Meta
406
- access: open
407
- num_parameters: 66000000000
408
- release_date: 2022-05-02
409
-
410
- - name: together/opt-6.7b
411
- display_name: OPT (6.7B)
412
- description: Open Pre-trained Transformers (6.7B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
413
- creator_organization: Meta
414
- access: open
415
- num_parameters: 6700000000
416
- release_date: 2022-05-02
417
-
418
- - name: together/opt-1.3b
419
- display_name: OPT (1.3B)
420
- description: Open Pre-trained Transformers (1.3B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
421
- creator_organization: Meta
422
- access: open
423
- num_parameters: 1300000000
424
- release_date: 2022-05-02
425
-
426
- - name: together/galactica-120b
427
- display_name: Galactica (120B)
428
- description: Galactica (120B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
429
- creator_organization: Meta
430
- access: open
431
- num_parameters: 120000000000
432
- release_date: 2022-11-15
433
- todo: true
434
-
435
- - name: together/galactica-30b
436
- display_name: Galactica (30B)
437
- description: Galactica (30B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
438
- creator_organization: Meta
439
- access: open
440
- num_parameters: 30000000000
441
- release_date: 2022-11-15
442
- todo: true
443
- - name: meta/llama-7b
444
- display_name: LLaMA (7B)
445
- description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
446
- creator_organization: Meta
447
- access: open
448
- num_parameters: 7000000000
449
- release_date: 2023-02-24
450
- - name: meta/llama-13b
451
- display_name: LLaMA (13B)
452
- description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
453
- creator_organization: Meta
454
- access: open
455
- num_parameters: 13000000000
456
- release_date: 2023-02-24
457
- - name: meta/llama-30b
458
- display_name: LLaMA (30B)
459
- description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
460
- creator_organization: Meta
461
- access: open
462
- num_parameters: 30000000000
463
- release_date: 2023-02-24
464
- - name: meta/llama-65b
465
- display_name: LLaMA (65B)
466
- description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
467
- creator_organization: Meta
468
- access: open
469
- num_parameters: 65000000000
470
- release_date: 2023-02-24
471
- - name: meta/llama-2-7b
472
- display_name: Llama 2 (7B)
473
- description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
474
- creator_organization: Meta
475
- access: open
476
- num_parameters: 7000000000
477
- release_date: 2023-07-18
478
- - name: meta/llama-2-13b
479
- display_name: Llama 2 (13B)
480
- description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
481
- creator_organization: Meta
482
- access: open
483
- num_parameters: 13000000000
484
- release_date: 2023-07-18
485
- - name: meta/llama-2-70b
486
- display_name: Llama 2 (70B)
487
- description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
488
- creator_organization: Meta
489
- access: open
490
- num_parameters: 70000000000
491
- release_date: 2023-07-18
492
-
493
- # Stability AI
494
- - name: stabilityai/stablelm-base-alpha-3b
495
- display_name: StableLM-Base-Alpha (3B)
496
- description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
497
- creator_organization: Stability AI
498
- access: open
499
- num_parameters: 3000000000
500
- release_date: 2023-04-20
501
- todo: true
502
-
503
- - name: stabilityai/stablelm-base-alpha-7b
504
- display_name: StableLM-Base-Alpha (7B)
505
- description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
506
- creator_organization: Stability AI
507
- access: open
508
- num_parameters: 7000000000
509
- release_date: 2023-04-20
510
- todo: true
511
-
512
- # Stanford
513
- - name: stanford/alpaca-7b
514
- display_name: Alpaca (7B)
515
- description: Alpaca 7B is a model fine-tuned from the LLaMA 7B model on 52K instruction-following demonstrations
516
- creator_organization: Stanford
517
- access: open
518
- num_parameters: 7000000000
519
- release_date: 2023-03-13
520
-
521
- # LMSYS
522
- - name: lmsys/vicuna-7b-v1.3
523
- display_name: Vicuna v1.3 (7B)
524
- description: Vicuna v1.3 (7B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
525
- creator_organization: LMSYS
526
- access: open
527
- num_parameters: 7000000000
528
- release_date: 2023-06-22
529
- - name: lmsys/vicuna-13b-v1.3
530
- display_name: Vicuna v1.3 (13B)
531
- description: Vicuna v1.3 (13B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
532
- creator_organization: LMSYS
533
- access: open
534
- num_parameters: 13000000000
535
- release_date: 2023-06-22
536
-
537
- # Mistral AI
538
- - name: mistralai/mistral-7b-v0.1
539
- display_name: Mistral v0.1 (7B)
540
- description: Mistral 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA).
541
- creator_organization: Mistral AI
542
- access: open
543
- num_parameters: 7300000000
544
- release_date: 2023-09-27
545
-
546
- # Microsoft/NVIDIA
547
- - name: microsoft/TNLGv2_530B
548
- display_name: TNLG v2 (530B)
549
- description: TNLG v2 (530B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
550
- creator_organization: Microsoft/NVIDIA
551
- access: closed
552
- num_parameters: 530000000000
553
- release_date: 2022-01-28
554
- - name: microsoft/TNLGv2_7B
555
- display_name: TNLG v2 (6.7B)
556
- description: TNLG v2 (6.7B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
557
- creator_organization: Microsoft/NVIDIA
558
- access: closed
559
- num_parameters: 6700000000
560
- release_date: 2022-01-28
561
-
562
- # OpenAI: https://beta.openai.com/docs/engines/gpt-3
563
- - name: openai/davinci
564
- display_name: davinci (175B)
565
- description: Original GPT-3 (175B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
566
- creator_organization: OpenAI
567
- access: limited
568
- num_parameters: 175000000000
569
- release_date: 2020-05-28
570
- - name: openai/curie
571
- display_name: curie (6.7B)
572
- description: Original GPT-3 (6.7B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
573
- creator_organization: OpenAI
574
- access: limited
575
- num_parameters: 6700000000
576
- release_date: 2020-05-28
577
- - name: openai/babbage
578
- display_name: babbage (1.3B)
579
- description: Original GPT-3 (1.3B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
580
- creator_organization: OpenAI
581
- access: limited
582
- num_parameters: 1300000000
583
- release_date: 2020-05-28
584
- - name: openai/ada
585
- display_name: ada (350M)
586
- description: Original GPT-3 (350M parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
587
- creator_organization: OpenAI
588
- access: limited
589
- num_parameters: 350000000
590
- release_date: 2020-05-28
591
- - name: openai/text-davinci-003
592
- display_name: text-davinci-003
593
- description: text-davinci-003 model that involves reinforcement learning (PPO) with reward models. Derived from text-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
594
- creator_organization: OpenAI
595
- access: limited
596
- num_parameters: 175000000000
597
- release_date: 2022-11-28
598
- - name: openai/text-davinci-002
599
- display_name: text-davinci-002
600
- description: text-davinci-002 model that involves supervised fine-tuning on human-written demonstrations. Derived from code-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
601
- creator_organization: OpenAI
602
- access: limited
603
- num_parameters: 175000000000
604
- release_date: 2022-01-27
605
- - name: openai/text-davinci-001
606
- display_name: text-davinci-001
607
- description: text-davinci-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
608
- creator_organization: OpenAI
609
- access: limited
610
- num_parameters: 175000000000
611
- release_date: 2022-01-27
612
- todo: true
613
- - name: openai/text-curie-001
614
- display_name: text-curie-001
615
- description: text-curie-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
616
- creator_organization: OpenAI
617
- access: limited
618
- num_parameters: 6700000000
619
- release_date: 2022-01-27
620
- - name: openai/text-babbage-001
621
- display_name: text-babbage-001
622
- description: text-babbage-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
623
- creator_organization: OpenAI
624
- access: limited
625
- num_parameters: 1300000000
626
- release_date: 2022-01-27
627
- - name: openai/text-ada-001
628
- display_name: text-ada-001
629
- description: text-ada-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
630
- creator_organization: OpenAI
631
- access: limited
632
- num_parameters: 350000000
633
- release_date: 2022-01-27
634
- - name: openai/gpt-4-0314
635
- display_name: gpt-4-0314
636
- description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from March 14th 2023.
637
- creator_organization: OpenAI
638
- access: limited
639
- release_date: 2023-03-14
640
- - name: openai/gpt-4-32k-0314
641
- display_name: gpt-4-32k-0314
642
- description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from March 14th 2023.
643
- creator_organization: OpenAI
644
- access: limited
645
- release_date: 2023-03-14
646
- - name: openai/gpt-4-0613
647
- display_name: gpt-4-0613
648
- description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from 2023-06-13.
649
- creator_organization: OpenAI
650
- access: limited
651
- release_date: 2023-06-13
652
- - name: openai/gpt-4-32k-0613
653
- display_name: gpt-4-32k-0613
654
- description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from 2023-06-13.
655
- creator_organization: OpenAI
656
- access: limited
657
- release_date: 2023-06-13
658
- - name: openai/code-davinci-002
659
- display_name: code-davinci-002
660
- description: Codex-style model that is designed for pure code-completion tasks ([docs](https://beta.openai.com/docs/models/codex)).
661
- creator_organization: OpenAI
662
- access: limited
663
- - name: openai/code-davinci-001
664
- display_name: code-davinci-001
665
- description: code-davinci-001 model
666
- creator_organization: OpenAI
667
- access: limited
668
- todo: true
669
- - name: openai/code-cushman-001
670
- display_name: code-cushman-001 (12B)
671
- description: Codex-style model that is a stronger, multilingual version of the Codex (12B) model in the [Codex paper](https://arxiv.org/pdf/2107.03374.pdf).
672
- creator_organization: OpenAI
673
- access: limited
674
- - name: openai/gpt-3.5-turbo-0301
675
- display_name: gpt-3.5-turbo-0301
676
- description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01.
677
- creator_organization: OpenAI
678
- access: limited
679
- release_date: 2023-03-01
680
- - name: openai/gpt-3.5-turbo-0613
681
- display_name: gpt-3.5-turbo-0613
682
- description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13.
683
- creator_organization: OpenAI
684
- access: limited
685
- release_date: 2023-06-13
686
- - name: openai/gpt-3.5-turbo-16k-0613
687
- display_name: gpt-3.5-turbo-16k-0613
688
- description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13 with a longer context length of 16,384 tokens.
689
- creator_organization: OpenAI
690
- access: limited
691
- release_date: 2023-06-13
692
-
693
- # Together
694
- - name: together/Together-gpt-JT-6B-v1
695
- display_name: GPT-JT (6B)
696
- description: GPT-JT (6B parameters) is a fork of GPT-J ([blog post](https://www.together.xyz/blog/releasing-v1-of-gpt-jt-powered-by-open-source-ai)).
697
- creator_organization: Together
698
- access: open
699
- num_parameters: 6700000000
700
- release_date: 2022-11-29
701
- todo: true
702
- - name: together/gpt-neoxt-chat-base-20b
703
- display_name: GPT-NeoXT-Chat-Base (20B)
704
- description: GPT-NeoXT-Chat-Base (20B) is fine-tuned from GPT-NeoX, serving as a base model for developing open-source chatbots.
705
- creator_organization: Together
706
- access: open
707
- num_parameters: 20000000000
708
- release_date: 2023-03-08
709
- todo: true
710
- - name: together/redpajama-incite-base-3b-v1
711
- display_name: RedPajama-INCITE-Base-v1 (3B)
712
- description: RedPajama-INCITE-Base-v1 (3B parameters) is a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
713
- creator_organization: Together
714
- access: open
715
- num_parameters: 3000000000
716
- release_date: 2023-05-05
717
- - name: together/redpajama-incite-instruct-3b-v1
718
- display_name: RedPajama-INCITE-Instruct-v1 (3B)
719
- description: RedPajama-INCITE-Instruct-v1 (3B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
720
- creator_organization: Together
721
- access: open
722
- num_parameters: 3000000000
723
- release_date: 2023-05-05
724
- todo: true
725
- - name: together/redpajama-incite-chat-3b-v1
726
- display_name: RedPajama-INCITE-Chat-v1 (3B)
727
- description: RedPajama-INCITE-Chat-v1 (3B parameters) is a model fine-tuned on OASST1 and Dolly2 to enhance chatting ability. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
728
- creator_organization: Together
729
- access: open
730
- num_parameters: 3000000000
731
- release_date: 2023-05-05
732
- todo: true
733
- - name: together/redpajama-incite-base-7b
734
- display_name: RedPajama-INCITE-Base (7B)
735
- description: RedPajama-INCITE-Base (7B parameters) is a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
736
- creator_organization: Together
737
- access: open
738
- num_parameters: 7000000000
739
- release_date: 2023-05-05
740
- todo: true
741
- - name: together/redpajama-incite-instruct-7b
742
- display_name: RedPajama-INCITE-Instruct (7B)
743
- description: RedPajama-INCITE-Instruct (7B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base (7B), a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
744
- creator_organization: Together
745
- access: open
746
- num_parameters: 7000000000
747
- release_date: 2023-05-05
748
- todo: true
749
-
750
- # MosaicML
751
- - name: mosaicml/mpt-7b
752
- display_name: MPT (7B)
753
- description: MPT (7B) is a Transformer trained from scratch on 1T tokens of text and code.
754
- creator_organization: MosaicML
755
- access: open
756
- num_parameters: 6700000000
757
- release_date: 2023-05-05
758
- - name: mosaicml/mpt-7b-chat
759
- display_name: MPT-Chat (7B)
760
- description: MPT-Chat (7B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B) , a Transformer trained from scratch on 1T tokens of text and code.
761
- creator_organization: MosaicML
762
- access: open
763
- num_parameters: 6700000000
764
- release_date: 2023-05-05
765
- todo: true
766
- - name: mosaicml/mpt-instruct-7b
767
- display_name: MPT-Instruct (7B)
768
- description: MPT-Instruct (7B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
769
- creator_organization: MosaicML
770
- access: open
771
- num_parameters: 6700000000
772
- release_date: 2023-05-05
773
- - name: mosaicml/mpt-30b
774
- display_name: MPT (30B)
775
- description: MPT (30B) is a Transformer trained from scratch on 1T tokens of text and code.
776
- creator_organization: MosaicML
777
- access: open
778
- num_parameters: 30000000000
779
- release_date: 2023-06-22
780
- - name: mosaicml/mpt-30b-chat
781
- display_name: MPT-Chat (30B)
782
- description: MPT-Chat (30B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
783
- creator_organization: MosaicML
784
- access: open
785
- num_parameters: 30000000000
786
- release_date: 2023-06-22
787
- todo: true
788
- - name: mosaicml/mpt-instruct-30b
789
- display_name: MPT-Instruct (30B)
790
- description: MPT-Instruct (30B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
791
- creator_organization: MosaicML
792
- access: open
793
- num_parameters: 30000000000
794
- release_date: 2023-06-22
795
-
796
- # TII UAE
797
- - name: tiiuae/falcon-7b
798
- display_name: Falcon (7B)
799
- description: Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
800
- creator_organization: TII UAE
801
- access: open
802
- num_parameters: 7000000000
803
- release_date: 2023-03-15
804
- - name: tiiuae/falcon-7b-instruct
805
- display_name: Falcon-Instruct (7B)
806
- description: Falcon-7B-Instruct is a 7B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
807
- creator_organization: TII UAE
808
- access: open
809
- num_parameters: 7000000000
810
- release_date: 2023-03-15
811
- - name: tiiuae/falcon-40b
812
- display_name: Falcon (40B)
813
- description: Falcon-40B is a 40B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
814
- creator_organization: TII UAE
815
- access: open
816
- num_parameters: 40000000000
817
- release_date: 2023-05-25
818
- - name: tiiuae/falcon-40b-instruct
819
- display_name: Falcon-Instruct (40B)
820
- description: Falcon-40B-Instruct is a 40B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
821
- creator_organization: TII UAE
822
- access: open
823
- num_parameters: 40000000000
824
- release_date: 2023-05-25
825
-
826
- # Salesforce
827
- - name: together/codegen
828
- display_name: CodeGen (16B)
829
- description: CodeGen (16B parameters) is an open dense code model trained for multi-turn program synthesis ([blog](https://arxiv.org/pdf/2203.13474.pdf)).
830
- creator_organization: Tsinghua
831
- access: open
832
- num_parameters: 16000000000
833
- release_date: 2022-03-25
834
- todo: true
835
-
836
- # Tsinghua
837
- - name: together/glm
838
- display_name: GLM (130B)
839
- description: GLM (130B parameters) is an open bilingual (English & Chinese) bidirectional dense model that was trained using General Language Model (GLM) procedure ([paper](https://arxiv.org/pdf/2210.02414.pdf)).
840
- creator_organization: Tsinghua
841
- access: open
842
- num_parameters: 130000000000
843
- release_date: 2022-08-04
844
-
845
- - name: together/codegeex
846
- display_name: CodeGeeX (13B)
847
- description: CodeGeeX (13B parameters) is an open dense code model trained on more than 20 programming languages on a corpus of more than 850B tokens ([blog](http://keg.cs.tsinghua.edu.cn/codegeex/)).
848
- creator_organization: Tsinghua
849
- access: open
850
- num_parameters: 13000000000
851
- release_date: 2022-09-19
852
- todo: true
853
-
854
- # Writer
855
- - name: writer/palmyra-base
856
- display_name: Palmyra Base (5B)
857
- description: Palmyra Base (5B)
858
- creator_organization: Writer
859
- access: limited
860
- num_parameters: 5000000000
861
- release_date: 2022-10-13
862
- todo: true
863
- - name: writer/palmyra-large
864
- display_name: Palmyra Large (20B)
865
- description: Palmyra Large (20B)
866
- creator_organization: Writer
867
- access: limited
868
- num_parameters: 20000000000
869
- release_date: 2022-12-23
870
- todo: true
871
- - name: writer/palmyra-instruct-30
872
- display_name: InstructPalmyra (30B)
873
- description: InstructPalmyra (30B parameters) is trained using reinforcement learning techniques based on feedback from humans.
874
- creator_organization: Writer
875
- access: limited
876
- num_parameters: 30000000000
877
- release_date: 2023-02-16
878
- todo: true
879
- - name: writer/palmyra-e
880
- display_name: Palmyra E (30B)
881
- description: Palmyra E (30B)
882
- creator_organization: Writer
883
- access: limited
884
- num_parameters: 30000000000
885
- release_date: 2023-03-03
886
- todo: true
887
- - name: writer/silk-road
888
- display_name: Silk Road (35B)
889
- description: Silk Road (35B)
890
- creator_organization: Writer
891
- access: limited
892
- num_parameters: 35000000000
893
- release_date: 2023-04-13
894
- todo: true
895
- - name: writer/palmyra-x
896
- display_name: Palmyra X (43B)
897
- description: Palmyra-X (43B parameters) is trained to adhere to instructions using human feedback and utilizes a technique called multiquery attention. Furthermore, a new feature called 'self-instruct' has been introduced, which includes the implementation of an early stopping criteria specifically designed for minimal instruction tuning ([paper](https://dev.writer.com/docs/becoming-self-instruct-introducing-early-stopping-criteria-for-minimal-instruct-tuning)).
898
- creator_organization: Writer
899
- access: limited
900
- num_parameters: 43000000000
901
- release_date: 2023-06-11
902
- todo: true
903
-
904
- # Yandex
905
- - name: together/yalm
906
- display_name: YaLM (100B)
907
- description: YaLM (100B parameters) is an autoregressive language model trained on English and Russian text ([GitHub](https://github.com/yandex/YaLM-100B)).
908
- creator_organization: Yandex
909
- access: open
910
- num_parameters: 100000000000
911
- release_date: 2022-06-23
912
-
913
- # NVIDIA
914
- - name: nvidia/megatron-gpt2
915
- display_name: Megatron GPT2
916
- description: GPT-2 implemented in Megatron-LM ([paper](https://arxiv.org/abs/1909.08053)).
917
- creator_organization: NVIDIA
918
- access: open
919
- todo: true
920
-
921
2
  ############################################################
922
3
  adapter:
923
4
  - name: method
@@ -961,8 +42,12 @@ adapter:
961
42
  description: Maximum number of possible outputs to generate by sampling multiple outputs.
962
43
  - name: num_train_trials
963
44
  description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
45
+ - name: sample_train
46
+ description: If true, randomly sample N training examples; if false, select N consecutive training examples
964
47
  - name: model
965
- description: Name of the language model (<organization>/<model name>) to send requests to.
48
+ description: Name of the language model (<creator_organization>/<model name>) to send requests to.
49
+ - name: model_deployment
50
+ description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
966
51
  - name: temperature
967
52
  description: Temperature parameter used in generation.
968
53
  - name: max_tokens
@@ -971,6 +56,8 @@ adapter:
971
56
  description: List of sequences, where we stop generation if we encounter any of them.
972
57
  - name: random
973
58
  description: Random seed (string), which guarantees reproducibility.
59
+ - name: multi_label
60
+ description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
974
61
 
975
62
  ############################################################
976
63
  metrics:
@@ -1059,6 +146,7 @@ metrics:
1059
146
  short_display_name: PEM
1060
147
  description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
1061
148
  lower_is_better: false
149
+
1062
150
  - name: exact_match@5
1063
151
  display_name: Exact match @5
1064
152
  short_display_name: EM@5
@@ -1069,6 +157,17 @@ metrics:
1069
157
  short_display_name: EM@5
1070
158
  description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
1071
159
  lower_is_better: false
160
+ - name: prefix_exact_match@5
161
+ display_name: Prefix exact match @5
162
+ short_display_name: PEM@5
163
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
164
+ lower_is_better: false
165
+ - name: quasi_prefix_exact_match@5
166
+ display_name: Prefix quasi-exact match @5
167
+ short_display_name: PEM@5
168
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
169
+ lower_is_better: false
170
+
1072
171
  - name: logprob
1073
172
  display_name: Log probability
1074
173
  short_display_name: Logprob
@@ -1163,10 +262,15 @@ metrics:
1163
262
  lower_is_better: false
1164
263
  - name: math_equiv_chain_of_thought
1165
264
  display_name: Equivalent (chain of thought)
1166
- description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thoughts prompting.
265
+ description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.
1167
266
  lower_is_better: false
1168
267
  - name: exact_match_indicator
1169
- display_name: Exact match (up to specified indicator)
268
+ display_name: Exact match (final)
269
+ short_display_name: EM
270
+ description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator (e.g., space).
271
+ lower_is_better: false
272
+ - name: final_number_exact_match
273
+ display_name: Exact match (final number)
1170
274
  short_display_name: EM
1171
275
  description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.
1172
276
  lower_is_better: false
@@ -1464,6 +568,50 @@ metrics:
1464
568
  lower_is_better: false
1465
569
  # CLEVA Chinese bias, copyright and toxicity metrics share the same name as the original HELM metrics
1466
570
 
571
+
572
+ # DecodingTrust Fairness Metrics
573
+ - name: equalized_odds_difference
574
+ display_name: Equalized Odds Difference
575
+ short_display_name: Equalized Odds
576
+ description: The rate at which the model refuses to answer an OoD question.
577
+ lower_is_better: true
578
+
579
+ - name: demographic_parity_difference
580
+ display_name: Demographic Parity Difference
581
+ short_display_name: demographic parity
582
+ description: The demographic parity difference measures the difference between the probability of positive predictions conditioned on sensitive attribute A = 1 and that conditioned on A = 0
583
+ lower_is_better: true
584
+
585
+
586
+ # DecodingTrust OoD Metrics
587
+ - name: ood_knowledge_rr
588
+ display_name: OOD Knowledge Refusal Rate
589
+ short_display_name: OoD-RR
590
+ description: The rate at which the model refuses to answer an OoD question.
591
+ lower_is_better: false
592
+
593
+ - name: ood_knowledge_macc
594
+ display_name: OOD Meaningful Answer Accuracy (Exclude Refusal)
595
+ short_display_name: OoD-MACC
596
+ description: MACC indicates the percentage of correct answers when the model successfully generates meaningful responses by excluding outputs that are refused to answer
597
+ lower_is_better: false
598
+
599
+
600
+ # DecodingTrust Stereotype Bias Metrics
601
+
602
+ - name: decodingtrust_stereotype_bias
603
+ display_name: DecodingTrust Bias Score
604
+ short_display_name: Bias Score
605
+ description: 1 - agreementIndex (Section 4.2 of DecodingTrust)
606
+ lower_is_better: false
607
+
608
+ - name: decodingtrust_stereotype_bias_rejection_rate
609
+ display_name: Accuracy at 10% coverage
610
+ short_display_name: Bias Rejection Rate
611
+ description: Rejection rate of stereotype prompts
612
+ lower_is_better: false
613
+
614
+
1467
615
  ############################################################
1468
616
  perturbations:
1469
617
  - name: robustness
@@ -1514,7 +662,7 @@ metric_groups:
1514
662
  split: ${main_split}
1515
663
 
1516
664
  - name: calibration_detailed
1517
- display_name: Calibration
665
+ display_name: Calibration (Detailed)
1518
666
  description: Measures how calibrated the model is (how meaningful its uncertainty estimates are).
1519
667
  metrics:
1520
668
  - name: max_prob
@@ -1545,7 +693,7 @@ metric_groups:
1545
693
 
1546
694
  # TODO: Add other robustness perturbations
1547
695
  - name: robustness_detailed
1548
- display_name: Robustness
696
+ display_name: Robustness (Detailed)
1549
697
  description: Measures how robust the model is to invariances.
1550
698
  metrics:
1551
699
  - name: ${main_name}
@@ -1564,7 +712,7 @@ metric_groups:
1564
712
 
1565
713
  # TODO: Add other fairness perturbations
1566
714
  - name: fairness_detailed
1567
- display_name: Fairness
715
+ display_name: Fairness (Detailed)
1568
716
  description: Measures how fair the model is.
1569
717
  metrics:
1570
718
  - name: ${main_name}
@@ -1602,7 +750,7 @@ metric_groups:
1602
750
  split: ${main_split}
1603
751
 
1604
752
  - name: efficiency_detailed
1605
- display_name: Efficiency
753
+ display_name: Efficiency (Detailed)
1606
754
  description: The efficiency of the model across both training and inference.
1607
755
  metrics:
1608
756
  - name: inference_runtime
@@ -1747,6 +895,31 @@ metric_groups:
1747
895
  - name: chinese_bleu_1
1748
896
  split: ${main_split}
1749
897
 
898
+ - name: decodingtrust_fairness_metrics
899
+ display_name: DecodingTrust Fairness
900
+ metrics:
901
+ - name: equalized_odds_difference
902
+ split: ${main_split}
903
+ - name: demographic_parity_difference
904
+ split: ${main_split}
905
+
906
+ - name: decodingtrust_ood_metrics
907
+ display_name: DecodingTrust OOD Accuracy
908
+ metrics:
909
+ - name: ood_knowledge_rr
910
+ split: ${main_split}
911
+ - name: ood_knowledge_macc
912
+ split: ${main_split}
913
+
914
+ - name: decodingtrust_stereotype_bias_metrics
915
+ display_name: DecodingTrust Stereotype Bias
916
+ metrics:
917
+ - name: decodingtrust_stereotype_bias
918
+ split: ${main_split}
919
+ - name: decodingtrust_stereotype_bias_rejection_rate
920
+ split: ${main_split}
921
+
922
+
1750
923
  ############################################################
1751
924
  run_groups:
1752
925
  ## Top-level
@@ -1910,6 +1083,7 @@ run_groups:
1910
1083
  - synthetic_efficiency
1911
1084
  adapter_keys_shown:
1912
1085
  - model
1086
+ - model_deployment
1913
1087
  - max_tokens
1914
1088
 
1915
1089
  - name: calibration
@@ -1928,6 +1102,20 @@ run_groups:
1928
1102
  main_name: none
1929
1103
  main_split: none
1930
1104
 
1105
+ - name: decodingtrust
1106
+ display_name: DecodingTrust
1107
+ description: A comprehensive benchmark of the trustworthiness of large language models [(Wang et. al. 2023)](https://decodingtrust.github.io/)
1108
+ category: Core scenarios
1109
+ subgroups:
1110
+ - decodingtrust_adv_robustness
1111
+ - decodingtrust_adv_demonstration
1112
+ - decodingtrust_ood_robustness
1113
+ - decodingtrust_fairness
1114
+ - decodingtrust_privacy
1115
+ - decodingtrust_machine_ethics
1116
+ - decodingtrust_toxicity_prompts
1117
+ - decodingtrust_stereotype_bias
1118
+
1931
1119
  ### Ablations
1932
1120
  - name: ablation_in_context
1933
1121
  display_name: Vary number of in-context examples
@@ -1941,6 +1129,7 @@ run_groups:
1941
1129
  - civil_comments
1942
1130
  adapter_keys_shown:
1943
1131
  - model
1132
+ - model_deployment
1944
1133
  - max_train_instances
1945
1134
  subgroup_metric_groups_hidden:
1946
1135
  - robustness
@@ -1962,6 +1151,7 @@ run_groups:
1962
1151
  - bbq
1963
1152
  adapter_keys_shown:
1964
1153
  - model
1154
+ - model_deployment
1965
1155
  - method
1966
1156
 
1967
1157
  - name: ablation_prompts
@@ -1976,6 +1166,7 @@ run_groups:
1976
1166
  - civil_comments
1977
1167
  adapter_keys_shown:
1978
1168
  - model
1169
+ - model_deployment
1979
1170
  - instructions
1980
1171
  - input_prefix
1981
1172
  - input_suffix
@@ -2636,8 +1827,8 @@ run_groups:
2636
1827
  language: synthetic
2637
1828
 
2638
1829
  - name: math_chain_of_thought
2639
- display_name: MATH (chain-of-thoughts)
2640
- description: The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thoughts style reasoning [(Hendrycks et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).
1830
+ display_name: MATH (chain-of-thought)
1831
+ description: The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).
2641
1832
  metric_groups:
2642
1833
  - accuracy
2643
1834
  - efficiency
@@ -2687,6 +1878,23 @@ run_groups:
2687
1878
  when: n/a
2688
1879
  language: synthetic
2689
1880
 
1881
+ - name: legalbench
1882
+ display_name: LegalBench
1883
+ description: LegalBench is a large collaboratively constructed benchmark of legal reasoning. Five representative tasks are included here. See [(Guha et al, 2023)[https://arxiv.org/abs/2308.11462] for more details.
1884
+ metric_groups:
1885
+ - accuracy
1886
+ - efficiency
1887
+ - general_information
1888
+ environment:
1889
+ main_name: quasi_exact_match
1890
+ main_split: test
1891
+ taxonomy:
1892
+ task: "text classification"
1893
+ what: "fact patterns, questions, and legal documents"
1894
+ who: "lawyers"
1895
+ when: n/a
1896
+ language: English
1897
+
2690
1898
  - name: legal_support
2691
1899
  display_name: LegalSupport
2692
1900
  description: Scenario introduced in this work to measure fine-grained legal reasoning through reverse entailment.
@@ -2721,6 +1929,40 @@ run_groups:
2721
1929
  when: n/a
2722
1930
  language: synthetic
2723
1931
 
1932
+ - name: med_qa
1933
+ display_name: MedQA
1934
+ description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
1935
+ metric_groups:
1936
+ - accuracy
1937
+ - efficiency
1938
+ - general_information
1939
+ environment:
1940
+ main_name: quasi_exact_match
1941
+ main_split: test
1942
+ taxonomy:
1943
+ task: question answering
1944
+ what: n/a
1945
+ who: n/a
1946
+ when: n/a
1947
+ language: English
1948
+
1949
+ - name: wmt_14
1950
+ display_name: WMT 2014
1951
+ description: WMT 2014 is a collection of machine translation datasets.
1952
+ metric_groups:
1953
+ - accuracy
1954
+ - efficiency
1955
+ - general_information
1956
+ environment:
1957
+ main_name: bleu_4
1958
+ main_split: test
1959
+ taxonomy:
1960
+ task: machine translation
1961
+ what: n/a
1962
+ who: n/a
1963
+ when: n/a
1964
+ language: English
1965
+
2724
1966
  - name: lextreme
2725
1967
  display_name: LEXTREME
2726
1968
  description: A Multilingual Legal Benchmark for Natural Language Understanding
@@ -2981,6 +2223,7 @@ run_groups:
2981
2223
  main_split: test
2982
2224
  adapter_keys_shown:
2983
2225
  - model
2226
+ - model_deployment
2984
2227
  - max_tokens
2985
2228
  taxonomy:
2986
2229
  task: "?"
@@ -3402,7 +2645,7 @@ run_groups:
3402
2645
 
3403
2646
  - name: cleva_mathematical_reasoning
3404
2647
  display_name: CLEVA (Chinese) mathematical reasoning
3405
- description: "Scenario that tests models' mathematical reasoning ability with chain-of-thoughts style reasoning. It contains a math word problem solving subtask."
2648
+ description: "Scenario that tests models' mathematical reasoning ability with chain-of-thought style reasoning. It contains a math word problem solving subtask."
3406
2649
  metric_groups:
3407
2650
  - cleva_mathematical_reasoning_metrics
3408
2651
  - general_information
@@ -3449,7 +2692,7 @@ run_groups:
3449
2692
  main_split: test
3450
2693
  taxonomy:
3451
2694
  task: toxicity classification
3452
- what: text from Chinese social media
2695
+ what: text from Chinese social media
3453
2696
  who: web users
3454
2697
  when: 2022 or before
3455
2698
  language: Chinese
@@ -3649,3 +2892,176 @@ run_groups:
3649
2892
  task: user-facing tasks
3650
2893
  language: English dialects
3651
2894
  todo: true
2895
+
2896
+
2897
+ # DecodingTrust scenarios
2898
+ - name: decodingtrust_adv_robustness
2899
+ display_name: DecodingTrust - AdvGLUE++
2900
+ short_display_name: AdvGLUE++
2901
+ description: Adversarial perturbations of the GLUE dataset generated against open-source LLMs including Alpaca, Vicuna, and Stable-Vicuna
2902
+ metric_groups:
2903
+ - accuracy
2904
+ - calibration
2905
+ - efficiency
2906
+ - general_information
2907
+ environment:
2908
+ main_name: quasi_exact_match
2909
+ main_split: test
2910
+ taxonomy:
2911
+ task: text classification
2912
+ what: "?"
2913
+ who: "?"
2914
+ when: "?"
2915
+ language: English
2916
+ todo: true
2917
+
2918
+ - name: decodingtrust_adv_demonstration
2919
+ display_name: DecodingTrust - Adversarial Demonstrations
2920
+ short_display_name: AdvDemo
2921
+ description: Robustness analysis of LM generations when facing adversarial demonstrations
2922
+ metric_groups:
2923
+ - accuracy
2924
+ - calibration
2925
+ - efficiency
2926
+ - general_information
2927
+ environment:
2928
+ main_name: quasi_exact_match
2929
+ main_split: test
2930
+ taxonomy:
2931
+ task: text classification
2932
+ what: "?"
2933
+ who: "?"
2934
+ when: "?"
2935
+ language: English
2936
+
2937
+ - name: decodingtrust_ood_robustness
2938
+ display_name: DecodingTrust - OoD Robustness
2939
+ short_display_name: OoD
2940
+ description: Style perturbations of GLUE datasets (OoD styles) and out-of-scope OoD knowledge evaluations
2941
+ metric_groups:
2942
+ - accuracy
2943
+ - calibration
2944
+ - efficiency
2945
+ - general_information
2946
+ - decodingtrust_ood_metrics
2947
+ environment:
2948
+ main_name: quasi_exact_match
2949
+ main_split: test
2950
+ taxonomy:
2951
+ task: text classification
2952
+ what: "?"
2953
+ who: "?"
2954
+ when: "?"
2955
+ language: English
2956
+
2957
+ - name: decodingtrust_fairness
2958
+ display_name: DecodingTrust - Fairness
2959
+ short_display_name: Fairness
2960
+ description: Fairness analysis of LLMs
2961
+ metric_groups:
2962
+ - accuracy
2963
+ - calibration
2964
+ - efficiency
2965
+ - general_information
2966
+ - decodingtrust_fairness_metrics
2967
+ environment:
2968
+ main_name: quasi_exact_match
2969
+ main_split: test
2970
+ taxonomy:
2971
+ task: text classification
2972
+ what: "?"
2973
+ who: "?"
2974
+ when: "?"
2975
+ language: English
2976
+
2977
+ - name: decodingtrust_privacy
2978
+ display_name: DecodingTrust - Privacy
2979
+ short_display_name: Privacy
2980
+ description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
2981
+ metric_groups:
2982
+ - accuracy
2983
+ - calibration
2984
+ - efficiency
2985
+ - general_information
2986
+ environment:
2987
+ main_name: quasi_exact_match
2988
+ main_split: test
2989
+ taxonomy:
2990
+ task: text classification
2991
+ what: "?"
2992
+ who: "?"
2993
+ when: "?"
2994
+ language: English
2995
+
2996
+ - name: decodingtrust_machine_ethics
2997
+ display_name: DecodingTrust - Ethics
2998
+ short_display_name: Ethics
2999
+ description: Evaluation of the understanding of ethical behaviors of LLMs
3000
+ metric_groups:
3001
+ - accuracy
3002
+ - calibration
3003
+ - efficiency
3004
+ - general_information
3005
+ environment:
3006
+ main_name: quasi_exact_match
3007
+ main_split: test
3008
+ taxonomy:
3009
+ task: text classification
3010
+ what: "?"
3011
+ who: "?"
3012
+ when: "?"
3013
+ language: English
3014
+
3015
+ - name: decodingtrust_toxicity_prompts
3016
+ display_name: DecodingTrust - Toxicity
3017
+ short_display_name: Toxicity
3018
+ description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
3019
+ metric_groups:
3020
+ - toxicity
3021
+ - bias
3022
+ - efficiency
3023
+ - general_information
3024
+ environment:
3025
+ main_split: test
3026
+ taxonomy:
3027
+ task: "?"
3028
+ what: n/a
3029
+ who: n/a
3030
+ when: n/a
3031
+ language: synthetic
3032
+
3033
+ - name: decodingtrust_stereotype_bias
3034
+ display_name: DecodingTrust - Stereotype Bias
3035
+ short_display_name: Stereotype
3036
+ description: Manually crafted stereotype user prompts from DecodingTrust
3037
+ metric_groups:
3038
+ - toxicity
3039
+ - bias
3040
+ - efficiency
3041
+ - general_information
3042
+ - decodingtrust_stereotype_bias_metrics
3043
+ environment:
3044
+ main_split: test
3045
+ taxonomy:
3046
+ task: "?"
3047
+ what: n/a
3048
+ who: n/a
3049
+ when: n/a
3050
+ language: synthetic
3051
+
3052
+ - name: thai_exam
3053
+ display_name: Thai Exam
3054
+ short_display_name: ThaiExam
3055
+ description: A benchmark comprising Thai multiple-choice examinations.
3056
+ metric_groups:
3057
+ - accuracy
3058
+ - general_information
3059
+ environment:
3060
+ main_name: exact_match
3061
+ main_split: test
3062
+ taxonomy:
3063
+ task: question answering
3064
+ what: "?"
3065
+ who: "?"
3066
+ when: "?"
3067
+ language: Thai