crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -1,1068 +1,4 @@
1
1
  ---
2
- ############################################################
3
- models:
4
- # AI21 Labs
5
- - name: ai21/j1-jumbo
6
- display_name: J1-Jumbo v1 (178B)
7
- description: Jurassic-1 Jumbo (178B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
8
- creator_organization: AI21 Labs
9
- access: limited
10
- num_parameters: 178000000000
11
- release_date: 2021-08-11
12
- - name: ai21/j1-large
13
- display_name: J1-Large v1 (7.5B)
14
- description: Jurassic-1 Large (7.5B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
15
- creator_organization: AI21 Labs
16
- access: limited
17
- num_parameters: 7500000000
18
- release_date: 2021-08-11
19
- - name: ai21/j1-grande
20
- display_name: J1-Grande v1 (17B)
21
- description: Jurassic-1 Grande (17B parameters) with a "few tweaks" to the training process ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
22
- creator_organization: AI21 Labs
23
- access: limited
24
- num_parameters: 17000000000
25
- release_date: 2022-05-03
26
- - name: ai21/j1-grande-v2-beta
27
- display_name: J1-Grande v2 beta (17B)
28
- description: Jurassic-1 Grande v2 beta (17B parameters)
29
- creator_organization: AI21 Labs
30
- access: limited
31
- num_parameters: 17000000000
32
- release_date: 2022-10-28
33
- - name: ai21/j2-jumbo
34
- display_name: Jurassic-2 Jumbo (178B)
35
- description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
36
- creator_organization: AI21 Labs
37
- access: limited
38
- num_parameters: 178000000000
39
- release_date: 2023-03-09
40
- - name: ai21/j2-grande
41
- display_name: Jurassic-2 Grande (17B)
42
- description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
43
- creator_organization: AI21 Labs
44
- access: limited
45
- num_parameters: 17000000000
46
- release_date: 2023-03-09
47
- - name: ai21/j2-large
48
- display_name: Jurassic-2 Large (7.5B)
49
- description: Jurassic-2 Large (7.5B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
50
- creator_organization: AI21 Labs
51
- access: limited
52
- num_parameters: 7500000000
53
- release_date: 2023-03-09
54
-
55
- # Aleph Alpha
56
- # TODO: add Luminous World when it's released
57
- - name: AlephAlpha/luminous-base
58
- display_name: Luminous Base (13B)
59
- description: Luminous Base (13B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
60
- creator_organization: Aleph Alpha
61
- access: limited
62
- num_parameters: 13000000000
63
- # TODO: get exact release date
64
- release_date: 2022-01-01
65
- - name: AlephAlpha/luminous-extended
66
- display_name: Luminous Extended (30B)
67
- description: Luminous Extended (30B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
68
- creator_organization: Aleph Alpha
69
- access: limited
70
- num_parameters: 30000000000
71
- release_date: 2022-01-01
72
- - name: AlephAlpha/luminous-supreme
73
- display_name: Luminous Supreme (70B)
74
- description: Luminous Supreme (70B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
75
- creator_organization: Aleph Alpha
76
- access: limited
77
- num_parameters: 70000000000
78
- release_date: 2022-01-01
79
-
80
- # TODO: Remove Once we have configurable model names
81
- - name: neurips/local
82
- display_name: Local service
83
- description: Local competition service
84
- creator_organization: neurips
85
- access: open
86
- num_parameters: 1
87
- release_date: 2021-12-01
88
-
89
-
90
- # Anthropic
91
- - name: anthropic/stanford-online-all-v4-s3
92
- display_name: Anthropic-LM v4-s3 (52B)
93
- description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf).
94
- creator_organization: Anthropic
95
- access: closed
96
- num_parameters: 52000000000
97
- release_date: 2021-12-01
98
- - name: anthropic/claude-2.0
99
- display_name: Anthropic Claude 2.0
100
- description: Claude 2.0 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
101
- creator_organization: Anthropic
102
- access: limited
103
- release_date: 2023-07-11
104
- - name: anthropic/claude-2.1
105
- display_name: Anthropic Claude 2.1
106
- description: Claude 2.1 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
107
- creator_organization: Anthropic
108
- access: limited
109
- release_date: 2023-11-21
110
- - name: anthropic/claude-v1.3
111
- display_name: Anthropic Claude v1.3
112
- description: A model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
113
- creator_organization: Anthropic
114
- access: limited
115
- release_date: 2023-03-17
116
- - name: anthropic/claude-instant-v1
117
- display_name: Anthropic Claude Instant V1
118
- description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
119
- creator_organization: Anthropic
120
- access: limited
121
- release_date: 2023-03-17
122
- - name: anthropic/claude-instant-1.2
123
- display_name: Anthropic Claude Instant 1.2
124
- description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
125
- creator_organization: Anthropic
126
- access: limited
127
- release_date: 2023-08-09
128
-
129
- # Berkeley
130
- - name: together/koala-13b
131
- display_name: Koala (13B)
132
- description: Koala (13B) is a chatbot fine-tuned from Llama (13B) on dialogue data gathered from the web. ([blog post](https://bair.berkeley.edu/blog/2023/04/03/koala/))
133
- creator_organization: UC Berkeley
134
- access: open
135
- num_parameters: 13000000000
136
- release_date: 2022-04-03
137
- todo: true
138
-
139
- # BigScience
140
- - name: together/bloom
141
- display_name: BLOOM (176B)
142
- description: BLOOM (176B parameters) is an autoregressive model trained on 46 natural languages and 13 programming languages ([paper](https://arxiv.org/pdf/2211.05100.pdf)).
143
- creator_organization: BigScience
144
- access: open
145
- num_parameters: 176000000000
146
- release_date: 2022-06-28
147
- - name: together/bloomz
148
- display_name: BLOOMZ (176B)
149
- description: BLOOMZ (176B parameters) is BLOOM that has been fine-tuned on natural language instructions ([details](https://huggingface.co/bigscience/bloomz)).
150
- creator_organization: BigScience
151
- access: open
152
- num_parameters: 176000000000
153
- release_date: 2022-11-03
154
- todo: true
155
- - name: together/t0pp
156
- display_name: T0pp (11B)
157
- description: T0pp (11B parameters) is an encoder-decoder model trained on a large set of different tasks specified in natural language prompts ([paper](https://arxiv.org/pdf/2110.08207.pdf)).
158
- creator_organization: BigScience
159
- access: open
160
- num_parameters: 11000000000
161
- release_date: 2021-10-15
162
-
163
- # BigCode
164
- - name: huggingface/santacoder
165
- display_name: SantaCoder (1.1B)
166
- description: SantaCoder (1.1B parameters) model trained on the Python, Java, and JavaScript subset of The Stack (v1.1) ([model card](https://huggingface.co/bigcode/santacoder)).
167
- creator_organization: BigCode
168
- access: open
169
- - name: huggingface/starcoder
170
- display_name: StarCoder (15.5B)
171
- description: The StarCoder (15.5B parameter) model trained on 80+ programming languages from The Stack (v1.2) ([model card](https://huggingface.co/bigcode/starcoder)).
172
- creator_organization: BigCode
173
- access: open
174
-
175
- # Hugging Face
176
- - name: huggingface/gpt2
177
- display_name: GPT-2 (124M)
178
- description: GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts.
179
- creator_organization: OpenAI
180
- access: open
181
- num_parameters: 124000000
182
- - name: huggingface/gpt2-medium
183
- display_name: GPT-2 Medium (355M)
184
- description: GPT-2 Medium is the 355M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
185
- creator_organization: OpenAI
186
- access: open
187
- num_parameters: 355000000
188
- - name: huggingface/gpt2-large
189
- display_name: GPT-2 Large (774M)
190
- description: GPT-2 Large is the 774M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
191
- creator_organization: OpenAI
192
- access: open
193
- num_parameters: 774000000
194
- - name: huggingface/gpt2-xl
195
- display_name: GPT-2 XL (1.5B)
196
- description: GPT-2 XL is the 1.5B parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
197
- creator_organization: OpenAI
198
- access: open
199
- num_parameters: 1500000000
200
-
201
- # HuggignfaceM4
202
- - name: HuggingFaceM4/idefics-9b
203
- display_name: IDEFICS (9B)
204
- description: IDEFICS (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
205
- creator_organization: HuggingFace
206
- access: open
207
- num_parameters: 9000000000
208
- release_date: 2023-08-22
209
- - name: HuggingFaceM4/idefics-9b-instruct
210
- display_name: IDEFICS instruct (9B)
211
- description: IDEFICS instruct (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
212
- creator_organization: HuggingFace
213
- access: open
214
- num_parameters: 9000000000
215
- release_date: 2023-08-22
216
- - name: HuggingFaceM4/idefics-80b
217
- display_name: IDEFICS (80B)
218
- description: IDEFICS (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
219
- creator_organization: HuggingFace
220
- access: open
221
- num_parameters: 80000000000
222
- release_date: 2023-08-22
223
- - name: HuggingFaceM4/idefics-80b-instruct
224
- display_name: IDEFICS instruct (80B)
225
- description: IDEFICS instruct (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
226
- creator_organization: HuggingFace
227
- access: open
228
- num_parameters: 80000000000
229
- release_date: 2023-08-22
230
-
231
- # Cerebras Systems
232
- - name: together/cerebras-gpt-6.7b
233
- display_name: Cerebras GPT (6.7B)
234
- description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
235
- creator_organization: Cerebras
236
- access: limited
237
- num_parameters: 6700000000
238
- release_date: 2023-04-06
239
- todo: true
240
- - name: together/cerebras-gpt-13b
241
- display_name: Cerebras GPT (13B)
242
- description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
243
- creator_organization: Cerebras
244
- access: limited
245
- num_parameters: 13000000000
246
- release_date: 2023-04-06
247
- todo: true
248
-
249
- # Cohere
250
- - name: cohere/xlarge-20220609
251
- display_name: Cohere xlarge v20220609 (52.4B)
252
- description: Cohere xlarge v20220609 (52.4B parameters)
253
- creator_organization: Cohere
254
- access: limited
255
- num_parameters: 52400000000
256
- release_date: 2022-06-09
257
- - name: cohere/large-20220720
258
- display_name: Cohere large v20220720 (13.1B)
259
- description: Cohere large v20220720 (13.1B parameters), which is deprecated by Cohere as of December 2, 2022.
260
- creator_organization: Cohere
261
- access: limited
262
- num_parameters: 13100000000
263
- release_date: 2022-07-20
264
- - name: cohere/medium-20220720
265
- display_name: Cohere medium v20220720 (6.1B)
266
- description: Cohere medium v20220720 (6.1B parameters)
267
- creator_organization: Cohere
268
- access: limited
269
- num_parameters: 6100000000
270
- release_date: 2022-07-20
271
- - name: cohere/small-20220720
272
- display_name: Cohere small v20220720 (410M)
273
- description: Cohere small v20220720 (410M parameters), which is deprecated by Cohere as of December 2, 2022.
274
- creator_organization: Cohere
275
- access: limited
276
- num_parameters: 410000000
277
- release_date: 2022-07-20
278
- - name: cohere/xlarge-20221108
279
- display_name: Cohere xlarge v20221108 (52.4B)
280
- description: Cohere xlarge v20221108 (52.4B parameters)
281
- creator_organization: Cohere
282
- access: limited
283
- num_parameters: 52400000000
284
- release_date: 2022-11-08
285
- - name: cohere/medium-20221108
286
- display_name: Cohere medium v20221108 (6.1B)
287
- description: Cohere medium v20221108 (6.1B parameters)
288
- creator_organization: Cohere
289
- access: limited
290
- num_parameters: 6100000000
291
- release_date: 2022-11-08
292
- - name: cohere/command-medium-beta
293
- display_name: Cohere Command beta (6.1B)
294
- description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
295
- creator_organization: Cohere
296
- access: limited
297
- num_parameters: 6100000000
298
- release_date: 2022-11-08
299
- - name: cohere/command-xlarge-beta
300
- display_name: Cohere Command beta (52.4B)
301
- description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
302
- creator_organization: Cohere
303
- access: limited
304
- num_parameters: 52400000000
305
- release_date: 2022-11-08
306
- - name: cohere/command
307
- display_name: Cohere Command
308
- description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
309
- creator_organization: Cohere
310
- access: limited
311
- release_date: 2023-09-29
312
- - name: cohere/command-light
313
- display_name: Cohere Command Light
314
- description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
315
- creator_organization: Cohere
316
- access: limited
317
- release_date: 2023-09-29
318
-
319
- # Databricks
320
- - name: databricks/dolly-v2-3b
321
- display_name: Dolly V2 (3B)
322
- description: Dolly V2 (3B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
323
- creator_organization: Databricks
324
- access: open
325
- num_parameters: 2517652480
326
- release_date: 2023-04-12
327
- todo: true
328
- - name: databricks/dolly-v2-7b
329
- display_name: Dolly V2 (7B)
330
- description: Dolly V2 (7B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
331
- creator_organization: Databricks
332
- access: open
333
- num_parameters: 6444163072
334
- release_date: 2023-04-12
335
- todo: true
336
- - name: databricks/dolly-v2-12b
337
- display_name: Dolly V2 (12B)
338
- description: Dolly V2 (12B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
339
- creator_organization: Databricks
340
- access: open
341
- num_parameters: 11327027200
342
- release_date: 2023-04-12
343
- todo: true
344
-
345
- # DeepMind
346
- - name: deepmind/gopher
347
- display_name: Gopher (280B)
348
- description: Gopher (540B parameters) ([paper](https://arxiv.org/pdf/2112.11446.pdf)).
349
- creator_organization: DeepMind
350
- access: closed
351
- todo: true
352
- - name: deepmind/chinchilla
353
- display_name: Chinchilla (70B)
354
- description: Chinchilla (70B parameters) ([paper](https://arxiv.org/pdf/2203.15556.pdf)).
355
- creator_organization: DeepMind
356
- access: closed
357
- todo: true
358
-
359
- # EleutherAI
360
- - name: together/gpt-j-6b
361
- display_name: GPT-J (6B)
362
- description: GPT-J (6B parameters) autoregressive language model trained on The Pile ([details](https://arankomatsuzaki.wordpress.com/2021/06/04/gpt-j/)).
363
- creator_organization: EleutherAI
364
- access: open
365
- num_parameters: 6000000000
366
- release_date: 2021-06-04
367
- - name: together/gpt-neox-20b
368
- display_name: GPT-NeoX (20B)
369
- description: GPT-NeoX (20B parameters) autoregressive language model trained on The Pile ([paper](https://arxiv.org/pdf/2204.06745.pdf)).
370
- creator_organization: EleutherAI
371
- access: open
372
- num_parameters: 20000000000
373
- release_date: 2022-02-02
374
- - name: eleutherai/pythia-1b-v0
375
- display_name: Pythia (1B)
376
- description: Pythia (1B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
377
- creator_organization: EleutherAI
378
- access: open
379
- num_parameters: 805736448
380
- release_date: 2023-02-13
381
- todo: true
382
- - name: eleutherai/pythia-2.8b-v0
383
- display_name: Pythia (2.8B)
384
- description: Pythia (2.8B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
385
- creator_organization: EleutherAI
386
- access: open
387
- num_parameters: 2517652480
388
- release_date: 2023-02-13
389
- todo: true
390
- - name: eleutherai/pythia-6.9b
391
- display_name: Pythia (6.9B)
392
- description: Pythia (6.9B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
393
- creator_organization: EleutherAI
394
- access: open
395
- num_parameters: 6444163072
396
- release_date: 2023-02-13
397
- - name: eleutherai/pythia-12b-v0
398
- display_name: Pythia (12B)
399
- description: Pythia (12B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
400
- creator_organization: EleutherAI
401
- access: open
402
- num_parameters: 11327027200
403
- release_date: 2023-02-13
404
-
405
- # Google
406
- - name: together/t5-11b
407
- display_name: T5 (11B)
408
- description: T5 (11B parameters) is an encoder-decoder model trained on a multi-task mixture, where each task is converted into a text-to-text format ([paper](https://arxiv.org/pdf/1910.10683.pdf)).
409
- creator_organization: Google
410
- access: open
411
- num_parameters: 11000000000
412
- release_date: 2019-10-23
413
- - name: together/ul2
414
- display_name: UL2 (20B)
415
- description: UL2 (20B parameters) is an encoder-decoder model trained on the C4 corpus. It's similar to T5 but trained with a different objective and slightly different scaling knobs ([paper](https://arxiv.org/pdf/2205.05131.pdf)).
416
- creator_organization: Google
417
- access: open
418
- num_parameters: 20000000000
419
- release_date: 2022-05-10
420
- - name: together/flan-t5-xxl
421
- display_name: Flan-T5 (11B)
422
- description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)).
423
- creator_organization: Google
424
- access: open
425
- - name: google/palm
426
- display_name: PaLM (540B)
427
- description: Pathways Language Model (540B parameters) is trained using 6144 TPU v4 chips ([paper](https://arxiv.org/pdf/2204.02311.pdf)).
428
- creator_organization: Google
429
- access: closed
430
- todo: true
431
- ## PaLM 2
432
- - name: google/text-bison@001
433
- display_name: PaLM-2 (Bison)
434
- description: The best value PaLM model. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
435
- creator_organization: Google
436
- access: limited
437
- release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
438
- - name: google/text-bison-32k
439
- display_name: PaLM-2 (Bison)
440
- description: The best value PaLM model with a 32K context. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
441
- creator_organization: Google
442
- access: limited
443
- release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
444
- - name: google/text-unicorn@001
445
- display_name: PaLM-2 (Unicorn)
446
- description: The largest model in PaLM family. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
447
- creator_organization: Google
448
- access: limited
449
- release_date: 2023-11-30 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
450
- - name: google/code-bison@001
451
- display_name: Codey PaLM-2 (Bison)
452
- description: A model fine-tuned to generate code based on a natural language description of the desired code. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
453
- creator_organization: Google
454
- access: limited
455
- release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
456
- - name: google/code-bison-32k
457
- display_name: Codey PaLM-2 (Bison)
458
- description: Codey with a 32K context. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
459
- creator_organization: Google
460
- access: limited
461
- release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
462
-
463
- # HazyResearch
464
- - name: together/h3-2.7b
465
- display_name: H3 (2.7B)
466
- description: H3 (2.7B parameters) is a decoder-only language model based on state space models ([paper](https://arxiv.org/abs/2212.14052)).
467
- creator_organization: HazyResearch
468
- access: open
469
- num_parameters: 2700000000
470
- release_date: 2023-01-23
471
- todo: true
472
-
473
- # Lightning AI's Lit-GPT
474
- - name: lightningai/lit-gpt
475
- display_name: Lit-GPT
476
- description: Lit-GPT is an optimized collection of open-source LLMs for finetuning and inference. It supports – Falcon, Llama 2, Vicuna, LongChat, and other top-performing open-source large language models.
477
- creator_organization: Lightning AI
478
- access: open
479
- num_parameters: 1
480
- release_date: 2023-04-04
481
-
482
-
483
- # Meta
484
- - name: together/opt-iml-175b
485
- display_name: OPT-IML (175B)
486
- description: OPT-IML (175B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
487
- creator_organization: Meta
488
- access: open
489
- num_parameters: 175000000000
490
- release_date: 2022-12-22
491
- todo: true
492
-
493
- - name: together/opt-iml-30b
494
- display_name: OPT-IML (30B)
495
- description: OPT-IML (30B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
496
- creator_organization: Meta
497
- access: open
498
- num_parameters: 30000000000
499
- release_date: 2022-12-22
500
- todo: true
501
-
502
- - name: together/opt-175b
503
- display_name: OPT (175B)
504
- description: Open Pre-trained Transformers (175B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
505
- creator_organization: Meta
506
- access: open
507
- num_parameters: 175000000000
508
- release_date: 2022-05-02
509
-
510
- - name: together/opt-66b
511
- display_name: OPT (66B)
512
- description: Open Pre-trained Transformers (66B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
513
- creator_organization: Meta
514
- access: open
515
- num_parameters: 66000000000
516
- release_date: 2022-05-02
517
-
518
- - name: together/opt-6.7b
519
- display_name: OPT (6.7B)
520
- description: Open Pre-trained Transformers (6.7B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
521
- creator_organization: Meta
522
- access: open
523
- num_parameters: 6700000000
524
- release_date: 2022-05-02
525
-
526
- - name: together/opt-1.3b
527
- display_name: OPT (1.3B)
528
- description: Open Pre-trained Transformers (1.3B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
529
- creator_organization: Meta
530
- access: open
531
- num_parameters: 1300000000
532
- release_date: 2022-05-02
533
-
534
- - name: together/galactica-120b
535
- display_name: Galactica (120B)
536
- description: Galactica (120B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
537
- creator_organization: Meta
538
- access: open
539
- num_parameters: 120000000000
540
- release_date: 2022-11-15
541
- todo: true
542
-
543
- - name: together/galactica-30b
544
- display_name: Galactica (30B)
545
- description: Galactica (30B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
546
- creator_organization: Meta
547
- access: open
548
- num_parameters: 30000000000
549
- release_date: 2022-11-15
550
- todo: true
551
- - name: meta/llama-7b
552
- display_name: LLaMA (7B)
553
- description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
554
- creator_organization: Meta
555
- access: open
556
- num_parameters: 7000000000
557
- release_date: 2023-02-24
558
- - name: meta/llama-13b
559
- display_name: LLaMA (13B)
560
- description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
561
- creator_organization: Meta
562
- access: open
563
- num_parameters: 13000000000
564
- release_date: 2023-02-24
565
- - name: meta/llama-30b
566
- display_name: LLaMA (30B)
567
- description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
568
- creator_organization: Meta
569
- access: open
570
- num_parameters: 30000000000
571
- release_date: 2023-02-24
572
- - name: meta/llama-65b
573
- display_name: LLaMA (65B)
574
- description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
575
- creator_organization: Meta
576
- access: open
577
- num_parameters: 65000000000
578
- release_date: 2023-02-24
579
- - name: meta/llama-2-7b
580
- display_name: Llama 2 (7B)
581
- description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
582
- creator_organization: Meta
583
- access: open
584
- num_parameters: 7000000000
585
- release_date: 2023-07-18
586
- - name: meta/llama-2-13b
587
- display_name: Llama 2 (13B)
588
- description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
589
- creator_organization: Meta
590
- access: open
591
- num_parameters: 13000000000
592
- release_date: 2023-07-18
593
- - name: meta/llama-2-70b
594
- display_name: Llama 2 (70B)
595
- description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
596
- creator_organization: Meta
597
- access: open
598
- num_parameters: 70000000000
599
- release_date: 2023-07-18
600
-
601
- # Stability AI
602
- - name: stabilityai/stablelm-base-alpha-3b
603
- display_name: StableLM-Base-Alpha (3B)
604
- description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
605
- creator_organization: Stability AI
606
- access: open
607
- num_parameters: 3000000000
608
- release_date: 2023-04-20
609
- todo: true
610
-
611
- - name: stabilityai/stablelm-base-alpha-7b
612
- display_name: StableLM-Base-Alpha (7B)
613
- description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
614
- creator_organization: Stability AI
615
- access: open
616
- num_parameters: 7000000000
617
- release_date: 2023-04-20
618
- todo: true
619
-
620
- # Stanford
621
- - name: stanford/alpaca-7b
622
- display_name: Alpaca (7B)
623
- description: Alpaca 7B is a model fine-tuned from the LLaMA 7B model on 52K instruction-following demonstrations
624
- creator_organization: Stanford
625
- access: open
626
- num_parameters: 7000000000
627
- release_date: 2023-03-13
628
-
629
- # LMSYS
630
- - name: lmsys/vicuna-7b-v1.3
631
- display_name: Vicuna v1.3 (7B)
632
- description: Vicuna v1.3 (7B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
633
- creator_organization: LMSYS
634
- access: open
635
- num_parameters: 7000000000
636
- release_date: 2023-06-22
637
- - name: lmsys/vicuna-13b-v1.3
638
- display_name: Vicuna v1.3 (13B)
639
- description: Vicuna v1.3 (13B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
640
- creator_organization: LMSYS
641
- access: open
642
- num_parameters: 13000000000
643
- release_date: 2023-06-22
644
-
645
- # 01.AI
646
- - name: 01-ai/yi-6b
647
- display_name: Yi (6B)
648
- description: The Yi models are large language models trained from scratch by developers at 01.AI.
649
- creator_organization: 01.AI
650
- access: open
651
- num_parameters: 6000000000
652
- release_date: 2023-11-02
653
- - name: 01-ai/yi-34b
654
- display_name: Yi (34B)
655
- description: The Yi models are large language models trained from scratch by developers at 01.AI.
656
- creator_organization: 01.AI
657
- access: open
658
- num_parameters: 34000000000
659
- release_date: 2023-11-02
660
-
661
- # Mistral AI
662
- - name: mistralai/mistral-7b-v0.1
663
- display_name: Mistral v0.1 (7B)
664
- description: Mistral 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA).
665
- creator_organization: Mistral AI
666
- access: open
667
- num_parameters: 7300000000
668
- release_date: 2023-09-27
669
-
670
- # Microsoft/NVIDIA
671
- - name: microsoft/TNLGv2_530B
672
- display_name: TNLG v2 (530B)
673
- description: TNLG v2 (530B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
674
- creator_organization: Microsoft/NVIDIA
675
- access: closed
676
- num_parameters: 530000000000
677
- release_date: 2022-01-28
678
- - name: microsoft/TNLGv2_7B
679
- display_name: TNLG v2 (6.7B)
680
- description: TNLG v2 (6.7B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
681
- creator_organization: Microsoft/NVIDIA
682
- access: closed
683
- num_parameters: 6700000000
684
- release_date: 2022-01-28
685
-
686
- # OpenAI: https://beta.openai.com/docs/engines/gpt-3
687
- - name: openai/davinci
688
- display_name: davinci (175B)
689
- description: Original GPT-3 (175B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
690
- creator_organization: OpenAI
691
- access: limited
692
- num_parameters: 175000000000
693
- release_date: 2020-05-28
694
- - name: openai/curie
695
- display_name: curie (6.7B)
696
- description: Original GPT-3 (6.7B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
697
- creator_organization: OpenAI
698
- access: limited
699
- num_parameters: 6700000000
700
- release_date: 2020-05-28
701
- - name: openai/babbage
702
- display_name: babbage (1.3B)
703
- description: Original GPT-3 (1.3B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
704
- creator_organization: OpenAI
705
- access: limited
706
- num_parameters: 1300000000
707
- release_date: 2020-05-28
708
- - name: openai/ada
709
- display_name: ada (350M)
710
- description: Original GPT-3 (350M parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
711
- creator_organization: OpenAI
712
- access: limited
713
- num_parameters: 350000000
714
- release_date: 2020-05-28
715
- - name: openai/text-davinci-003
716
- display_name: text-davinci-003
717
- description: text-davinci-003 model that involves reinforcement learning (PPO) with reward models. Derived from text-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
718
- creator_organization: OpenAI
719
- access: limited
720
- num_parameters: 175000000000
721
- release_date: 2022-11-28
722
- - name: openai/text-davinci-002
723
- display_name: text-davinci-002
724
- description: text-davinci-002 model that involves supervised fine-tuning on human-written demonstrations. Derived from code-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
725
- creator_organization: OpenAI
726
- access: limited
727
- num_parameters: 175000000000
728
- release_date: 2022-01-27
729
- - name: openai/text-davinci-001
730
- display_name: text-davinci-001
731
- description: text-davinci-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
732
- creator_organization: OpenAI
733
- access: limited
734
- num_parameters: 175000000000
735
- release_date: 2022-01-27
736
- todo: true
737
- - name: openai/text-curie-001
738
- display_name: text-curie-001
739
- description: text-curie-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
740
- creator_organization: OpenAI
741
- access: limited
742
- num_parameters: 6700000000
743
- release_date: 2022-01-27
744
- - name: openai/text-babbage-001
745
- display_name: text-babbage-001
746
- description: text-babbage-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
747
- creator_organization: OpenAI
748
- access: limited
749
- num_parameters: 1300000000
750
- release_date: 2022-01-27
751
- - name: openai/text-ada-001
752
- display_name: text-ada-001
753
- description: text-ada-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
754
- creator_organization: OpenAI
755
- access: limited
756
- num_parameters: 350000000
757
- release_date: 2022-01-27
758
- - name: openai/gpt-4-0314
759
- display_name: gpt-4-0314
760
- description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from March 14th 2023.
761
- creator_organization: OpenAI
762
- access: limited
763
- release_date: 2023-03-14
764
- - name: openai/gpt-4-32k-0314
765
- display_name: gpt-4-32k-0314
766
- description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from March 14th 2023.
767
- creator_organization: OpenAI
768
- access: limited
769
- release_date: 2023-03-14
770
- - name: openai/gpt-4-0613
771
- display_name: gpt-4-0613
772
- description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from 2023-06-13.
773
- creator_organization: OpenAI
774
- access: limited
775
- release_date: 2023-06-13
776
- - name: openai/gpt-4-32k-0613
777
- display_name: gpt-4-32k-0613
778
- description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from 2023-06-13.
779
- creator_organization: OpenAI
780
- access: limited
781
- release_date: 2023-06-13
782
- - name: openai/code-davinci-002
783
- display_name: code-davinci-002
784
- description: Codex-style model that is designed for pure code-completion tasks ([docs](https://beta.openai.com/docs/models/codex)).
785
- creator_organization: OpenAI
786
- access: limited
787
- - name: openai/code-davinci-001
788
- display_name: code-davinci-001
789
- description: code-davinci-001 model
790
- creator_organization: OpenAI
791
- access: limited
792
- todo: true
793
- - name: openai/code-cushman-001
794
- display_name: code-cushman-001 (12B)
795
- description: Codex-style model that is a stronger, multilingual version of the Codex (12B) model in the [Codex paper](https://arxiv.org/pdf/2107.03374.pdf).
796
- creator_organization: OpenAI
797
- access: limited
798
- - name: openai/gpt-3.5-turbo-0301
799
- display_name: gpt-3.5-turbo-0301
800
- description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01.
801
- creator_organization: OpenAI
802
- access: limited
803
- release_date: 2023-03-01
804
- - name: openai/gpt-3.5-turbo-0613
805
- display_name: gpt-3.5-turbo-0613
806
- description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13.
807
- creator_organization: OpenAI
808
- access: limited
809
- release_date: 2023-06-13
810
- - name: openai/gpt-3.5-turbo-16k-0613
811
- display_name: gpt-3.5-turbo-16k-0613
812
- description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13 with a longer context length of 16,384 tokens.
813
- creator_organization: OpenAI
814
- access: limited
815
- release_date: 2023-06-13
816
- - name: openai/gpt-4-1106-preview
817
- display_name: gpt-4-1106-preview
818
- description: GPT-4 Turbo (preview) is a large multimodal model that is optimized for chat but works well for traditional completions tasks. The model is cheaper and faster than the original GPT-4 model. Preview snapshot from November 6, 2023.
819
- creator_organization: OpenAI
820
- access: limited
821
- release_date: 2023-11-06
822
-
823
- # Together
824
- - name: together/Together-gpt-JT-6B-v1
825
- display_name: GPT-JT (6B)
826
- description: GPT-JT (6B parameters) is a fork of GPT-J ([blog post](https://www.together.xyz/blog/releasing-v1-of-gpt-jt-powered-by-open-source-ai)).
827
- creator_organization: Together
828
- access: open
829
- num_parameters: 6700000000
830
- release_date: 2022-11-29
831
- todo: true
832
- - name: together/gpt-neoxt-chat-base-20b
833
- display_name: GPT-NeoXT-Chat-Base (20B)
834
- description: GPT-NeoXT-Chat-Base (20B) is fine-tuned from GPT-NeoX, serving as a base model for developing open-source chatbots.
835
- creator_organization: Together
836
- access: open
837
- num_parameters: 20000000000
838
- release_date: 2023-03-08
839
- todo: true
840
- - name: together/redpajama-incite-base-3b-v1
841
- display_name: RedPajama-INCITE-Base-v1 (3B)
842
- description: RedPajama-INCITE-Base-v1 (3B parameters) is a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
843
- creator_organization: Together
844
- access: open
845
- num_parameters: 3000000000
846
- release_date: 2023-05-05
847
- - name: together/redpajama-incite-instruct-3b-v1
848
- display_name: RedPajama-INCITE-Instruct-v1 (3B)
849
- description: RedPajama-INCITE-Instruct-v1 (3B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
850
- creator_organization: Together
851
- access: open
852
- num_parameters: 3000000000
853
- release_date: 2023-05-05
854
- todo: true
855
- - name: together/redpajama-incite-chat-3b-v1
856
- display_name: RedPajama-INCITE-Chat-v1 (3B)
857
- description: RedPajama-INCITE-Chat-v1 (3B parameters) is a model fine-tuned on OASST1 and Dolly2 to enhance chatting ability. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
858
- creator_organization: Together
859
- access: open
860
- num_parameters: 3000000000
861
- release_date: 2023-05-05
862
- todo: true
863
- - name: together/redpajama-incite-base-7b
864
- display_name: RedPajama-INCITE-Base (7B)
865
- description: RedPajama-INCITE-Base (7B parameters) is a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
866
- creator_organization: Together
867
- access: open
868
- num_parameters: 7000000000
869
- release_date: 2023-05-05
870
- todo: true
871
- - name: together/redpajama-incite-instruct-7b
872
- display_name: RedPajama-INCITE-Instruct (7B)
873
- description: RedPajama-INCITE-Instruct (7B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base (7B), a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
874
- creator_organization: Together
875
- access: open
876
- num_parameters: 7000000000
877
- release_date: 2023-05-05
878
- todo: true
879
-
880
- # MosaicML
881
- - name: mosaicml/mpt-7b
882
- display_name: MPT (7B)
883
- description: MPT (7B) is a Transformer trained from scratch on 1T tokens of text and code.
884
- creator_organization: MosaicML
885
- access: open
886
- num_parameters: 6700000000
887
- release_date: 2023-05-05
888
- - name: mosaicml/mpt-7b-chat
889
- display_name: MPT-Chat (7B)
890
- description: MPT-Chat (7B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B) , a Transformer trained from scratch on 1T tokens of text and code.
891
- creator_organization: MosaicML
892
- access: open
893
- num_parameters: 6700000000
894
- release_date: 2023-05-05
895
- todo: true
896
- - name: mosaicml/mpt-instruct-7b
897
- display_name: MPT-Instruct (7B)
898
- description: MPT-Instruct (7B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
899
- creator_organization: MosaicML
900
- access: open
901
- num_parameters: 6700000000
902
- release_date: 2023-05-05
903
- - name: mosaicml/mpt-30b
904
- display_name: MPT (30B)
905
- description: MPT (30B) is a Transformer trained from scratch on 1T tokens of text and code.
906
- creator_organization: MosaicML
907
- access: open
908
- num_parameters: 30000000000
909
- release_date: 2023-06-22
910
- - name: mosaicml/mpt-30b-chat
911
- display_name: MPT-Chat (30B)
912
- description: MPT-Chat (30B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
913
- creator_organization: MosaicML
914
- access: open
915
- num_parameters: 30000000000
916
- release_date: 2023-06-22
917
- todo: true
918
- - name: mosaicml/mpt-instruct-30b
919
- display_name: MPT-Instruct (30B)
920
- description: MPT-Instruct (30B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
921
- creator_organization: MosaicML
922
- access: open
923
- num_parameters: 30000000000
924
- release_date: 2023-06-22
925
-
926
- # TII UAE
927
- - name: tiiuae/falcon-7b
928
- display_name: Falcon (7B)
929
- description: Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
930
- creator_organization: TII UAE
931
- access: open
932
- num_parameters: 7000000000
933
- release_date: 2023-03-15
934
- - name: tiiuae/falcon-7b-instruct
935
- display_name: Falcon-Instruct (7B)
936
- description: Falcon-7B-Instruct is a 7B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
937
- creator_organization: TII UAE
938
- access: open
939
- num_parameters: 7000000000
940
- release_date: 2023-03-15
941
- - name: tiiuae/falcon-40b
942
- display_name: Falcon (40B)
943
- description: Falcon-40B is a 40B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
944
- creator_organization: TII UAE
945
- access: open
946
- num_parameters: 40000000000
947
- release_date: 2023-05-25
948
- - name: tiiuae/falcon-40b-instruct
949
- display_name: Falcon-Instruct (40B)
950
- description: Falcon-40B-Instruct is a 40B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
951
- creator_organization: TII UAE
952
- access: open
953
- num_parameters: 40000000000
954
- release_date: 2023-05-25
955
-
956
- # Salesforce
957
- - name: together/codegen
958
- display_name: CodeGen (16B)
959
- description: CodeGen (16B parameters) is an open dense code model trained for multi-turn program synthesis ([blog](https://arxiv.org/pdf/2203.13474.pdf)).
960
- creator_organization: Tsinghua
961
- access: open
962
- num_parameters: 16000000000
963
- release_date: 2022-03-25
964
- todo: true
965
-
966
- # Tsinghua
967
- - name: together/glm
968
- display_name: GLM (130B)
969
- description: GLM (130B parameters) is an open bilingual (English & Chinese) bidirectional dense model that was trained using General Language Model (GLM) procedure ([paper](https://arxiv.org/pdf/2210.02414.pdf)).
970
- creator_organization: Tsinghua
971
- access: open
972
- num_parameters: 130000000000
973
- release_date: 2022-08-04
974
-
975
- - name: together/codegeex
976
- display_name: CodeGeeX (13B)
977
- description: CodeGeeX (13B parameters) is an open dense code model trained on more than 20 programming languages on a corpus of more than 850B tokens ([blog](http://keg.cs.tsinghua.edu.cn/codegeex/)).
978
- creator_organization: Tsinghua
979
- access: open
980
- num_parameters: 13000000000
981
- release_date: 2022-09-19
982
- todo: true
983
-
984
- # Writer
985
- - name: writer/palmyra-base
986
- display_name: Palmyra Base (5B)
987
- description: Palmyra Base (5B)
988
- creator_organization: Writer
989
- access: limited
990
- num_parameters: 5000000000
991
- release_date: 2022-10-13
992
- - name: writer/palmyra-large
993
- display_name: Palmyra Large (20B)
994
- description: Palmyra Large (20B)
995
- creator_organization: Writer
996
- access: limited
997
- num_parameters: 20000000000
998
- release_date: 2022-12-23
999
- - name: writer/palmyra-instruct-30
1000
- display_name: InstructPalmyra (30B)
1001
- description: InstructPalmyra (30B parameters) is trained using reinforcement learning techniques based on feedback from humans.
1002
- creator_organization: Writer
1003
- access: limited
1004
- num_parameters: 30000000000
1005
- release_date: 2023-02-16
1006
- - name: writer/palmyra-e
1007
- display_name: Palmyra E (30B)
1008
- description: Palmyra E (30B)
1009
- creator_organization: Writer
1010
- access: limited
1011
- num_parameters: 30000000000
1012
- release_date: 2023-03-03
1013
- - name: writer/silk-road
1014
- display_name: Silk Road (35B)
1015
- description: Silk Road (35B)
1016
- creator_organization: Writer
1017
- access: limited
1018
- num_parameters: 35000000000
1019
- release_date: 2023-04-13
1020
- - name: writer/palmyra-x
1021
- display_name: Palmyra X (43B)
1022
- description: Palmyra-X (43B parameters) is trained to adhere to instructions using human feedback and utilizes a technique called multiquery attention. Furthermore, a new feature called 'self-instruct' has been introduced, which includes the implementation of an early stopping criteria specifically designed for minimal instruction tuning ([paper](https://dev.writer.com/docs/becoming-self-instruct-introducing-early-stopping-criteria-for-minimal-instruct-tuning)).
1023
- creator_organization: Writer
1024
- access: limited
1025
- num_parameters: 43000000000
1026
- release_date: 2023-06-11
1027
- - name: writer/palmyra-x-v2
1028
- display_name: Palmyra X V2 (33B)
1029
- description: Palmyra-X V2 (33B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. The pre-training data more than 2 trillion tokens types are diverse and cover a wide range of areas, used FlashAttention-2.
1030
- creator_organization: Writer
1031
- access: limited
1032
- num_parameters: 33000000000
1033
- release_date: 2023-12-01
1034
- - name: writer/palmyra-x-v3
1035
- display_name: Palmyra X V3 (72B)
1036
- description: Palmyra-X V3 (72B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. It is trained via unsupervised learning and DPO and use multiquery attention.
1037
- creator_organization: Writer
1038
- access: limited
1039
- num_parameters: 72000000000
1040
- release_date: 2023-12-01
1041
- - name: writer/palmyra-x-32k
1042
- display_name: Palmyra X-32K (33B)
1043
- description: Palmyra-X-32K (33B parameters) is a Transformer-based model, which is trained on large-scale pre-training data. The pre-training data types are diverse and cover a wide range of areas. These data types are used in conjunction and the alignment mechanism to extend context window.
1044
- creator_organization: Writer
1045
- access: limited
1046
- num_parameters: 33000000000
1047
- release_date: 2023-12-01
1048
-
1049
- # Yandex
1050
- - name: together/yalm
1051
- display_name: YaLM (100B)
1052
- description: YaLM (100B parameters) is an autoregressive language model trained on English and Russian text ([GitHub](https://github.com/yandex/YaLM-100B)).
1053
- creator_organization: Yandex
1054
- access: open
1055
- num_parameters: 100000000000
1056
- release_date: 2022-06-23
1057
-
1058
- # NVIDIA
1059
- - name: nvidia/megatron-gpt2
1060
- display_name: Megatron GPT2
1061
- description: GPT-2 implemented in Megatron-LM ([paper](https://arxiv.org/abs/1909.08053)).
1062
- creator_organization: NVIDIA
1063
- access: open
1064
- todo: true
1065
-
1066
2
  ############################################################
1067
3
  adapter:
1068
4
  - name: method
@@ -1109,9 +45,9 @@ adapter:
1109
45
  - name: sample_train
1110
46
  description: If true, randomly sample N training examples; if false, select N consecutive training examples
1111
47
  - name: model
1112
- description: DEPRECATED. Name of the language model (<creator_organization>/<model name>) to send requests to.
48
+ description: Name of the language model (<creator_organization>/<model name>) to send requests to.
1113
49
  - name: model_deployment
1114
- description: Name of the language model (<host_organization>/<model name>) to send requests to.
50
+ description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
1115
51
  - name: temperature
1116
52
  description: Temperature parameter used in generation.
1117
53
  - name: max_tokens
@@ -1632,6 +568,50 @@ metrics:
1632
568
  lower_is_better: false
1633
569
  # CLEVA Chinese bias, copyright and toxicity metrics share the same name as the original HELM metrics
1634
570
 
571
+
572
+ # DecodingTrust Fairness Metrics
573
+ - name: equalized_odds_difference
574
+ display_name: Equalized Odds Difference
575
+ short_display_name: Equalized Odds
576
+ description: The rate at which the model refuses to answer an OoD question.
577
+ lower_is_better: true
578
+
579
+ - name: demographic_parity_difference
580
+ display_name: Demographic Parity Difference
581
+ short_display_name: demographic parity
582
+ description: The demographic parity difference measures the difference between the probability of positive predictions conditioned on sensitive attribute A = 1 and that conditioned on A = 0
583
+ lower_is_better: true
584
+
585
+
586
+ # DecodingTrust OoD Metrics
587
+ - name: ood_knowledge_rr
588
+ display_name: OOD Knowledge Refusal Rate
589
+ short_display_name: OoD-RR
590
+ description: The rate at which the model refuses to answer an OoD question.
591
+ lower_is_better: false
592
+
593
+ - name: ood_knowledge_macc
594
+ display_name: OOD Meaningful Answer Accuracy (Exclude Refusal)
595
+ short_display_name: OoD-MACC
596
+ description: MACC indicates the percentage of correct answers when the model successfully generates meaningful responses by excluding outputs that are refused to answer
597
+ lower_is_better: false
598
+
599
+
600
+ # DecodingTrust Stereotype Bias Metrics
601
+
602
+ - name: decodingtrust_stereotype_bias
603
+ display_name: DecodingTrust Bias Score
604
+ short_display_name: Bias Score
605
+ description: 1 - agreementIndex (Section 4.2 of DecodingTrust)
606
+ lower_is_better: false
607
+
608
+ - name: decodingtrust_stereotype_bias_rejection_rate
609
+ display_name: Accuracy at 10% coverage
610
+ short_display_name: Bias Rejection Rate
611
+ description: Rejection rate of stereotype prompts
612
+ lower_is_better: false
613
+
614
+
1635
615
  ############################################################
1636
616
  perturbations:
1637
617
  - name: robustness
@@ -1915,6 +895,31 @@ metric_groups:
1915
895
  - name: chinese_bleu_1
1916
896
  split: ${main_split}
1917
897
 
898
+ - name: decodingtrust_fairness_metrics
899
+ display_name: DecodingTrust Fairness
900
+ metrics:
901
+ - name: equalized_odds_difference
902
+ split: ${main_split}
903
+ - name: demographic_parity_difference
904
+ split: ${main_split}
905
+
906
+ - name: decodingtrust_ood_metrics
907
+ display_name: DecodingTrust OOD Accuracy
908
+ metrics:
909
+ - name: ood_knowledge_rr
910
+ split: ${main_split}
911
+ - name: ood_knowledge_macc
912
+ split: ${main_split}
913
+
914
+ - name: decodingtrust_stereotype_bias_metrics
915
+ display_name: DecodingTrust Stereotype Bias
916
+ metrics:
917
+ - name: decodingtrust_stereotype_bias
918
+ split: ${main_split}
919
+ - name: decodingtrust_stereotype_bias_rejection_rate
920
+ split: ${main_split}
921
+
922
+
1918
923
  ############################################################
1919
924
  run_groups:
1920
925
  ## Top-level
@@ -2097,6 +1102,20 @@ run_groups:
2097
1102
  main_name: none
2098
1103
  main_split: none
2099
1104
 
1105
+ - name: decodingtrust
1106
+ display_name: DecodingTrust
1107
+ description: A comprehensive benchmark of the trustworthiness of large language models [(Wang et. al. 2023)](https://decodingtrust.github.io/)
1108
+ category: Core scenarios
1109
+ subgroups:
1110
+ - decodingtrust_adv_robustness
1111
+ - decodingtrust_adv_demonstration
1112
+ - decodingtrust_ood_robustness
1113
+ - decodingtrust_fairness
1114
+ - decodingtrust_privacy
1115
+ - decodingtrust_machine_ethics
1116
+ - decodingtrust_toxicity_prompts
1117
+ - decodingtrust_stereotype_bias
1118
+
2100
1119
  ### Ablations
2101
1120
  - name: ablation_in_context
2102
1121
  display_name: Vary number of in-context examples
@@ -3873,3 +2892,176 @@ run_groups:
3873
2892
  task: user-facing tasks
3874
2893
  language: English dialects
3875
2894
  todo: true
2895
+
2896
+
2897
+ # DecodingTrust scenarios
2898
+ - name: decodingtrust_adv_robustness
2899
+ display_name: DecodingTrust - AdvGLUE++
2900
+ short_display_name: AdvGLUE++
2901
+ description: Adversarial perturbations of the GLUE dataset generated against open-source LLMs including Alpaca, Vicuna, and Stable-Vicuna
2902
+ metric_groups:
2903
+ - accuracy
2904
+ - calibration
2905
+ - efficiency
2906
+ - general_information
2907
+ environment:
2908
+ main_name: quasi_exact_match
2909
+ main_split: test
2910
+ taxonomy:
2911
+ task: text classification
2912
+ what: "?"
2913
+ who: "?"
2914
+ when: "?"
2915
+ language: English
2916
+ todo: true
2917
+
2918
+ - name: decodingtrust_adv_demonstration
2919
+ display_name: DecodingTrust - Adversarial Demonstrations
2920
+ short_display_name: AdvDemo
2921
+ description: Robustness analysis of LM generations when facing adversarial demonstrations
2922
+ metric_groups:
2923
+ - accuracy
2924
+ - calibration
2925
+ - efficiency
2926
+ - general_information
2927
+ environment:
2928
+ main_name: quasi_exact_match
2929
+ main_split: test
2930
+ taxonomy:
2931
+ task: text classification
2932
+ what: "?"
2933
+ who: "?"
2934
+ when: "?"
2935
+ language: English
2936
+
2937
+ - name: decodingtrust_ood_robustness
2938
+ display_name: DecodingTrust - OoD Robustness
2939
+ short_display_name: OoD
2940
+ description: Style perturbations of GLUE datasets (OoD styles) and out-of-scope OoD knowledge evaluations
2941
+ metric_groups:
2942
+ - accuracy
2943
+ - calibration
2944
+ - efficiency
2945
+ - general_information
2946
+ - decodingtrust_ood_metrics
2947
+ environment:
2948
+ main_name: quasi_exact_match
2949
+ main_split: test
2950
+ taxonomy:
2951
+ task: text classification
2952
+ what: "?"
2953
+ who: "?"
2954
+ when: "?"
2955
+ language: English
2956
+
2957
+ - name: decodingtrust_fairness
2958
+ display_name: DecodingTrust - Fairness
2959
+ short_display_name: Fairness
2960
+ description: Fairness analysis of LLMs
2961
+ metric_groups:
2962
+ - accuracy
2963
+ - calibration
2964
+ - efficiency
2965
+ - general_information
2966
+ - decodingtrust_fairness_metrics
2967
+ environment:
2968
+ main_name: quasi_exact_match
2969
+ main_split: test
2970
+ taxonomy:
2971
+ task: text classification
2972
+ what: "?"
2973
+ who: "?"
2974
+ when: "?"
2975
+ language: English
2976
+
2977
+ - name: decodingtrust_privacy
2978
+ display_name: DecodingTrust - Privacy
2979
+ short_display_name: Privacy
2980
+ description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
2981
+ metric_groups:
2982
+ - accuracy
2983
+ - calibration
2984
+ - efficiency
2985
+ - general_information
2986
+ environment:
2987
+ main_name: quasi_exact_match
2988
+ main_split: test
2989
+ taxonomy:
2990
+ task: text classification
2991
+ what: "?"
2992
+ who: "?"
2993
+ when: "?"
2994
+ language: English
2995
+
2996
+ - name: decodingtrust_machine_ethics
2997
+ display_name: DecodingTrust - Ethics
2998
+ short_display_name: Ethics
2999
+ description: Evaluation of the understanding of ethical behaviors of LLMs
3000
+ metric_groups:
3001
+ - accuracy
3002
+ - calibration
3003
+ - efficiency
3004
+ - general_information
3005
+ environment:
3006
+ main_name: quasi_exact_match
3007
+ main_split: test
3008
+ taxonomy:
3009
+ task: text classification
3010
+ what: "?"
3011
+ who: "?"
3012
+ when: "?"
3013
+ language: English
3014
+
3015
+ - name: decodingtrust_toxicity_prompts
3016
+ display_name: DecodingTrust - Toxicity
3017
+ short_display_name: Toxicity
3018
+ description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
3019
+ metric_groups:
3020
+ - toxicity
3021
+ - bias
3022
+ - efficiency
3023
+ - general_information
3024
+ environment:
3025
+ main_split: test
3026
+ taxonomy:
3027
+ task: "?"
3028
+ what: n/a
3029
+ who: n/a
3030
+ when: n/a
3031
+ language: synthetic
3032
+
3033
+ - name: decodingtrust_stereotype_bias
3034
+ display_name: DecodingTrust - Stereotype Bias
3035
+ short_display_name: Stereotype
3036
+ description: Manually crafted stereotype user prompts from DecodingTrust
3037
+ metric_groups:
3038
+ - toxicity
3039
+ - bias
3040
+ - efficiency
3041
+ - general_information
3042
+ - decodingtrust_stereotype_bias_metrics
3043
+ environment:
3044
+ main_split: test
3045
+ taxonomy:
3046
+ task: "?"
3047
+ what: n/a
3048
+ who: n/a
3049
+ when: n/a
3050
+ language: synthetic
3051
+
3052
+ - name: thai_exam
3053
+ display_name: Thai Exam
3054
+ short_display_name: ThaiExam
3055
+ description: A benchmark comprising Thai multiple-choice examinations.
3056
+ metric_groups:
3057
+ - accuracy
3058
+ - general_information
3059
+ environment:
3060
+ main_name: exact_match
3061
+ main_split: test
3062
+ taxonomy:
3063
+ task: question answering
3064
+ what: "?"
3065
+ who: "?"
3066
+ when: "?"
3067
+ language: Thai