crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -1,923 +1,4 @@
|
|
|
1
1
|
---
|
|
2
|
-
############################################################
|
|
3
|
-
models:
|
|
4
|
-
# AI21 Labs
|
|
5
|
-
- name: ai21/j1-jumbo
|
|
6
|
-
display_name: J1-Jumbo v1 (178B)
|
|
7
|
-
description: Jurassic-1 Jumbo (178B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
|
|
8
|
-
creator_organization: AI21 Labs
|
|
9
|
-
access: limited
|
|
10
|
-
num_parameters: 178000000000
|
|
11
|
-
release_date: 2021-08-11
|
|
12
|
-
- name: ai21/j1-large
|
|
13
|
-
display_name: J1-Large v1 (7.5B)
|
|
14
|
-
description: Jurassic-1 Large (7.5B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
|
|
15
|
-
creator_organization: AI21 Labs
|
|
16
|
-
access: limited
|
|
17
|
-
num_parameters: 7500000000
|
|
18
|
-
release_date: 2021-08-11
|
|
19
|
-
- name: ai21/j1-grande
|
|
20
|
-
display_name: J1-Grande v1 (17B)
|
|
21
|
-
description: Jurassic-1 Grande (17B parameters) with a "few tweaks" to the training process ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
|
|
22
|
-
creator_organization: AI21 Labs
|
|
23
|
-
access: limited
|
|
24
|
-
num_parameters: 17000000000
|
|
25
|
-
release_date: 2022-05-03
|
|
26
|
-
- name: ai21/j1-grande-v2-beta
|
|
27
|
-
display_name: J1-Grande v2 beta (17B)
|
|
28
|
-
description: Jurassic-1 Grande v2 beta (17B parameters)
|
|
29
|
-
creator_organization: AI21 Labs
|
|
30
|
-
access: limited
|
|
31
|
-
num_parameters: 17000000000
|
|
32
|
-
release_date: 2022-10-28
|
|
33
|
-
- name: ai21/j2-jumbo
|
|
34
|
-
display_name: Jurassic-2 Jumbo (178B)
|
|
35
|
-
description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
|
|
36
|
-
creator_organization: AI21 Labs
|
|
37
|
-
access: limited
|
|
38
|
-
num_parameters: 178000000000
|
|
39
|
-
release_date: 2023-03-09
|
|
40
|
-
- name: ai21/j2-grande
|
|
41
|
-
display_name: Jurassic-2 Grande (17B)
|
|
42
|
-
description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
|
|
43
|
-
creator_organization: AI21 Labs
|
|
44
|
-
access: limited
|
|
45
|
-
num_parameters: 17000000000
|
|
46
|
-
release_date: 2023-03-09
|
|
47
|
-
- name: ai21/j2-large
|
|
48
|
-
display_name: Jurassic-2 Large (7.5B)
|
|
49
|
-
description: Jurassic-2 Large (7.5B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
|
|
50
|
-
creator_organization: AI21 Labs
|
|
51
|
-
access: limited
|
|
52
|
-
num_parameters: 7500000000
|
|
53
|
-
release_date: 2023-03-09
|
|
54
|
-
|
|
55
|
-
# Aleph Alpha
|
|
56
|
-
# TODO: add Luminous World when it's released
|
|
57
|
-
- name: AlephAlpha/luminous-base
|
|
58
|
-
display_name: Luminous Base (13B)
|
|
59
|
-
description: Luminous Base (13B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
|
|
60
|
-
creator_organization: Aleph Alpha
|
|
61
|
-
access: limited
|
|
62
|
-
num_parameters: 13000000000
|
|
63
|
-
# TODO: get exact release date
|
|
64
|
-
release_date: 2022-01-01
|
|
65
|
-
- name: AlephAlpha/luminous-extended
|
|
66
|
-
display_name: Luminous Extended (30B)
|
|
67
|
-
description: Luminous Extended (30B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
|
|
68
|
-
creator_organization: Aleph Alpha
|
|
69
|
-
access: limited
|
|
70
|
-
num_parameters: 30000000000
|
|
71
|
-
release_date: 2022-01-01
|
|
72
|
-
- name: AlephAlpha/luminous-supreme
|
|
73
|
-
display_name: Luminous Supreme (70B)
|
|
74
|
-
description: Luminous Supreme (70B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
|
|
75
|
-
creator_organization: Aleph Alpha
|
|
76
|
-
access: limited
|
|
77
|
-
num_parameters: 70000000000
|
|
78
|
-
release_date: 2022-01-01
|
|
79
|
-
|
|
80
|
-
# TODO: Remove Once we have configurable model names
|
|
81
|
-
- name: neurips/local
|
|
82
|
-
display_name: Local service
|
|
83
|
-
description: Local competition service
|
|
84
|
-
creator_organization: neurips
|
|
85
|
-
access: open
|
|
86
|
-
num_parameters: 1
|
|
87
|
-
release_date: 2021-12-01
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
# Anthropic
|
|
91
|
-
- name: anthropic/stanford-online-all-v4-s3
|
|
92
|
-
display_name: Anthropic-LM v4-s3 (52B)
|
|
93
|
-
description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf).
|
|
94
|
-
creator_organization: Anthropic
|
|
95
|
-
access: closed
|
|
96
|
-
num_parameters: 52000000000
|
|
97
|
-
release_date: 2021-12-01
|
|
98
|
-
- name: anthropic/claude-2.0
|
|
99
|
-
display_name: Anthropic Claude 2.0
|
|
100
|
-
description: Claude 2.0 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
|
|
101
|
-
creator_organization: Anthropic
|
|
102
|
-
access: limited
|
|
103
|
-
release_date: 2023-07-11
|
|
104
|
-
- name: anthropic/claude-v1.3
|
|
105
|
-
display_name: Anthropic Claude v1.3
|
|
106
|
-
description: A model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
|
|
107
|
-
creator_organization: Anthropic
|
|
108
|
-
access: limited
|
|
109
|
-
release_date: 2023-03-17
|
|
110
|
-
- name: anthropic/claude-instant-v1
|
|
111
|
-
display_name: Anthropic Claude Instant V1
|
|
112
|
-
description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
|
|
113
|
-
creator_organization: Anthropic
|
|
114
|
-
access: limited
|
|
115
|
-
release_date: 2023-03-17
|
|
116
|
-
|
|
117
|
-
# Berkeley
|
|
118
|
-
- name: together/koala-13b
|
|
119
|
-
display_name: Koala (13B)
|
|
120
|
-
description: Koala (13B) is a chatbot fine-tuned from Llama (13B) on dialogue data gathered from the web. ([blog post](https://bair.berkeley.edu/blog/2023/04/03/koala/))
|
|
121
|
-
creator_organization: UC Berkeley
|
|
122
|
-
access: open
|
|
123
|
-
num_parameters: 13000000000
|
|
124
|
-
release_date: 2022-04-03
|
|
125
|
-
todo: true
|
|
126
|
-
|
|
127
|
-
# BigScience
|
|
128
|
-
- name: together/bloom
|
|
129
|
-
display_name: BLOOM (176B)
|
|
130
|
-
description: BLOOM (176B parameters) is an autoregressive model trained on 46 natural languages and 13 programming languages ([paper](https://arxiv.org/pdf/2211.05100.pdf)).
|
|
131
|
-
creator_organization: BigScience
|
|
132
|
-
access: open
|
|
133
|
-
num_parameters: 176000000000
|
|
134
|
-
release_date: 2022-06-28
|
|
135
|
-
- name: together/bloomz
|
|
136
|
-
display_name: BLOOMZ (176B)
|
|
137
|
-
description: BLOOMZ (176B parameters) is BLOOM that has been fine-tuned on natural language instructions ([details](https://huggingface.co/bigscience/bloomz)).
|
|
138
|
-
creator_organization: BigScience
|
|
139
|
-
access: open
|
|
140
|
-
num_parameters: 176000000000
|
|
141
|
-
release_date: 2022-11-03
|
|
142
|
-
todo: true
|
|
143
|
-
- name: together/t0pp
|
|
144
|
-
display_name: T0pp (11B)
|
|
145
|
-
description: T0pp (11B parameters) is an encoder-decoder model trained on a large set of different tasks specified in natural language prompts ([paper](https://arxiv.org/pdf/2110.08207.pdf)).
|
|
146
|
-
creator_organization: BigScience
|
|
147
|
-
access: open
|
|
148
|
-
num_parameters: 11000000000
|
|
149
|
-
release_date: 2021-10-15
|
|
150
|
-
|
|
151
|
-
# BigCode
|
|
152
|
-
- name: huggingface/santacoder
|
|
153
|
-
display_name: SantaCoder (1.1B)
|
|
154
|
-
description: SantaCoder (1.1B parameters) model trained on the Python, Java, and JavaScript subset of The Stack (v1.1) ([model card](https://huggingface.co/bigcode/santacoder)).
|
|
155
|
-
creator_organization: BigCode
|
|
156
|
-
access: open
|
|
157
|
-
- name: huggingface/starcoder
|
|
158
|
-
display_name: StarCoder (15.5B)
|
|
159
|
-
description: The StarCoder (15.5B parameter) model trained on 80+ programming languages from The Stack (v1.2) ([model card](https://huggingface.co/bigcode/starcoder)).
|
|
160
|
-
creator_organization: BigCode
|
|
161
|
-
access: open
|
|
162
|
-
|
|
163
|
-
# Cerebras Systems
|
|
164
|
-
- name: together/cerebras-gpt-6.7b
|
|
165
|
-
display_name: Cerebras GPT (6.7B)
|
|
166
|
-
description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
|
|
167
|
-
creator_organization: Cerebras
|
|
168
|
-
access: limited
|
|
169
|
-
num_parameters: 6700000000
|
|
170
|
-
release_date: 2023-04-06
|
|
171
|
-
todo: true
|
|
172
|
-
- name: together/cerebras-gpt-13b
|
|
173
|
-
display_name: Cerebras GPT (13B)
|
|
174
|
-
description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
|
|
175
|
-
creator_organization: Cerebras
|
|
176
|
-
access: limited
|
|
177
|
-
num_parameters: 13000000000
|
|
178
|
-
release_date: 2023-04-06
|
|
179
|
-
todo: true
|
|
180
|
-
|
|
181
|
-
# Cohere
|
|
182
|
-
- name: cohere/xlarge-20220609
|
|
183
|
-
display_name: Cohere xlarge v20220609 (52.4B)
|
|
184
|
-
description: Cohere xlarge v20220609 (52.4B parameters)
|
|
185
|
-
creator_organization: Cohere
|
|
186
|
-
access: limited
|
|
187
|
-
num_parameters: 52400000000
|
|
188
|
-
release_date: 2022-06-09
|
|
189
|
-
- name: cohere/large-20220720
|
|
190
|
-
display_name: Cohere large v20220720 (13.1B)
|
|
191
|
-
description: Cohere large v20220720 (13.1B parameters), which is deprecated by Cohere as of December 2, 2022.
|
|
192
|
-
creator_organization: Cohere
|
|
193
|
-
access: limited
|
|
194
|
-
num_parameters: 13100000000
|
|
195
|
-
release_date: 2022-07-20
|
|
196
|
-
- name: cohere/medium-20220720
|
|
197
|
-
display_name: Cohere medium v20220720 (6.1B)
|
|
198
|
-
description: Cohere medium v20220720 (6.1B parameters)
|
|
199
|
-
creator_organization: Cohere
|
|
200
|
-
access: limited
|
|
201
|
-
num_parameters: 6100000000
|
|
202
|
-
release_date: 2022-07-20
|
|
203
|
-
- name: cohere/small-20220720
|
|
204
|
-
display_name: Cohere small v20220720 (410M)
|
|
205
|
-
description: Cohere small v20220720 (410M parameters), which is deprecated by Cohere as of December 2, 2022.
|
|
206
|
-
creator_organization: Cohere
|
|
207
|
-
access: limited
|
|
208
|
-
num_parameters: 410000000
|
|
209
|
-
release_date: 2022-07-20
|
|
210
|
-
- name: cohere/xlarge-20221108
|
|
211
|
-
display_name: Cohere xlarge v20221108 (52.4B)
|
|
212
|
-
description: Cohere xlarge v20221108 (52.4B parameters)
|
|
213
|
-
creator_organization: Cohere
|
|
214
|
-
access: limited
|
|
215
|
-
num_parameters: 52400000000
|
|
216
|
-
release_date: 2022-11-08
|
|
217
|
-
- name: cohere/medium-20221108
|
|
218
|
-
display_name: Cohere medium v20221108 (6.1B)
|
|
219
|
-
description: Cohere medium v20221108 (6.1B parameters)
|
|
220
|
-
creator_organization: Cohere
|
|
221
|
-
access: limited
|
|
222
|
-
num_parameters: 6100000000
|
|
223
|
-
release_date: 2022-11-08
|
|
224
|
-
- name: cohere/command-medium-beta
|
|
225
|
-
display_name: Cohere Command beta (6.1B)
|
|
226
|
-
description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
|
|
227
|
-
creator_organization: Cohere
|
|
228
|
-
access: limited
|
|
229
|
-
num_parameters: 6100000000
|
|
230
|
-
release_date: 2022-11-08
|
|
231
|
-
- name: cohere/command-xlarge-beta
|
|
232
|
-
display_name: Cohere Command beta (52.4B)
|
|
233
|
-
description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
|
|
234
|
-
creator_organization: Cohere
|
|
235
|
-
access: limited
|
|
236
|
-
num_parameters: 52400000000
|
|
237
|
-
release_date: 2022-11-08
|
|
238
|
-
|
|
239
|
-
# Databricks
|
|
240
|
-
- name: databricks/dolly-v2-3b
|
|
241
|
-
display_name: Dolly V2 (3B)
|
|
242
|
-
description: Dolly V2 (3B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
|
|
243
|
-
creator_organization: Databricks
|
|
244
|
-
access: open
|
|
245
|
-
num_parameters: 2517652480
|
|
246
|
-
release_date: 2023-04-12
|
|
247
|
-
todo: true
|
|
248
|
-
- name: databricks/dolly-v2-7b
|
|
249
|
-
display_name: Dolly V2 (7B)
|
|
250
|
-
description: Dolly V2 (7B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
|
|
251
|
-
creator_organization: Databricks
|
|
252
|
-
access: open
|
|
253
|
-
num_parameters: 6444163072
|
|
254
|
-
release_date: 2023-04-12
|
|
255
|
-
todo: true
|
|
256
|
-
- name: databricks/dolly-v2-12b
|
|
257
|
-
display_name: Dolly V2 (12B)
|
|
258
|
-
description: Dolly V2 (12B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
|
|
259
|
-
creator_organization: Databricks
|
|
260
|
-
access: open
|
|
261
|
-
num_parameters: 11327027200
|
|
262
|
-
release_date: 2023-04-12
|
|
263
|
-
todo: true
|
|
264
|
-
|
|
265
|
-
# DeepMind
|
|
266
|
-
- name: deepmind/gopher
|
|
267
|
-
display_name: Gopher (280B)
|
|
268
|
-
description: Gopher (540B parameters) ([paper](https://arxiv.org/pdf/2112.11446.pdf)).
|
|
269
|
-
creator_organization: DeepMind
|
|
270
|
-
access: closed
|
|
271
|
-
todo: true
|
|
272
|
-
- name: deepmind/chinchilla
|
|
273
|
-
display_name: Chinchilla (70B)
|
|
274
|
-
description: Chinchilla (70B parameters) ([paper](https://arxiv.org/pdf/2203.15556.pdf)).
|
|
275
|
-
creator_organization: DeepMind
|
|
276
|
-
access: closed
|
|
277
|
-
todo: true
|
|
278
|
-
|
|
279
|
-
# EleutherAI
|
|
280
|
-
- name: together/gpt-j-6b
|
|
281
|
-
display_name: GPT-J (6B)
|
|
282
|
-
description: GPT-J (6B parameters) autoregressive language model trained on The Pile ([details](https://arankomatsuzaki.wordpress.com/2021/06/04/gpt-j/)).
|
|
283
|
-
creator_organization: EleutherAI
|
|
284
|
-
access: open
|
|
285
|
-
num_parameters: 6000000000
|
|
286
|
-
release_date: 2021-06-04
|
|
287
|
-
- name: together/gpt-neox-20b
|
|
288
|
-
display_name: GPT-NeoX (20B)
|
|
289
|
-
description: GPT-NeoX (20B parameters) autoregressive language model trained on The Pile ([paper](https://arxiv.org/pdf/2204.06745.pdf)).
|
|
290
|
-
creator_organization: EleutherAI
|
|
291
|
-
access: open
|
|
292
|
-
num_parameters: 20000000000
|
|
293
|
-
release_date: 2022-02-02
|
|
294
|
-
- name: eleutherai/pythia-1b-v0
|
|
295
|
-
display_name: Pythia (1B)
|
|
296
|
-
description: Pythia (1B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
|
|
297
|
-
creator_organization: EleutherAI
|
|
298
|
-
access: open
|
|
299
|
-
num_parameters: 805736448
|
|
300
|
-
release_date: 2023-02-13
|
|
301
|
-
todo: true
|
|
302
|
-
- name: eleutherai/pythia-2.8b-v0
|
|
303
|
-
display_name: Pythia (2.8B)
|
|
304
|
-
description: Pythia (2.8B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
|
|
305
|
-
creator_organization: EleutherAI
|
|
306
|
-
access: open
|
|
307
|
-
num_parameters: 2517652480
|
|
308
|
-
release_date: 2023-02-13
|
|
309
|
-
todo: true
|
|
310
|
-
- name: eleutherai/pythia-6.9b
|
|
311
|
-
display_name: Pythia (6.9B)
|
|
312
|
-
description: Pythia (6.9B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
|
|
313
|
-
creator_organization: EleutherAI
|
|
314
|
-
access: open
|
|
315
|
-
num_parameters: 6444163072
|
|
316
|
-
release_date: 2023-02-13
|
|
317
|
-
- name: eleutherai/pythia-12b-v0
|
|
318
|
-
display_name: Pythia (12B)
|
|
319
|
-
description: Pythia (12B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
|
|
320
|
-
creator_organization: EleutherAI
|
|
321
|
-
access: open
|
|
322
|
-
num_parameters: 11327027200
|
|
323
|
-
release_date: 2023-02-13
|
|
324
|
-
|
|
325
|
-
# Google
|
|
326
|
-
- name: together/t5-11b
|
|
327
|
-
display_name: T5 (11B)
|
|
328
|
-
description: T5 (11B parameters) is an encoder-decoder model trained on a multi-task mixture, where each task is converted into a text-to-text format ([paper](https://arxiv.org/pdf/1910.10683.pdf)).
|
|
329
|
-
creator_organization: Google
|
|
330
|
-
access: open
|
|
331
|
-
num_parameters: 11000000000
|
|
332
|
-
release_date: 2019-10-23
|
|
333
|
-
|
|
334
|
-
- name: together/ul2
|
|
335
|
-
display_name: UL2 (20B)
|
|
336
|
-
description: UL2 (20B parameters) is an encoder-decoder model trained on the C4 corpus. It's similar to T5 but trained with a different objective and slightly different scaling knobs ([paper](https://arxiv.org/pdf/2205.05131.pdf)).
|
|
337
|
-
creator_organization: Google
|
|
338
|
-
access: open
|
|
339
|
-
num_parameters: 20000000000
|
|
340
|
-
release_date: 2022-05-10
|
|
341
|
-
|
|
342
|
-
- name: together/flan-t5-xxl
|
|
343
|
-
display_name: Flan-T5 (11B)
|
|
344
|
-
description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)).
|
|
345
|
-
creator_organization: Google
|
|
346
|
-
access: open
|
|
347
|
-
|
|
348
|
-
- name: google/palm
|
|
349
|
-
display_name: PaLM (540B)
|
|
350
|
-
description: Pathways Language Model (540B parameters) is trained using 6144 TPU v4 chips ([paper](https://arxiv.org/pdf/2204.02311.pdf)).
|
|
351
|
-
creator_organization: Google
|
|
352
|
-
access: closed
|
|
353
|
-
todo: true
|
|
354
|
-
|
|
355
|
-
# HazyResearch
|
|
356
|
-
- name: together/h3-2.7b
|
|
357
|
-
display_name: H3 (2.7B)
|
|
358
|
-
description: H3 (2.7B parameters) is a decoder-only language model based on state space models ([paper](https://arxiv.org/abs/2212.14052)).
|
|
359
|
-
creator_organization: HazyResearch
|
|
360
|
-
access: open
|
|
361
|
-
num_parameters: 2700000000
|
|
362
|
-
release_date: 2023-01-23
|
|
363
|
-
todo: true
|
|
364
|
-
|
|
365
|
-
# Lightning AI's Lit-GPT
|
|
366
|
-
- name: lightningai/lit-gpt
|
|
367
|
-
display_name: Lit-GPT
|
|
368
|
-
description: Lit-GPT is an optimized collection of open-source LLMs for finetuning and inference. It supports – Falcon, Llama 2, Vicuna, LongChat, and other top-performing open-source large language models.
|
|
369
|
-
creator_organization: Lightning AI
|
|
370
|
-
access: open
|
|
371
|
-
num_parameters: 1
|
|
372
|
-
release_date: 2023-04-04
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
# Meta
|
|
376
|
-
- name: together/opt-iml-175b
|
|
377
|
-
display_name: OPT-IML (175B)
|
|
378
|
-
description: OPT-IML (175B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
|
|
379
|
-
creator_organization: Meta
|
|
380
|
-
access: open
|
|
381
|
-
num_parameters: 175000000000
|
|
382
|
-
release_date: 2022-12-22
|
|
383
|
-
todo: true
|
|
384
|
-
|
|
385
|
-
- name: together/opt-iml-30b
|
|
386
|
-
display_name: OPT-IML (30B)
|
|
387
|
-
description: OPT-IML (30B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
|
|
388
|
-
creator_organization: Meta
|
|
389
|
-
access: open
|
|
390
|
-
num_parameters: 30000000000
|
|
391
|
-
release_date: 2022-12-22
|
|
392
|
-
todo: true
|
|
393
|
-
|
|
394
|
-
- name: together/opt-175b
|
|
395
|
-
display_name: OPT (175B)
|
|
396
|
-
description: Open Pre-trained Transformers (175B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
|
|
397
|
-
creator_organization: Meta
|
|
398
|
-
access: open
|
|
399
|
-
num_parameters: 175000000000
|
|
400
|
-
release_date: 2022-05-02
|
|
401
|
-
|
|
402
|
-
- name: together/opt-66b
|
|
403
|
-
display_name: OPT (66B)
|
|
404
|
-
description: Open Pre-trained Transformers (66B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
|
|
405
|
-
creator_organization: Meta
|
|
406
|
-
access: open
|
|
407
|
-
num_parameters: 66000000000
|
|
408
|
-
release_date: 2022-05-02
|
|
409
|
-
|
|
410
|
-
- name: together/opt-6.7b
|
|
411
|
-
display_name: OPT (6.7B)
|
|
412
|
-
description: Open Pre-trained Transformers (6.7B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
|
|
413
|
-
creator_organization: Meta
|
|
414
|
-
access: open
|
|
415
|
-
num_parameters: 6700000000
|
|
416
|
-
release_date: 2022-05-02
|
|
417
|
-
|
|
418
|
-
- name: together/opt-1.3b
|
|
419
|
-
display_name: OPT (1.3B)
|
|
420
|
-
description: Open Pre-trained Transformers (1.3B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
|
|
421
|
-
creator_organization: Meta
|
|
422
|
-
access: open
|
|
423
|
-
num_parameters: 1300000000
|
|
424
|
-
release_date: 2022-05-02
|
|
425
|
-
|
|
426
|
-
- name: together/galactica-120b
|
|
427
|
-
display_name: Galactica (120B)
|
|
428
|
-
description: Galactica (120B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
|
|
429
|
-
creator_organization: Meta
|
|
430
|
-
access: open
|
|
431
|
-
num_parameters: 120000000000
|
|
432
|
-
release_date: 2022-11-15
|
|
433
|
-
todo: true
|
|
434
|
-
|
|
435
|
-
- name: together/galactica-30b
|
|
436
|
-
display_name: Galactica (30B)
|
|
437
|
-
description: Galactica (30B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
|
|
438
|
-
creator_organization: Meta
|
|
439
|
-
access: open
|
|
440
|
-
num_parameters: 30000000000
|
|
441
|
-
release_date: 2022-11-15
|
|
442
|
-
todo: true
|
|
443
|
-
- name: meta/llama-7b
|
|
444
|
-
display_name: LLaMA (7B)
|
|
445
|
-
description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
|
|
446
|
-
creator_organization: Meta
|
|
447
|
-
access: open
|
|
448
|
-
num_parameters: 7000000000
|
|
449
|
-
release_date: 2023-02-24
|
|
450
|
-
- name: meta/llama-13b
|
|
451
|
-
display_name: LLaMA (13B)
|
|
452
|
-
description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
|
|
453
|
-
creator_organization: Meta
|
|
454
|
-
access: open
|
|
455
|
-
num_parameters: 13000000000
|
|
456
|
-
release_date: 2023-02-24
|
|
457
|
-
- name: meta/llama-30b
|
|
458
|
-
display_name: LLaMA (30B)
|
|
459
|
-
description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
|
|
460
|
-
creator_organization: Meta
|
|
461
|
-
access: open
|
|
462
|
-
num_parameters: 30000000000
|
|
463
|
-
release_date: 2023-02-24
|
|
464
|
-
- name: meta/llama-65b
|
|
465
|
-
display_name: LLaMA (65B)
|
|
466
|
-
description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
|
|
467
|
-
creator_organization: Meta
|
|
468
|
-
access: open
|
|
469
|
-
num_parameters: 65000000000
|
|
470
|
-
release_date: 2023-02-24
|
|
471
|
-
- name: meta/llama-2-7b
|
|
472
|
-
display_name: Llama 2 (7B)
|
|
473
|
-
description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
|
|
474
|
-
creator_organization: Meta
|
|
475
|
-
access: open
|
|
476
|
-
num_parameters: 7000000000
|
|
477
|
-
release_date: 2023-07-18
|
|
478
|
-
- name: meta/llama-2-13b
|
|
479
|
-
display_name: Llama 2 (13B)
|
|
480
|
-
description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
|
|
481
|
-
creator_organization: Meta
|
|
482
|
-
access: open
|
|
483
|
-
num_parameters: 13000000000
|
|
484
|
-
release_date: 2023-07-18
|
|
485
|
-
- name: meta/llama-2-70b
|
|
486
|
-
display_name: Llama 2 (70B)
|
|
487
|
-
description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
|
|
488
|
-
creator_organization: Meta
|
|
489
|
-
access: open
|
|
490
|
-
num_parameters: 70000000000
|
|
491
|
-
release_date: 2023-07-18
|
|
492
|
-
|
|
493
|
-
# Stability AI
|
|
494
|
-
- name: stabilityai/stablelm-base-alpha-3b
|
|
495
|
-
display_name: StableLM-Base-Alpha (3B)
|
|
496
|
-
description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
|
|
497
|
-
creator_organization: Stability AI
|
|
498
|
-
access: open
|
|
499
|
-
num_parameters: 3000000000
|
|
500
|
-
release_date: 2023-04-20
|
|
501
|
-
todo: true
|
|
502
|
-
|
|
503
|
-
- name: stabilityai/stablelm-base-alpha-7b
|
|
504
|
-
display_name: StableLM-Base-Alpha (7B)
|
|
505
|
-
description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
|
|
506
|
-
creator_organization: Stability AI
|
|
507
|
-
access: open
|
|
508
|
-
num_parameters: 7000000000
|
|
509
|
-
release_date: 2023-04-20
|
|
510
|
-
todo: true
|
|
511
|
-
|
|
512
|
-
# Stanford
|
|
513
|
-
- name: stanford/alpaca-7b
|
|
514
|
-
display_name: Alpaca (7B)
|
|
515
|
-
description: Alpaca 7B is a model fine-tuned from the LLaMA 7B model on 52K instruction-following demonstrations
|
|
516
|
-
creator_organization: Stanford
|
|
517
|
-
access: open
|
|
518
|
-
num_parameters: 7000000000
|
|
519
|
-
release_date: 2023-03-13
|
|
520
|
-
|
|
521
|
-
# LMSYS
|
|
522
|
-
- name: lmsys/vicuna-7b-v1.3
|
|
523
|
-
display_name: Vicuna v1.3 (7B)
|
|
524
|
-
description: Vicuna v1.3 (7B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
|
|
525
|
-
creator_organization: LMSYS
|
|
526
|
-
access: open
|
|
527
|
-
num_parameters: 7000000000
|
|
528
|
-
release_date: 2023-06-22
|
|
529
|
-
- name: lmsys/vicuna-13b-v1.3
|
|
530
|
-
display_name: Vicuna v1.3 (13B)
|
|
531
|
-
description: Vicuna v1.3 (13B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
|
|
532
|
-
creator_organization: LMSYS
|
|
533
|
-
access: open
|
|
534
|
-
num_parameters: 13000000000
|
|
535
|
-
release_date: 2023-06-22
|
|
536
|
-
|
|
537
|
-
# Mistral AI
|
|
538
|
-
- name: mistralai/mistral-7b-v0.1
|
|
539
|
-
display_name: Mistral v0.1 (7B)
|
|
540
|
-
description: Mistral 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA).
|
|
541
|
-
creator_organization: Mistral AI
|
|
542
|
-
access: open
|
|
543
|
-
num_parameters: 7300000000
|
|
544
|
-
release_date: 2023-09-27
|
|
545
|
-
|
|
546
|
-
# Microsoft/NVIDIA
|
|
547
|
-
- name: microsoft/TNLGv2_530B
|
|
548
|
-
display_name: TNLG v2 (530B)
|
|
549
|
-
description: TNLG v2 (530B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
|
|
550
|
-
creator_organization: Microsoft/NVIDIA
|
|
551
|
-
access: closed
|
|
552
|
-
num_parameters: 530000000000
|
|
553
|
-
release_date: 2022-01-28
|
|
554
|
-
- name: microsoft/TNLGv2_7B
|
|
555
|
-
display_name: TNLG v2 (6.7B)
|
|
556
|
-
description: TNLG v2 (6.7B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
|
|
557
|
-
creator_organization: Microsoft/NVIDIA
|
|
558
|
-
access: closed
|
|
559
|
-
num_parameters: 6700000000
|
|
560
|
-
release_date: 2022-01-28
|
|
561
|
-
|
|
562
|
-
# OpenAI: https://beta.openai.com/docs/engines/gpt-3
|
|
563
|
-
- name: openai/davinci
|
|
564
|
-
display_name: davinci (175B)
|
|
565
|
-
description: Original GPT-3 (175B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
|
|
566
|
-
creator_organization: OpenAI
|
|
567
|
-
access: limited
|
|
568
|
-
num_parameters: 175000000000
|
|
569
|
-
release_date: 2020-05-28
|
|
570
|
-
- name: openai/curie
|
|
571
|
-
display_name: curie (6.7B)
|
|
572
|
-
description: Original GPT-3 (6.7B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
|
|
573
|
-
creator_organization: OpenAI
|
|
574
|
-
access: limited
|
|
575
|
-
num_parameters: 6700000000
|
|
576
|
-
release_date: 2020-05-28
|
|
577
|
-
- name: openai/babbage
|
|
578
|
-
display_name: babbage (1.3B)
|
|
579
|
-
description: Original GPT-3 (1.3B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
|
|
580
|
-
creator_organization: OpenAI
|
|
581
|
-
access: limited
|
|
582
|
-
num_parameters: 1300000000
|
|
583
|
-
release_date: 2020-05-28
|
|
584
|
-
- name: openai/ada
|
|
585
|
-
display_name: ada (350M)
|
|
586
|
-
description: Original GPT-3 (350M parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
|
|
587
|
-
creator_organization: OpenAI
|
|
588
|
-
access: limited
|
|
589
|
-
num_parameters: 350000000
|
|
590
|
-
release_date: 2020-05-28
|
|
591
|
-
- name: openai/text-davinci-003
|
|
592
|
-
display_name: text-davinci-003
|
|
593
|
-
description: text-davinci-003 model that involves reinforcement learning (PPO) with reward models. Derived from text-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
|
|
594
|
-
creator_organization: OpenAI
|
|
595
|
-
access: limited
|
|
596
|
-
num_parameters: 175000000000
|
|
597
|
-
release_date: 2022-11-28
|
|
598
|
-
- name: openai/text-davinci-002
|
|
599
|
-
display_name: text-davinci-002
|
|
600
|
-
description: text-davinci-002 model that involves supervised fine-tuning on human-written demonstrations. Derived from code-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
|
|
601
|
-
creator_organization: OpenAI
|
|
602
|
-
access: limited
|
|
603
|
-
num_parameters: 175000000000
|
|
604
|
-
release_date: 2022-01-27
|
|
605
|
-
- name: openai/text-davinci-001
|
|
606
|
-
display_name: text-davinci-001
|
|
607
|
-
description: text-davinci-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
|
|
608
|
-
creator_organization: OpenAI
|
|
609
|
-
access: limited
|
|
610
|
-
num_parameters: 175000000000
|
|
611
|
-
release_date: 2022-01-27
|
|
612
|
-
todo: true
|
|
613
|
-
- name: openai/text-curie-001
|
|
614
|
-
display_name: text-curie-001
|
|
615
|
-
description: text-curie-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
|
|
616
|
-
creator_organization: OpenAI
|
|
617
|
-
access: limited
|
|
618
|
-
num_parameters: 6700000000
|
|
619
|
-
release_date: 2022-01-27
|
|
620
|
-
- name: openai/text-babbage-001
|
|
621
|
-
display_name: text-babbage-001
|
|
622
|
-
description: text-babbage-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
|
|
623
|
-
creator_organization: OpenAI
|
|
624
|
-
access: limited
|
|
625
|
-
num_parameters: 1300000000
|
|
626
|
-
release_date: 2022-01-27
|
|
627
|
-
- name: openai/text-ada-001
|
|
628
|
-
display_name: text-ada-001
|
|
629
|
-
description: text-ada-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
|
|
630
|
-
creator_organization: OpenAI
|
|
631
|
-
access: limited
|
|
632
|
-
num_parameters: 350000000
|
|
633
|
-
release_date: 2022-01-27
|
|
634
|
-
- name: openai/gpt-4-0314
|
|
635
|
-
display_name: gpt-4-0314
|
|
636
|
-
description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from March 14th 2023.
|
|
637
|
-
creator_organization: OpenAI
|
|
638
|
-
access: limited
|
|
639
|
-
release_date: 2023-03-14
|
|
640
|
-
- name: openai/gpt-4-32k-0314
|
|
641
|
-
display_name: gpt-4-32k-0314
|
|
642
|
-
description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from March 14th 2023.
|
|
643
|
-
creator_organization: OpenAI
|
|
644
|
-
access: limited
|
|
645
|
-
release_date: 2023-03-14
|
|
646
|
-
- name: openai/gpt-4-0613
|
|
647
|
-
display_name: gpt-4-0613
|
|
648
|
-
description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from 2023-06-13.
|
|
649
|
-
creator_organization: OpenAI
|
|
650
|
-
access: limited
|
|
651
|
-
release_date: 2023-06-13
|
|
652
|
-
- name: openai/gpt-4-32k-0613
|
|
653
|
-
display_name: gpt-4-32k-0613
|
|
654
|
-
description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from 2023-06-13.
|
|
655
|
-
creator_organization: OpenAI
|
|
656
|
-
access: limited
|
|
657
|
-
release_date: 2023-06-13
|
|
658
|
-
- name: openai/code-davinci-002
|
|
659
|
-
display_name: code-davinci-002
|
|
660
|
-
description: Codex-style model that is designed for pure code-completion tasks ([docs](https://beta.openai.com/docs/models/codex)).
|
|
661
|
-
creator_organization: OpenAI
|
|
662
|
-
access: limited
|
|
663
|
-
- name: openai/code-davinci-001
|
|
664
|
-
display_name: code-davinci-001
|
|
665
|
-
description: code-davinci-001 model
|
|
666
|
-
creator_organization: OpenAI
|
|
667
|
-
access: limited
|
|
668
|
-
todo: true
|
|
669
|
-
- name: openai/code-cushman-001
|
|
670
|
-
display_name: code-cushman-001 (12B)
|
|
671
|
-
description: Codex-style model that is a stronger, multilingual version of the Codex (12B) model in the [Codex paper](https://arxiv.org/pdf/2107.03374.pdf).
|
|
672
|
-
creator_organization: OpenAI
|
|
673
|
-
access: limited
|
|
674
|
-
- name: openai/gpt-3.5-turbo-0301
|
|
675
|
-
display_name: gpt-3.5-turbo-0301
|
|
676
|
-
description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01.
|
|
677
|
-
creator_organization: OpenAI
|
|
678
|
-
access: limited
|
|
679
|
-
release_date: 2023-03-01
|
|
680
|
-
- name: openai/gpt-3.5-turbo-0613
|
|
681
|
-
display_name: gpt-3.5-turbo-0613
|
|
682
|
-
description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13.
|
|
683
|
-
creator_organization: OpenAI
|
|
684
|
-
access: limited
|
|
685
|
-
release_date: 2023-06-13
|
|
686
|
-
- name: openai/gpt-3.5-turbo-16k-0613
|
|
687
|
-
display_name: gpt-3.5-turbo-16k-0613
|
|
688
|
-
description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13 with a longer context length of 16,384 tokens.
|
|
689
|
-
creator_organization: OpenAI
|
|
690
|
-
access: limited
|
|
691
|
-
release_date: 2023-06-13
|
|
692
|
-
|
|
693
|
-
# Together
|
|
694
|
-
- name: together/Together-gpt-JT-6B-v1
|
|
695
|
-
display_name: GPT-JT (6B)
|
|
696
|
-
description: GPT-JT (6B parameters) is a fork of GPT-J ([blog post](https://www.together.xyz/blog/releasing-v1-of-gpt-jt-powered-by-open-source-ai)).
|
|
697
|
-
creator_organization: Together
|
|
698
|
-
access: open
|
|
699
|
-
num_parameters: 6700000000
|
|
700
|
-
release_date: 2022-11-29
|
|
701
|
-
todo: true
|
|
702
|
-
- name: together/gpt-neoxt-chat-base-20b
|
|
703
|
-
display_name: GPT-NeoXT-Chat-Base (20B)
|
|
704
|
-
description: GPT-NeoXT-Chat-Base (20B) is fine-tuned from GPT-NeoX, serving as a base model for developing open-source chatbots.
|
|
705
|
-
creator_organization: Together
|
|
706
|
-
access: open
|
|
707
|
-
num_parameters: 20000000000
|
|
708
|
-
release_date: 2023-03-08
|
|
709
|
-
todo: true
|
|
710
|
-
- name: together/redpajama-incite-base-3b-v1
|
|
711
|
-
display_name: RedPajama-INCITE-Base-v1 (3B)
|
|
712
|
-
description: RedPajama-INCITE-Base-v1 (3B parameters) is a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
|
|
713
|
-
creator_organization: Together
|
|
714
|
-
access: open
|
|
715
|
-
num_parameters: 3000000000
|
|
716
|
-
release_date: 2023-05-05
|
|
717
|
-
- name: together/redpajama-incite-instruct-3b-v1
|
|
718
|
-
display_name: RedPajama-INCITE-Instruct-v1 (3B)
|
|
719
|
-
description: RedPajama-INCITE-Instruct-v1 (3B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
|
|
720
|
-
creator_organization: Together
|
|
721
|
-
access: open
|
|
722
|
-
num_parameters: 3000000000
|
|
723
|
-
release_date: 2023-05-05
|
|
724
|
-
todo: true
|
|
725
|
-
- name: together/redpajama-incite-chat-3b-v1
|
|
726
|
-
display_name: RedPajama-INCITE-Chat-v1 (3B)
|
|
727
|
-
description: RedPajama-INCITE-Chat-v1 (3B parameters) is a model fine-tuned on OASST1 and Dolly2 to enhance chatting ability. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
|
|
728
|
-
creator_organization: Together
|
|
729
|
-
access: open
|
|
730
|
-
num_parameters: 3000000000
|
|
731
|
-
release_date: 2023-05-05
|
|
732
|
-
todo: true
|
|
733
|
-
- name: together/redpajama-incite-base-7b
|
|
734
|
-
display_name: RedPajama-INCITE-Base (7B)
|
|
735
|
-
description: RedPajama-INCITE-Base (7B parameters) is a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
|
|
736
|
-
creator_organization: Together
|
|
737
|
-
access: open
|
|
738
|
-
num_parameters: 7000000000
|
|
739
|
-
release_date: 2023-05-05
|
|
740
|
-
todo: true
|
|
741
|
-
- name: together/redpajama-incite-instruct-7b
|
|
742
|
-
display_name: RedPajama-INCITE-Instruct (7B)
|
|
743
|
-
description: RedPajama-INCITE-Instruct (7B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base (7B), a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
|
|
744
|
-
creator_organization: Together
|
|
745
|
-
access: open
|
|
746
|
-
num_parameters: 7000000000
|
|
747
|
-
release_date: 2023-05-05
|
|
748
|
-
todo: true
|
|
749
|
-
|
|
750
|
-
# MosaicML
|
|
751
|
-
- name: mosaicml/mpt-7b
|
|
752
|
-
display_name: MPT (7B)
|
|
753
|
-
description: MPT (7B) is a Transformer trained from scratch on 1T tokens of text and code.
|
|
754
|
-
creator_organization: MosaicML
|
|
755
|
-
access: open
|
|
756
|
-
num_parameters: 6700000000
|
|
757
|
-
release_date: 2023-05-05
|
|
758
|
-
- name: mosaicml/mpt-7b-chat
|
|
759
|
-
display_name: MPT-Chat (7B)
|
|
760
|
-
description: MPT-Chat (7B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B) , a Transformer trained from scratch on 1T tokens of text and code.
|
|
761
|
-
creator_organization: MosaicML
|
|
762
|
-
access: open
|
|
763
|
-
num_parameters: 6700000000
|
|
764
|
-
release_date: 2023-05-05
|
|
765
|
-
todo: true
|
|
766
|
-
- name: mosaicml/mpt-instruct-7b
|
|
767
|
-
display_name: MPT-Instruct (7B)
|
|
768
|
-
description: MPT-Instruct (7B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
|
|
769
|
-
creator_organization: MosaicML
|
|
770
|
-
access: open
|
|
771
|
-
num_parameters: 6700000000
|
|
772
|
-
release_date: 2023-05-05
|
|
773
|
-
- name: mosaicml/mpt-30b
|
|
774
|
-
display_name: MPT (30B)
|
|
775
|
-
description: MPT (30B) is a Transformer trained from scratch on 1T tokens of text and code.
|
|
776
|
-
creator_organization: MosaicML
|
|
777
|
-
access: open
|
|
778
|
-
num_parameters: 30000000000
|
|
779
|
-
release_date: 2023-06-22
|
|
780
|
-
- name: mosaicml/mpt-30b-chat
|
|
781
|
-
display_name: MPT-Chat (30B)
|
|
782
|
-
description: MPT-Chat (30B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
|
|
783
|
-
creator_organization: MosaicML
|
|
784
|
-
access: open
|
|
785
|
-
num_parameters: 30000000000
|
|
786
|
-
release_date: 2023-06-22
|
|
787
|
-
todo: true
|
|
788
|
-
- name: mosaicml/mpt-instruct-30b
|
|
789
|
-
display_name: MPT-Instruct (30B)
|
|
790
|
-
description: MPT-Instruct (30B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
|
|
791
|
-
creator_organization: MosaicML
|
|
792
|
-
access: open
|
|
793
|
-
num_parameters: 30000000000
|
|
794
|
-
release_date: 2023-06-22
|
|
795
|
-
|
|
796
|
-
# TII UAE
|
|
797
|
-
- name: tiiuae/falcon-7b
|
|
798
|
-
display_name: Falcon (7B)
|
|
799
|
-
description: Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
|
|
800
|
-
creator_organization: TII UAE
|
|
801
|
-
access: open
|
|
802
|
-
num_parameters: 7000000000
|
|
803
|
-
release_date: 2023-03-15
|
|
804
|
-
- name: tiiuae/falcon-7b-instruct
|
|
805
|
-
display_name: Falcon-Instruct (7B)
|
|
806
|
-
description: Falcon-7B-Instruct is a 7B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
|
|
807
|
-
creator_organization: TII UAE
|
|
808
|
-
access: open
|
|
809
|
-
num_parameters: 7000000000
|
|
810
|
-
release_date: 2023-03-15
|
|
811
|
-
- name: tiiuae/falcon-40b
|
|
812
|
-
display_name: Falcon (40B)
|
|
813
|
-
description: Falcon-40B is a 40B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
|
|
814
|
-
creator_organization: TII UAE
|
|
815
|
-
access: open
|
|
816
|
-
num_parameters: 40000000000
|
|
817
|
-
release_date: 2023-05-25
|
|
818
|
-
- name: tiiuae/falcon-40b-instruct
|
|
819
|
-
display_name: Falcon-Instruct (40B)
|
|
820
|
-
description: Falcon-40B-Instruct is a 40B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
|
|
821
|
-
creator_organization: TII UAE
|
|
822
|
-
access: open
|
|
823
|
-
num_parameters: 40000000000
|
|
824
|
-
release_date: 2023-05-25
|
|
825
|
-
|
|
826
|
-
# Salesforce
|
|
827
|
-
- name: together/codegen
|
|
828
|
-
display_name: CodeGen (16B)
|
|
829
|
-
description: CodeGen (16B parameters) is an open dense code model trained for multi-turn program synthesis ([blog](https://arxiv.org/pdf/2203.13474.pdf)).
|
|
830
|
-
creator_organization: Tsinghua
|
|
831
|
-
access: open
|
|
832
|
-
num_parameters: 16000000000
|
|
833
|
-
release_date: 2022-03-25
|
|
834
|
-
todo: true
|
|
835
|
-
|
|
836
|
-
# Tsinghua
|
|
837
|
-
- name: together/glm
|
|
838
|
-
display_name: GLM (130B)
|
|
839
|
-
description: GLM (130B parameters) is an open bilingual (English & Chinese) bidirectional dense model that was trained using General Language Model (GLM) procedure ([paper](https://arxiv.org/pdf/2210.02414.pdf)).
|
|
840
|
-
creator_organization: Tsinghua
|
|
841
|
-
access: open
|
|
842
|
-
num_parameters: 130000000000
|
|
843
|
-
release_date: 2022-08-04
|
|
844
|
-
|
|
845
|
-
- name: together/codegeex
|
|
846
|
-
display_name: CodeGeeX (13B)
|
|
847
|
-
description: CodeGeeX (13B parameters) is an open dense code model trained on more than 20 programming languages on a corpus of more than 850B tokens ([blog](http://keg.cs.tsinghua.edu.cn/codegeex/)).
|
|
848
|
-
creator_organization: Tsinghua
|
|
849
|
-
access: open
|
|
850
|
-
num_parameters: 13000000000
|
|
851
|
-
release_date: 2022-09-19
|
|
852
|
-
todo: true
|
|
853
|
-
|
|
854
|
-
# Writer
|
|
855
|
-
- name: writer/palmyra-base
|
|
856
|
-
display_name: Palmyra Base (5B)
|
|
857
|
-
description: Palmyra Base (5B)
|
|
858
|
-
creator_organization: Writer
|
|
859
|
-
access: limited
|
|
860
|
-
num_parameters: 5000000000
|
|
861
|
-
release_date: 2022-10-13
|
|
862
|
-
todo: true
|
|
863
|
-
- name: writer/palmyra-large
|
|
864
|
-
display_name: Palmyra Large (20B)
|
|
865
|
-
description: Palmyra Large (20B)
|
|
866
|
-
creator_organization: Writer
|
|
867
|
-
access: limited
|
|
868
|
-
num_parameters: 20000000000
|
|
869
|
-
release_date: 2022-12-23
|
|
870
|
-
todo: true
|
|
871
|
-
- name: writer/palmyra-instruct-30
|
|
872
|
-
display_name: InstructPalmyra (30B)
|
|
873
|
-
description: InstructPalmyra (30B parameters) is trained using reinforcement learning techniques based on feedback from humans.
|
|
874
|
-
creator_organization: Writer
|
|
875
|
-
access: limited
|
|
876
|
-
num_parameters: 30000000000
|
|
877
|
-
release_date: 2023-02-16
|
|
878
|
-
todo: true
|
|
879
|
-
- name: writer/palmyra-e
|
|
880
|
-
display_name: Palmyra E (30B)
|
|
881
|
-
description: Palmyra E (30B)
|
|
882
|
-
creator_organization: Writer
|
|
883
|
-
access: limited
|
|
884
|
-
num_parameters: 30000000000
|
|
885
|
-
release_date: 2023-03-03
|
|
886
|
-
todo: true
|
|
887
|
-
- name: writer/silk-road
|
|
888
|
-
display_name: Silk Road (35B)
|
|
889
|
-
description: Silk Road (35B)
|
|
890
|
-
creator_organization: Writer
|
|
891
|
-
access: limited
|
|
892
|
-
num_parameters: 35000000000
|
|
893
|
-
release_date: 2023-04-13
|
|
894
|
-
todo: true
|
|
895
|
-
- name: writer/palmyra-x
|
|
896
|
-
display_name: Palmyra X (43B)
|
|
897
|
-
description: Palmyra-X (43B parameters) is trained to adhere to instructions using human feedback and utilizes a technique called multiquery attention. Furthermore, a new feature called 'self-instruct' has been introduced, which includes the implementation of an early stopping criteria specifically designed for minimal instruction tuning ([paper](https://dev.writer.com/docs/becoming-self-instruct-introducing-early-stopping-criteria-for-minimal-instruct-tuning)).
|
|
898
|
-
creator_organization: Writer
|
|
899
|
-
access: limited
|
|
900
|
-
num_parameters: 43000000000
|
|
901
|
-
release_date: 2023-06-11
|
|
902
|
-
todo: true
|
|
903
|
-
|
|
904
|
-
# Yandex
|
|
905
|
-
- name: together/yalm
|
|
906
|
-
display_name: YaLM (100B)
|
|
907
|
-
description: YaLM (100B parameters) is an autoregressive language model trained on English and Russian text ([GitHub](https://github.com/yandex/YaLM-100B)).
|
|
908
|
-
creator_organization: Yandex
|
|
909
|
-
access: open
|
|
910
|
-
num_parameters: 100000000000
|
|
911
|
-
release_date: 2022-06-23
|
|
912
|
-
|
|
913
|
-
# NVIDIA
|
|
914
|
-
- name: nvidia/megatron-gpt2
|
|
915
|
-
display_name: Megatron GPT2
|
|
916
|
-
description: GPT-2 implemented in Megatron-LM ([paper](https://arxiv.org/abs/1909.08053)).
|
|
917
|
-
creator_organization: NVIDIA
|
|
918
|
-
access: open
|
|
919
|
-
todo: true
|
|
920
|
-
|
|
921
2
|
############################################################
|
|
922
3
|
adapter:
|
|
923
4
|
- name: method
|
|
@@ -961,8 +42,12 @@ adapter:
|
|
|
961
42
|
description: Maximum number of possible outputs to generate by sampling multiple outputs.
|
|
962
43
|
- name: num_train_trials
|
|
963
44
|
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
|
|
45
|
+
- name: sample_train
|
|
46
|
+
description: If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
964
47
|
- name: model
|
|
965
|
-
description: Name of the language model (<
|
|
48
|
+
description: Name of the language model (<creator_organization>/<model name>) to send requests to.
|
|
49
|
+
- name: model_deployment
|
|
50
|
+
description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
|
|
966
51
|
- name: temperature
|
|
967
52
|
description: Temperature parameter used in generation.
|
|
968
53
|
- name: max_tokens
|
|
@@ -971,6 +56,8 @@ adapter:
|
|
|
971
56
|
description: List of sequences, where we stop generation if we encounter any of them.
|
|
972
57
|
- name: random
|
|
973
58
|
description: Random seed (string), which guarantees reproducibility.
|
|
59
|
+
- name: multi_label
|
|
60
|
+
description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
|
|
974
61
|
|
|
975
62
|
############################################################
|
|
976
63
|
metrics:
|
|
@@ -1059,6 +146,7 @@ metrics:
|
|
|
1059
146
|
short_display_name: PEM
|
|
1060
147
|
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
1061
148
|
lower_is_better: false
|
|
149
|
+
|
|
1062
150
|
- name: exact_match@5
|
|
1063
151
|
display_name: Exact match @5
|
|
1064
152
|
short_display_name: EM@5
|
|
@@ -1069,6 +157,17 @@ metrics:
|
|
|
1069
157
|
short_display_name: EM@5
|
|
1070
158
|
description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
|
|
1071
159
|
lower_is_better: false
|
|
160
|
+
- name: prefix_exact_match@5
|
|
161
|
+
display_name: Prefix exact match @5
|
|
162
|
+
short_display_name: PEM@5
|
|
163
|
+
description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
|
|
164
|
+
lower_is_better: false
|
|
165
|
+
- name: quasi_prefix_exact_match@5
|
|
166
|
+
display_name: Prefix quasi-exact match @5
|
|
167
|
+
short_display_name: PEM@5
|
|
168
|
+
description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
|
|
169
|
+
lower_is_better: false
|
|
170
|
+
|
|
1072
171
|
- name: logprob
|
|
1073
172
|
display_name: Log probability
|
|
1074
173
|
short_display_name: Logprob
|
|
@@ -1163,10 +262,15 @@ metrics:
|
|
|
1163
262
|
lower_is_better: false
|
|
1164
263
|
- name: math_equiv_chain_of_thought
|
|
1165
264
|
display_name: Equivalent (chain of thought)
|
|
1166
|
-
description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-
|
|
265
|
+
description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.
|
|
1167
266
|
lower_is_better: false
|
|
1168
267
|
- name: exact_match_indicator
|
|
1169
|
-
display_name: Exact match (
|
|
268
|
+
display_name: Exact match (final)
|
|
269
|
+
short_display_name: EM
|
|
270
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator (e.g., space).
|
|
271
|
+
lower_is_better: false
|
|
272
|
+
- name: final_number_exact_match
|
|
273
|
+
display_name: Exact match (final number)
|
|
1170
274
|
short_display_name: EM
|
|
1171
275
|
description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.
|
|
1172
276
|
lower_is_better: false
|
|
@@ -1464,6 +568,50 @@ metrics:
|
|
|
1464
568
|
lower_is_better: false
|
|
1465
569
|
# CLEVA Chinese bias, copyright and toxicity metrics share the same name as the original HELM metrics
|
|
1466
570
|
|
|
571
|
+
|
|
572
|
+
# DecodingTrust Fairness Metrics
|
|
573
|
+
- name: equalized_odds_difference
|
|
574
|
+
display_name: Equalized Odds Difference
|
|
575
|
+
short_display_name: Equalized Odds
|
|
576
|
+
description: The rate at which the model refuses to answer an OoD question.
|
|
577
|
+
lower_is_better: true
|
|
578
|
+
|
|
579
|
+
- name: demographic_parity_difference
|
|
580
|
+
display_name: Demographic Parity Difference
|
|
581
|
+
short_display_name: demographic parity
|
|
582
|
+
description: The demographic parity difference measures the difference between the probability of positive predictions conditioned on sensitive attribute A = 1 and that conditioned on A = 0
|
|
583
|
+
lower_is_better: true
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
# DecodingTrust OoD Metrics
|
|
587
|
+
- name: ood_knowledge_rr
|
|
588
|
+
display_name: OOD Knowledge Refusal Rate
|
|
589
|
+
short_display_name: OoD-RR
|
|
590
|
+
description: The rate at which the model refuses to answer an OoD question.
|
|
591
|
+
lower_is_better: false
|
|
592
|
+
|
|
593
|
+
- name: ood_knowledge_macc
|
|
594
|
+
display_name: OOD Meaningful Answer Accuracy (Exclude Refusal)
|
|
595
|
+
short_display_name: OoD-MACC
|
|
596
|
+
description: MACC indicates the percentage of correct answers when the model successfully generates meaningful responses by excluding outputs that are refused to answer
|
|
597
|
+
lower_is_better: false
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
# DecodingTrust Stereotype Bias Metrics
|
|
601
|
+
|
|
602
|
+
- name: decodingtrust_stereotype_bias
|
|
603
|
+
display_name: DecodingTrust Bias Score
|
|
604
|
+
short_display_name: Bias Score
|
|
605
|
+
description: 1 - agreementIndex (Section 4.2 of DecodingTrust)
|
|
606
|
+
lower_is_better: false
|
|
607
|
+
|
|
608
|
+
- name: decodingtrust_stereotype_bias_rejection_rate
|
|
609
|
+
display_name: Accuracy at 10% coverage
|
|
610
|
+
short_display_name: Bias Rejection Rate
|
|
611
|
+
description: Rejection rate of stereotype prompts
|
|
612
|
+
lower_is_better: false
|
|
613
|
+
|
|
614
|
+
|
|
1467
615
|
############################################################
|
|
1468
616
|
perturbations:
|
|
1469
617
|
- name: robustness
|
|
@@ -1514,7 +662,7 @@ metric_groups:
|
|
|
1514
662
|
split: ${main_split}
|
|
1515
663
|
|
|
1516
664
|
- name: calibration_detailed
|
|
1517
|
-
display_name: Calibration
|
|
665
|
+
display_name: Calibration (Detailed)
|
|
1518
666
|
description: Measures how calibrated the model is (how meaningful its uncertainty estimates are).
|
|
1519
667
|
metrics:
|
|
1520
668
|
- name: max_prob
|
|
@@ -1545,7 +693,7 @@ metric_groups:
|
|
|
1545
693
|
|
|
1546
694
|
# TODO: Add other robustness perturbations
|
|
1547
695
|
- name: robustness_detailed
|
|
1548
|
-
display_name: Robustness
|
|
696
|
+
display_name: Robustness (Detailed)
|
|
1549
697
|
description: Measures how robust the model is to invariances.
|
|
1550
698
|
metrics:
|
|
1551
699
|
- name: ${main_name}
|
|
@@ -1564,7 +712,7 @@ metric_groups:
|
|
|
1564
712
|
|
|
1565
713
|
# TODO: Add other fairness perturbations
|
|
1566
714
|
- name: fairness_detailed
|
|
1567
|
-
display_name: Fairness
|
|
715
|
+
display_name: Fairness (Detailed)
|
|
1568
716
|
description: Measures how fair the model is.
|
|
1569
717
|
metrics:
|
|
1570
718
|
- name: ${main_name}
|
|
@@ -1602,7 +750,7 @@ metric_groups:
|
|
|
1602
750
|
split: ${main_split}
|
|
1603
751
|
|
|
1604
752
|
- name: efficiency_detailed
|
|
1605
|
-
display_name: Efficiency
|
|
753
|
+
display_name: Efficiency (Detailed)
|
|
1606
754
|
description: The efficiency of the model across both training and inference.
|
|
1607
755
|
metrics:
|
|
1608
756
|
- name: inference_runtime
|
|
@@ -1747,6 +895,31 @@ metric_groups:
|
|
|
1747
895
|
- name: chinese_bleu_1
|
|
1748
896
|
split: ${main_split}
|
|
1749
897
|
|
|
898
|
+
- name: decodingtrust_fairness_metrics
|
|
899
|
+
display_name: DecodingTrust Fairness
|
|
900
|
+
metrics:
|
|
901
|
+
- name: equalized_odds_difference
|
|
902
|
+
split: ${main_split}
|
|
903
|
+
- name: demographic_parity_difference
|
|
904
|
+
split: ${main_split}
|
|
905
|
+
|
|
906
|
+
- name: decodingtrust_ood_metrics
|
|
907
|
+
display_name: DecodingTrust OOD Accuracy
|
|
908
|
+
metrics:
|
|
909
|
+
- name: ood_knowledge_rr
|
|
910
|
+
split: ${main_split}
|
|
911
|
+
- name: ood_knowledge_macc
|
|
912
|
+
split: ${main_split}
|
|
913
|
+
|
|
914
|
+
- name: decodingtrust_stereotype_bias_metrics
|
|
915
|
+
display_name: DecodingTrust Stereotype Bias
|
|
916
|
+
metrics:
|
|
917
|
+
- name: decodingtrust_stereotype_bias
|
|
918
|
+
split: ${main_split}
|
|
919
|
+
- name: decodingtrust_stereotype_bias_rejection_rate
|
|
920
|
+
split: ${main_split}
|
|
921
|
+
|
|
922
|
+
|
|
1750
923
|
############################################################
|
|
1751
924
|
run_groups:
|
|
1752
925
|
## Top-level
|
|
@@ -1910,6 +1083,7 @@ run_groups:
|
|
|
1910
1083
|
- synthetic_efficiency
|
|
1911
1084
|
adapter_keys_shown:
|
|
1912
1085
|
- model
|
|
1086
|
+
- model_deployment
|
|
1913
1087
|
- max_tokens
|
|
1914
1088
|
|
|
1915
1089
|
- name: calibration
|
|
@@ -1928,6 +1102,20 @@ run_groups:
|
|
|
1928
1102
|
main_name: none
|
|
1929
1103
|
main_split: none
|
|
1930
1104
|
|
|
1105
|
+
- name: decodingtrust
|
|
1106
|
+
display_name: DecodingTrust
|
|
1107
|
+
description: A comprehensive benchmark of the trustworthiness of large language models [(Wang et. al. 2023)](https://decodingtrust.github.io/)
|
|
1108
|
+
category: Core scenarios
|
|
1109
|
+
subgroups:
|
|
1110
|
+
- decodingtrust_adv_robustness
|
|
1111
|
+
- decodingtrust_adv_demonstration
|
|
1112
|
+
- decodingtrust_ood_robustness
|
|
1113
|
+
- decodingtrust_fairness
|
|
1114
|
+
- decodingtrust_privacy
|
|
1115
|
+
- decodingtrust_machine_ethics
|
|
1116
|
+
- decodingtrust_toxicity_prompts
|
|
1117
|
+
- decodingtrust_stereotype_bias
|
|
1118
|
+
|
|
1931
1119
|
### Ablations
|
|
1932
1120
|
- name: ablation_in_context
|
|
1933
1121
|
display_name: Vary number of in-context examples
|
|
@@ -1941,6 +1129,7 @@ run_groups:
|
|
|
1941
1129
|
- civil_comments
|
|
1942
1130
|
adapter_keys_shown:
|
|
1943
1131
|
- model
|
|
1132
|
+
- model_deployment
|
|
1944
1133
|
- max_train_instances
|
|
1945
1134
|
subgroup_metric_groups_hidden:
|
|
1946
1135
|
- robustness
|
|
@@ -1962,6 +1151,7 @@ run_groups:
|
|
|
1962
1151
|
- bbq
|
|
1963
1152
|
adapter_keys_shown:
|
|
1964
1153
|
- model
|
|
1154
|
+
- model_deployment
|
|
1965
1155
|
- method
|
|
1966
1156
|
|
|
1967
1157
|
- name: ablation_prompts
|
|
@@ -1976,6 +1166,7 @@ run_groups:
|
|
|
1976
1166
|
- civil_comments
|
|
1977
1167
|
adapter_keys_shown:
|
|
1978
1168
|
- model
|
|
1169
|
+
- model_deployment
|
|
1979
1170
|
- instructions
|
|
1980
1171
|
- input_prefix
|
|
1981
1172
|
- input_suffix
|
|
@@ -2636,8 +1827,8 @@ run_groups:
|
|
|
2636
1827
|
language: synthetic
|
|
2637
1828
|
|
|
2638
1829
|
- name: math_chain_of_thought
|
|
2639
|
-
display_name: MATH (chain-of-
|
|
2640
|
-
description: The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-
|
|
1830
|
+
display_name: MATH (chain-of-thought)
|
|
1831
|
+
description: The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).
|
|
2641
1832
|
metric_groups:
|
|
2642
1833
|
- accuracy
|
|
2643
1834
|
- efficiency
|
|
@@ -2687,6 +1878,23 @@ run_groups:
|
|
|
2687
1878
|
when: n/a
|
|
2688
1879
|
language: synthetic
|
|
2689
1880
|
|
|
1881
|
+
- name: legalbench
|
|
1882
|
+
display_name: LegalBench
|
|
1883
|
+
description: LegalBench is a large collaboratively constructed benchmark of legal reasoning. Five representative tasks are included here. See [(Guha et al, 2023)[https://arxiv.org/abs/2308.11462] for more details.
|
|
1884
|
+
metric_groups:
|
|
1885
|
+
- accuracy
|
|
1886
|
+
- efficiency
|
|
1887
|
+
- general_information
|
|
1888
|
+
environment:
|
|
1889
|
+
main_name: quasi_exact_match
|
|
1890
|
+
main_split: test
|
|
1891
|
+
taxonomy:
|
|
1892
|
+
task: "text classification"
|
|
1893
|
+
what: "fact patterns, questions, and legal documents"
|
|
1894
|
+
who: "lawyers"
|
|
1895
|
+
when: n/a
|
|
1896
|
+
language: English
|
|
1897
|
+
|
|
2690
1898
|
- name: legal_support
|
|
2691
1899
|
display_name: LegalSupport
|
|
2692
1900
|
description: Scenario introduced in this work to measure fine-grained legal reasoning through reverse entailment.
|
|
@@ -2721,6 +1929,40 @@ run_groups:
|
|
|
2721
1929
|
when: n/a
|
|
2722
1930
|
language: synthetic
|
|
2723
1931
|
|
|
1932
|
+
- name: med_qa
|
|
1933
|
+
display_name: MedQA
|
|
1934
|
+
description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
|
|
1935
|
+
metric_groups:
|
|
1936
|
+
- accuracy
|
|
1937
|
+
- efficiency
|
|
1938
|
+
- general_information
|
|
1939
|
+
environment:
|
|
1940
|
+
main_name: quasi_exact_match
|
|
1941
|
+
main_split: test
|
|
1942
|
+
taxonomy:
|
|
1943
|
+
task: question answering
|
|
1944
|
+
what: n/a
|
|
1945
|
+
who: n/a
|
|
1946
|
+
when: n/a
|
|
1947
|
+
language: English
|
|
1948
|
+
|
|
1949
|
+
- name: wmt_14
|
|
1950
|
+
display_name: WMT 2014
|
|
1951
|
+
description: WMT 2014 is a collection of machine translation datasets.
|
|
1952
|
+
metric_groups:
|
|
1953
|
+
- accuracy
|
|
1954
|
+
- efficiency
|
|
1955
|
+
- general_information
|
|
1956
|
+
environment:
|
|
1957
|
+
main_name: bleu_4
|
|
1958
|
+
main_split: test
|
|
1959
|
+
taxonomy:
|
|
1960
|
+
task: machine translation
|
|
1961
|
+
what: n/a
|
|
1962
|
+
who: n/a
|
|
1963
|
+
when: n/a
|
|
1964
|
+
language: English
|
|
1965
|
+
|
|
2724
1966
|
- name: lextreme
|
|
2725
1967
|
display_name: LEXTREME
|
|
2726
1968
|
description: A Multilingual Legal Benchmark for Natural Language Understanding
|
|
@@ -2981,6 +2223,7 @@ run_groups:
|
|
|
2981
2223
|
main_split: test
|
|
2982
2224
|
adapter_keys_shown:
|
|
2983
2225
|
- model
|
|
2226
|
+
- model_deployment
|
|
2984
2227
|
- max_tokens
|
|
2985
2228
|
taxonomy:
|
|
2986
2229
|
task: "?"
|
|
@@ -3402,7 +2645,7 @@ run_groups:
|
|
|
3402
2645
|
|
|
3403
2646
|
- name: cleva_mathematical_reasoning
|
|
3404
2647
|
display_name: CLEVA (Chinese) mathematical reasoning
|
|
3405
|
-
description: "Scenario that tests models' mathematical reasoning ability with chain-of-
|
|
2648
|
+
description: "Scenario that tests models' mathematical reasoning ability with chain-of-thought style reasoning. It contains a math word problem solving subtask."
|
|
3406
2649
|
metric_groups:
|
|
3407
2650
|
- cleva_mathematical_reasoning_metrics
|
|
3408
2651
|
- general_information
|
|
@@ -3449,7 +2692,7 @@ run_groups:
|
|
|
3449
2692
|
main_split: test
|
|
3450
2693
|
taxonomy:
|
|
3451
2694
|
task: toxicity classification
|
|
3452
|
-
what: text from Chinese social media
|
|
2695
|
+
what: text from Chinese social media
|
|
3453
2696
|
who: web users
|
|
3454
2697
|
when: 2022 or before
|
|
3455
2698
|
language: Chinese
|
|
@@ -3649,3 +2892,176 @@ run_groups:
|
|
|
3649
2892
|
task: user-facing tasks
|
|
3650
2893
|
language: English dialects
|
|
3651
2894
|
todo: true
|
|
2895
|
+
|
|
2896
|
+
|
|
2897
|
+
# DecodingTrust scenarios
|
|
2898
|
+
- name: decodingtrust_adv_robustness
|
|
2899
|
+
display_name: DecodingTrust - AdvGLUE++
|
|
2900
|
+
short_display_name: AdvGLUE++
|
|
2901
|
+
description: Adversarial perturbations of the GLUE dataset generated against open-source LLMs including Alpaca, Vicuna, and Stable-Vicuna
|
|
2902
|
+
metric_groups:
|
|
2903
|
+
- accuracy
|
|
2904
|
+
- calibration
|
|
2905
|
+
- efficiency
|
|
2906
|
+
- general_information
|
|
2907
|
+
environment:
|
|
2908
|
+
main_name: quasi_exact_match
|
|
2909
|
+
main_split: test
|
|
2910
|
+
taxonomy:
|
|
2911
|
+
task: text classification
|
|
2912
|
+
what: "?"
|
|
2913
|
+
who: "?"
|
|
2914
|
+
when: "?"
|
|
2915
|
+
language: English
|
|
2916
|
+
todo: true
|
|
2917
|
+
|
|
2918
|
+
- name: decodingtrust_adv_demonstration
|
|
2919
|
+
display_name: DecodingTrust - Adversarial Demonstrations
|
|
2920
|
+
short_display_name: AdvDemo
|
|
2921
|
+
description: Robustness analysis of LM generations when facing adversarial demonstrations
|
|
2922
|
+
metric_groups:
|
|
2923
|
+
- accuracy
|
|
2924
|
+
- calibration
|
|
2925
|
+
- efficiency
|
|
2926
|
+
- general_information
|
|
2927
|
+
environment:
|
|
2928
|
+
main_name: quasi_exact_match
|
|
2929
|
+
main_split: test
|
|
2930
|
+
taxonomy:
|
|
2931
|
+
task: text classification
|
|
2932
|
+
what: "?"
|
|
2933
|
+
who: "?"
|
|
2934
|
+
when: "?"
|
|
2935
|
+
language: English
|
|
2936
|
+
|
|
2937
|
+
- name: decodingtrust_ood_robustness
|
|
2938
|
+
display_name: DecodingTrust - OoD Robustness
|
|
2939
|
+
short_display_name: OoD
|
|
2940
|
+
description: Style perturbations of GLUE datasets (OoD styles) and out-of-scope OoD knowledge evaluations
|
|
2941
|
+
metric_groups:
|
|
2942
|
+
- accuracy
|
|
2943
|
+
- calibration
|
|
2944
|
+
- efficiency
|
|
2945
|
+
- general_information
|
|
2946
|
+
- decodingtrust_ood_metrics
|
|
2947
|
+
environment:
|
|
2948
|
+
main_name: quasi_exact_match
|
|
2949
|
+
main_split: test
|
|
2950
|
+
taxonomy:
|
|
2951
|
+
task: text classification
|
|
2952
|
+
what: "?"
|
|
2953
|
+
who: "?"
|
|
2954
|
+
when: "?"
|
|
2955
|
+
language: English
|
|
2956
|
+
|
|
2957
|
+
- name: decodingtrust_fairness
|
|
2958
|
+
display_name: DecodingTrust - Fairness
|
|
2959
|
+
short_display_name: Fairness
|
|
2960
|
+
description: Fairness analysis of LLMs
|
|
2961
|
+
metric_groups:
|
|
2962
|
+
- accuracy
|
|
2963
|
+
- calibration
|
|
2964
|
+
- efficiency
|
|
2965
|
+
- general_information
|
|
2966
|
+
- decodingtrust_fairness_metrics
|
|
2967
|
+
environment:
|
|
2968
|
+
main_name: quasi_exact_match
|
|
2969
|
+
main_split: test
|
|
2970
|
+
taxonomy:
|
|
2971
|
+
task: text classification
|
|
2972
|
+
what: "?"
|
|
2973
|
+
who: "?"
|
|
2974
|
+
when: "?"
|
|
2975
|
+
language: English
|
|
2976
|
+
|
|
2977
|
+
- name: decodingtrust_privacy
|
|
2978
|
+
display_name: DecodingTrust - Privacy
|
|
2979
|
+
short_display_name: Privacy
|
|
2980
|
+
description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
|
|
2981
|
+
metric_groups:
|
|
2982
|
+
- accuracy
|
|
2983
|
+
- calibration
|
|
2984
|
+
- efficiency
|
|
2985
|
+
- general_information
|
|
2986
|
+
environment:
|
|
2987
|
+
main_name: quasi_exact_match
|
|
2988
|
+
main_split: test
|
|
2989
|
+
taxonomy:
|
|
2990
|
+
task: text classification
|
|
2991
|
+
what: "?"
|
|
2992
|
+
who: "?"
|
|
2993
|
+
when: "?"
|
|
2994
|
+
language: English
|
|
2995
|
+
|
|
2996
|
+
- name: decodingtrust_machine_ethics
|
|
2997
|
+
display_name: DecodingTrust - Ethics
|
|
2998
|
+
short_display_name: Ethics
|
|
2999
|
+
description: Evaluation of the understanding of ethical behaviors of LLMs
|
|
3000
|
+
metric_groups:
|
|
3001
|
+
- accuracy
|
|
3002
|
+
- calibration
|
|
3003
|
+
- efficiency
|
|
3004
|
+
- general_information
|
|
3005
|
+
environment:
|
|
3006
|
+
main_name: quasi_exact_match
|
|
3007
|
+
main_split: test
|
|
3008
|
+
taxonomy:
|
|
3009
|
+
task: text classification
|
|
3010
|
+
what: "?"
|
|
3011
|
+
who: "?"
|
|
3012
|
+
when: "?"
|
|
3013
|
+
language: English
|
|
3014
|
+
|
|
3015
|
+
- name: decodingtrust_toxicity_prompts
|
|
3016
|
+
display_name: DecodingTrust - Toxicity
|
|
3017
|
+
short_display_name: Toxicity
|
|
3018
|
+
description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
|
|
3019
|
+
metric_groups:
|
|
3020
|
+
- toxicity
|
|
3021
|
+
- bias
|
|
3022
|
+
- efficiency
|
|
3023
|
+
- general_information
|
|
3024
|
+
environment:
|
|
3025
|
+
main_split: test
|
|
3026
|
+
taxonomy:
|
|
3027
|
+
task: "?"
|
|
3028
|
+
what: n/a
|
|
3029
|
+
who: n/a
|
|
3030
|
+
when: n/a
|
|
3031
|
+
language: synthetic
|
|
3032
|
+
|
|
3033
|
+
- name: decodingtrust_stereotype_bias
|
|
3034
|
+
display_name: DecodingTrust - Stereotype Bias
|
|
3035
|
+
short_display_name: Stereotype
|
|
3036
|
+
description: Manually crafted stereotype user prompts from DecodingTrust
|
|
3037
|
+
metric_groups:
|
|
3038
|
+
- toxicity
|
|
3039
|
+
- bias
|
|
3040
|
+
- efficiency
|
|
3041
|
+
- general_information
|
|
3042
|
+
- decodingtrust_stereotype_bias_metrics
|
|
3043
|
+
environment:
|
|
3044
|
+
main_split: test
|
|
3045
|
+
taxonomy:
|
|
3046
|
+
task: "?"
|
|
3047
|
+
what: n/a
|
|
3048
|
+
who: n/a
|
|
3049
|
+
when: n/a
|
|
3050
|
+
language: synthetic
|
|
3051
|
+
|
|
3052
|
+
- name: thai_exam
|
|
3053
|
+
display_name: Thai Exam
|
|
3054
|
+
short_display_name: ThaiExam
|
|
3055
|
+
description: A benchmark comprising Thai multiple-choice examinations.
|
|
3056
|
+
metric_groups:
|
|
3057
|
+
- accuracy
|
|
3058
|
+
- general_information
|
|
3059
|
+
environment:
|
|
3060
|
+
main_name: exact_match
|
|
3061
|
+
main_split: test
|
|
3062
|
+
taxonomy:
|
|
3063
|
+
task: question answering
|
|
3064
|
+
what: "?"
|
|
3065
|
+
who: "?"
|
|
3066
|
+
when: "?"
|
|
3067
|
+
language: Thai
|