PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (1033) hide show

crfm_helm-0.5.10.dist-info/METADATA +369 -0
crfm_helm-0.5.10.dist-info/RECORD +1008 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +80 -29
helm/benchmark/adaptation/adapters/adapter.py +2 -2
helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
helm/benchmark/adaptation/common_adapter_specs.py +443 -0
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/adaptation/request_state.py +6 -1
helm/benchmark/adaptation/scenario_state.py +6 -2
helm/benchmark/annotation/aci_bench_annotator.py +84 -0
helm/benchmark/annotation/air_bench_annotator.py +79 -0
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/annotator.py +48 -0
helm/benchmark/annotation/annotator_factory.py +50 -0
helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/call_center_annotator.py +258 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +96 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/financebench_annotator.py +79 -0
helm/benchmark/annotation/harm_bench_annotator.py +55 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
helm/benchmark/annotation/live_qa_annotator.py +76 -0
helm/benchmark/annotation/med_dialog_annotator.py +88 -0
helm/benchmark/annotation/medalign_annotator.py +89 -0
helm/benchmark/annotation/medi_qa_annotator.py +87 -0
helm/benchmark/annotation/medication_qa_annotator.py +86 -0
helm/benchmark/annotation/mental_health_annotator.py +87 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
helm/benchmark/annotation/model_as_judge.py +309 -0
helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
helm/benchmark/annotation/test_annotator_factory.py +26 -0
helm/benchmark/annotation/test_dummy_annotator.py +44 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation/xstest_annotator.py +100 -0
helm/benchmark/annotation_executor.py +144 -0
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/data_augmenter.py +0 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +3 -3
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +26 -4
helm/benchmark/augmentations/perturbation_description.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +29 -0
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +56 -19
helm/benchmark/augmentations/translate_perturbation.py +31 -0
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/config_registry.py +7 -1
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +54 -25
helm/benchmark/huggingface_registration.py +28 -10
helm/benchmark/metrics/air_bench_metrics.py +3212 -0
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/annotation_metrics.py +108 -0
helm/benchmark/metrics/basic_metrics.py +437 -667
helm/benchmark/metrics/bbq_metrics.py +17 -6
helm/benchmark/metrics/bias_metrics.py +18 -9
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/classification_metrics.py +107 -22
helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/code_metrics_helper.py +11 -3
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +174 -0
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
helm/benchmark/metrics/copyright_metrics.py +5 -5
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
helm/benchmark/metrics/disinformation_metrics.py +8 -114
helm/benchmark/metrics/dry_run_metrics.py +35 -6
helm/benchmark/metrics/efficiency_metrics.py +287 -0
helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +67 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
helm/benchmark/metrics/language_modeling_metrics.py +111 -0
helm/benchmark/metrics/live_qa_metrics.py +35 -0
helm/benchmark/metrics/llm_jury_metrics.py +58 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/machine_translation_metrics.py +89 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
helm/benchmark/metrics/medec_metrics.py +124 -0
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/metric.py +121 -175
helm/benchmark/metrics/metric_name.py +0 -1
helm/benchmark/metrics/metric_service.py +23 -7
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/omni_math_metrics.py +44 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/ranking_metrics.py +5 -5
helm/benchmark/metrics/reference_metric.py +148 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/safety_metrics.py +91 -0
helm/benchmark/metrics/seahelm_metrics.py +201 -0
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +8 -11
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +150 -11
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +145 -70
helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
helm/benchmark/metrics/test_metric.py +3 -3
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
helm/benchmark/metrics/toxicity_metrics.py +37 -7
helm/benchmark/metrics/toxicity_utils.py +23 -0
helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
helm/benchmark/metrics/unitxt_metrics.py +107 -0
helm/benchmark/metrics/vision_language/__init__.py +0 -0
helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
helm/benchmark/metrics/vision_language/image_utils.py +100 -0
helm/benchmark/metrics/wildbench_metrics.py +54 -0
helm/benchmark/model_deployment_registry.py +69 -5
helm/benchmark/model_metadata_registry.py +58 -2
helm/benchmark/multi_gpu_runner.py +133 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +51 -20
helm/benchmark/presentation/run_display.py +51 -12
helm/benchmark/presentation/run_entry.py +2 -2
helm/benchmark/presentation/schema.py +83 -66
helm/benchmark/presentation/summarize.py +483 -388
helm/benchmark/presentation/table.py +8 -8
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_contamination.py +2 -2
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/presentation/test_run_entry.py +2 -2
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/presentation/test_summarize.py +148 -6
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +151 -87
helm/benchmark/run_expander.py +418 -33
helm/benchmark/run_spec.py +93 -0
helm/benchmark/run_spec_factory.py +180 -0
helm/benchmark/run_specs/__init__.py +0 -0
helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
helm/benchmark/run_specs/arabic_run_specs.py +197 -0
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/call_center_run_specs.py +201 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1393 -0
helm/benchmark/run_specs/cleva_run_specs.py +277 -0
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +224 -0
helm/benchmark/run_specs/finance_run_specs.py +114 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +625 -0
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
helm/benchmark/run_specs/lite_run_specs.py +307 -0
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +188 -0
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +191 -0
helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
helm/benchmark/run_specs/simple_run_specs.py +104 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +63 -62
helm/benchmark/runner_config_registry.py +21 -0
helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
helm/benchmark/scenarios/air_bench_scenario.py +76 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
helm/benchmark/scenarios/banking77_scenario.py +77 -0
helm/benchmark/scenarios/bbq_scenario.py +17 -2
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +18 -3
helm/benchmark/scenarios/boolq_scenario.py +21 -1
helm/benchmark/scenarios/call_center_scenario.py +84 -0
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
helm/benchmark/scenarios/clear_scenario.py +180 -0
helm/benchmark/scenarios/cleva_scenario.py +482 -3
helm/benchmark/scenarios/code_scenario.py +46 -4
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +33 -1
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
helm/benchmark/scenarios/disinformation_scenario.py +32 -1
helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
helm/benchmark/scenarios/ewok_scenario.py +116 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
helm/benchmark/scenarios/financebench_scenario.py +74 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
helm/benchmark/scenarios/gpqa_scenario.py +98 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +21 -2
helm/benchmark/scenarios/gsm_scenario.py +31 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
helm/benchmark/scenarios/headqa_scenario.py +158 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
helm/benchmark/scenarios/ice_scenario.py +28 -4
helm/benchmark/scenarios/ifeval_scenario.py +71 -0
helm/benchmark/scenarios/image_generation/__init__.py +0 -0
helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +26 -3
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
helm/benchmark/scenarios/legal_support_scenario.py +24 -1
helm/benchmark/scenarios/legalbench_scenario.py +45 -3
helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
helm/benchmark/scenarios/lextreme_scenario.py +22 -1
helm/benchmark/scenarios/live_qa_scenario.py +94 -0
helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +81 -22
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +30 -1
helm/benchmark/scenarios/medalign_scenario.py +117 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
helm/benchmark/scenarios/medbullets_scenario.py +167 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
helm/benchmark/scenarios/medec_scenario.py +148 -0
helm/benchmark/scenarios/medhallu_scenario.py +95 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +146 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
helm/benchmark/scenarios/mmlu_scenario.py +32 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +31 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +71 -0
helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
helm/benchmark/scenarios/quac_scenario.py +24 -1
helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
helm/benchmark/scenarios/raft_scenario.py +33 -3
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
helm/benchmark/scenarios/scenario.py +44 -1
helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
helm/benchmark/scenarios/simple_scenarios.py +122 -1
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/spider_scenario.py +109 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
helm/benchmark/scenarios/summarization_scenario.py +48 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
helm/benchmark/scenarios/test_math_scenario.py +4 -3
helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_scenario.py +6 -3
helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/unitxt_scenario.py +62 -0
helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
helm/benchmark/scenarios/vicuna_scenario.py +22 -2
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
helm/benchmark/scenarios/wikifact_scenario.py +31 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +101 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
helm/benchmark/scenarios/xstest_scenario.py +35 -0
helm/benchmark/server.py +32 -2
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +78 -50
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +269 -0
helm/benchmark/static/schema_capabilities.yaml +254 -0
helm/benchmark/static/schema_classic.yaml +259 -1140
helm/benchmark/static/schema_cleva.yaml +768 -0
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_decodingtrust.yaml +444 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_ewok.yaml +367 -0
helm/benchmark/static/schema_finance.yaml +191 -0
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_image2struct.yaml +588 -0
helm/benchmark/static/schema_instruction_following.yaml +161 -0
helm/benchmark/static/schema_legal.yaml +566 -0
helm/benchmark/static/schema_lite.yaml +3 -286
helm/benchmark/static/schema_long_context.yaml +282 -0
helm/benchmark/static/schema_medhelm.yaml +1176 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu.yaml +1449 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +283 -0
helm/benchmark/static/schema_seahelm.yaml +723 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/schema_thai.yaml +244 -0
helm/benchmark/static/schema_torr.yaml +474 -0
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_unitxt.yaml +370 -0
helm/benchmark/static/schema_vhelm.yaml +933 -0
helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
helm/benchmark/static_build/config.js +4 -0
helm/benchmark/static_build/index.html +19 -0
helm/benchmark/test_data_preprocessor.py +3 -3
helm/benchmark/test_run_expander.py +1 -1
helm/benchmark/window_services/default_window_service.py +3 -45
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
helm/benchmark/window_services/ice_window_service.py +1 -35
helm/benchmark/window_services/image_generation/__init__.py +0 -0
helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
helm/benchmark/window_services/local_window_service.py +22 -5
helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
helm/benchmark/window_services/test_bloom_window_service.py +5 -4
helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
helm/benchmark/window_services/test_gptj_window_service.py +11 -5
helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
helm/benchmark/window_services/test_openai_window_service.py +18 -12
helm/benchmark/window_services/test_opt_window_service.py +6 -5
helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
helm/benchmark/window_services/test_t511b_window_service.py +5 -4
helm/benchmark/window_services/test_ul2_window_service.py +5 -4
helm/benchmark/window_services/test_utils.py +6 -6
helm/benchmark/window_services/test_yalm_window_service.py +5 -4
helm/benchmark/window_services/tokenizer_service.py +7 -13
helm/benchmark/window_services/window_service.py +42 -0
helm/benchmark/window_services/window_service_factory.py +4 -1
helm/benchmark/window_services/yalm_window_service.py +1 -28
helm/clients/__init__.py +0 -0
helm/{proxy/clients → clients}/ai21_client.py +78 -12
helm/clients/aleph_alpha_client.py +114 -0
helm/{proxy/clients → clients}/anthropic_client.py +304 -21
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +122 -0
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/llama_omni_client.py +199 -0
helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
helm/clients/audio_language/qwen_audiolm_client.py +153 -0
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/audio_language/test.py +62 -0
helm/{proxy/clients → clients}/auto_client.py +72 -31
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +381 -0
helm/clients/bedrock_utils.py +105 -0
helm/{proxy/clients → clients}/client.py +92 -17
helm/clients/clip_score_client.py +49 -0
helm/clients/clip_scorers/__init__.py +0 -0
helm/clients/clip_scorers/base_clip_scorer.py +18 -0
helm/clients/clip_scorers/clip_scorer.py +50 -0
helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
helm/{proxy/clients → clients}/cohere_client.py +105 -14
helm/clients/dspy_client.py +135 -0
helm/clients/gcs_client.py +82 -0
helm/{proxy/clients → clients}/google_client.py +8 -6
helm/clients/google_translate_client.py +35 -0
helm/clients/grok_client.py +36 -0
helm/{proxy/clients → clients}/http_model_client.py +8 -8
helm/{proxy/clients → clients}/huggingface_client.py +157 -86
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +269 -0
helm/clients/image_generation/__init__.py +0 -0
helm/clients/image_generation/adobe_vision_client.py +80 -0
helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
helm/clients/image_generation/cogview2/__init__.py +0 -0
helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
helm/clients/image_generation/cogview2_client.py +192 -0
helm/clients/image_generation/dalle2_client.py +194 -0
helm/clients/image_generation/dalle3_client.py +108 -0
helm/clients/image_generation/dalle_mini/__init__.py +3 -0
helm/clients/image_generation/dalle_mini/data.py +442 -0
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
helm/clients/image_generation/dalle_mini/model/text.py +251 -0
helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
helm/clients/image_generation/dalle_mini_client.py +191 -0
helm/clients/image_generation/deep_floyd_client.py +80 -0
helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
helm/clients/image_generation/image_generation_client_utils.py +9 -0
helm/clients/image_generation/lexica_client.py +88 -0
helm/clients/image_generation/mindalle/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/__init__.py +216 -0
helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
helm/clients/image_generation/mindalle/utils/config.py +129 -0
helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
helm/clients/image_generation/mindalle/utils/utils.py +89 -0
helm/clients/image_generation/mindalle_client.py +116 -0
helm/clients/image_generation/nudity_check_client.py +64 -0
helm/clients/image_generation/together_image_generation_client.py +113 -0
helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
helm/{proxy/clients → clients}/megatron_client.py +7 -5
helm/clients/mistral_client.py +180 -0
helm/clients/moderation_api_client.py +111 -0
helm/clients/nvidia_nim_client.py +32 -0
helm/clients/open_lm_client.py +43 -0
helm/clients/openai_client.py +604 -0
helm/clients/openai_responses_client.py +200 -0
helm/clients/openrouter_client.py +31 -0
helm/{proxy/clients → clients}/palmyra_client.py +31 -14
helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
helm/clients/reka_client.py +190 -0
helm/clients/simple_client.py +64 -0
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +95 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/{proxy/clients → clients}/test_auto_client.py +13 -15
helm/clients/test_client.py +98 -0
helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
helm/clients/test_openrouter_client.py +69 -0
helm/clients/test_simple_client.py +19 -0
helm/clients/test_together_client.py +184 -0
helm/clients/together_client.py +599 -0
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +488 -0
helm/clients/vision_language/__init__.py +0 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
helm/clients/vision_language/huggingface_vlm_client.py +114 -0
helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
helm/clients/vision_language/open_flamingo/__init__.py +2 -0
helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
helm/clients/vision_language/open_flamingo_client.py +155 -0
helm/clients/vision_language/paligemma_client.py +147 -0
helm/clients/vision_language/palmyra_vision_client.py +101 -0
helm/clients/vision_language/qwen2_vlm_client.py +189 -0
helm/clients/vision_language/qwen_vlm_client.py +174 -0
helm/clients/vllm_client.py +80 -0
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +105 -0
helm/clients/yi_client.py +28 -0
helm/common/audio_utils.py +111 -0
helm/common/cache.py +23 -33
helm/common/cache_backend_config.py +47 -0
helm/common/clip_score_request.py +41 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +10 -2
helm/common/file_caches/__init__.py +0 -0
helm/common/file_caches/file_cache.py +16 -0
helm/common/file_caches/local_file_cache.py +61 -0
helm/common/file_caches/test_local_file_cache.py +25 -0
helm/common/file_upload_request.py +27 -0
helm/common/general.py +10 -3
helm/common/hierarchical_logger.py +124 -12
helm/common/image_generation_parameters.py +25 -0
helm/common/images_utils.py +60 -5
helm/common/key_value_store.py +41 -10
helm/common/local_context.py +140 -0
helm/common/media_object.py +14 -1
helm/common/moderations_api_request.py +71 -0
helm/common/mongo_key_value_store.py +8 -7
helm/common/multimodal_request_utils.py +57 -0
helm/common/nudity_check_request.py +29 -0
helm/common/object_spec.py +23 -8
helm/common/optional_dependencies.py +1 -1
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +45 -19
helm/common/response_format.py +18 -0
helm/common/test_cache.py +1 -48
helm/common/test_general.py +10 -0
helm/common/test_logging.py +94 -0
helm/common/test_media_object.py +1 -1
helm/common/tokenization_request.py +1 -10
helm/config/model_deployments.yaml +4713 -1005
helm/config/model_metadata.yaml +4045 -255
helm/config/tokenizer_configs.yaml +1091 -50
helm/proxy/accounts.py +31 -4
helm/proxy/cli.py +6 -4
helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/critique/model_critique_client.py +40 -10
helm/proxy/example_queries.py +33 -28
helm/proxy/retry.py +5 -0
helm/proxy/server.py +82 -18
helm/proxy/services/remote_service.py +32 -7
helm/proxy/services/server_service.py +71 -69
helm/proxy/services/service.py +30 -6
helm/proxy/services/test_remote_service.py +6 -5
helm/proxy/services/test_service.py +1 -13
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +61 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +462 -0
helm/proxy/test_accounts.py +32 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +37 -37
helm/proxy/token_counters/test_auto_token_counter.py +164 -0
helm/proxy/token_counters/token_counter.py +3 -5
helm/tokenizers/__init__.py +0 -0
helm/tokenizers/ai21_tokenizer.py +52 -0
helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
helm/tokenizers/cohere_tokenizer.py +50 -0
helm/tokenizers/grok_tokenizer.py +55 -0
helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/simple_tokenizer.py +33 -0
helm/tokenizers/test_ai21_tokenizer.py +48 -0
helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
helm/tokenizers/test_simple_tokenizer.py +33 -0
helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
crfm_helm-0.4.0.dist-info/METADATA +0 -264
crfm_helm-0.4.0.dist-info/RECORD +0 -397
helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
helm/benchmark/data_overlap/export_scenario_text.py +0 -119
helm/benchmark/data_overlap/light_scenario.py +0 -60
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/run_specs.py +0 -2762
helm/benchmark/scenarios/numeracy_scenario.py +0 -784
helm/benchmark/static/benchmarking.css +0 -156
helm/benchmark/static/benchmarking.js +0 -1705
helm/benchmark/static/config.js +0 -3
helm/benchmark/static/images/helm-logo.png +0 -0
helm/benchmark/static/images/language-model-helm.png +0 -0
helm/benchmark/static/images/organizations/ai21.png +0 -0
helm/benchmark/static/images/organizations/anthropic.png +0 -0
helm/benchmark/static/images/organizations/bigscience.png +0 -0
helm/benchmark/static/images/organizations/cohere.png +0 -0
helm/benchmark/static/images/organizations/eleutherai.png +0 -0
helm/benchmark/static/images/organizations/google.png +0 -0
helm/benchmark/static/images/organizations/meta.png +0 -0
helm/benchmark/static/images/organizations/microsoft.png +0 -0
helm/benchmark/static/images/organizations/nvidia.png +0 -0
helm/benchmark/static/images/organizations/openai.png +0 -0
helm/benchmark/static/images/organizations/together.png +0 -0
helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
helm/benchmark/static/images/organizations/yandex.png +0 -0
helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
helm/benchmark/static/index.html +0 -68
helm/benchmark/static/json-urls.js +0 -69
helm/benchmark/static/plot-captions.js +0 -27
helm/benchmark/static/utils.js +0 -285
helm/benchmark/test_model_deployment_definition.py +0 -92
helm/benchmark/test_model_properties.py +0 -1570
helm/benchmark/vlm_run_specs.py +0 -97
helm/benchmark/window_services/ai21_window_service.py +0 -258
helm/benchmark/window_services/cohere_window_service.py +0 -163
helm/benchmark/window_services/flan_t5_window_service.py +0 -29
helm/benchmark/window_services/gpt2_window_service.py +0 -32
helm/benchmark/window_services/huggingface_window_service.py +0 -60
helm/benchmark/window_services/t0pp_window_service.py +0 -35
helm/benchmark/window_services/t511b_window_service.py +0 -30
helm/benchmark/window_services/test_ai21_window_service.py +0 -163
helm/benchmark/window_services/test_cohere_window_service.py +0 -74
helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
helm/benchmark/window_services/test_ice_window_service.py +0 -326
helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
helm/benchmark/window_services/ul2_window_service.py +0 -30
helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
helm/common/cache_utils.py +0 -14
helm/proxy/clients/aleph_alpha_client.py +0 -95
helm/proxy/clients/goose_ai_client.py +0 -99
helm/proxy/clients/microsoft_client.py +0 -180
helm/proxy/clients/openai_client.py +0 -206
helm/proxy/clients/simple_client.py +0 -60
helm/proxy/clients/test_client.py +0 -49
helm/proxy/clients/test_together_client.py +0 -97
helm/proxy/clients/together_client.py +0 -334
helm/proxy/clients/vertexai_client.py +0 -115
helm/proxy/token_counters/ai21_token_counter.py +0 -20
helm/proxy/token_counters/cohere_token_counter.py +0 -13
helm/proxy/token_counters/free_token_counter.py +0 -12
helm/proxy/token_counters/gooseai_token_counter.py +0 -24
helm/proxy/token_counters/openai_token_counter.py +0 -22
helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
helm/proxy/token_counters/test_openai_token_counter.py +0 -81
helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
helm/proxy/tokenizers/ice_tokenizer.py +0 -30
helm/proxy/tokenizers/simple_tokenizer.py +0 -32
helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
/helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
/helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
/helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
/helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
/helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
/helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
/helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
/helm/{proxy/clients → clients}/ai21_utils.py +0 -0
/helm/{proxy/clients → clients}/cohere_utils.py +0 -0
/helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
/helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
/helm/{benchmark → proxy}/static/general.js +0 -0
/helm/{benchmark → proxy}/static/info-icon.png +0 -0

helm/benchmark/static/schema_classic.yaml CHANGED Viewed

@@ -1,1128 +1,8 @@
 ---
 ############################################################
-models:
-  # AI21 Labs
-  - name: ai21/j1-jumbo
-    display_name: J1-Jumbo v1 (178B)
-    description: Jurassic-1 Jumbo (178B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
-    creator_organization: AI21 Labs
-    access: limited
-    num_parameters: 178000000000
-    release_date: 2021-08-11
-  - name: ai21/j1-large
-    display_name: J1-Large v1 (7.5B)
-    description: Jurassic-1 Large (7.5B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
-    creator_organization: AI21 Labs
-    access: limited
-    num_parameters: 7500000000
-    release_date: 2021-08-11
-  - name: ai21/j1-grande
-    display_name: J1-Grande v1 (17B)
-    description: Jurassic-1 Grande (17B parameters) with a "few tweaks" to the training process ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
-    creator_organization: AI21 Labs
-    access: limited
-    num_parameters: 17000000000
-    release_date: 2022-05-03
-  - name: ai21/j1-grande-v2-beta
-    display_name: J1-Grande v2 beta (17B)
-    description: Jurassic-1 Grande v2 beta (17B parameters)
-    creator_organization: AI21 Labs
-    access: limited
-    num_parameters: 17000000000
-    release_date: 2022-10-28
-  - name: ai21/j2-jumbo
-    display_name: Jurassic-2 Jumbo (178B)
-    description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
-    creator_organization: AI21 Labs
-    access: limited
-    num_parameters: 178000000000
-    release_date: 2023-03-09
-  - name: ai21/j2-grande
-    display_name: Jurassic-2 Grande (17B)
-    description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
-    creator_organization: AI21 Labs
-    access: limited
-    num_parameters: 17000000000
-    release_date: 2023-03-09
-  - name: ai21/j2-large
-    display_name: Jurassic-2 Large (7.5B)
-    description: Jurassic-2 Large (7.5B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
-    creator_organization: AI21 Labs
-    access: limited
-    num_parameters: 7500000000
-    release_date: 2023-03-09
-  #  Aleph Alpha
-  # TODO: add Luminous World when it's released
-  - name: AlephAlpha/luminous-base
-    display_name: Luminous Base (13B)
-    description: Luminous Base (13B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
-    creator_organization: Aleph Alpha
-    access: limited
-    num_parameters: 13000000000
-    # TODO: get exact release date
-    release_date: 2022-01-01
-  - name: AlephAlpha/luminous-extended
-    display_name: Luminous Extended (30B)
-    description: Luminous Extended (30B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
-    creator_organization: Aleph Alpha
-    access: limited
-    num_parameters: 30000000000
-    release_date: 2022-01-01
-  - name: AlephAlpha/luminous-supreme
-    display_name: Luminous Supreme (70B)
-    description: Luminous Supreme (70B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
-    creator_organization: Aleph Alpha
-    access: limited
-    num_parameters: 70000000000
-    release_date: 2022-01-01
-  # TODO: Remove Once we have configurable model names
-  - name: neurips/local
-    display_name: Local service
-    description: Local competition service
-    creator_organization: neurips
-    access: open
-    num_parameters: 1
-    release_date: 2021-12-01
-  # Anthropic
-  - name: anthropic/stanford-online-all-v4-s3
-    display_name: Anthropic-LM v4-s3 (52B)
-    description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf).
-    creator_organization: Anthropic
-    access: closed
-    num_parameters: 52000000000
-    release_date: 2021-12-01
-  - name: anthropic/claude-2.0
-    display_name: Anthropic Claude 2.0
-    description: Claude 2.0 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
-    creator_organization: Anthropic
-    access: limited
-    release_date: 2023-07-11
-  - name: anthropic/claude-2.1
-    display_name: Anthropic Claude 2.1
-    description: Claude 2.1 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
-    creator_organization: Anthropic
-    access: limited
-    release_date: 2023-11-21
-  - name: anthropic/claude-v1.3
-    display_name: Anthropic Claude v1.3
-    description: A model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
-    creator_organization: Anthropic
-    access: limited
-    release_date: 2023-03-17
-  - name: anthropic/claude-instant-v1
-    display_name: Anthropic Claude Instant V1
-    description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
-    creator_organization: Anthropic
-    access: limited
-    release_date: 2023-03-17
-  - name: anthropic/claude-instant-1.2
-    display_name: Anthropic Claude Instant 1.2
-    description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
-    creator_organization: Anthropic
-    access: limited
-    release_date: 2023-08-09
-  # Berkeley
-  - name: together/koala-13b
-    display_name: Koala (13B)
-    description: Koala (13B) is a chatbot fine-tuned from Llama (13B) on dialogue data gathered from the web. ([blog post](https://bair.berkeley.edu/blog/2023/04/03/koala/))
-    creator_organization: UC Berkeley
-    access: open
-    num_parameters: 13000000000
-    release_date: 2022-04-03
-    todo: true
-  # BigScience
-  - name: together/bloom
-    display_name: BLOOM (176B)
-    description: BLOOM (176B parameters) is an autoregressive model trained on 46 natural languages and 13 programming languages ([paper](https://arxiv.org/pdf/2211.05100.pdf)).
-    creator_organization: BigScience
-    access: open
-    num_parameters: 176000000000
-    release_date: 2022-06-28
-  - name: together/bloomz
-    display_name: BLOOMZ (176B)
-    description: BLOOMZ (176B parameters) is BLOOM that has been fine-tuned on natural language instructions ([details](https://huggingface.co/bigscience/bloomz)).
-    creator_organization: BigScience
-    access: open
-    num_parameters: 176000000000
-    release_date: 2022-11-03
-    todo: true
-  - name: together/t0pp
-    display_name: T0pp (11B)
-    description: T0pp (11B parameters) is an encoder-decoder model trained on a large set of different tasks specified in natural language prompts ([paper](https://arxiv.org/pdf/2110.08207.pdf)).
-    creator_organization: BigScience
-    access: open
-    num_parameters: 11000000000
-    release_date: 2021-10-15
-  # BigCode
-  - name: huggingface/santacoder
-    display_name: SantaCoder (1.1B)
-    description: SantaCoder (1.1B parameters) model trained on the Python, Java, and JavaScript subset of The Stack (v1.1) ([model card](https://huggingface.co/bigcode/santacoder)).
-    creator_organization: BigCode
-    access: open
-  - name: huggingface/starcoder
-    display_name: StarCoder (15.5B)
-    description: The StarCoder (15.5B parameter) model trained on 80+ programming languages from The Stack (v1.2) ([model card](https://huggingface.co/bigcode/starcoder)).
-    creator_organization: BigCode
-    access: open
-  # Hugging Face
-  - name: huggingface/gpt2
-    display_name: GPT-2 (124M)
-    description: GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts.
-    creator_organization: OpenAI
-    access: open
-    num_parameters: 124000000
-  - name: huggingface/gpt2-medium
-    display_name: GPT-2 Medium (355M)
-    description: GPT-2 Medium is the 355M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
-    creator_organization: OpenAI
-    access: open
-    num_parameters: 355000000
-  - name: huggingface/gpt2-large
-    display_name: GPT-2 Large (774M)
-    description: GPT-2 Large is the 774M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
-    creator_organization: OpenAI
-    access: open
-    num_parameters: 774000000
-  - name: huggingface/gpt2-xl
-    display_name: GPT-2 XL (1.5B)
-    description: GPT-2 XL is the 1.5B parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
-    creator_organization: OpenAI
-    access: open
-    num_parameters: 1500000000
-  # HuggignfaceM4
-  - name: HuggingFaceM4/idefics-9b
-    display_name: IDEFICS (9B)
-    description: IDEFICS (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
-    creator_organization: HuggingFace
-    access: open
-    num_parameters: 9000000000
-    release_date: 2023-08-22
-  - name: HuggingFaceM4/idefics-9b-instruct
-    display_name: IDEFICS instruct (9B)
-    description: IDEFICS instruct (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
-    creator_organization: HuggingFace
-    access: open
-    num_parameters: 9000000000
-    release_date: 2023-08-22
-  - name: HuggingFaceM4/idefics-80b
-    display_name: IDEFICS (80B)
-    description: IDEFICS (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
-    creator_organization: HuggingFace
-    access: open
-    num_parameters: 80000000000
-    release_date: 2023-08-22
-  - name: HuggingFaceM4/idefics-80b-instruct
-    display_name: IDEFICS instruct (80B)
-    description: IDEFICS instruct (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
-    creator_organization: HuggingFace
-    access: open
-    num_parameters: 80000000000
-    release_date: 2023-08-22
-  # Cerebras Systems
-  - name: together/cerebras-gpt-6.7b
-    display_name: Cerebras GPT (6.7B)
-    description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
-    creator_organization: Cerebras
-    access: limited
-    num_parameters: 6700000000
-    release_date: 2023-04-06
-    todo: true
-  - name: together/cerebras-gpt-13b
-    display_name: Cerebras GPT (13B)
-    description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
-    creator_organization: Cerebras
-    access: limited
-    num_parameters: 13000000000
-    release_date: 2023-04-06
-    todo: true
-  # Cohere
-  - name: cohere/xlarge-20220609
-    display_name: Cohere xlarge v20220609 (52.4B)
-    description: Cohere xlarge v20220609 (52.4B parameters)
-    creator_organization: Cohere
-    access: limited
-    num_parameters: 52400000000
-    release_date: 2022-06-09
-  - name: cohere/large-20220720
-    display_name: Cohere large v20220720 (13.1B)
-    description: Cohere large v20220720 (13.1B parameters), which is deprecated by Cohere as of December 2, 2022.
-    creator_organization: Cohere
-    access: limited
-    num_parameters: 13100000000
-    release_date: 2022-07-20
-  - name: cohere/medium-20220720
-    display_name: Cohere medium v20220720 (6.1B)
-    description: Cohere medium v20220720 (6.1B parameters)
-    creator_organization: Cohere
-    access: limited
-    num_parameters: 6100000000
-    release_date: 2022-07-20
-  - name: cohere/small-20220720
-    display_name: Cohere small v20220720 (410M)
-    description: Cohere small v20220720 (410M parameters), which is deprecated by Cohere as of December 2, 2022.
-    creator_organization: Cohere
-    access: limited
-    num_parameters: 410000000
-    release_date: 2022-07-20
-  - name: cohere/xlarge-20221108
-    display_name: Cohere xlarge v20221108 (52.4B)
-    description: Cohere xlarge v20221108 (52.4B parameters)
-    creator_organization: Cohere
-    access: limited
-    num_parameters: 52400000000
-    release_date: 2022-11-08
-  - name: cohere/medium-20221108
-    display_name: Cohere medium v20221108 (6.1B)
-    description: Cohere medium v20221108 (6.1B parameters)
-    creator_organization: Cohere
-    access: limited
-    num_parameters: 6100000000
-    release_date: 2022-11-08
-  - name: cohere/command-medium-beta
-    display_name: Cohere Command beta (6.1B)
-    description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
-    creator_organization: Cohere
-    access: limited
-    num_parameters: 6100000000
-    release_date: 2022-11-08
-  - name: cohere/command-xlarge-beta
-    display_name: Cohere Command beta (52.4B)
-    description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
-    creator_organization: Cohere
-    access: limited
-    num_parameters: 52400000000
-    release_date: 2022-11-08
-  - name: cohere/command
-    display_name: Cohere Command
-    description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
-    creator_organization: Cohere
-    access: limited
-    release_date: 2023-09-29
-  - name: cohere/command-light
-    display_name: Cohere Command Light
-    description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
-    creator_organization: Cohere
-    access: limited
-    release_date: 2023-09-29
-  # Databricks
-  - name: databricks/dolly-v2-3b
-    display_name: Dolly V2 (3B)
-    description: Dolly V2 (3B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
-    creator_organization: Databricks
-    access: open
-    num_parameters: 2517652480
-    release_date: 2023-04-12
-    todo: true
-  - name: databricks/dolly-v2-7b
-    display_name: Dolly V2 (7B)
-    description: Dolly V2 (7B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
-    creator_organization: Databricks
-    access: open
-    num_parameters: 6444163072
-    release_date: 2023-04-12
-    todo: true
-  - name: databricks/dolly-v2-12b
-    display_name: Dolly V2 (12B)
-    description: Dolly V2 (12B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
-    creator_organization: Databricks
-    access: open
-    num_parameters: 11327027200
-    release_date: 2023-04-12
-    todo: true
-  # DeepMind
-  - name: deepmind/gopher
-    display_name: Gopher (280B)
-    description: Gopher (540B parameters) ([paper](https://arxiv.org/pdf/2112.11446.pdf)).
-    creator_organization: DeepMind
-    access: closed
-    todo: true
-  - name: deepmind/chinchilla
-    display_name: Chinchilla (70B)
-    description: Chinchilla (70B parameters) ([paper](https://arxiv.org/pdf/2203.15556.pdf)).
-    creator_organization: DeepMind
-    access: closed
-    todo: true
-  # EleutherAI
-  - name: together/gpt-j-6b
-    display_name: GPT-J (6B)
-    description: GPT-J (6B parameters) autoregressive language model trained on The Pile ([details](https://arankomatsuzaki.wordpress.com/2021/06/04/gpt-j/)).
-    creator_organization: EleutherAI
-    access: open
-    num_parameters: 6000000000
-    release_date: 2021-06-04
-  - name: together/gpt-neox-20b
-    display_name: GPT-NeoX (20B)
-    description: GPT-NeoX (20B parameters) autoregressive language model trained on The Pile ([paper](https://arxiv.org/pdf/2204.06745.pdf)).
-    creator_organization: EleutherAI
-    access: open
-    num_parameters: 20000000000
-    release_date: 2022-02-02
-  - name: eleutherai/pythia-1b-v0
-    display_name: Pythia (1B)
-    description: Pythia (1B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
-    creator_organization: EleutherAI
-    access: open
-    num_parameters: 805736448
-    release_date: 2023-02-13
-    todo: true
-  - name: eleutherai/pythia-2.8b-v0
-    display_name: Pythia (2.8B)
-    description: Pythia (2.8B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
-    creator_organization: EleutherAI
-    access: open
-    num_parameters: 2517652480
-    release_date: 2023-02-13
-    todo: true
-  - name: eleutherai/pythia-6.9b
-    display_name: Pythia (6.9B)
-    description: Pythia (6.9B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
-    creator_organization: EleutherAI
-    access: open
-    num_parameters: 6444163072
-    release_date: 2023-02-13
-  - name: eleutherai/pythia-12b-v0
-    display_name: Pythia (12B)
-    description: Pythia (12B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
-    creator_organization: EleutherAI
-    access: open
-    num_parameters: 11327027200
-    release_date: 2023-02-13
-  # Google
-  - name: together/t5-11b
-    display_name: T5 (11B)
-    description: T5 (11B parameters) is an encoder-decoder model trained on a multi-task mixture, where each task is converted into a text-to-text format ([paper](https://arxiv.org/pdf/1910.10683.pdf)).
-    creator_organization: Google
-    access: open
-    num_parameters: 11000000000
-    release_date: 2019-10-23
-  - name: together/ul2
-    display_name: UL2 (20B)
-    description: UL2 (20B parameters) is an encoder-decoder model trained on the C4 corpus. It's similar to T5 but trained with a different objective and slightly different scaling knobs ([paper](https://arxiv.org/pdf/2205.05131.pdf)).
-    creator_organization: Google
-    access: open
-    num_parameters: 20000000000
-    release_date: 2022-05-10
-  - name: together/flan-t5-xxl
-    display_name: Flan-T5 (11B)
-    description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)).
-    creator_organization: Google
-    access: open
-  - name: google/palm
-    display_name: PaLM (540B)
-    description: Pathways Language Model (540B parameters) is trained using 6144 TPU v4 chips ([paper](https://arxiv.org/pdf/2204.02311.pdf)).
-    creator_organization: Google
-    access: closed
-    todo: true
-  ## PaLM 2
-  - name: google/text-bison@001
-    display_name: PaLM-2 (Bison)
-    description: The best value PaLM model. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
-    creator_organization: Google
-    access: limited
-    release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
-  - name: google/text-bison-32k
-    display_name: PaLM-2 (Bison)
-    description: The best value PaLM model with a 32K context. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
-    creator_organization: Google
-    access: limited
-    release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
-  - name: google/text-unicorn@001
-    display_name: PaLM-2 (Unicorn)
-    description: The largest model in PaLM family. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
-    creator_organization: Google
-    access: limited
-    release_date: 2023-11-30 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
-  - name: google/code-bison@001
-    display_name: Codey PaLM-2 (Bison)
-    description: A model fine-tuned to generate code based on a natural language description of the desired code. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
-    creator_organization: Google
-    access: limited
-    release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
-  - name: google/code-bison-32k
-    display_name: Codey PaLM-2 (Bison)
-    description: Codey with a 32K context. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
-    creator_organization: Google
-    access: limited
-    release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
-  # HazyResearch
-  - name: together/h3-2.7b
-    display_name: H3 (2.7B)
-    description: H3 (2.7B parameters) is a decoder-only language model based on state space models ([paper](https://arxiv.org/abs/2212.14052)).
-    creator_organization: HazyResearch
-    access: open
-    num_parameters: 2700000000
-    release_date: 2023-01-23
-    todo: true
-  # Lightning AI's Lit-GPT
-  - name: lightningai/lit-gpt
-    display_name: Lit-GPT
-    description: Lit-GPT is an optimized collection of open-source LLMs for finetuning and inference. It supports – Falcon, Llama 2, Vicuna, LongChat, and other top-performing open-source large language models.
-    creator_organization: Lightning AI
-    access: open
-    num_parameters: 1
-    release_date: 2023-04-04
-  # Meta
-  - name: together/opt-iml-175b
-    display_name: OPT-IML (175B)
-    description: OPT-IML (175B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
-    creator_organization: Meta
-    access: open
-    num_parameters: 175000000000
-    release_date: 2022-12-22
-    todo: true
-  - name: together/opt-iml-30b
-    display_name: OPT-IML (30B)
-    description: OPT-IML (30B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
-    creator_organization: Meta
-    access: open
-    num_parameters: 30000000000
-    release_date: 2022-12-22
-    todo: true
-  - name: together/opt-175b
-    display_name: OPT (175B)
-    description: Open Pre-trained Transformers (175B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
-    creator_organization: Meta
-    access: open
-    num_parameters: 175000000000
-    release_date: 2022-05-02
-  - name: together/opt-66b
-    display_name: OPT (66B)
-    description: Open Pre-trained Transformers (66B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
-    creator_organization: Meta
-    access: open
-    num_parameters: 66000000000
-    release_date: 2022-05-02
-  - name: together/opt-6.7b
-    display_name: OPT (6.7B)
-    description: Open Pre-trained Transformers (6.7B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
-    creator_organization: Meta
-    access: open
-    num_parameters: 6700000000
-    release_date: 2022-05-02
-  - name: together/opt-1.3b
-    display_name: OPT (1.3B)
-    description: Open Pre-trained Transformers (1.3B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
-    creator_organization: Meta
-    access: open
-    num_parameters: 1300000000
-    release_date: 2022-05-02
-  - name: together/galactica-120b
-    display_name: Galactica (120B)
-    description: Galactica (120B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
-    creator_organization: Meta
-    access: open
-    num_parameters: 120000000000
-    release_date: 2022-11-15
-    todo: true
-  - name: together/galactica-30b
-    display_name: Galactica (30B)
-    description: Galactica (30B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
-    creator_organization: Meta
-    access: open
-    num_parameters: 30000000000
-    release_date: 2022-11-15
-    todo: true
-  - name: meta/llama-7b
-    display_name: LLaMA (7B)
-    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
-    creator_organization: Meta
-    access: open
-    num_parameters: 7000000000
-    release_date: 2023-02-24
-  - name: meta/llama-13b
-    display_name: LLaMA (13B)
-    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
-    creator_organization: Meta
-    access: open
-    num_parameters: 13000000000
-    release_date: 2023-02-24
-  - name: meta/llama-30b
-    display_name: LLaMA (30B)
-    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
-    creator_organization: Meta
-    access: open
-    num_parameters: 30000000000
-    release_date: 2023-02-24
-  - name: meta/llama-65b
-    display_name: LLaMA (65B)
-    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
-    creator_organization: Meta
-    access: open
-    num_parameters: 65000000000
-    release_date: 2023-02-24
-  - name: meta/llama-2-7b
-    display_name: Llama 2 (7B)
-    description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
-    creator_organization: Meta
-    access: open
-    num_parameters: 7000000000
-    release_date: 2023-07-18
-  - name: meta/llama-2-13b
-    display_name: Llama 2 (13B)
-    description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
-    creator_organization: Meta
-    access: open
-    num_parameters: 13000000000
-    release_date: 2023-07-18
-  - name: meta/llama-2-70b
-    display_name: Llama 2 (70B)
-    description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
-    creator_organization: Meta
-    access: open
-    num_parameters: 70000000000
-    release_date: 2023-07-18
-  # Stability AI
-  - name: stabilityai/stablelm-base-alpha-3b
-    display_name: StableLM-Base-Alpha (3B)
-    description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
-    creator_organization: Stability AI
-    access: open
-    num_parameters: 3000000000
-    release_date: 2023-04-20
-    todo: true
-  - name: stabilityai/stablelm-base-alpha-7b
-    display_name: StableLM-Base-Alpha (7B)
-    description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
-    creator_organization: Stability AI
-    access: open
-    num_parameters: 7000000000
-    release_date: 2023-04-20
-    todo: true
-  # Stanford
-  - name: stanford/alpaca-7b
-    display_name: Alpaca (7B)
-    description: Alpaca 7B is a model fine-tuned from the LLaMA 7B model on 52K instruction-following demonstrations
-    creator_organization: Stanford
-    access: open
-    num_parameters: 7000000000
-    release_date: 2023-03-13
-  # LMSYS
-  - name: lmsys/vicuna-7b-v1.3
-    display_name: Vicuna v1.3 (7B)
-    description: Vicuna v1.3 (7B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
-    creator_organization: LMSYS
-    access: open
-    num_parameters: 7000000000
-    release_date: 2023-06-22
-  - name: lmsys/vicuna-13b-v1.3
-    display_name: Vicuna v1.3 (13B)
-    description: Vicuna v1.3 (13B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
-    creator_organization: LMSYS
-    access: open
-    num_parameters: 13000000000
-    release_date: 2023-06-22
-  # 01.AI
-  - name: 01-ai/yi-6b
-    display_name: Yi (6B)
-    description: The Yi models are large language models trained from scratch by developers at 01.AI.
-    creator_organization: 01.AI
-    access: open
-    num_parameters: 6000000000
-    release_date: 2023-11-02
-  - name: 01-ai/yi-34b
-    display_name: Yi (34B)
-    description: The Yi models are large language models trained from scratch by developers at 01.AI.
-    creator_organization: 01.AI
-    access: open
-    num_parameters: 34000000000
-    release_date: 2023-11-02
-  # Mistral AI
-  - name: mistralai/mistral-7b-v0.1
-    display_name: Mistral v0.1 (7B)
-    description: Mistral 7B is a  7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA).
-    creator_organization: Mistral AI
-    access: open
-    num_parameters: 7300000000
-    release_date: 2023-09-27
-  # Microsoft/NVIDIA
-  - name: microsoft/TNLGv2_530B
-    display_name: TNLG v2 (530B)
-    description: TNLG v2 (530B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
-    creator_organization: Microsoft/NVIDIA
-    access: closed
-    num_parameters: 530000000000
-    release_date: 2022-01-28
-  - name: microsoft/TNLGv2_7B
-    display_name: TNLG v2 (6.7B)
-    description: TNLG v2 (6.7B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
-    creator_organization: Microsoft/NVIDIA
-    access: closed
-    num_parameters: 6700000000
-    release_date: 2022-01-28
-  # OpenAI: https://beta.openai.com/docs/engines/gpt-3
-  - name: openai/davinci
-    display_name: davinci (175B)
-    description: Original GPT-3 (175B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
-    creator_organization: OpenAI
-    access: limited
-    num_parameters: 175000000000
-    release_date: 2020-05-28
-  - name: openai/curie
-    display_name: curie (6.7B)
-    description: Original GPT-3 (6.7B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
-    creator_organization: OpenAI
-    access: limited
-    num_parameters: 6700000000
-    release_date: 2020-05-28
-  - name: openai/babbage
-    display_name: babbage (1.3B)
-    description: Original GPT-3 (1.3B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
-    creator_organization: OpenAI
-    access: limited
-    num_parameters: 1300000000
-    release_date: 2020-05-28
-  - name: openai/ada
-    display_name: ada (350M)
-    description: Original GPT-3 (350M parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
-    creator_organization: OpenAI
-    access: limited
-    num_parameters: 350000000
-    release_date: 2020-05-28
-  - name: openai/text-davinci-003
-    display_name: text-davinci-003
-    description: text-davinci-003 model that involves reinforcement learning (PPO) with reward models. Derived from text-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
-    creator_organization: OpenAI
-    access: limited
-    num_parameters: 175000000000
-    release_date: 2022-11-28
-  - name: openai/text-davinci-002
-    display_name: text-davinci-002
-    description: text-davinci-002 model that involves supervised fine-tuning on human-written demonstrations. Derived from code-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
-    creator_organization: OpenAI
-    access: limited
-    num_parameters: 175000000000
-    release_date: 2022-01-27
-  - name: openai/text-davinci-001
-    display_name: text-davinci-001
-    description: text-davinci-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
-    creator_organization: OpenAI
-    access: limited
-    num_parameters: 175000000000
-    release_date: 2022-01-27
-    todo: true
-  - name: openai/text-curie-001
-    display_name: text-curie-001
-    description: text-curie-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
-    creator_organization: OpenAI
-    access: limited
-    num_parameters: 6700000000
-    release_date: 2022-01-27
-  - name: openai/text-babbage-001
-    display_name: text-babbage-001
-    description: text-babbage-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
-    creator_organization: OpenAI
-    access: limited
-    num_parameters: 1300000000
-    release_date: 2022-01-27
-  - name: openai/text-ada-001
-    display_name: text-ada-001
-    description: text-ada-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
-    creator_organization: OpenAI
-    access: limited
-    num_parameters: 350000000
-    release_date: 2022-01-27
-  - name: openai/gpt-4-0314
-    display_name: gpt-4-0314
-    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from March 14th 2023.
-    creator_organization: OpenAI
-    access: limited
-    release_date: 2023-03-14
-  - name: openai/gpt-4-32k-0314
-    display_name: gpt-4-32k-0314
-    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from March 14th 2023.
-    creator_organization: OpenAI
-    access: limited
-    release_date: 2023-03-14
-  - name: openai/gpt-4-0613
-    display_name: gpt-4-0613
-    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from 2023-06-13.
-    creator_organization: OpenAI
-    access: limited
-    release_date: 2023-06-13
-  - name: openai/gpt-4-32k-0613
-    display_name: gpt-4-32k-0613
-    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from 2023-06-13.
-    creator_organization: OpenAI
-    access: limited
-    release_date: 2023-06-13
-  - name: openai/code-davinci-002
-    display_name: code-davinci-002
-    description: Codex-style model that is designed for pure code-completion tasks ([docs](https://beta.openai.com/docs/models/codex)).
-    creator_organization: OpenAI
-    access: limited
-  - name: openai/code-davinci-001
-    display_name: code-davinci-001
-    description: code-davinci-001 model
-    creator_organization: OpenAI
-    access: limited
-    todo: true
-  - name: openai/code-cushman-001
-    display_name: code-cushman-001 (12B)
-    description: Codex-style model that is a stronger, multilingual version of the Codex (12B) model in the [Codex paper](https://arxiv.org/pdf/2107.03374.pdf).
-    creator_organization: OpenAI
-    access: limited
-  - name: openai/gpt-3.5-turbo-0301
-    display_name: gpt-3.5-turbo-0301
-    description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01.
-    creator_organization: OpenAI
-    access: limited
-    release_date: 2023-03-01
-  - name: openai/gpt-3.5-turbo-0613
-    display_name: gpt-3.5-turbo-0613
-    description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13.
-    creator_organization: OpenAI
-    access: limited
-    release_date: 2023-06-13
-  - name: openai/gpt-3.5-turbo-16k-0613
-    display_name: gpt-3.5-turbo-16k-0613
-    description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13 with a longer context length of 16,384 tokens.
-    creator_organization: OpenAI
-    access: limited
-    release_date: 2023-06-13
-  - name: openai/gpt-4-1106-preview
-    display_name: gpt-4-1106-preview
-    description: GPT-4 Turbo (preview) is a large multimodal model that is optimized for chat but works well for traditional completions tasks. The model is cheaper and faster than the original GPT-4 model. Preview snapshot from November 6, 2023.
-    creator_organization: OpenAI
-    access: limited
-    release_date: 2023-11-06
-  # Together
-  - name: together/Together-gpt-JT-6B-v1
-    display_name: GPT-JT (6B)
-    description: GPT-JT (6B parameters) is a fork of GPT-J ([blog post](https://www.together.xyz/blog/releasing-v1-of-gpt-jt-powered-by-open-source-ai)).
-    creator_organization: Together
-    access: open
-    num_parameters: 6700000000
-    release_date: 2022-11-29
-    todo: true
-  - name: together/gpt-neoxt-chat-base-20b
-    display_name: GPT-NeoXT-Chat-Base (20B)
-    description: GPT-NeoXT-Chat-Base (20B) is fine-tuned from GPT-NeoX, serving as a base model for developing open-source chatbots.
-    creator_organization: Together
-    access: open
-    num_parameters: 20000000000
-    release_date: 2023-03-08
-    todo: true
-  - name: together/redpajama-incite-base-3b-v1
-    display_name: RedPajama-INCITE-Base-v1 (3B)
-    description: RedPajama-INCITE-Base-v1 (3B parameters) is a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
-    creator_organization: Together
-    access: open
-    num_parameters: 3000000000
-    release_date: 2023-05-05
-  - name: together/redpajama-incite-instruct-3b-v1
-    display_name: RedPajama-INCITE-Instruct-v1 (3B)
-    description: RedPajama-INCITE-Instruct-v1 (3B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
-    creator_organization: Together
-    access: open
-    num_parameters: 3000000000
-    release_date: 2023-05-05
-    todo: true
-  - name: together/redpajama-incite-chat-3b-v1
-    display_name: RedPajama-INCITE-Chat-v1 (3B)
-    description: RedPajama-INCITE-Chat-v1 (3B parameters) is a model fine-tuned on OASST1 and Dolly2 to enhance chatting ability. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
-    creator_organization: Together
-    access: open
-    num_parameters: 3000000000
-    release_date: 2023-05-05
-    todo: true
-  - name: together/redpajama-incite-base-7b
-    display_name: RedPajama-INCITE-Base (7B)
-    description: RedPajama-INCITE-Base (7B parameters) is a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
-    creator_organization: Together
-    access: open
-    num_parameters: 7000000000
-    release_date: 2023-05-05
-    todo: true
-  - name: together/redpajama-incite-instruct-7b
-    display_name: RedPajama-INCITE-Instruct (7B)
-    description: RedPajama-INCITE-Instruct (7B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base (7B), a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
-    creator_organization: Together
-    access: open
-    num_parameters: 7000000000
-    release_date: 2023-05-05
-    todo: true
-  # MosaicML
-  - name: mosaicml/mpt-7b
-    display_name: MPT (7B)
-    description: MPT (7B) is a Transformer trained from scratch on 1T tokens of text and code.
-    creator_organization: MosaicML
-    access: open
-    num_parameters: 6700000000
-    release_date: 2023-05-05
-  - name: mosaicml/mpt-7b-chat
-    display_name: MPT-Chat (7B)
-    description: MPT-Chat (7B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B) , a Transformer trained from scratch on 1T tokens of text and code.
-    creator_organization: MosaicML
-    access: open
-    num_parameters: 6700000000
-    release_date: 2023-05-05
-    todo: true
-  - name: mosaicml/mpt-instruct-7b
-    display_name: MPT-Instruct (7B)
-    description: MPT-Instruct (7B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
-    creator_organization: MosaicML
-    access: open
-    num_parameters: 6700000000
-    release_date: 2023-05-05
-  - name: mosaicml/mpt-30b
-    display_name: MPT (30B)
-    description: MPT (30B) is a Transformer trained from scratch on 1T tokens of text and code.
-    creator_organization: MosaicML
-    access: open
-    num_parameters: 30000000000
-    release_date: 2023-06-22
-  - name: mosaicml/mpt-30b-chat
-    display_name: MPT-Chat (30B)
-    description: MPT-Chat (30B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
-    creator_organization: MosaicML
-    access: open
-    num_parameters: 30000000000
-    release_date: 2023-06-22
-    todo: true
-  - name: mosaicml/mpt-instruct-30b
-    display_name: MPT-Instruct (30B)
-    description: MPT-Instruct (30B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
-    creator_organization: MosaicML
-    access: open
-    num_parameters: 30000000000
-    release_date: 2023-06-22
-  # TII UAE
-  - name: tiiuae/falcon-7b
-    display_name: Falcon (7B)
-    description: Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
-    creator_organization: TII UAE
-    access: open
-    num_parameters: 7000000000
-    release_date: 2023-03-15
-  - name: tiiuae/falcon-7b-instruct
-    display_name: Falcon-Instruct (7B)
-    description: Falcon-7B-Instruct is a 7B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
-    creator_organization: TII UAE
-    access: open
-    num_parameters: 7000000000
-    release_date: 2023-03-15
-  - name: tiiuae/falcon-40b
-    display_name: Falcon (40B)
-    description: Falcon-40B is a 40B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
-    creator_organization: TII UAE
-    access: open
-    num_parameters: 40000000000
-    release_date: 2023-05-25
-  - name: tiiuae/falcon-40b-instruct
-    display_name: Falcon-Instruct (40B)
-    description: Falcon-40B-Instruct is a 40B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
-    creator_organization: TII UAE
-    access: open
-    num_parameters: 40000000000
-    release_date: 2023-05-25
-  # Salesforce
-  - name: together/codegen
-    display_name: CodeGen (16B)
-    description: CodeGen (16B parameters) is an open dense code model trained for multi-turn program synthesis ([blog](https://arxiv.org/pdf/2203.13474.pdf)).
-    creator_organization: Tsinghua
-    access: open
-    num_parameters: 16000000000
-    release_date: 2022-03-25
-    todo: true
-  # Tsinghua
-  - name: together/glm
-    display_name: GLM (130B)
-    description: GLM (130B parameters) is an open bilingual (English & Chinese) bidirectional dense model that was trained using General Language Model (GLM) procedure ([paper](https://arxiv.org/pdf/2210.02414.pdf)).
-    creator_organization: Tsinghua
-    access: open
-    num_parameters: 130000000000
-    release_date: 2022-08-04
-  - name: together/codegeex
-    display_name: CodeGeeX (13B)
-    description: CodeGeeX (13B parameters) is an open dense code model trained on more than 20 programming languages on a corpus of more than 850B tokens ([blog](http://keg.cs.tsinghua.edu.cn/codegeex/)).
-    creator_organization: Tsinghua
-    access: open
-    num_parameters: 13000000000
-    release_date: 2022-09-19
-    todo: true
-  # Writer
-  - name: writer/palmyra-base
-    display_name: Palmyra Base (5B)
-    description: Palmyra Base (5B)
-    creator_organization: Writer
-    access: limited
-    num_parameters: 5000000000
-    release_date: 2022-10-13
-  - name: writer/palmyra-large
-    display_name: Palmyra Large (20B)
-    description: Palmyra Large (20B)
-    creator_organization: Writer
-    access: limited
-    num_parameters: 20000000000
-    release_date: 2022-12-23
-  - name: writer/palmyra-instruct-30
-    display_name: InstructPalmyra (30B)
-    description: InstructPalmyra (30B parameters) is trained using reinforcement learning techniques based on feedback from humans.
-    creator_organization: Writer
-    access: limited
-    num_parameters: 30000000000
-    release_date: 2023-02-16
-  - name: writer/palmyra-e
-    display_name: Palmyra E (30B)
-    description: Palmyra E (30B)
-    creator_organization: Writer
-    access: limited
-    num_parameters: 30000000000
-    release_date: 2023-03-03
-  - name: writer/silk-road
-    display_name: Silk Road (35B)
-    description: Silk Road (35B)
-    creator_organization: Writer
-    access: limited
-    num_parameters: 35000000000
-    release_date: 2023-04-13
-  - name: writer/palmyra-x
-    display_name: Palmyra X (43B)
-    description: Palmyra-X (43B parameters) is trained to adhere to instructions using human feedback and utilizes a technique called multiquery attention. Furthermore, a new feature called 'self-instruct' has been introduced, which includes the implementation of an early stopping criteria specifically designed for minimal instruction tuning ([paper](https://dev.writer.com/docs/becoming-self-instruct-introducing-early-stopping-criteria-for-minimal-instruct-tuning)).
-    creator_organization: Writer
-    access: limited
-    num_parameters: 43000000000
-    release_date: 2023-06-11
-  - name: writer/palmyra-x-v2
-    display_name: Palmyra X V2 (33B)
-    description: Palmyra-X V2 (33B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. The pre-training data more than 2 trillion tokens types are diverse and cover a wide range of areas, used FlashAttention-2.
-    creator_organization: Writer
-    access: limited
-    num_parameters: 33000000000
-    release_date: 2023-12-01
-  - name: writer/palmyra-x-v3
-    display_name: Palmyra X V3 (72B)
-    description: Palmyra-X V3 (72B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. It is trained via unsupervised learning and DPO and use multiquery attention.
-    creator_organization: Writer
-    access: limited
-    num_parameters: 72000000000
-    release_date: 2023-12-01
-  - name: writer/palmyra-x-32k
-    display_name: Palmyra X-32K (33B)
-    description: Palmyra-X-32K (33B parameters) is a Transformer-based model, which is trained on large-scale pre-training data. The pre-training data types are diverse and cover a wide range of areas. These data types are used in conjunction and the alignment mechanism to extend context window.
-    creator_organization: Writer
-    access: limited
-    num_parameters: 33000000000
-    release_date: 2023-12-01
-  # Yandex
-  - name: together/yalm
-    display_name: YaLM (100B)
-    description: YaLM (100B parameters) is an autoregressive language model trained on English and Russian text ([GitHub](https://github.com/yandex/YaLM-100B)).
-    creator_organization: Yandex
-    access: open
-    num_parameters: 100000000000
-    release_date: 2022-06-23
-  # NVIDIA
-  - name: nvidia/megatron-gpt2
-    display_name: Megatron GPT2
-    description: GPT-2 implemented in Megatron-LM ([paper](https://arxiv.org/abs/1909.08053)).
-    creator_organization: NVIDIA
-    access: open
-    todo: true
-############################################################
-adapter:
-  - name: method
-    description: The high-level strategy for converting instances into a prompt for the language model.
-    values:
-      - name: generation
-        description: Given the input, the model generates the output free-form.
-      - name: multiple_choice_joint
-        description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
-      - name: multiple_choice_separate_original
-        description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
-      - name: multiple_choice_separate_calibrated
-        description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
-      - name: language_modeling
-        description: Given the input, the model assigns the sequence a probability.
-  - name: instructions
-    description: The description of the task that is included at the very beginning of the prompt.
-  - name: global_prefix
-    description: The string that is prepended to the prompt.
-  - name: instance_prefix
-    description: The string that is included before each instance (e.g., '\n\n').
-  - name: input_prefix
-    description: The string that is included before each input (e.g., 'Question:').
-  - name: input_suffix
-    description: The string that is included after each input (e.g., '\n').
-  - name: reference_prefix
-    description: The string that is included before each reference (for multiple-choice questions).
-  - name: reference_suffix
-    description: The string that is included after each reference (for multiple-choice questions).
-  - name: output_prefix
-    description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
-  - name: output_suffix
-    description: The string that is included after the correct answer/predicted output (e.g., '\n').
-  - name: substitutions
-    description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
-  - name: max_train_instances
-    description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
-  - name: max_eval_instances
-    description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
-  - name: num_outputs
-    description: Maximum number of possible outputs to generate by sampling multiple outputs.
-  - name: num_train_trials
-    description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
-  - name: sample_train
-    description: If true, randomly sample N training examples; if false, select N consecutive training examples
-  - name: model
-    description: DEPRECATED. Name of the language model (<creator_organization>/<model name>) to send requests to.
-  - name: model_deployment
-    description: Name of the language model (<host_organization>/<model name>) to send requests to.
-  - name: temperature
-    description: Temperature parameter used in generation.
-  - name: max_tokens
-    description: Maximum number of tokens to generate.
-  - name: stop_sequences
-    description: List of sequences, where we stop generation if we encounter any of them.
-  - name: random
-    description: Random seed (string), which guarantees reproducibility.
-  - name: multi_label
-    description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
+# For backwards compatibility with older versions of HELM.
+# TODO: Remove this after 2024-09-01.
+adapter: []
 ############################################################
 metrics:
   # Infrastructure metrics:
@@ -1632,6 +512,50 @@ metrics:
     lower_is_better: false
   # CLEVA Chinese bias, copyright and toxicity metrics share the same name as the original HELM metrics
+  # DecodingTrust Fairness Metrics
+  - name: equalized_odds_difference
+    display_name: Equalized Odds Difference
+    short_display_name: Equalized Odds
+    description: The rate at which the model refuses to answer an OoD question.
+    lower_is_better: true
+  - name: demographic_parity_difference
+    display_name: Demographic Parity Difference
+    short_display_name: demographic parity
+    description: The demographic parity difference measures the difference between the probability of positive predictions conditioned on sensitive attribute A = 1 and that conditioned on A = 0
+    lower_is_better: true
+  # DecodingTrust OoD Metrics
+  - name: ood_knowledge_rr
+    display_name: OOD Knowledge Refusal Rate
+    short_display_name: OoD-RR
+    description: The rate at which the model refuses to answer an OoD question.
+    lower_is_better: false
+  - name: ood_knowledge_macc
+    display_name: OOD Meaningful Answer Accuracy (Exclude Refusal)
+    short_display_name: OoD-MACC
+    description: MACC indicates the percentage of correct answers when the model successfully generates meaningful responses by excluding outputs that are refused to answer
+    lower_is_better: false
+  # DecodingTrust Stereotype Bias Metrics
+  - name: decodingtrust_stereotype_bias
+    display_name: DecodingTrust Bias Score
+    short_display_name: Bias Score
+    description: 1 - agreementIndex (Section 4.2 of DecodingTrust)
+    lower_is_better: false
+  - name: decodingtrust_stereotype_bias_rejection_rate
+    display_name: Accuracy at 10% coverage
+    short_display_name: Bias Rejection Rate
+    description: Rejection rate of stereotype prompts
+    lower_is_better: false
 ############################################################
 perturbations:
   - name: robustness
@@ -1915,6 +839,31 @@ metric_groups:
       - name: chinese_bleu_1
         split: ${main_split}
+  - name: decodingtrust_fairness_metrics
+    display_name: DecodingTrust Fairness
+    metrics:
+      - name: equalized_odds_difference
+        split: ${main_split}
+      - name: demographic_parity_difference
+        split: ${main_split}
+  - name: decodingtrust_ood_metrics
+    display_name: DecodingTrust OOD Accuracy
+    metrics:
+      - name: ood_knowledge_rr
+        split: ${main_split}
+      - name: ood_knowledge_macc
+        split: ${main_split}
+  - name: decodingtrust_stereotype_bias_metrics
+    display_name: DecodingTrust Stereotype Bias
+    metrics:
+      - name: decodingtrust_stereotype_bias
+        split: ${main_split}
+      - name: decodingtrust_stereotype_bias_rejection_rate
+        split: ${main_split}
 ############################################################
 run_groups:
 ## Top-level
@@ -2097,6 +1046,20 @@ run_groups:
       main_name: none
       main_split: none
+  - name: decodingtrust
+    display_name: DecodingTrust
+    description: A comprehensive benchmark of the trustworthiness of large language models [(Wang et. al. 2023)](https://decodingtrust.github.io/)
+    category: Core scenarios
+    subgroups:
+      - decodingtrust_adv_robustness
+      - decodingtrust_adv_demonstration
+      - decodingtrust_ood_robustness
+      - decodingtrust_fairness
+      - decodingtrust_privacy
+      - decodingtrust_machine_ethics
+      - decodingtrust_toxicity_prompts
+      - decodingtrust_stereotype_bias
 ### Ablations
   - name: ablation_in_context
     display_name: Vary number of in-context examples
@@ -2720,23 +1683,6 @@ run_groups:
       when: n/a
       language: synthetic
-  - name: numeracy
-    display_name: Numerical reasoning
-    description: Scenario introduced in this work to test numerical reasoning via symbolic regression.
-    metric_groups:
-      - accuracy
-      - efficiency
-      - general_information
-    environment:
-      main_name: absolute_value_difference
-      main_split: test
-    taxonomy:
-      task: next-word prediction
-      what: Dyck formal language
-      who: n/a
-      when: n/a
-      language: synthetic
   - name: synthetic_reasoning
     display_name: Synthetic reasoning (abstract symbols)
     description: Synthetic reasoning tasks defined using abstract symbols based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
@@ -3873,3 +2819,176 @@ run_groups:
       task: user-facing tasks
       language: English dialects
     todo: true
+# DecodingTrust scenarios
+  - name: decodingtrust_adv_robustness
+    display_name: DecodingTrust - AdvGLUE++
+    short_display_name: AdvGLUE++
+    description: Adversarial perturbations of the GLUE dataset generated against open-source LLMs including Alpaca, Vicuna, and Stable-Vicuna
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: text classification
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+    todo: true
+  - name: decodingtrust_adv_demonstration
+    display_name: DecodingTrust - Adversarial Demonstrations
+    short_display_name: AdvDemo
+    description: Robustness analysis of LM generations when facing adversarial demonstrations
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: text classification
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+  - name: decodingtrust_ood_robustness
+    display_name: DecodingTrust - OoD Robustness
+    short_display_name: OoD
+    description: Style perturbations of GLUE datasets (OoD styles) and out-of-scope OoD knowledge evaluations
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+      - decodingtrust_ood_metrics
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: text classification
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+  - name: decodingtrust_fairness
+    display_name: DecodingTrust - Fairness
+    short_display_name: Fairness
+    description: Fairness analysis of LLMs
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+      - decodingtrust_fairness_metrics
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: text classification
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+  - name: decodingtrust_privacy
+    display_name: DecodingTrust - Privacy
+    short_display_name: Privacy
+    description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: text classification
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+  - name: decodingtrust_machine_ethics
+    display_name: DecodingTrust - Ethics
+    short_display_name: Ethics
+    description: Evaluation of the understanding of ethical behaviors of LLMs
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: text classification
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+  - name: decodingtrust_toxicity_prompts
+    display_name: DecodingTrust - Toxicity
+    short_display_name: Toxicity
+    description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
+    metric_groups:
+      - toxicity
+      - bias
+      - efficiency
+      - general_information
+    environment:
+      main_split: test
+    taxonomy:
+      task: "?"
+      what: n/a
+      who: n/a
+      when: n/a
+      language: synthetic
+  - name: decodingtrust_stereotype_bias
+    display_name: DecodingTrust - Stereotype Bias
+    short_display_name: Stereotype
+    description: Manually crafted stereotype user prompts from DecodingTrust
+    metric_groups:
+      - toxicity
+      - bias
+      - efficiency
+      - general_information
+      - decodingtrust_stereotype_bias_metrics
+    environment:
+      main_split: test
+    taxonomy:
+      task: "?"
+      what: n/a
+      who: n/a
+      when: n/a
+      language: synthetic
+  - name: thai_exam
+    display_name: Thai Exam
+    short_display_name: ThaiExam
+    description: A benchmark comprising Thai multiple-choice examinations.
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: question answering
+      what: "?"
+      who: "?"
+      when: "?"
+      language: Thai

crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.4.0py3-none-any.whl → 0.5.10py3-none-any.whl