crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8" />
|
|
5
|
+
<link rel="icon" type="image/svg+xml" href="https://crfm.stanford.edu/helm/helm.svg" />
|
|
6
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
|
+
<title>Holistic Evaluation of Language Models (HELM)</title>
|
|
8
|
+
<meta name="description" content="The Holistic Evaluation of Language Models (HELM) serves as a living benchmark for transparency in language models. Providing broad coverage and recognizing incompleteness, multi-metric measurements, and standardization. All data and analysis are freely accessible on the website for exploration and study." />
|
|
9
|
+
<script type="text/javascript" src="./config.js"></script>
|
|
10
|
+
<script type="module" crossorigin src="./assets/index-qOFpOyHb.js"></script>
|
|
11
|
+
<link rel="modulepreload" crossorigin href="./assets/react-BteFIppM.js">
|
|
12
|
+
<link rel="modulepreload" crossorigin href="./assets/recharts-DxuQtTOs.js">
|
|
13
|
+
<link rel="modulepreload" crossorigin href="./assets/tremor-DR4fE7ko.js">
|
|
14
|
+
<link rel="stylesheet" crossorigin href="./assets/index-oIeiQW2g.css">
|
|
15
|
+
</head>
|
|
16
|
+
<body class="block">
|
|
17
|
+
<div id="root"></div>
|
|
18
|
+
</body>
|
|
19
|
+
</html>
|
|
@@ -4,14 +4,14 @@ from typing import List
|
|
|
4
4
|
from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
|
|
5
5
|
from helm.benchmark.augmentations.perturbation import PerturbationSpec
|
|
6
6
|
from helm.benchmark.data_preprocessor import DataPreprocessor
|
|
7
|
-
from helm.benchmark.run_specs import
|
|
7
|
+
from helm.benchmark.run_specs.simple_run_specs import get_simple1_spec
|
|
8
8
|
from helm.benchmark.scenarios.scenario import create_scenario, Instance, Scenario, with_instance_ids
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def test_data_preprocessor():
|
|
12
12
|
# Test that each Instance is given a unique ID and is preserved through data augmentation
|
|
13
13
|
data_preprocessor = DataPreprocessor(DataAugmenterSpec())
|
|
14
|
-
scenario: Scenario = create_scenario(
|
|
14
|
+
scenario: Scenario = create_scenario(get_simple1_spec().scenario_spec)
|
|
15
15
|
instances = with_instance_ids(scenario.get_instances(output_path=""))
|
|
16
16
|
instances: List[Instance] = data_preprocessor.preprocess(instances)
|
|
17
17
|
for i, instance in enumerate(instances):
|
|
@@ -32,7 +32,7 @@ def test_data_preprocessor_with_data_augmentation():
|
|
|
32
32
|
should_include_original_eval=True,
|
|
33
33
|
)
|
|
34
34
|
data_preprocessor = DataPreprocessor(data_augmenter_spec)
|
|
35
|
-
scenario: Scenario = create_scenario(
|
|
35
|
+
scenario: Scenario = create_scenario(get_simple1_spec().scenario_spec)
|
|
36
36
|
instances = with_instance_ids(scenario.get_instances(output_path=""))
|
|
37
37
|
instances: List[Instance] = data_preprocessor.preprocess(instances)
|
|
38
38
|
assert len(instances) == 10 + 10 + 10 # original train + original eval + perturbed eval
|
|
@@ -2,7 +2,7 @@ import unittest
|
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
4
|
from helm.benchmark.run_expander import IncreaseMaxTokensRunExpander
|
|
5
|
-
from helm.benchmark.
|
|
5
|
+
from helm.benchmark.run_spec import RunSpec
|
|
6
6
|
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
7
7
|
|
|
8
8
|
|
|
@@ -1,48 +1,6 @@
|
|
|
1
|
-
from
|
|
2
|
-
from .window_service import INT_MAX
|
|
3
|
-
from .local_window_service import LocalWindowService
|
|
4
|
-
from .tokenizer_service import TokenizerService
|
|
1
|
+
from helm.benchmark.window_services.local_window_service import LocalWindowService
|
|
5
2
|
|
|
6
3
|
|
|
7
4
|
class DefaultWindowService(LocalWindowService):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
service: TokenizerService,
|
|
11
|
-
tokenizer_name: str,
|
|
12
|
-
max_sequence_length: int,
|
|
13
|
-
max_request_length: Optional[int] = None,
|
|
14
|
-
max_sequence_and_generated_tokens_length: Optional[int] = None,
|
|
15
|
-
end_of_text_token: Optional[str] = None,
|
|
16
|
-
prefix_token: Optional[str] = None,
|
|
17
|
-
):
|
|
18
|
-
super().__init__(service)
|
|
19
|
-
self._tokenizer_name = tokenizer_name
|
|
20
|
-
self._max_sequence_length = max_sequence_length
|
|
21
|
-
self._max_request_length = max_request_length or max_sequence_length
|
|
22
|
-
self._max_sequence_and_generated_tokens_length = max_sequence_and_generated_tokens_length or INT_MAX
|
|
23
|
-
self._end_of_text_token = end_of_text_token or ""
|
|
24
|
-
self._prefix_token = prefix_token or ""
|
|
25
|
-
|
|
26
|
-
@property
|
|
27
|
-
def tokenizer_name(self) -> str:
|
|
28
|
-
return self._tokenizer_name
|
|
29
|
-
|
|
30
|
-
@property
|
|
31
|
-
def max_sequence_length(self) -> int:
|
|
32
|
-
return self._max_sequence_length
|
|
33
|
-
|
|
34
|
-
@property
|
|
35
|
-
def max_request_length(self) -> int:
|
|
36
|
-
return self._max_request_length
|
|
37
|
-
|
|
38
|
-
@property
|
|
39
|
-
def max_sequence_and_generated_tokens_length(self) -> int:
|
|
40
|
-
return self._max_sequence_and_generated_tokens_length
|
|
41
|
-
|
|
42
|
-
@property
|
|
43
|
-
def end_of_text_token(self) -> str:
|
|
44
|
-
return self._end_of_text_token
|
|
45
|
-
|
|
46
|
-
@property
|
|
47
|
-
def prefix_token(self) -> str:
|
|
48
|
-
return self._prefix_token
|
|
5
|
+
# TODO: Delete this WindowService.
|
|
6
|
+
pass
|
|
@@ -1,21 +1,10 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
|
|
3
|
-
from helm.common.hierarchical_logger import
|
|
4
|
-
from .local_window_service import LocalWindowService
|
|
5
|
-
from .tokenizer_service import TokenizerService
|
|
3
|
+
from helm.common.hierarchical_logger import hwarn
|
|
4
|
+
from helm.benchmark.window_services.local_window_service import LocalWindowService
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
class EncoderDecoderWindowService(LocalWindowService, ABC):
|
|
9
|
-
def __init__(self, service: TokenizerService):
|
|
10
|
-
super().__init__(service)
|
|
11
|
-
|
|
12
|
-
@property
|
|
13
|
-
def max_request_length(self) -> int:
|
|
14
|
-
"""
|
|
15
|
-
Return the max request length. We set the max requests length to be `max_sequence_length`.
|
|
16
|
-
"""
|
|
17
|
-
return self.max_sequence_length
|
|
18
|
-
|
|
19
8
|
@property
|
|
20
9
|
def max_output_length(self) -> int:
|
|
21
10
|
"""
|
|
@@ -32,8 +21,8 @@ class EncoderDecoderWindowService(LocalWindowService, ABC):
|
|
|
32
21
|
vs. the completions, we check the two values separately.
|
|
33
22
|
"""
|
|
34
23
|
if expected_completion_token_length > self.max_output_length:
|
|
35
|
-
|
|
36
|
-
f"
|
|
24
|
+
hwarn(
|
|
25
|
+
f"The expected completion token length ({expected_completion_token_length}) "
|
|
37
26
|
f"exceeds the max output length ({self.max_output_length})."
|
|
38
27
|
)
|
|
39
28
|
return self.get_num_tokens(text) <= self.max_request_length
|
|
@@ -1,41 +1,7 @@
|
|
|
1
|
-
from .local_window_service import LocalWindowService
|
|
2
|
-
from .tokenizer_service import TokenizerService
|
|
1
|
+
from helm.benchmark.window_services.local_window_service import LocalWindowService
|
|
3
2
|
|
|
4
3
|
|
|
5
4
|
class ICEWindowService(LocalWindowService):
|
|
6
|
-
def __init__(self, service: TokenizerService):
|
|
7
|
-
super().__init__(service)
|
|
8
|
-
|
|
9
|
-
@property
|
|
10
|
-
def tokenizer_name(self) -> str:
|
|
11
|
-
return "TsinghuaKEG/ice"
|
|
12
|
-
|
|
13
|
-
@property
|
|
14
|
-
def max_sequence_length(self) -> int:
|
|
15
|
-
"""
|
|
16
|
-
The max length of the model input.
|
|
17
|
-
According to https://github.com/THUDM/GLM-130B, the max sequence length is 2048.
|
|
18
|
-
"""
|
|
19
|
-
return 2048
|
|
20
|
-
|
|
21
|
-
@property
|
|
22
|
-
def max_request_length(self) -> int:
|
|
23
|
-
return self.max_sequence_length + 1
|
|
24
|
-
|
|
25
|
-
@property
|
|
26
|
-
def end_of_text_token(self) -> str:
|
|
27
|
-
"""The end of text token."""
|
|
28
|
-
# Followed up in https://github.com/THUDM/icetk/issues/1
|
|
29
|
-
return "</s>"
|
|
30
|
-
|
|
31
|
-
@property
|
|
32
|
-
def prefix_token(self) -> str:
|
|
33
|
-
"""
|
|
34
|
-
The prefix token.
|
|
35
|
-
Inference with echo=True is not feasible, so just set it to the empty string.
|
|
36
|
-
"""
|
|
37
|
-
return ""
|
|
38
|
-
|
|
39
5
|
def truncate_from_right(self, text: str, expected_completion_token_length: int = 0) -> str:
|
|
40
6
|
"""
|
|
41
7
|
Truncates text from the right to fit within the context window given by `max_request_length`
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from helm.benchmark.window_services.local_window_service import LocalWindowService
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class CLIPWindowService(LocalWindowService):
|
|
5
|
+
def truncate_from_right(self, text: str, expected_completion_token_length: int = 0) -> str:
|
|
6
|
+
result: str = self.decode(self.encode(text, truncation=True, max_length=self.max_request_length).tokens)
|
|
7
|
+
|
|
8
|
+
# HACK: For the vast majority of cases, the above logic works, but there are a few where the
|
|
9
|
+
# token count exceeds `max_length` by 1.
|
|
10
|
+
while not self.fits_within_context_window(result):
|
|
11
|
+
result = result[:-1]
|
|
12
|
+
|
|
13
|
+
return result
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from helm.benchmark.window_services.image_generation.clip_window_service import CLIPWindowService
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LexicaSearchWindowService(CLIPWindowService):
|
|
5
|
+
def fits_within_context_window(self, text: str, expected_completion_token_length: int = 0) -> bool:
|
|
6
|
+
return len(text) <= self.max_sequence_length
|
|
7
|
+
|
|
8
|
+
def truncate_from_right(self, text: str, expected_completion_token_length: int = 0) -> str:
|
|
9
|
+
return text[: self.max_sequence_length]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from helm.benchmark.window_services.image_generation.clip_window_service import CLIPWindowService
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class OpenAIDALLEWindowService(CLIPWindowService):
|
|
5
|
+
def fits_within_context_window(self, text: str, expected_completion_token_length: int = 0) -> bool:
|
|
6
|
+
return len(text) <= self.max_sequence_length
|
|
7
|
+
|
|
8
|
+
def truncate_from_right(self, text: str, expected_completion_token_length: int = 0) -> str:
|
|
9
|
+
return text[: self.max_sequence_length]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
import tempfile
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
5
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service
|
|
6
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
7
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestCLIPWindowService:
|
|
11
|
+
def setup_method(self):
|
|
12
|
+
self.path: str = tempfile.mkdtemp()
|
|
13
|
+
service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
|
|
14
|
+
self.window_service = WindowServiceFactory.get_window_service("huggingface/dreamlike-photoreal-v2-0", service)
|
|
15
|
+
|
|
16
|
+
def teardown_method(self, method):
|
|
17
|
+
shutil.rmtree(self.path)
|
|
18
|
+
|
|
19
|
+
def test_truncate_from_right(self):
|
|
20
|
+
example_text: str = (
|
|
21
|
+
"an instqrumemnt used for cutting cloth , paper , axdz othr thdin mteroial , "
|
|
22
|
+
"consamistng of two blades lad one on tvopb of the other and fhastned in tle mixdqdjle "
|
|
23
|
+
"so as to bllow them txo be pened and closed by thumb and fitngesr inserted tgrough rings on"
|
|
24
|
+
)
|
|
25
|
+
assert not self.window_service.fits_within_context_window(example_text)
|
|
26
|
+
|
|
27
|
+
# Truncate and ensure it fits within the context window
|
|
28
|
+
truncated_prompt: str = self.window_service.truncate_from_right(example_text)
|
|
29
|
+
assert self.window_service.fits_within_context_window(truncated_prompt)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
import tempfile
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
5
|
+
from helm.clients.image_generation.dalle2_client import DALLE2Client
|
|
6
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TestOpenAIDALLEWindowService:
|
|
12
|
+
def setup_method(self):
|
|
13
|
+
self.path: str = tempfile.mkdtemp()
|
|
14
|
+
service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
|
|
15
|
+
self.window_service = WindowServiceFactory.get_window_service("openai/dall-e-2", service)
|
|
16
|
+
|
|
17
|
+
def teardown_method(self, method):
|
|
18
|
+
shutil.rmtree(self.path)
|
|
19
|
+
|
|
20
|
+
def test_fits_within_context_window(self):
|
|
21
|
+
assert self.window_service.fits_within_context_window(TEST_PROMPT)
|
|
22
|
+
|
|
23
|
+
def test_truncate_from_right(self):
|
|
24
|
+
long_prompt: str = TEST_PROMPT * 10
|
|
25
|
+
assert not self.window_service.fits_within_context_window(long_prompt)
|
|
26
|
+
|
|
27
|
+
# Truncate and ensure it fits within the context window
|
|
28
|
+
truncated_long_prompt: str = self.window_service.truncate_from_right(long_prompt)
|
|
29
|
+
assert len(truncated_long_prompt) == DALLE2Client.MAX_PROMPT_LENGTH
|
|
30
|
+
assert self.window_service.fits_within_context_window(truncated_long_prompt)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
from typing import List, Optional, cast
|
|
3
3
|
|
|
4
|
-
from .window_service import
|
|
5
|
-
from .tokenizer_service import TokenizerService
|
|
4
|
+
from helm.benchmark.window_services.window_service import ConfigurableWindowService, EncodeResult
|
|
5
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
6
6
|
from helm.common.tokenization_request import (
|
|
7
7
|
DecodeRequest,
|
|
8
8
|
DecodeRequestResult,
|
|
@@ -10,11 +10,28 @@ from helm.common.tokenization_request import (
|
|
|
10
10
|
TokenizationRequestResult,
|
|
11
11
|
TokenizationToken,
|
|
12
12
|
)
|
|
13
|
-
from helm.
|
|
13
|
+
from helm.clients.client import cleanup_tokens
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
class LocalWindowService(
|
|
17
|
-
def __init__(
|
|
16
|
+
class LocalWindowService(ConfigurableWindowService, ABC):
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
service: TokenizerService,
|
|
20
|
+
tokenizer_name: str,
|
|
21
|
+
max_sequence_length: int,
|
|
22
|
+
max_request_length: Optional[int] = None,
|
|
23
|
+
max_sequence_and_generated_tokens_length: Optional[int] = None,
|
|
24
|
+
end_of_text_token: Optional[str] = None,
|
|
25
|
+
prefix_token: Optional[str] = None,
|
|
26
|
+
):
|
|
27
|
+
super().__init__(
|
|
28
|
+
tokenizer_name=tokenizer_name,
|
|
29
|
+
max_sequence_length=max_sequence_length,
|
|
30
|
+
max_request_length=max_request_length,
|
|
31
|
+
max_sequence_and_generated_tokens_length=max_sequence_and_generated_tokens_length,
|
|
32
|
+
end_of_text_token=end_of_text_token,
|
|
33
|
+
prefix_token=prefix_token,
|
|
34
|
+
)
|
|
18
35
|
self.service: TokenizerService = service
|
|
19
36
|
|
|
20
37
|
def encode(self, text: str, truncation: bool = False, max_length: Optional[int] = None) -> EncodeResult:
|
|
@@ -2,9 +2,10 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
7
|
-
from .
|
|
5
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class TestAnthropicWindowService:
|
|
@@ -120,7 +121,7 @@ class TestAnthropicWindowService:
|
|
|
120
121
|
|
|
121
122
|
def setup_method(self):
|
|
122
123
|
self.path: str = tempfile.mkdtemp()
|
|
123
|
-
service: TokenizerService = get_tokenizer_service(self.path)
|
|
124
|
+
service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
|
|
124
125
|
self.window_service = WindowServiceFactory.get_window_service("anthropic/claude-v1.3", service)
|
|
125
126
|
|
|
126
127
|
def teardown_method(self, method):
|
|
@@ -2,9 +2,10 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
7
|
-
from .
|
|
5
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class TestBloomWindowService:
|
|
@@ -64,7 +65,7 @@ class TestBloomWindowService:
|
|
|
64
65
|
|
|
65
66
|
def setup_method(self):
|
|
66
67
|
self.path: str = tempfile.mkdtemp()
|
|
67
|
-
service: TokenizerService = get_tokenizer_service(self.path)
|
|
68
|
+
service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
|
|
68
69
|
self.window_service = WindowServiceFactory.get_window_service("together/bloom", service)
|
|
69
70
|
|
|
70
71
|
def teardown_method(self, method):
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import tempfile
|
|
2
2
|
|
|
3
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
3
4
|
from helm.benchmark.window_services.test_t511b_window_service import TestT511bWindowService
|
|
4
5
|
from helm.benchmark.window_services.window_service_factory import TokenizerService, WindowServiceFactory
|
|
5
6
|
from helm.benchmark.window_services.test_utils import get_tokenizer_service
|
|
@@ -8,5 +9,5 @@ from helm.benchmark.window_services.test_utils import get_tokenizer_service
|
|
|
8
9
|
class TestFlanT5WindowService(TestT511bWindowService):
|
|
9
10
|
def setup_method(self):
|
|
10
11
|
self.path: str = tempfile.mkdtemp()
|
|
11
|
-
service: TokenizerService = get_tokenizer_service(self.path)
|
|
12
|
+
service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
|
|
12
13
|
self.window_service = WindowServiceFactory.get_window_service("together/flan-t5-xxl", service)
|
|
@@ -2,15 +2,20 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
5
|
-
|
|
6
|
-
from .test_utils import
|
|
7
|
-
|
|
5
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
+
from helm.benchmark.window_services.test_utils import (
|
|
7
|
+
get_tokenizer_service,
|
|
8
|
+
TEST_PROMPT,
|
|
9
|
+
GPT2_TEST_TOKENS,
|
|
10
|
+
GPT2_TEST_TOKEN_IDS,
|
|
11
|
+
)
|
|
12
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
class TestGPT2WindowService:
|
|
11
16
|
def setup_method(self):
|
|
12
17
|
self.path: str = tempfile.mkdtemp()
|
|
13
|
-
service: TokenizerService = get_tokenizer_service(self.path)
|
|
18
|
+
service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
|
|
14
19
|
self.window_service = WindowServiceFactory.get_window_service("huggingface/gpt2", service)
|
|
15
20
|
|
|
16
21
|
def teardown_method(self, method):
|
|
@@ -1,15 +1,21 @@
|
|
|
1
1
|
import shutil
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
|
-
from .
|
|
5
|
-
from .
|
|
6
|
-
|
|
4
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
+
from helm.benchmark.window_services.test_utils import (
|
|
6
|
+
get_tokenizer_service,
|
|
7
|
+
TEST_PROMPT,
|
|
8
|
+
GPT4_TEST_TOKEN_IDS,
|
|
9
|
+
GPT4_TEST_TOKENS,
|
|
10
|
+
)
|
|
11
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
12
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
7
13
|
|
|
8
14
|
|
|
9
15
|
class TestOpenAIWindowService:
|
|
10
16
|
def setup_method(self):
|
|
11
17
|
self.path: str = tempfile.mkdtemp()
|
|
12
|
-
service: TokenizerService = get_tokenizer_service(self.path)
|
|
18
|
+
service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
|
|
13
19
|
self.window_service = WindowServiceFactory.get_window_service("openai/gpt-3.5-turbo-0301", service)
|
|
14
20
|
|
|
15
21
|
def teardown_method(self, method):
|
|
@@ -1,16 +1,22 @@
|
|
|
1
1
|
import shutil
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
|
-
from .
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
4
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
6
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
7
|
+
from helm.benchmark.window_services.test_utils import (
|
|
8
|
+
get_tokenizer_service,
|
|
9
|
+
GPT2_TEST_TOKENS,
|
|
10
|
+
GPT2_TEST_TOKEN_IDS,
|
|
11
|
+
TEST_PROMPT,
|
|
12
|
+
)
|
|
7
13
|
|
|
8
14
|
|
|
9
15
|
class TestGPTJWindowService:
|
|
10
16
|
def setup_method(self):
|
|
11
17
|
self.path: str = tempfile.mkdtemp()
|
|
12
|
-
service: TokenizerService = get_tokenizer_service(self.path)
|
|
13
|
-
self.window_service = WindowServiceFactory.get_window_service("
|
|
18
|
+
service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
|
|
19
|
+
self.window_service = WindowServiceFactory.get_window_service("huggingface/gpt-j-6b", service)
|
|
14
20
|
|
|
15
21
|
def teardown_method(self, method):
|
|
16
22
|
shutil.rmtree(self.path)
|
|
@@ -2,9 +2,10 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
7
|
-
from .
|
|
5
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class TestGPTNeoXWindowService:
|
|
@@ -65,8 +66,8 @@ class TestGPTNeoXWindowService:
|
|
|
65
66
|
|
|
66
67
|
def setup_method(self):
|
|
67
68
|
self.path: str = tempfile.mkdtemp()
|
|
68
|
-
service: TokenizerService = get_tokenizer_service(self.path)
|
|
69
|
-
self.window_service = WindowServiceFactory.get_window_service("
|
|
69
|
+
service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
|
|
70
|
+
self.window_service = WindowServiceFactory.get_window_service("huggingface/gpt-neox-20b", service)
|
|
70
71
|
|
|
71
72
|
def teardown_method(self, method):
|
|
72
73
|
shutil.rmtree(self.path)
|
|
@@ -1,22 +1,28 @@
|
|
|
1
1
|
import shutil
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
|
-
from .
|
|
5
|
-
from .
|
|
6
|
-
|
|
4
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
+
from helm.benchmark.window_services.test_utils import (
|
|
6
|
+
get_tokenizer_service,
|
|
7
|
+
TEST_PROMPT,
|
|
8
|
+
GPT2_TEST_TOKENS,
|
|
9
|
+
GPT2_TEST_TOKEN_IDS,
|
|
10
|
+
)
|
|
11
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
12
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
7
13
|
|
|
8
14
|
|
|
9
15
|
class TestOpenAIWindowService:
|
|
10
16
|
def setup_method(self):
|
|
11
17
|
self.path: str = tempfile.mkdtemp()
|
|
12
|
-
service: TokenizerService = get_tokenizer_service(self.path)
|
|
13
|
-
self.window_service = WindowServiceFactory.get_window_service("
|
|
18
|
+
service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
|
|
19
|
+
self.window_service = WindowServiceFactory.get_window_service("huggingface/gpt2", service)
|
|
14
20
|
|
|
15
21
|
def teardown_method(self, method):
|
|
16
22
|
shutil.rmtree(self.path)
|
|
17
23
|
|
|
18
24
|
def test_max_request_length(self):
|
|
19
|
-
assert self.window_service.max_request_length ==
|
|
25
|
+
assert self.window_service.max_request_length == 1025
|
|
20
26
|
|
|
21
27
|
def test_encode(self):
|
|
22
28
|
assert self.window_service.encode(TEST_PROMPT).token_values == GPT2_TEST_TOKEN_IDS
|
|
@@ -29,19 +35,19 @@ class TestOpenAIWindowService:
|
|
|
29
35
|
|
|
30
36
|
def test_fits_within_context_window(self):
|
|
31
37
|
# Should fit in the context window since we subtracted the number of tokens of the test prompt
|
|
32
|
-
# from the max request length of
|
|
33
|
-
assert self.window_service.fits_within_context_window(TEST_PROMPT,
|
|
38
|
+
# from the max request length of 1025
|
|
39
|
+
assert self.window_service.fits_within_context_window(TEST_PROMPT, 1025 - 51)
|
|
34
40
|
# Should not fit within the max request length because we're expecting one more extra token in the completion
|
|
35
|
-
assert not self.window_service.fits_within_context_window(TEST_PROMPT,
|
|
41
|
+
assert not self.window_service.fits_within_context_window(TEST_PROMPT, 1025 - 51 + 1)
|
|
36
42
|
|
|
37
43
|
def test_truncate_from_right(self):
|
|
38
|
-
# Create a prompt that exceed max context length: 51 *
|
|
39
|
-
long_prompt: str = TEST_PROMPT *
|
|
44
|
+
# Create a prompt that exceed max context length: 51 * 21 = 1071 tokens
|
|
45
|
+
long_prompt: str = TEST_PROMPT * 21
|
|
40
46
|
assert not self.window_service.fits_within_context_window(long_prompt)
|
|
41
47
|
|
|
42
48
|
# Truncate and ensure it fits within the context window
|
|
43
49
|
truncated_long_prompt: str = self.window_service.truncate_from_right(long_prompt)
|
|
44
|
-
assert self.window_service.get_num_tokens(truncated_long_prompt) ==
|
|
50
|
+
assert self.window_service.get_num_tokens(truncated_long_prompt) == 1025
|
|
45
51
|
assert self.window_service.fits_within_context_window(truncated_long_prompt)
|
|
46
52
|
|
|
47
53
|
def test_tokenize_and_count(self):
|
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
import shutil
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
|
-
from .
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
4
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class TestOPTWindowService:
|
|
10
11
|
def setup_method(self):
|
|
11
12
|
self.path: str = tempfile.mkdtemp()
|
|
12
|
-
service: TokenizerService = get_tokenizer_service(self.path)
|
|
13
|
-
self.window_service = WindowServiceFactory.get_window_service("
|
|
13
|
+
service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
|
|
14
|
+
self.window_service = WindowServiceFactory.get_window_service("huggingface/opt-175b", service)
|
|
14
15
|
|
|
15
16
|
def teardown_method(self, method):
|
|
16
17
|
shutil.rmtree(self.path)
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from tempfile import TemporaryDirectory
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
-
from .
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
4
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
6
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
7
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class TestPalmyraWindowService:
|
|
@@ -117,7 +118,7 @@ class TestPalmyraWindowService:
|
|
|
117
118
|
|
|
118
119
|
def setup_method(self):
|
|
119
120
|
self.temporary_directory = TemporaryDirectory()
|
|
120
|
-
service: TokenizerService = get_tokenizer_service(self.temporary_directory.name)
|
|
121
|
+
service: TokenizerService = get_tokenizer_service(self.temporary_directory.name, BlackHoleCacheBackendConfig())
|
|
121
122
|
self.window_service = WindowServiceFactory.get_window_service("writer/palmyra-large", service)
|
|
122
123
|
|
|
123
124
|
def teardown_method(self, method):
|
|
@@ -2,9 +2,10 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
7
|
-
from .
|
|
5
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class TestT0ppWindowService:
|
|
@@ -70,7 +71,7 @@ class TestT0ppWindowService:
|
|
|
70
71
|
|
|
71
72
|
def setup_method(self):
|
|
72
73
|
self.path: str = tempfile.mkdtemp()
|
|
73
|
-
service: TokenizerService = get_tokenizer_service(self.path)
|
|
74
|
+
service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
|
|
74
75
|
self.window_service = WindowServiceFactory.get_window_service("together/t0pp", service)
|
|
75
76
|
|
|
76
77
|
def teardown_method(self, method):
|