crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -9,8 +9,9 @@ from typing import Dict, List, Optional, Set
|
|
|
9
9
|
|
|
10
10
|
from helm.benchmark.scenarios.scenario import Input, Instance, Reference, Output
|
|
11
11
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists, match_case
|
|
12
|
-
from .perturbation_description import PerturbationDescription
|
|
13
|
-
from .perturbation import Perturbation
|
|
12
|
+
from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
|
|
13
|
+
from helm.benchmark.augmentations.perturbation import Perturbation
|
|
14
|
+
from helm.benchmark.runner import get_benchmark_output_path
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
# Pull this out so serialization works for multiprocessing
|
|
@@ -35,7 +36,7 @@ class PersonNamePerturbation(Perturbation):
|
|
|
35
36
|
"https://storage.googleapis.com/crfm-helm-public/source_datasets/"
|
|
36
37
|
"augmentations/person_name_perturbation/person_names.txt"
|
|
37
38
|
)
|
|
38
|
-
OUTPUT_PATH = os.path.join(
|
|
39
|
+
OUTPUT_PATH = os.path.join(get_benchmark_output_path(), "perturbations", name)
|
|
39
40
|
|
|
40
41
|
""" Name types """
|
|
41
42
|
FIRST_NAME = "first_name"
|
|
@@ -153,8 +154,6 @@ class PersonNamePerturbation(Perturbation):
|
|
|
153
154
|
find the gender association for a source_word, we randomly
|
|
154
155
|
pick from one of the target names.
|
|
155
156
|
"""
|
|
156
|
-
# TODO: Update path so it is not hard-coded to benchmark_output
|
|
157
|
-
# https://github.com/stanford-crfm/benchmarking/issues/493
|
|
158
157
|
self.output_path: str = self.OUTPUT_PATH
|
|
159
158
|
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
|
160
159
|
|
|
@@ -4,13 +4,12 @@ from random import Random
|
|
|
4
4
|
from typing import List, Optional
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
from .perturbation_description import PerturbationDescription
|
|
7
|
+
from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
|
|
8
8
|
from helm.benchmark.scenarios.scenario import Input, Instance, Reference, Output
|
|
9
9
|
from helm.common.object_spec import ObjectSpec, create_object
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class Perturbation(ABC):
|
|
13
|
-
|
|
14
13
|
# Unique name to describe perturbation
|
|
15
14
|
name: str
|
|
16
15
|
|
|
@@ -49,18 +48,41 @@ class TextPerturbation(Perturbation, ABC):
|
|
|
49
48
|
|
|
50
49
|
description = replace(self.description, seed=seed)
|
|
51
50
|
|
|
51
|
+
perturbed_input: Input
|
|
52
|
+
if instance.input.multimedia_content:
|
|
53
|
+
perturbed_media_objects = []
|
|
54
|
+
for media_object in instance.input.multimedia_content.media_objects:
|
|
55
|
+
# Apply perturbations to the text data of the multimedia content
|
|
56
|
+
if media_object.is_type("text") and media_object.text is not None:
|
|
57
|
+
perturbed_media_objects.append(replace(media_object, text=self.perturb(media_object.text, rng)))
|
|
58
|
+
else:
|
|
59
|
+
perturbed_media_objects.append(media_object)
|
|
60
|
+
|
|
61
|
+
perturbed_input = Input(
|
|
62
|
+
multimedia_content=replace(instance.input.multimedia_content, media_objects=perturbed_media_objects)
|
|
63
|
+
)
|
|
64
|
+
else:
|
|
65
|
+
perturbed_input = Input(text=self.perturb(instance.input.text, rng))
|
|
66
|
+
|
|
52
67
|
# Don't modify `id` of `Instance` here.
|
|
53
68
|
# All the perturbed Instances generated from a single Instance should have the same ID.
|
|
54
69
|
return replace(
|
|
55
70
|
instance,
|
|
56
|
-
input=
|
|
71
|
+
input=perturbed_input,
|
|
57
72
|
references=references,
|
|
58
73
|
perturbation=description,
|
|
74
|
+
contrast_inputs=[instance.input],
|
|
59
75
|
)
|
|
60
76
|
|
|
61
77
|
def _perturb_reference(self, reference: Reference, rng: Random) -> Reference:
|
|
62
78
|
"""Generates a new Reference by perturbing the output and tagging the Reference."""
|
|
63
|
-
return replace(
|
|
79
|
+
return replace(
|
|
80
|
+
reference,
|
|
81
|
+
output=Output(
|
|
82
|
+
text=self.perturb(reference.output.text, rng), multimedia_content=reference.output.multimedia_content
|
|
83
|
+
),
|
|
84
|
+
tags=reference.tags,
|
|
85
|
+
)
|
|
64
86
|
|
|
65
87
|
@abstractmethod
|
|
66
88
|
def perturb(self, text: str, rng: Random) -> str:
|
|
@@ -23,7 +23,7 @@ class PerturbationDescription:
|
|
|
23
23
|
computed_on: str = PERTURBATION_PERTURBED
|
|
24
24
|
"""Which types of Instances we are evaluating, to be populated during metric evaluation. PERTURBATION_PERTURBED
|
|
25
25
|
(default) means we are evaluating on perturbed instances, PERTURBATION_ORIGINAL means we are evaluating the
|
|
26
|
-
unperturbed version of instances where this perturbation
|
|
26
|
+
unperturbed version of instances where this perturbation applies, and, PERTURBATION_WORST means the the minimum
|
|
27
27
|
metric between the two."""
|
|
28
28
|
|
|
29
29
|
seed: Optional[int] = None
|
|
@@ -2,8 +2,8 @@ from dataclasses import dataclass
|
|
|
2
2
|
from random import Random
|
|
3
3
|
import re
|
|
4
4
|
|
|
5
|
-
from .perturbation import TextPerturbation
|
|
6
|
-
from .perturbation_description import PerturbationDescription
|
|
5
|
+
from helm.benchmark.augmentations.perturbation import TextPerturbation
|
|
6
|
+
from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class SpacePerturbation(TextPerturbation):
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from random import Random
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.augmentations.perturbation import TextPerturbation
|
|
5
|
+
from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SuffixPerturbation(TextPerturbation):
|
|
9
|
+
"""
|
|
10
|
+
Appends a suffix to the end of the text. Example:
|
|
11
|
+
|
|
12
|
+
A picture of a dog -> A picture of a dog, picasso
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class Description(PerturbationDescription):
|
|
17
|
+
suffix: str = ""
|
|
18
|
+
|
|
19
|
+
name: str = "style"
|
|
20
|
+
|
|
21
|
+
def __init__(self, suffix: str):
|
|
22
|
+
self._suffix: str = suffix
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def description(self) -> PerturbationDescription:
|
|
26
|
+
return SuffixPerturbation.Description(name=self.name, suffix=self._suffix)
|
|
27
|
+
|
|
28
|
+
def perturb(self, text: str, rng: Random) -> str:
|
|
29
|
+
return f"{text}, {self._suffix}"
|
|
@@ -10,8 +10,9 @@ from nltk.corpus import wordnet
|
|
|
10
10
|
import spacy
|
|
11
11
|
|
|
12
12
|
from helm.common.general import match_case, ensure_file_downloaded
|
|
13
|
-
from .perturbation_description import PerturbationDescription
|
|
14
|
-
from .perturbation import TextPerturbation
|
|
13
|
+
from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
|
|
14
|
+
from helm.benchmark.augmentations.perturbation import TextPerturbation
|
|
15
|
+
from helm.benchmark.runner import get_benchmark_output_path
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class SynonymPerturbation(TextPerturbation):
|
|
@@ -57,7 +58,7 @@ class SynonymPerturbation(TextPerturbation):
|
|
|
57
58
|
spacy.cli.download("en_core_web_sm") # type: ignore
|
|
58
59
|
self.spacy_model = spacy.load("en_core_web_sm")
|
|
59
60
|
|
|
60
|
-
output_dir = os.path.join(
|
|
61
|
+
output_dir = os.path.join(get_benchmark_output_path(), "perturbations", self.name)
|
|
61
62
|
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
|
62
63
|
nltk.data.path.append(output_dir)
|
|
63
64
|
try:
|
|
@@ -2,19 +2,24 @@
|
|
|
2
2
|
from typing import List
|
|
3
3
|
import unittest
|
|
4
4
|
|
|
5
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
5
6
|
from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference
|
|
6
|
-
from .data_augmenter import DataAugmenter
|
|
7
|
-
from .extra_space_perturbation import ExtraSpacePerturbation
|
|
8
|
-
from .misspelling_perturbation import MisspellingPerturbation
|
|
9
|
-
from .contraction_expansion_perturbation import
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
from .
|
|
14
|
-
from .
|
|
15
|
-
from .
|
|
16
|
-
from .
|
|
17
|
-
from .
|
|
7
|
+
from helm.benchmark.augmentations.data_augmenter import DataAugmenter
|
|
8
|
+
from helm.benchmark.augmentations.extra_space_perturbation import ExtraSpacePerturbation
|
|
9
|
+
from helm.benchmark.augmentations.misspelling_perturbation import MisspellingPerturbation
|
|
10
|
+
from helm.benchmark.augmentations.contraction_expansion_perturbation import (
|
|
11
|
+
ContractionPerturbation,
|
|
12
|
+
ExpansionPerturbation,
|
|
13
|
+
)
|
|
14
|
+
from helm.benchmark.augmentations.typos_perturbation import TyposPerturbation
|
|
15
|
+
from helm.benchmark.augmentations.filler_words_perturbation import FillerWordsPerturbation
|
|
16
|
+
from helm.benchmark.augmentations.synonym_perturbation import SynonymPerturbation
|
|
17
|
+
from helm.benchmark.augmentations.lowercase_perturbation import LowerCasePerturbation
|
|
18
|
+
from helm.benchmark.augmentations.space_perturbation import SpacePerturbation
|
|
19
|
+
from helm.benchmark.augmentations.dialect_perturbation import DialectPerturbation
|
|
20
|
+
from helm.benchmark.augmentations.person_name_perturbation import PersonNamePerturbation
|
|
21
|
+
from helm.benchmark.augmentations.gender_perturbation import GenderPerturbation
|
|
22
|
+
from helm.benchmark.augmentations.suffix_perturbation import SuffixPerturbation
|
|
18
23
|
|
|
19
24
|
|
|
20
25
|
def test_extra_space_perturbation():
|
|
@@ -32,6 +37,35 @@ def test_extra_space_perturbation():
|
|
|
32
37
|
assert instances[1].references[0].output.text == "some name"
|
|
33
38
|
|
|
34
39
|
|
|
40
|
+
def test_multimodal_text_perturbation():
|
|
41
|
+
data_augmenter = DataAugmenter(perturbations=[ExtraSpacePerturbation(num_spaces=3)])
|
|
42
|
+
input: Input = Input(
|
|
43
|
+
multimedia_content=MultimediaObject(
|
|
44
|
+
[
|
|
45
|
+
MediaObject(text="Hello what is", content_type="text/plain"),
|
|
46
|
+
MediaObject(text="your name", content_type="text/plain"),
|
|
47
|
+
]
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
instance: Instance = Instance(id="id0", input=input, references=[Reference(Output(text="some name"), tags=[])])
|
|
51
|
+
instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
|
|
52
|
+
|
|
53
|
+
assert len(instances) == 2
|
|
54
|
+
|
|
55
|
+
# Test that the first instance is unperturbed
|
|
56
|
+
assert instances[0].id == "id0"
|
|
57
|
+
assert instances[0].perturbation is None
|
|
58
|
+
media_objects = instances[0].input.multimedia_content.media_objects
|
|
59
|
+
assert media_objects[0].text == "Hello what is"
|
|
60
|
+
assert media_objects[1].text == "your name"
|
|
61
|
+
|
|
62
|
+
assert instances[1].id == "id0"
|
|
63
|
+
assert instances[1].perturbation.name == "extra_space"
|
|
64
|
+
media_objects = instances[1].input.multimedia_content.media_objects
|
|
65
|
+
assert media_objects[0].text == "Hello what is"
|
|
66
|
+
assert media_objects[1].text == "your name"
|
|
67
|
+
|
|
68
|
+
|
|
35
69
|
def test_misspelling_perturbation():
|
|
36
70
|
data_augmenter = DataAugmenter(perturbations=[MisspellingPerturbation(prob=1.0)])
|
|
37
71
|
instance: Instance = Instance(
|
|
@@ -145,7 +179,6 @@ def test_space_perturbation():
|
|
|
145
179
|
instance: Instance = Instance(id="id0", input=Input(text="Hello World!\nQuite a day, huh?"), references=[])
|
|
146
180
|
instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
|
|
147
181
|
|
|
148
|
-
print(instances)
|
|
149
182
|
assert len(instances) == 2
|
|
150
183
|
assert instances[1].perturbation.name == "space"
|
|
151
184
|
assert instances[1].input.text == "Hello World!\nQuite a day, huh?"
|
|
@@ -162,7 +195,6 @@ def test_dialect_perturbation():
|
|
|
162
195
|
)
|
|
163
196
|
instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
|
|
164
197
|
|
|
165
|
-
print(instances)
|
|
166
198
|
assert len(instances) == 2
|
|
167
199
|
assert instances[1].perturbation.name == "dialect"
|
|
168
200
|
assert instances[1].input.text == "I gon remember dis day to b the best day of mah life."
|
|
@@ -188,7 +220,6 @@ def test_person_name_perturbation():
|
|
|
188
220
|
)
|
|
189
221
|
instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
|
|
190
222
|
|
|
191
|
-
print(instances)
|
|
192
223
|
assert len(instances) == 2
|
|
193
224
|
assert instances[1].perturbation.name == "person_name"
|
|
194
225
|
assert (
|
|
@@ -209,7 +240,6 @@ def test_gender_pronoun_perturbation():
|
|
|
209
240
|
)
|
|
210
241
|
instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
|
|
211
242
|
|
|
212
|
-
print(instances)
|
|
213
243
|
assert len(instances) == 2
|
|
214
244
|
assert instances[1].perturbation.mode == "pronouns"
|
|
215
245
|
assert instances[1].input.text == "Did she mention that she was coming with her parents and their friends?"
|
|
@@ -227,13 +257,22 @@ def test_gender_term_perturbation():
|
|
|
227
257
|
)
|
|
228
258
|
instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
|
|
229
259
|
|
|
230
|
-
print(instances)
|
|
231
260
|
assert len(instances) == 2
|
|
232
261
|
assert instances[1].perturbation.mode == "terms"
|
|
233
262
|
assert instances[1].input.text == "His granddaughters looked a lot like their mom."
|
|
234
263
|
assert instances[1].references[0].output.text == "How did their mother look like?"
|
|
235
264
|
|
|
236
265
|
|
|
266
|
+
def test_suffix_perturbation():
|
|
267
|
+
data_augmenter = DataAugmenter(perturbations=[SuffixPerturbation(suffix="pixel art")])
|
|
268
|
+
instance: Instance = Instance(id="id0", input=Input(text="A blue dog"), references=[])
|
|
269
|
+
instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
|
|
270
|
+
|
|
271
|
+
assert len(instances) == 2
|
|
272
|
+
assert instances[1].perturbation.suffix == "pixel art"
|
|
273
|
+
assert instances[1].input.text == "A blue dog, pixel art"
|
|
274
|
+
|
|
275
|
+
|
|
237
276
|
# TODO(#1958) Fix the logic to renable this test
|
|
238
277
|
@unittest.skip("Currently cannot replace words at either text boundary.")
|
|
239
278
|
def test_gender_term_perturbation_edge_word():
|
|
@@ -247,7 +286,6 @@ def test_gender_term_perturbation_edge_word():
|
|
|
247
286
|
)
|
|
248
287
|
instances: List[Instance] = data_augmenter.generate([instance], include_original=False)
|
|
249
288
|
|
|
250
|
-
print(instances)
|
|
251
289
|
assert len(instances) == 1
|
|
252
290
|
assert instances[0].input.text == "mom said it is okay"
|
|
253
291
|
assert instances[0].references[0].output.text == "Sure he did daughter"
|
|
@@ -266,6 +304,5 @@ def test_gender_term_perturbation_consequtive_words():
|
|
|
266
304
|
)
|
|
267
305
|
instances: List[Instance] = data_augmenter.generate([instance], include_original=False)
|
|
268
306
|
|
|
269
|
-
print(instances)
|
|
270
307
|
assert len(instances) == 1
|
|
271
308
|
assert instances[0].input.text == "I'm a mom mom: my daughter has a daughter."
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from random import Random
|
|
3
|
+
|
|
4
|
+
from helm.clients.google_translate_client import GoogleTranslateClient
|
|
5
|
+
from helm.benchmark.augmentations.perturbation import TextPerturbation
|
|
6
|
+
from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TranslatePerturbation(TextPerturbation):
|
|
10
|
+
"""
|
|
11
|
+
Translates to different languages.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class Description(PerturbationDescription):
|
|
16
|
+
# Language code to translate to. Needs a default value since we inherit from `PerturbationDescription`
|
|
17
|
+
language_code: str = "zh-CN"
|
|
18
|
+
|
|
19
|
+
name: str = "translate"
|
|
20
|
+
should_perturb_references: bool = True
|
|
21
|
+
|
|
22
|
+
def __init__(self, language_code: str):
|
|
23
|
+
self.language_code: str = language_code
|
|
24
|
+
self.google_translate_client = GoogleTranslateClient()
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def description(self) -> PerturbationDescription:
|
|
28
|
+
return TranslatePerturbation.Description(name=self.name, language_code=self.language_code)
|
|
29
|
+
|
|
30
|
+
def perturb(self, text: str, rng: Random) -> str:
|
|
31
|
+
return self.google_translate_client.translate(text, self.language_code)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from random import Random
|
|
3
3
|
|
|
4
|
-
from .perturbation_description import PerturbationDescription
|
|
5
|
-
from .perturbation import TextPerturbation
|
|
4
|
+
from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
|
|
5
|
+
from helm.benchmark.augmentations.perturbation import TextPerturbation
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class TyposPerturbation(TextPerturbation):
|
|
@@ -4,16 +4,18 @@ import importlib_resources as resources
|
|
|
4
4
|
from helm.benchmark.model_deployment_registry import register_model_deployments_from_path
|
|
5
5
|
from helm.benchmark.model_metadata_registry import register_model_metadata_from_path
|
|
6
6
|
from helm.benchmark.tokenizer_config_registry import register_tokenizer_configs_from_path
|
|
7
|
+
from helm.benchmark.runner_config_registry import register_runner_config_from_path
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
MODEL_METADATA_FILE: str = "model_metadata.yaml"
|
|
10
11
|
TOKENIZER_CONFIGS_FILE: str = "tokenizer_configs.yaml"
|
|
11
12
|
MODEL_DEPLOYMENTS_FILE: str = "model_deployments.yaml"
|
|
13
|
+
RUNNER_CONFIG_FILE: str = "runner_config.yaml"
|
|
12
14
|
|
|
13
15
|
CONFIG_PACKAGE = "helm.config"
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
def register_configs_from_directory(dir_path) -> None:
|
|
18
|
+
def register_configs_from_directory(dir_path: str) -> None:
|
|
17
19
|
model_metadata_path = os.path.join(dir_path, MODEL_METADATA_FILE)
|
|
18
20
|
if os.path.isfile(model_metadata_path):
|
|
19
21
|
register_model_metadata_from_path(model_metadata_path)
|
|
@@ -26,6 +28,10 @@ def register_configs_from_directory(dir_path) -> None:
|
|
|
26
28
|
if os.path.isfile(model_deployments_path):
|
|
27
29
|
register_model_deployments_from_path(model_deployments_path)
|
|
28
30
|
|
|
31
|
+
runner_config_path = os.path.join(dir_path, RUNNER_CONFIG_FILE)
|
|
32
|
+
if os.path.isfile(runner_config_path):
|
|
33
|
+
register_runner_config_from_path(runner_config_path)
|
|
34
|
+
|
|
29
35
|
|
|
30
36
|
def register_builtin_configs_from_helm_package() -> None:
|
|
31
37
|
package_path = str(resources.files(CONFIG_PACKAGE))
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from helm.common.hierarchical_logger import htrack
|
|
3
|
-
from .augmentations.data_augmenter import create_data_augmenter, DataAugmenterSpec, DataAugmenter
|
|
4
|
-
from .scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS
|
|
3
|
+
from helm.benchmark.augmentations.data_augmenter import create_data_augmenter, DataAugmenterSpec, DataAugmenter
|
|
4
|
+
from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class DataPreprocessor:
|
helm/benchmark/executor.py
CHANGED
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
from dataclasses import dataclass, replace
|
|
3
3
|
|
|
4
|
+
from helm.common.context import Context
|
|
5
|
+
from helm.common.local_context import LocalContext
|
|
6
|
+
from helm.common.remote_context import RemoteContext
|
|
7
|
+
from helm.common.cache_backend_config import (
|
|
8
|
+
CacheBackendConfig,
|
|
9
|
+
BlackHoleCacheBackendConfig,
|
|
10
|
+
MongoCacheBackendConfig,
|
|
11
|
+
SqliteCacheBackendConfig,
|
|
12
|
+
)
|
|
4
13
|
from helm.common.general import parallel_map
|
|
5
|
-
from helm.common.hierarchical_logger import htrack, hlog
|
|
6
|
-
from helm.common.request import RequestResult,
|
|
14
|
+
from helm.common.hierarchical_logger import htrack, hlog, hwarn
|
|
15
|
+
from helm.common.request import RequestResult, GeneratedOutput
|
|
7
16
|
from helm.common.authentication import Authentication
|
|
8
|
-
from helm.proxy.services.remote_service import RemoteService
|
|
9
|
-
from helm.proxy.services.server_service import ServerService
|
|
10
|
-
from helm.proxy.services.service import Service
|
|
11
17
|
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
12
18
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
13
19
|
|
|
@@ -18,28 +24,36 @@ class ExecutorError(Exception):
|
|
|
18
24
|
|
|
19
25
|
@dataclass(frozen=True)
|
|
20
26
|
class ExecutionSpec:
|
|
21
|
-
|
|
27
|
+
|
|
22
28
|
url: Optional[str]
|
|
29
|
+
"""If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959)."""
|
|
23
30
|
|
|
24
|
-
# Pass into the service
|
|
25
31
|
auth: Authentication
|
|
32
|
+
"""Authentication that will be passed into the remote service, if using the remote context."""
|
|
26
33
|
|
|
27
|
-
# Path where API credentials and cache is stored.
|
|
28
|
-
# This path is the same as `--base-path` when launching the proxy server (see server.py).
|
|
29
|
-
# Required when url is not set.
|
|
30
34
|
local_path: Optional[str]
|
|
35
|
+
"""Path where API credentials and cache is stored.
|
|
36
|
+
|
|
37
|
+
This path is the same as `--base-path` when launching the proxy server (see server.py).
|
|
38
|
+
Required when url is not set."""
|
|
31
39
|
|
|
32
|
-
# How many threads to have at once
|
|
33
40
|
parallelism: int
|
|
41
|
+
"""How many threads to have at once"""
|
|
34
42
|
|
|
35
|
-
# Whether to skip execution
|
|
36
43
|
dry_run: bool = False
|
|
44
|
+
"""Whether to skip execution"""
|
|
45
|
+
|
|
46
|
+
sqlite_cache_backend_config: Optional[SqliteCacheBackendConfig] = None
|
|
47
|
+
"""If set, SQLite will be used for the cache.
|
|
48
|
+
|
|
49
|
+
This specifies the directory in which the SQLite cache will store files.
|
|
50
|
+
At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set."""
|
|
37
51
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
52
|
+
mongo_cache_backend_config: Optional[MongoCacheBackendConfig] = None
|
|
53
|
+
"""If set, MongoDB will be used for the cache.
|
|
54
|
+
|
|
55
|
+
This specifies the MongoDB database to be used by the MongoDB cache.
|
|
56
|
+
At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set."""
|
|
43
57
|
|
|
44
58
|
|
|
45
59
|
class Executor:
|
|
@@ -51,14 +65,25 @@ class Executor:
|
|
|
51
65
|
def __init__(self, execution_spec: ExecutionSpec):
|
|
52
66
|
self.execution_spec = execution_spec
|
|
53
67
|
|
|
54
|
-
|
|
68
|
+
cache_backend_config: CacheBackendConfig
|
|
69
|
+
if execution_spec.sqlite_cache_backend_config and execution_spec.mongo_cache_backend_config:
|
|
70
|
+
raise ExecutorError("At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set.")
|
|
71
|
+
elif execution_spec.sqlite_cache_backend_config:
|
|
72
|
+
cache_backend_config = execution_spec.sqlite_cache_backend_config
|
|
73
|
+
elif execution_spec.mongo_cache_backend_config:
|
|
74
|
+
cache_backend_config = execution_spec.mongo_cache_backend_config
|
|
75
|
+
else:
|
|
76
|
+
cache_backend_config = BlackHoleCacheBackendConfig()
|
|
77
|
+
|
|
78
|
+
self.context: Context
|
|
55
79
|
if execution_spec.url:
|
|
56
80
|
hlog(f"Running using remote API proxy server: {execution_spec.url}")
|
|
57
|
-
self.
|
|
81
|
+
self.context = RemoteContext(execution_spec.url, execution_spec.auth)
|
|
58
82
|
elif execution_spec.local_path:
|
|
59
83
|
hlog(f"Running in local mode with base path: {execution_spec.local_path}")
|
|
60
|
-
self.
|
|
61
|
-
base_path=execution_spec.local_path,
|
|
84
|
+
self.context = LocalContext(
|
|
85
|
+
base_path=execution_spec.local_path,
|
|
86
|
+
cache_backend_config=cache_backend_config,
|
|
62
87
|
)
|
|
63
88
|
else:
|
|
64
89
|
raise ValueError("Either the proxy server URL or the local path must be set")
|
|
@@ -77,17 +102,21 @@ class Executor:
|
|
|
77
102
|
)
|
|
78
103
|
|
|
79
104
|
hlog(f"Processed {len(request_states)} requests")
|
|
80
|
-
return ScenarioState(
|
|
105
|
+
return ScenarioState(
|
|
106
|
+
adapter_spec=scenario_state.adapter_spec,
|
|
107
|
+
request_states=request_states,
|
|
108
|
+
annotator_specs=scenario_state.annotator_specs,
|
|
109
|
+
)
|
|
81
110
|
|
|
82
111
|
def process(self, state: RequestState) -> RequestState:
|
|
83
112
|
try:
|
|
84
|
-
result: RequestResult = self.
|
|
113
|
+
result: RequestResult = self.context.make_request(state.request)
|
|
85
114
|
except Exception as e:
|
|
86
115
|
raise ExecutorError(f"{str(e)} Request: {state.request}") from e
|
|
87
116
|
if not result.success:
|
|
88
117
|
if result.error_flags and not result.error_flags.is_fatal:
|
|
89
|
-
|
|
90
|
-
result.completions = [
|
|
118
|
+
hwarn(f"Non-fatal error treated as empty completion: {result.error}")
|
|
119
|
+
result.completions = [GeneratedOutput(text="", logprob=0, tokens=[])]
|
|
91
120
|
else:
|
|
92
121
|
raise ExecutorError(f"{str(result.error)} Request: {state.request}")
|
|
93
122
|
return replace(state, result=result)
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Optional
|
|
2
|
+
from typing import Optional, Dict, Union
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.model_deployment_registry import (
|
|
5
5
|
ClientSpec,
|
|
6
6
|
ModelDeployment,
|
|
7
|
-
WindowServiceSpec,
|
|
8
7
|
register_model_deployment,
|
|
9
8
|
)
|
|
10
9
|
from helm.benchmark.model_metadata_registry import (
|
|
@@ -14,27 +13,44 @@ from helm.benchmark.model_metadata_registry import (
|
|
|
14
13
|
)
|
|
15
14
|
from helm.benchmark.tokenizer_config_registry import TokenizerConfig, TokenizerSpec, register_tokenizer_config
|
|
16
15
|
from helm.common.hierarchical_logger import hlog
|
|
16
|
+
from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def register_huggingface_model(
|
|
20
|
-
helm_model_name: str,
|
|
20
|
+
helm_model_name: str,
|
|
21
|
+
pretrained_model_name_or_path: str,
|
|
22
|
+
revision: Optional[str] = None,
|
|
21
23
|
) -> None:
|
|
22
|
-
object_spec_args = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
|
|
24
|
+
object_spec_args: Dict[str, Union[str, bool]] = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
|
|
23
25
|
if revision:
|
|
24
26
|
object_spec_args["revision"] = revision
|
|
25
27
|
|
|
28
|
+
# Auto-infer model properties from the tokenizer.
|
|
29
|
+
create_tokenizer_args: Dict[str, str] = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
|
|
30
|
+
if revision:
|
|
31
|
+
create_tokenizer_args["revision"] = revision
|
|
32
|
+
with HuggingFaceTokenizer.create_tokenizer(**create_tokenizer_args) as tokenizer:
|
|
33
|
+
max_sequence_length = tokenizer.model_max_length
|
|
34
|
+
end_of_text_token = tokenizer.eos_token or ""
|
|
35
|
+
prefix_token = tokenizer.bos_token or ""
|
|
36
|
+
# If the tokenizer config has a model_max_length of 1000000000000000019884624838656
|
|
37
|
+
# it means that model creator did not specify model_max_length.
|
|
38
|
+
if max_sequence_length > 1_000_000:
|
|
39
|
+
raise ValueError(
|
|
40
|
+
f"Could not infer the model_max_length of Hugging Face model {pretrained_model_name_or_path}, so "
|
|
41
|
+
f"--enable-huggingface-models and --enable-local-huggingface-models cannot be used for this model. "
|
|
42
|
+
f"Please configure the model using prod_env/model_deployments.yaml instead."
|
|
43
|
+
)
|
|
44
|
+
|
|
26
45
|
model_deployment = ModelDeployment(
|
|
27
46
|
name=helm_model_name,
|
|
28
47
|
client_spec=ClientSpec(
|
|
29
|
-
class_name="helm.
|
|
48
|
+
class_name="helm.clients.huggingface_client.HuggingFaceClient",
|
|
30
49
|
args=object_spec_args,
|
|
31
50
|
),
|
|
32
51
|
model_name=helm_model_name,
|
|
33
52
|
tokenizer_name=helm_model_name,
|
|
34
|
-
|
|
35
|
-
class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService",
|
|
36
|
-
args=object_spec_args,
|
|
37
|
-
),
|
|
53
|
+
max_sequence_length=max_sequence_length,
|
|
38
54
|
)
|
|
39
55
|
|
|
40
56
|
# We check if the model is already registered because we don't want to
|
|
@@ -51,9 +67,11 @@ def register_huggingface_model(
|
|
|
51
67
|
tokenizer_config = TokenizerConfig(
|
|
52
68
|
name=helm_model_name,
|
|
53
69
|
tokenizer_spec=TokenizerSpec(
|
|
54
|
-
class_name="helm.
|
|
70
|
+
class_name="helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer",
|
|
55
71
|
args=object_spec_args,
|
|
56
72
|
),
|
|
73
|
+
end_of_text_token=end_of_text_token,
|
|
74
|
+
prefix_token=prefix_token,
|
|
57
75
|
)
|
|
58
76
|
register_tokenizer_config(tokenizer_config)
|
|
59
77
|
|