crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
helm/benchmark/vlm_run_specs.py
DELETED
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional
|
|
2
|
-
|
|
3
|
-
from .adaptation.adapter_spec import AdapterSpec
|
|
4
|
-
from .adaptation.adapters.adapter_factory import ADAPT_GENERATION_MULTIMODAL
|
|
5
|
-
from .metrics.metric import MetricSpec
|
|
6
|
-
from .run_specs import run_spec_function, get_exact_match_metric_specs
|
|
7
|
-
from .runner import RunSpec
|
|
8
|
-
from .scenarios.scenario import ScenarioSpec
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
############################################################
|
|
12
|
-
# Prototypical adapter specs for VLM evaluation
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def get_vlm_generation_adapter_spec(
|
|
16
|
-
instructions: str = "",
|
|
17
|
-
input_prefix: str = "",
|
|
18
|
-
input_suffix: str = "",
|
|
19
|
-
output_prefix: str = "",
|
|
20
|
-
output_suffix: str = "",
|
|
21
|
-
max_tokens: int = 100,
|
|
22
|
-
stop_sequences: Optional[List[str]] = None,
|
|
23
|
-
) -> AdapterSpec:
|
|
24
|
-
return AdapterSpec(
|
|
25
|
-
method=ADAPT_GENERATION_MULTIMODAL,
|
|
26
|
-
global_prefix="",
|
|
27
|
-
instructions=instructions,
|
|
28
|
-
input_prefix=input_prefix,
|
|
29
|
-
input_suffix=input_suffix,
|
|
30
|
-
output_prefix=output_prefix,
|
|
31
|
-
output_suffix=output_suffix,
|
|
32
|
-
instance_prefix="\n",
|
|
33
|
-
# We focus on zero-shot evaluation for now as most open VLMs only support a single image input
|
|
34
|
-
max_train_instances=0,
|
|
35
|
-
num_outputs=1,
|
|
36
|
-
max_tokens=max_tokens,
|
|
37
|
-
stop_sequences=stop_sequences if stop_sequences is not None else [],
|
|
38
|
-
random=None,
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
############################################################
|
|
43
|
-
# VHELM run specs
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
@run_spec_function("viz_wiz")
|
|
47
|
-
def get_viz_wiz_spec() -> RunSpec:
|
|
48
|
-
scenario_spec = ScenarioSpec(
|
|
49
|
-
class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
# TODO: finalize the adapter spec parameters once we add more models
|
|
53
|
-
adapter_spec: AdapterSpec = get_vlm_generation_adapter_spec(
|
|
54
|
-
input_prefix="User: ",
|
|
55
|
-
input_suffix="<end_of_utterance>",
|
|
56
|
-
output_prefix="\nAssistant: ",
|
|
57
|
-
output_suffix="<end_of_utterance>",
|
|
58
|
-
stop_sequences=["<end_of_utterance>"],
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
62
|
-
|
|
63
|
-
run_spec_name: str = "viz_wiz"
|
|
64
|
-
return RunSpec(
|
|
65
|
-
name=run_spec_name,
|
|
66
|
-
scenario_spec=scenario_spec,
|
|
67
|
-
adapter_spec=adapter_spec,
|
|
68
|
-
metric_specs=metric_specs,
|
|
69
|
-
groups=[run_spec_name],
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@run_spec_function("vqa")
|
|
74
|
-
def get_vqa_spec() -> RunSpec:
|
|
75
|
-
scenario_spec = ScenarioSpec(
|
|
76
|
-
class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario", args={}
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
# TODO: finalize the adapter spec parameters once we add more models
|
|
80
|
-
adapter_spec: AdapterSpec = get_vlm_generation_adapter_spec(
|
|
81
|
-
input_prefix="User: ",
|
|
82
|
-
input_suffix="<end_of_utterance>",
|
|
83
|
-
output_prefix="\nAssistant: ",
|
|
84
|
-
output_suffix="<end_of_utterance>",
|
|
85
|
-
stop_sequences=["<end_of_utterance>"],
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
89
|
-
|
|
90
|
-
run_spec_name: str = "vqa"
|
|
91
|
-
return RunSpec(
|
|
92
|
-
name=run_spec_name,
|
|
93
|
-
scenario_spec=scenario_spec,
|
|
94
|
-
adapter_spec=adapter_spec,
|
|
95
|
-
metric_specs=metric_specs,
|
|
96
|
-
groups=[run_spec_name],
|
|
97
|
-
)
|
|
@@ -1,258 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
from typing import List, Optional, Tuple
|
|
4
|
-
from urllib.parse import unquote
|
|
5
|
-
|
|
6
|
-
from helm.common.tokenization_request import (
|
|
7
|
-
TokenizationRequest,
|
|
8
|
-
TokenizationRequestResult,
|
|
9
|
-
TokenizationToken,
|
|
10
|
-
TextRange,
|
|
11
|
-
)
|
|
12
|
-
from .window_service import WindowService, EncodeResult
|
|
13
|
-
from .tokenizer_service import TokenizerService
|
|
14
|
-
from .gpt2_window_service import GPT2WindowService
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class AI21WindowService(WindowService):
|
|
18
|
-
"""Tokenizes by making a request to the proxy server with REST endpoint: `/api/tokenize`."""
|
|
19
|
-
|
|
20
|
-
# AI21's tokenizer API rejects a tokenization request if the input sequence is too long, so
|
|
21
|
-
# we need to set an upper limit for the length of the request. Empirically, if the GPT2 tokenizer tokenizes a
|
|
22
|
-
# sequence to <= 11000 tokens, then it is most likely safe to assume that AI21's tokenization API will
|
|
23
|
-
# process this request.
|
|
24
|
-
MAX_TOKENIZATION_REQUEST_LENGTH: int = 11000
|
|
25
|
-
|
|
26
|
-
# The AI21 tokenizer throws the following error when sending a request with text that has too many characters:
|
|
27
|
-
# "Text must be under 100,000 characters (type=value_error)"
|
|
28
|
-
# Sending a request with 100,000 characters seem to work though.
|
|
29
|
-
MAX_CHARACTER_LENGTH: int = 100_000
|
|
30
|
-
|
|
31
|
-
NOT_IMPLEMENTED_ERROR_MESSAGE: str = (
|
|
32
|
-
"AI21 only gave API access to their tokenizer, so this method is not supported."
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
def __init__(self, service: TokenizerService, gpt2_window_service: GPT2WindowService):
|
|
36
|
-
# We need the `TokenizerService` to make requests to the server.
|
|
37
|
-
self.service: TokenizerService = service
|
|
38
|
-
# As explained above, we need a `GPT2WindowService` to help tokenize long text sequences.
|
|
39
|
-
self.gpt2_window_service: GPT2WindowService = gpt2_window_service
|
|
40
|
-
|
|
41
|
-
@property
|
|
42
|
-
def tokenizer_name(self) -> str:
|
|
43
|
-
"""Name of the tokenizer to use when sending a request."""
|
|
44
|
-
return "ai21/j1"
|
|
45
|
-
|
|
46
|
-
@property
|
|
47
|
-
def max_sequence_length(self) -> int:
|
|
48
|
-
"""
|
|
49
|
-
The max token length of the model in. The AI21 server automatically prepends a token to every prompt,
|
|
50
|
-
so the actual max sequence length is 2048-1 = 2047.
|
|
51
|
-
"""
|
|
52
|
-
return 2047
|
|
53
|
-
|
|
54
|
-
@property
|
|
55
|
-
def max_request_length(self) -> int:
|
|
56
|
-
"""The max sequence length is the same as the max request length for AI21."""
|
|
57
|
-
return self.max_sequence_length
|
|
58
|
-
|
|
59
|
-
@property
|
|
60
|
-
def end_of_text_token(self) -> str:
|
|
61
|
-
# TODO: I'm not sure what their end of text token is. I don't think it's documented.
|
|
62
|
-
return " "
|
|
63
|
-
|
|
64
|
-
@property
|
|
65
|
-
def prefix_token(self) -> str:
|
|
66
|
-
"""AI21 tokenizers do no have a prefix token"""
|
|
67
|
-
return ""
|
|
68
|
-
|
|
69
|
-
def encode(self, text: str, truncation: bool = False, max_length: Optional[int] = None) -> EncodeResult:
|
|
70
|
-
"""
|
|
71
|
-
Encodes the input text to tokens.
|
|
72
|
-
"""
|
|
73
|
-
tokens: List[TokenizationToken]
|
|
74
|
-
normalized_text: str
|
|
75
|
-
tokens, normalized_text = self._make_long_tokenization_request(text)
|
|
76
|
-
# The end position of the last token should be the end of the text.
|
|
77
|
-
if len(tokens) > 0:
|
|
78
|
-
assert tokens[-1].text_range is not None
|
|
79
|
-
assert tokens[-1].text_range.end == len(normalized_text)
|
|
80
|
-
|
|
81
|
-
return EncodeResult(text=normalized_text, tokens=tokens)
|
|
82
|
-
|
|
83
|
-
def decode(self, tokens: List[TokenizationToken], normalized_text: Optional[str] = None) -> str:
|
|
84
|
-
"""
|
|
85
|
-
Given the model and a list of tokens, outputs the corresponding text.
|
|
86
|
-
|
|
87
|
-
For models using the GPT-2 tokenizer, the tokens are integers; for AI21
|
|
88
|
-
models, the tokens are `TokenizationToken`s.
|
|
89
|
-
|
|
90
|
-
Some tokenizers (e.g. AI21) normalize the text before encoding it and
|
|
91
|
-
thus require the `normalized_text` for decoding.
|
|
92
|
-
"""
|
|
93
|
-
if not tokens:
|
|
94
|
-
return ""
|
|
95
|
-
|
|
96
|
-
# `normalized_text` is necessary for decoding AI21 tokens.
|
|
97
|
-
assert normalized_text, "The AI21 tokenizer needs `normalized_text` for decoding"
|
|
98
|
-
for j in range(len(tokens) - 1):
|
|
99
|
-
first_text_range = tokens[j].text_range
|
|
100
|
-
second_text_range = tokens[j + 1].text_range
|
|
101
|
-
assert first_text_range is not None
|
|
102
|
-
assert second_text_range is not None
|
|
103
|
-
assert (
|
|
104
|
-
first_text_range.end == second_text_range.start
|
|
105
|
-
), "The tokens to be decoded must form a substring of `normalized_text`."
|
|
106
|
-
|
|
107
|
-
token_texts: List[str] = []
|
|
108
|
-
# The format of AI21 byte token representations. e.g. <0xE8>
|
|
109
|
-
byte_pattern = "<0x[0-9A-F]{2}>"
|
|
110
|
-
i: int = 0
|
|
111
|
-
while i < len(tokens):
|
|
112
|
-
# If there are byte tokens, aggregates them to a string
|
|
113
|
-
token_value = tokens[i].value
|
|
114
|
-
assert isinstance(token_value, str)
|
|
115
|
-
if re.match(byte_pattern, token_value):
|
|
116
|
-
bytestring = ""
|
|
117
|
-
while i < len(tokens) and re.match(byte_pattern, token_value):
|
|
118
|
-
# e.g. <0xE8> -> \xE8
|
|
119
|
-
bytestring += "\\" + token_value[2:-1]
|
|
120
|
-
i += 1
|
|
121
|
-
# Convert to encoded URI (e.g., %e2%80%99) and decode
|
|
122
|
-
token_text = unquote(bytestring.replace("\\x", "%"))
|
|
123
|
-
# Not a byte token: retrieves the token text based on text_range.
|
|
124
|
-
else:
|
|
125
|
-
token: TokenizationToken = tokens[i]
|
|
126
|
-
assert token.text_range is not None
|
|
127
|
-
token_text = normalized_text[token.text_range.start : token.text_range.end]
|
|
128
|
-
i += 1
|
|
129
|
-
token_texts.append(token_text)
|
|
130
|
-
return "".join(token_texts)
|
|
131
|
-
|
|
132
|
-
def tokenize(self, text: str) -> List[str]:
|
|
133
|
-
"""
|
|
134
|
-
Tokenizes the text via the /api/tokenize REST endpoint.
|
|
135
|
-
"""
|
|
136
|
-
response: TokenizationRequestResult = self._make_tokenization_request(text)
|
|
137
|
-
result = []
|
|
138
|
-
for token in response.tokens:
|
|
139
|
-
assert isinstance(token.value, str)
|
|
140
|
-
result.append(token.value)
|
|
141
|
-
return result
|
|
142
|
-
|
|
143
|
-
def get_num_tokens(self, text: str) -> int:
|
|
144
|
-
"""Tokenizes the text using the GPT-2 tokenizer and returns the number of tokens."""
|
|
145
|
-
return len(self.tokenize(text))
|
|
146
|
-
|
|
147
|
-
def fits_within_context_window(self, text: str, expected_completion_token_length: int = 0) -> bool:
|
|
148
|
-
return (
|
|
149
|
-
len(text) <= AI21WindowService.MAX_CHARACTER_LENGTH
|
|
150
|
-
and self.get_num_tokens(text) + expected_completion_token_length <= self.max_request_length
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
def truncate_from_right(self, text: str, expected_completion_token_length: int = 0) -> str:
|
|
154
|
-
"""
|
|
155
|
-
Truncates the text using the AI21 Jurassic tokenizer.
|
|
156
|
-
First, ensures the text is shorter than `AI21Tokenizer.MAX_CHARACTER_LENGTH` long.
|
|
157
|
-
Tokenizes, then truncates the list of tokens to fit within the context window minus the
|
|
158
|
-
expected completion length (defaults to 0), then uses the start of the text range of the first
|
|
159
|
-
token and the end of the text range of the last token of the truncated list of tokens to
|
|
160
|
-
build the truncated text.
|
|
161
|
-
"""
|
|
162
|
-
text = text[: AI21WindowService.MAX_CHARACTER_LENGTH]
|
|
163
|
-
response: TokenizationRequestResult = self._make_tokenization_request(text)
|
|
164
|
-
|
|
165
|
-
# Only look at the first `self.max_request_length` - `expected_completion_token_length`
|
|
166
|
-
# number of tokens to the fit the text within the context window.
|
|
167
|
-
# Each token is represented like this: {'text': '▁Hello', 'textRange': {'start': 0, 'end': 5}}
|
|
168
|
-
max_length: int = self.max_request_length - expected_completion_token_length
|
|
169
|
-
tokens: List[TokenizationToken] = response.tokens[:max_length]
|
|
170
|
-
|
|
171
|
-
# If there is no tokens, just return the original text
|
|
172
|
-
if len(tokens) == 0:
|
|
173
|
-
return text
|
|
174
|
-
|
|
175
|
-
# AI21 uses "_" to represent a single space in their tokens, so we have to build the new text from the
|
|
176
|
-
# original text after truncation using the text ranges of tokens generated from the original text.
|
|
177
|
-
assert tokens[0].text_range is not None
|
|
178
|
-
first_text_range: TextRange = tokens[0].text_range
|
|
179
|
-
assert tokens[-1].text_range is not None
|
|
180
|
-
last_text_range: TextRange = tokens[-1].text_range
|
|
181
|
-
start: int = first_text_range.start
|
|
182
|
-
end: int = last_text_range.end
|
|
183
|
-
truncated_text: str = text[start:end]
|
|
184
|
-
|
|
185
|
-
# HACK: For the vast majority of cases, the above logic works, but there are a few where the
|
|
186
|
-
# token count exceeds `max_length` by 1. This might be a bug with the AI21 tokenizer API.
|
|
187
|
-
# We handle those by removing characters one by one until it fits within the context window.
|
|
188
|
-
while not self.fits_within_context_window(truncated_text, expected_completion_token_length):
|
|
189
|
-
end -= 1
|
|
190
|
-
truncated_text = text[start:end]
|
|
191
|
-
return truncated_text
|
|
192
|
-
|
|
193
|
-
def _make_tokenization_request(self, text: str) -> TokenizationRequestResult:
|
|
194
|
-
"""Sends a request to the server to tokenize the text via the `TokenizerService`."""
|
|
195
|
-
return self.service.tokenize(TokenizationRequest(text=text, tokenizer=self.tokenizer_name))
|
|
196
|
-
|
|
197
|
-
def _make_long_tokenization_request(self, text: str) -> Tuple[List[TokenizationToken], str]:
|
|
198
|
-
"""If the text is too long (longer than 11,000 tokens when tokenized by the GPT-2 tokenizer),
|
|
199
|
-
the AI21 server will close the connection. Therefore, we need to split the text into smaller
|
|
200
|
-
chunks, tokenize each chunk, and re-assemble the tokenization results."""
|
|
201
|
-
# Uses the number of gpt2-style tokens as a measure of text length.
|
|
202
|
-
gpt2_tokens: List[TokenizationToken] = self.gpt2_window_service.encode(text).tokens
|
|
203
|
-
|
|
204
|
-
# If the text is short, just makes one request and returns the result.
|
|
205
|
-
if len(gpt2_tokens) < AI21WindowService.MAX_TOKENIZATION_REQUEST_LENGTH:
|
|
206
|
-
result: TokenizationRequestResult = self._make_tokenization_request(text)
|
|
207
|
-
return result.tokens, result.text
|
|
208
|
-
# Otherwise, splits the text to chunks, tokenizes each chunk, and re-assembles them.
|
|
209
|
-
else:
|
|
210
|
-
all_tokens: List[TokenizationToken] = []
|
|
211
|
-
normalized_text_chunks: List[str] = []
|
|
212
|
-
# The number of gpt2-style tokens we have tokenized with the AI21 tokenizer.
|
|
213
|
-
num_processed_tokens: int = 0
|
|
214
|
-
# The length of the (normalized) text string we have tokenized with the AI21 tokenizer.
|
|
215
|
-
num_processed_positions: int = 0
|
|
216
|
-
while num_processed_tokens < len(gpt2_tokens):
|
|
217
|
-
token_chunk_size: int = min(
|
|
218
|
-
len(gpt2_tokens) - num_processed_tokens, AI21WindowService.MAX_TOKENIZATION_REQUEST_LENGTH
|
|
219
|
-
)
|
|
220
|
-
token_chunk: List[TokenizationToken] = gpt2_tokens[
|
|
221
|
-
num_processed_tokens : num_processed_tokens + token_chunk_size
|
|
222
|
-
]
|
|
223
|
-
text_chunk: str = self.gpt2_window_service.decode(token_chunk)
|
|
224
|
-
# We need to avoid generating byte tokens when splitting the text
|
|
225
|
-
while text_chunk.endswith("\ufffd"):
|
|
226
|
-
token_chunk_size -= 1
|
|
227
|
-
token_chunk = gpt2_tokens[num_processed_tokens : num_processed_tokens + token_chunk_size]
|
|
228
|
-
text_chunk = self.gpt2_window_service.decode(token_chunk)
|
|
229
|
-
chunk_result: TokenizationRequestResult = self._make_tokenization_request(text_chunk)
|
|
230
|
-
chunk_tokens: List[TokenizationToken]
|
|
231
|
-
normalized_text_chunk: str
|
|
232
|
-
chunk_tokens, normalized_text_chunk = chunk_result.tokens, chunk_result.text
|
|
233
|
-
# Removes the empty tokens introduced by the split.
|
|
234
|
-
assert chunk_tokens[0].text_range is not None
|
|
235
|
-
if num_processed_tokens != 0 and chunk_tokens[0].text_range.start == chunk_tokens[0].text_range.end:
|
|
236
|
-
chunk_tokens = chunk_tokens[1:]
|
|
237
|
-
else:
|
|
238
|
-
chunk_tokens = chunk_tokens[:]
|
|
239
|
-
|
|
240
|
-
# Shifts the start and end index of each token
|
|
241
|
-
shifted_tokens: List[TokenizationToken] = []
|
|
242
|
-
for token in chunk_tokens:
|
|
243
|
-
assert token.text_range is not None
|
|
244
|
-
shifted_tokens.append(
|
|
245
|
-
TokenizationToken(
|
|
246
|
-
value=token.value,
|
|
247
|
-
text_range=TextRange(
|
|
248
|
-
start=token.text_range.start + num_processed_positions,
|
|
249
|
-
end=token.text_range.end + num_processed_positions,
|
|
250
|
-
),
|
|
251
|
-
)
|
|
252
|
-
)
|
|
253
|
-
all_tokens.extend(shifted_tokens)
|
|
254
|
-
normalized_text_chunks.append(normalized_text_chunk)
|
|
255
|
-
num_processed_tokens += token_chunk_size
|
|
256
|
-
num_processed_positions += len(normalized_text_chunk)
|
|
257
|
-
|
|
258
|
-
return all_tokens, "".join(normalized_text_chunks)
|
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional
|
|
2
|
-
|
|
3
|
-
from helm.proxy.tokenizers.cohere_tokenizer import CohereTokenizer
|
|
4
|
-
from .local_window_service import LocalWindowService
|
|
5
|
-
from .tokenizer_service import TokenizerService
|
|
6
|
-
from .window_service import EncodeResult
|
|
7
|
-
from helm.common.tokenization_request import (
|
|
8
|
-
TokenizationRequest,
|
|
9
|
-
TokenizationRequestResult,
|
|
10
|
-
TokenizationToken,
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class CohereWindowService(LocalWindowService):
|
|
15
|
-
def __init__(self, service: TokenizerService):
|
|
16
|
-
super().__init__(service)
|
|
17
|
-
|
|
18
|
-
@property
|
|
19
|
-
def tokenizer_name(self) -> str:
|
|
20
|
-
return "cohere/cohere"
|
|
21
|
-
|
|
22
|
-
@property
|
|
23
|
-
def max_sequence_length(self) -> int:
|
|
24
|
-
"""
|
|
25
|
-
The max length of the model input. Similar to MT-NLG, Cohere does not predict the logprob of
|
|
26
|
-
the first input token so `max_sequence_length` is one token shorter than `max_request_length`.
|
|
27
|
-
"""
|
|
28
|
-
return self.max_request_length - 1
|
|
29
|
-
|
|
30
|
-
@property
|
|
31
|
-
def max_request_length(self) -> int:
|
|
32
|
-
"""
|
|
33
|
-
The max request length of the model. For Cohere, this is the same as the `max_sequence_length`.
|
|
34
|
-
If we exceed the `max_sequence_length`, we get the following error:
|
|
35
|
-
|
|
36
|
-
Request failed with too many tokens: total number of tokens (prompt and prediction) cannot
|
|
37
|
-
exceed 2048 - received 2049. Try using a shorter prompt or a smaller max_tokens value.
|
|
38
|
-
"""
|
|
39
|
-
return 2048
|
|
40
|
-
|
|
41
|
-
@property
|
|
42
|
-
def end_of_text_token(self) -> str:
|
|
43
|
-
"""
|
|
44
|
-
The end of text token. Cohere does not have one.
|
|
45
|
-
"""
|
|
46
|
-
return ""
|
|
47
|
-
|
|
48
|
-
@property
|
|
49
|
-
def prefix_token(self) -> str:
|
|
50
|
-
"""
|
|
51
|
-
The prefix token. Cohere does not return the log prob for the first token when `echo_prompt` is True.
|
|
52
|
-
"""
|
|
53
|
-
# Cohere recommended ":", but we can try out different values
|
|
54
|
-
return ":"
|
|
55
|
-
|
|
56
|
-
def encode(self, text: str, truncation: bool = False, max_length: Optional[int] = None) -> EncodeResult:
|
|
57
|
-
"""
|
|
58
|
-
Encodes the input text to tokens.
|
|
59
|
-
"""
|
|
60
|
-
if max_length is None:
|
|
61
|
-
max_length = self.max_request_length
|
|
62
|
-
|
|
63
|
-
response: TokenizationRequestResult
|
|
64
|
-
tokens: List[TokenizationToken] = []
|
|
65
|
-
if truncation or len(text) <= CohereTokenizer.TOKENIZE_API_MAX_TEXT_LENGTH:
|
|
66
|
-
response = self.service.tokenize(
|
|
67
|
-
TokenizationRequest(
|
|
68
|
-
text,
|
|
69
|
-
tokenizer=self.tokenizer_name,
|
|
70
|
-
# The Cohere API does not support decoding, so set `encode` to False to get the value of tokens
|
|
71
|
-
# as strings so we can simply concatenate them when we need to decode.
|
|
72
|
-
encode=False,
|
|
73
|
-
truncation=truncation,
|
|
74
|
-
max_length=max_length,
|
|
75
|
-
)
|
|
76
|
-
)
|
|
77
|
-
tokens = response.tokens
|
|
78
|
-
else:
|
|
79
|
-
# Perform chunk encoding: Cohere doesn't support long sequences, so break it up into chunks
|
|
80
|
-
# and make a request for each chunk.
|
|
81
|
-
# This can potentially break up valid tokens at the end of the chunk, but the chunk size
|
|
82
|
-
# is large enough that this happens infrequently.
|
|
83
|
-
chunk_size: int = CohereTokenizer.TOKENIZE_API_MAX_TEXT_LENGTH
|
|
84
|
-
for i in range(0, len(text), chunk_size):
|
|
85
|
-
chunk: str = text[i : chunk_size + i]
|
|
86
|
-
response = self.service.tokenize(
|
|
87
|
-
TokenizationRequest(chunk, tokenizer=self.tokenizer_name, encode=False, truncation=False)
|
|
88
|
-
)
|
|
89
|
-
tokens.extend(response.tokens)
|
|
90
|
-
|
|
91
|
-
return EncodeResult(text=text, tokens=tokens)
|
|
92
|
-
|
|
93
|
-
def get_num_tokens(self, text: str) -> int:
|
|
94
|
-
"""Tokenizes the text and returns the number of tokens."""
|
|
95
|
-
# We need this check since we can't pass in empty string via the `tokenize` endpoint
|
|
96
|
-
if len(text) == 0:
|
|
97
|
-
return 0
|
|
98
|
-
return len(self.encode(text).tokens)
|
|
99
|
-
|
|
100
|
-
def decode(self, tokens: List[TokenizationToken], normalized_text: Optional[str] = None) -> str:
|
|
101
|
-
"""
|
|
102
|
-
The Cohere API does not support decoding, but we're able to recover the original text from the
|
|
103
|
-
values of the tokens by concatenating them.
|
|
104
|
-
|
|
105
|
-
Note this logic currently only works with English text.
|
|
106
|
-
"""
|
|
107
|
-
token_strings = []
|
|
108
|
-
for token in tokens:
|
|
109
|
-
assert isinstance(token.value, str)
|
|
110
|
-
token_strings.append(token.value)
|
|
111
|
-
return "".join(token_strings)
|
|
112
|
-
|
|
113
|
-
def fits_within_context_window(self, text: str, expected_completion_token_length: int = 0) -> bool:
|
|
114
|
-
"""
|
|
115
|
-
Checks if the given text fits within the context window given by `max_request_length`
|
|
116
|
-
taking to account the expected completion length (defaults to 0).
|
|
117
|
-
|
|
118
|
-
According to https://docs.cohere.ai/tokenize-reference#request, for tokenize, text: "the string to
|
|
119
|
-
be tokenized, the minimum text length is 1 character, and the maximum text length is 65,536 characters.",
|
|
120
|
-
so first check if the text has fewer than 65,536 characters.
|
|
121
|
-
"""
|
|
122
|
-
return (
|
|
123
|
-
len(text) <= CohereTokenizer.TOKENIZE_API_MAX_TEXT_LENGTH
|
|
124
|
-
and self.get_num_tokens(text) + expected_completion_token_length <= self.max_request_length
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
def truncate_from_right(self, text: str, expected_completion_token_length: int = 0) -> str:
|
|
128
|
-
"""
|
|
129
|
-
Truncates text from the right to fit within the context window given by `max_request_length`
|
|
130
|
-
minus the expected completion length (defaults to 0).
|
|
131
|
-
"""
|
|
132
|
-
# First truncate the text so it's within `CohereClient.TOKENIZE_MAX_TEXT_LENGTH` length.
|
|
133
|
-
text = text[: CohereTokenizer.TOKENIZE_API_MAX_TEXT_LENGTH]
|
|
134
|
-
|
|
135
|
-
max_length: int = self.max_request_length - expected_completion_token_length
|
|
136
|
-
result: str = self.decode(self.encode(text, truncation=True, max_length=max_length).tokens)
|
|
137
|
-
|
|
138
|
-
# HACK: For the vast majority of cases, the above logic works, but it sometimes doesn't work
|
|
139
|
-
# for non-English text, since Cohere technically only supports English at the moment.
|
|
140
|
-
while not self.fits_within_context_window(result, expected_completion_token_length):
|
|
141
|
-
result = result[:-1]
|
|
142
|
-
|
|
143
|
-
return result
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
class CohereCommandWindowService(CohereWindowService):
|
|
147
|
-
def __init__(self, service: TokenizerService):
|
|
148
|
-
super().__init__(service)
|
|
149
|
-
|
|
150
|
-
@property
|
|
151
|
-
def max_request_length(self) -> int:
|
|
152
|
-
"""
|
|
153
|
-
The max request length of the model. For Cohere, this is the same as the `max_sequence_length`.
|
|
154
|
-
If we exceed the `max_sequence_length`, we get the following error:
|
|
155
|
-
|
|
156
|
-
Request failed with too many tokens: total number of tokens (prompt and prediction) cannot
|
|
157
|
-
exceed 2048 - received 2049. Try using a shorter prompt or a smaller max_tokens value.
|
|
158
|
-
|
|
159
|
-
For the Command model, in rare situations, the co.tokenize returns a shorter list of tokens
|
|
160
|
-
than the co.generate. This causes sequence length errors for rare inputs. Cohere's advice is
|
|
161
|
-
to reduce the sequence length to 2020 to avoid these issues.
|
|
162
|
-
"""
|
|
163
|
-
return 2020
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
from .encoder_decoder_window_service import EncoderDecoderWindowService
|
|
2
|
-
from .tokenizer_service import TokenizerService
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class FlanT5WindowService(EncoderDecoderWindowService):
|
|
6
|
-
def __init__(self, service: TokenizerService):
|
|
7
|
-
super().__init__(service)
|
|
8
|
-
|
|
9
|
-
@property
|
|
10
|
-
def max_sequence_length(self) -> int:
|
|
11
|
-
"""Return the max sequence length."""
|
|
12
|
-
# We subtract 1 to account for <extra_id_0> that gets appended to prompts.
|
|
13
|
-
return 512 - 1
|
|
14
|
-
|
|
15
|
-
@property
|
|
16
|
-
def end_of_text_token(self) -> str:
|
|
17
|
-
"""The end of text token."""
|
|
18
|
-
return "</s>"
|
|
19
|
-
|
|
20
|
-
@property
|
|
21
|
-
def tokenizer_name(self) -> str:
|
|
22
|
-
"""Name of the tokenizer to use when sending a request."""
|
|
23
|
-
return "google/flan-t5-xxl"
|
|
24
|
-
|
|
25
|
-
@property
|
|
26
|
-
def prefix_token(self) -> str:
|
|
27
|
-
"""The prefix token is the same as the end of text token."""
|
|
28
|
-
# echo=True is not supported
|
|
29
|
-
return ""
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
from .local_window_service import LocalWindowService
|
|
2
|
-
from .tokenizer_service import TokenizerService
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class GPT2WindowService(LocalWindowService):
|
|
6
|
-
def __init__(self, service: TokenizerService):
|
|
7
|
-
super().__init__(service)
|
|
8
|
-
|
|
9
|
-
@property
|
|
10
|
-
def max_sequence_length(self) -> int:
|
|
11
|
-
"""Return the max sequence length of this tokenizer."""
|
|
12
|
-
return 1024
|
|
13
|
-
|
|
14
|
-
@property
|
|
15
|
-
def max_request_length(self) -> int:
|
|
16
|
-
"""Return the max request length of GPT-2."""
|
|
17
|
-
return self.max_sequence_length + 1
|
|
18
|
-
|
|
19
|
-
@property
|
|
20
|
-
def end_of_text_token(self) -> str:
|
|
21
|
-
"""The end of text token."""
|
|
22
|
-
return "<|endoftext|>"
|
|
23
|
-
|
|
24
|
-
@property
|
|
25
|
-
def tokenizer_name(self) -> str:
|
|
26
|
-
"""Name of the tokenizer to use when sending a request."""
|
|
27
|
-
return "huggingface/gpt2"
|
|
28
|
-
|
|
29
|
-
@property
|
|
30
|
-
def prefix_token(self) -> str:
|
|
31
|
-
"""The prefix token for models that uses the GPT-2 tokenizer is the end of text token."""
|
|
32
|
-
return self.end_of_text_token
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
|
-
from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
|
|
3
|
-
from .local_window_service import LocalWindowService
|
|
4
|
-
from .tokenizer_service import TokenizerService
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class HuggingFaceWindowService(LocalWindowService):
|
|
8
|
-
def __init__(
|
|
9
|
-
self,
|
|
10
|
-
service: TokenizerService,
|
|
11
|
-
tokenizer_name: str,
|
|
12
|
-
pretrained_model_name_or_path: Optional[str] = None,
|
|
13
|
-
max_sequence_length: Optional[int] = None,
|
|
14
|
-
max_request_length: Optional[int] = None,
|
|
15
|
-
end_of_text_token: Optional[str] = None,
|
|
16
|
-
prefix_token: Optional[str] = None,
|
|
17
|
-
**kwargs
|
|
18
|
-
):
|
|
19
|
-
super().__init__(service)
|
|
20
|
-
self._tokenizer_name = tokenizer_name
|
|
21
|
-
# Override max_sequence_length, max_request_length, end_of_text_token
|
|
22
|
-
# and prefix_token if provided as an argument.
|
|
23
|
-
# Otherwise, auto-infer them from the Hugging Face tokenizer.
|
|
24
|
-
#
|
|
25
|
-
# Note that many Hugging Face tokenizers have incorrect sequence lengths,
|
|
26
|
-
# so it is recommended to set this manually.
|
|
27
|
-
with HuggingFaceTokenizer.get_tokenizer(
|
|
28
|
-
helm_tokenizer_name=tokenizer_name,
|
|
29
|
-
pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
|
|
30
|
-
**kwargs,
|
|
31
|
-
) as tokenizer:
|
|
32
|
-
self._max_sequence_length = max_sequence_length or tokenizer.model_max_length
|
|
33
|
-
self._max_request_length = max_request_length or self._max_sequence_length
|
|
34
|
-
self._end_of_text_token = end_of_text_token or tokenizer.eos_token or ""
|
|
35
|
-
self._prefix_token = prefix_token or tokenizer.bos_token or ""
|
|
36
|
-
|
|
37
|
-
@property
|
|
38
|
-
def tokenizer_name(self) -> str:
|
|
39
|
-
"""Name of the tokenizer to use when sending a request."""
|
|
40
|
-
return self._tokenizer_name
|
|
41
|
-
|
|
42
|
-
@property
|
|
43
|
-
def max_sequence_length(self) -> int:
|
|
44
|
-
"""Return the max sequence length of this tokenizer."""
|
|
45
|
-
return self._max_sequence_length
|
|
46
|
-
|
|
47
|
-
@property
|
|
48
|
-
def max_request_length(self) -> int:
|
|
49
|
-
"""Return the max request length of this tokenizer."""
|
|
50
|
-
return self._max_request_length
|
|
51
|
-
|
|
52
|
-
@property
|
|
53
|
-
def end_of_text_token(self) -> str:
|
|
54
|
-
"""The end of text token."""
|
|
55
|
-
return self._end_of_text_token
|
|
56
|
-
|
|
57
|
-
@property
|
|
58
|
-
def prefix_token(self) -> str:
|
|
59
|
-
"""The prefix token."""
|
|
60
|
-
return self._prefix_token
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
from .encoder_decoder_window_service import EncoderDecoderWindowService
|
|
2
|
-
from .tokenizer_service import TokenizerService
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class T0ppWindowService(EncoderDecoderWindowService):
|
|
6
|
-
def __init__(self, service: TokenizerService):
|
|
7
|
-
super().__init__(service)
|
|
8
|
-
|
|
9
|
-
@property
|
|
10
|
-
def max_sequence_length(self) -> int:
|
|
11
|
-
"""Return the max sequence length."""
|
|
12
|
-
# From https://arxiv.org/pdf/2110.08207.pdf, "we truncate input and target sequences to 1024 and 256 tokens,
|
|
13
|
-
# respectively. Following Raffel et al. (2020), we use packing to combine multiple training examples into
|
|
14
|
-
# a single sequence to reach the maximum sequence length."
|
|
15
|
-
return 1024
|
|
16
|
-
|
|
17
|
-
@property
|
|
18
|
-
def max_output_length(self) -> int:
|
|
19
|
-
return 256
|
|
20
|
-
|
|
21
|
-
@property
|
|
22
|
-
def end_of_text_token(self) -> str:
|
|
23
|
-
"""The end of text token."""
|
|
24
|
-
return "</s>"
|
|
25
|
-
|
|
26
|
-
@property
|
|
27
|
-
def tokenizer_name(self) -> str:
|
|
28
|
-
"""Name of the tokenizer to use when sending a request."""
|
|
29
|
-
return "bigscience/T0pp"
|
|
30
|
-
|
|
31
|
-
@property
|
|
32
|
-
def prefix_token(self) -> str:
|
|
33
|
-
"""The prefix token is the same as the end of text token."""
|
|
34
|
-
# echo=True is not supported
|
|
35
|
-
return ""
|