PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show

crfm_helm-0.5.6.dist-info/METADATA +427 -0
crfm_helm-0.5.6.dist-info/RECORD +941 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +13 -1
helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/common_adapter_specs.py +69 -4
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/annotation/aci_bench_annotator.py +95 -0
helm/benchmark/annotation/air_bench_annotator.py +21 -6
helm/benchmark/annotation/annotator.py +5 -0
helm/benchmark/annotation/annotator_factory.py +3 -20
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +107 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +99 -0
helm/benchmark/annotation/medalign_annotator.py +100 -0
helm/benchmark/annotation/medi_qa_annotator.py +98 -0
helm/benchmark/annotation/medication_qa_annotator.py +87 -63
helm/benchmark/annotation/mental_health_annotator.py +98 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +214 -6
helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation_executor.py +35 -15
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +2 -2
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +2 -2
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +16 -13
helm/benchmark/augmentations/translate_perturbation.py +2 -2
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +11 -12
helm/benchmark/huggingface_registration.py +2 -7
helm/benchmark/metrics/aci_bench_metrics.py +14 -0
helm/benchmark/metrics/basic_metrics.py +6 -6
helm/benchmark/metrics/bbq_metrics.py +2 -2
helm/benchmark/metrics/bias_metrics.py +12 -3
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
helm/benchmark/metrics/classification_metrics.py +76 -12
helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +9 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
helm/benchmark/metrics/copyright_metrics.py +4 -4
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +14 -0
helm/benchmark/metrics/disinformation_metrics.py +4 -4
helm/benchmark/metrics/dry_run_metrics.py +5 -5
helm/benchmark/metrics/efficiency_metrics.py +6 -6
helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
helm/benchmark/metrics/ifeval/__init__.py +0 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +55 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/language_modeling_metrics.py +4 -4
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/machine_translation_metrics.py +2 -2
helm/benchmark/metrics/med_dialog_metrics.py +14 -0
helm/benchmark/metrics/medalign_metrics.py +14 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
helm/benchmark/metrics/medec_metrics.py +101 -0
helm/benchmark/metrics/medi_qa_metrics.py +14 -0
helm/benchmark/metrics/medication_qa_metrics.py +10 -19
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +14 -0
helm/benchmark/metrics/metric.py +3 -3
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/numeracy_metrics.py +4 -4
helm/benchmark/metrics/omni_math_metrics.py +32 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
helm/benchmark/metrics/ranking_metrics.py +3 -3
helm/benchmark/metrics/reference_metric.py +3 -3
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +20 -9
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +140 -68
helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
helm/benchmark/metrics/test_metric.py +1 -1
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
helm/benchmark/metrics/toxicity_metrics.py +6 -6
helm/benchmark/metrics/unitxt_metrics.py +7 -5
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/metrics/wildbench_metrics.py +34 -0
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/model_metadata_registry.py +16 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +97 -67
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +86 -90
helm/benchmark/run_expander.py +90 -9
helm/benchmark/run_spec_factory.py +13 -0
helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/call_center_run_specs.py +49 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1 -69
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +142 -3
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/lite_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +141 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +37 -0
helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
helm/benchmark/run_specs/vlm_run_specs.py +103 -2
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
helm/benchmark/scenarios/air_bench_scenario.py +6 -1
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
helm/benchmark/scenarios/banking77_scenario.py +6 -1
helm/benchmark/scenarios/bbq_scenario.py +1 -1
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bold_scenario.py +1 -1
helm/benchmark/scenarios/boolq_scenario.py +1 -1
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
helm/benchmark/scenarios/clear_scenario.py +157 -0
helm/benchmark/scenarios/cleva_scenario.py +2 -2
helm/benchmark/scenarios/code_scenario.py +17 -4
helm/benchmark/scenarios/commonsense_scenario.py +1 -1
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
helm/benchmark/scenarios/disinformation_scenario.py +10 -1
helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
helm/benchmark/scenarios/gpqa_scenario.py +80 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +2 -2
helm/benchmark/scenarios/gsm_scenario.py +10 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
helm/benchmark/scenarios/headqa_scenario.py +136 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
helm/benchmark/scenarios/ice_scenario.py +8 -4
helm/benchmark/scenarios/ifeval_scenario.py +53 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +11 -2
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +1 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
helm/benchmark/scenarios/legal_support_scenario.py +11 -1
helm/benchmark/scenarios/legalbench_scenario.py +22 -3
helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
helm/benchmark/scenarios/lextreme_scenario.py +11 -1
helm/benchmark/scenarios/live_qa_scenario.py +1 -1
helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
helm/benchmark/scenarios/math_scenario.py +9 -1
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +10 -1
helm/benchmark/scenarios/medalign_scenario.py +94 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
helm/benchmark/scenarios/medbullets_scenario.py +145 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
helm/benchmark/scenarios/medec_scenario.py +125 -0
helm/benchmark/scenarios/medhallu_scenario.py +72 -0
helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +123 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
helm/benchmark/scenarios/mmlu_scenario.py +11 -1
helm/benchmark/scenarios/msmarco_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/numeracy_scenario.py +12 -2
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +53 -0
helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
helm/benchmark/scenarios/quac_scenario.py +10 -1
helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
helm/benchmark/scenarios/raft_scenario.py +17 -2
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
helm/benchmark/scenarios/scenario.py +9 -1
helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
helm/benchmark/scenarios/spider_scenario.py +91 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
helm/benchmark/scenarios/summarization_scenario.py +11 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_math_scenario.py +1 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
helm/benchmark/scenarios/unitxt_scenario.py +8 -2
helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
helm/benchmark/scenarios/vicuna_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/wikifact_scenario.py +11 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +83 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
helm/benchmark/scenarios/xstest_scenario.py +1 -1
helm/benchmark/server.py +13 -1
helm/benchmark/slurm_runner.py +1 -1
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +97 -60
helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_finance.yaml +14 -12
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_long_context.yaml +283 -0
helm/benchmark/static/schema_medhelm.yaml +1140 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +18 -1
helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_vhelm.yaml +129 -56
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
helm/benchmark/static_build/config.js +1 -1
helm/benchmark/static_build/index.html +6 -6
helm/benchmark/window_services/default_window_service.py +1 -1
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
helm/benchmark/window_services/ice_window_service.py +1 -1
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
helm/benchmark/window_services/local_window_service.py +2 -2
helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
helm/benchmark/window_services/test_bloom_window_service.py +3 -3
helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
helm/benchmark/window_services/test_gptj_window_service.py +8 -3
helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
helm/benchmark/window_services/test_openai_window_service.py +8 -3
helm/benchmark/window_services/test_opt_window_service.py +3 -3
helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
helm/benchmark/window_services/test_t511b_window_service.py +3 -3
helm/benchmark/window_services/test_ul2_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +4 -5
helm/benchmark/window_services/test_yalm_window_service.py +3 -3
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/benchmark/window_services/yalm_window_service.py +1 -1
helm/clients/ai21_client.py +3 -3
helm/clients/aleph_alpha_client.py +1 -1
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +120 -0
helm/clients/audio_language/llama_omni_client.py +198 -0
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
helm/clients/audio_language/qwen_audiolm_client.py +152 -0
helm/clients/audio_language/test.py +62 -0
helm/clients/auto_client.py +4 -2
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +203 -7
helm/clients/bedrock_utils.py +33 -0
helm/clients/client.py +7 -7
helm/clients/clip_scorers/clip_scorer.py +1 -1
helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
helm/clients/cohere_client.py +3 -3
helm/clients/google_client.py +1 -1
helm/clients/grok_client.py +36 -0
helm/clients/http_model_client.py +1 -1
helm/clients/huggingface_client.py +52 -21
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +267 -0
helm/clients/image_generation/adobe_vision_client.py +1 -1
helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
helm/clients/image_generation/cogview2_client.py +1 -1
helm/clients/image_generation/dalle2_client.py +1 -1
helm/clients/image_generation/dalle3_client.py +2 -2
helm/clients/image_generation/dalle_mini/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/data.py +1 -1
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
helm/clients/image_generation/dalle_mini_client.py +1 -1
helm/clients/image_generation/deep_floyd_client.py +1 -1
helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
helm/clients/image_generation/lexica_client.py +1 -1
helm/clients/image_generation/mindalle/models/__init__.py +6 -6
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
helm/clients/image_generation/mindalle_client.py +1 -1
helm/clients/image_generation/together_image_generation_client.py +1 -1
helm/clients/lit_gpt_client.py +2 -2
helm/clients/mistral_client.py +62 -18
helm/clients/nvidia_nim_client.py +0 -3
helm/clients/openai_client.py +308 -43
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +3 -9
helm/clients/reka_client.py +3 -3
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +93 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/clients/test_client.py +1 -1
helm/clients/test_together_client.py +6 -1
helm/clients/together_client.py +76 -9
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +45 -13
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/open_flamingo/__init__.py +2 -2
helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +188 -0
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +4 -6
helm/clients/writer_client.py +102 -0
helm/clients/yi_client.py +0 -3
helm/common/audio_utils.py +111 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/file_caches/local_file_cache.py +1 -1
helm/common/file_caches/test_local_file_cache.py +1 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/images_utils.py +2 -2
helm/common/local_context.py +140 -0
helm/common/media_object.py +2 -2
helm/common/multimodal_request_utils.py +26 -0
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +14 -2
helm/common/response_format.py +18 -0
helm/common/test_media_object.py +1 -1
helm/config/model_deployments.yaml +1792 -28
helm/config/model_metadata.yaml +1606 -51
helm/config/tokenizer_configs.yaml +521 -4
helm/proxy/cli.py +5 -3
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/example_queries.py +1 -1
helm/proxy/server.py +11 -4
helm/proxy/services/remote_service.py +1 -1
helm/proxy/services/server_service.py +22 -86
helm/proxy/services/test_remote_service.py +2 -2
helm/proxy/services/test_service.py +1 -1
helm/proxy/static/general.js +122 -0
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +57 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +456 -0
helm/proxy/static/info-icon.png +0 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +1 -1
helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
helm/tokenizers/caching_tokenizer.py +2 -30
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/http_model_tokenizer.py +1 -1
helm/tokenizers/huggingface_tokenizer.py +3 -3
helm/tokenizers/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/tokenizers/test_huggingface_tokenizer.py +1 -1
helm/tokenizers/test_yalm_tokenizer.py +1 -1
helm/tokenizers/tiktoken_tokenizer.py +1 -1
helm/tokenizers/tokenizer.py +3 -1
helm/tokenizers/yalm_tokenizer.py +3 -3
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
crfm_helm-0.5.4.dist-info/METADATA +0 -350
crfm_helm-0.5.4.dist-info/RECORD +0 -697
helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
helm/tokenizers/anthropic_tokenizer.py +0 -52
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0

helm/common/audio_utils.py ADDED Viewed

@@ -0,0 +1,111 @@
+from io import BytesIO
+from typing import Optional
+from filelock import FileLock
+import base64
+import os
+import ffmpeg
+import numpy as np
+import soundfile as sf
+import subprocess
+from helm.common.hierarchical_logger import hlog
+from helm.common.multimodal_request_utils import get_contents_as_bytes
+from helm.common.optional_dependencies import handle_module_not_found_error
+try:
+    import librosa
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["audiolm"])
+def ensure_audio_file_exists_from_array(path: str, array: np.ndarray, sample_rate: int) -> None:
+    """Write the array to the wav or mp3 file if it does not already exist.
+    Uses file locking and an atomic rename to avoid file corruption due to incomplete writes and
+    concurrent writes."""
+    file_extension = os.path.splitext(path)[1]
+    if file_extension != ".wav" and file_extension != ".mp3":
+        raise ValueError(f"Path must end with .wav or .mp3: {path}")
+    with FileLock(f"{path}.lock"):
+        if os.path.exists(path):
+            # Skip because file already exists
+            return
+        path_prefix = path.removesuffix(file_extension)
+        tmp_path = f"{path_prefix}.tmp{file_extension}"
+        sf.write(tmp_path, array, samplerate=sample_rate)
+        os.rename(tmp_path, path)
+def get_array_from_audio_file(path: str, sample_rate: Optional[int]) -> np.ndarray:
+    """Get an array from an audio file"""
+    audio_file = (
+        BytesIO(get_contents_as_bytes(path)) if path.startswith("http://") or path.startswith("https://") else path
+    )
+    # librosa accepts a local file path or a file-like object
+    audio_array, _ = librosa.load(audio_file, sr=sample_rate)
+    return audio_array
+def use_ffmpeg_to_convert_audio_file(input_path: str, output_path: str) -> None:
+    if os.path.exists(output_path):
+        return
+    """Use ffmpeg to convert an audio file type"""
+    try:
+        subprocess.run(["ffmpeg", "-i", input_path, output_path], check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to convert audio files.")
+def is_invalid_audio_file(audio_path: str) -> bool:
+    """
+    Two conditions for an audio file to be considered invalid:
+    1. The file does not exist.
+    2. The file is empty.
+    """
+    if not os.path.exists(audio_path):
+        return True
+    try:
+        with sf.SoundFile(audio_path) as audio_file:
+            return len(audio_file) == 0
+    except RuntimeError:
+        return True
+def extract_audio(video_path: str, output_audio_path: str) -> None:
+    """
+    Extracts audio from an MP4 video file and saves it as an MP3 file.
+    Args:
+        video_path (str): Path to the input MP4 video file.
+        output_audio_path (str): Path to save the extracted MP3 audio file.
+    Returns:
+        None
+    """
+    try:
+        (
+            ffmpeg.input(video_path)
+            .output(output_audio_path, format="mp3", acodec="libmp3lame", audio_bitrate="192k")
+            .run(overwrite_output=True)
+        )
+    except ffmpeg.Error as e:
+        hlog(f"Error extracting audio from video: {video_path}: {e.stderr.decode()}")
+        raise e
+def encode_audio_to_base64(file_path: str) -> str:
+    """
+    Encodes an audio file to a Base64 string.
+    Args:
+        file_path (str): Path to the audio file.
+    Returns:
+        str: Base64-encoded string of the audio file.
+    """
+    assert os.path.exists(file_path), f"Audio file does not exist at path: {file_path}"
+    with open(file_path, "rb") as audio_file:
+        return base64.b64encode(audio_file.read()).decode("utf-8")

helm/common/context.py ADDED Viewed

@@ -0,0 +1,80 @@
+from abc import ABC, abstractmethod
+from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
+from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
+from helm.common.file_upload_request import FileUploadResult, FileUploadRequest
+from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
+from helm.common.perspective_api_request import PerspectiveAPIRequestResult, PerspectiveAPIRequest
+from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationRequestResult,
+    DecodeRequest,
+    DecodeRequestResult,
+)
+from helm.common.request import Request, RequestResult
+from helm.proxy.query import Query, QueryResult
+from helm.common.cache import CacheConfig
+from helm.proxy.services.service import GeneralInfo
+class Context(ABC):
+    @abstractmethod
+    def get_general_info(self) -> GeneralInfo:
+        """Get general info."""
+        pass
+    @abstractmethod
+    def expand_query(self, query: Query) -> QueryResult:
+        """Turn the `query` into requests."""
+        pass
+    @abstractmethod
+    def make_request(self, request: Request) -> RequestResult:
+        """Actually make a request to an API."""
+        pass
+    @abstractmethod
+    def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
+        """Tokenize via an API."""
+        pass
+    @abstractmethod
+    def decode(self, request: DecodeRequest) -> DecodeRequestResult:
+        """Decodes to text."""
+        pass
+    @abstractmethod
+    def upload(self, request: FileUploadRequest) -> FileUploadResult:
+        """Uploads a file to external storage."""
+        pass
+    @abstractmethod
+    def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
+        """Check for nudity for a batch of images."""
+        pass
+    @abstractmethod
+    def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
+        """Computes CLIPScore for a given caption and image."""
+        pass
+    @abstractmethod
+    def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
+        """Get toxicity scores for a batch of text."""
+        pass
+    @abstractmethod
+    def get_moderation_results(self, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
+        """Get OpenAI's moderation results for some text."""
+        pass
+    @abstractmethod
+    def make_critique_request(self, request: CritiqueRequest) -> CritiqueRequestResult:
+        """Get responses to a critique request."""
+        pass
+    @abstractmethod
+    def get_cache_config(self, shard_name: str) -> CacheConfig:
+        """Returns a CacheConfig"""
+        pass

helm/common/credentials_utils.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from typing import Any, Mapping, Optional
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, hwarn
 def provide_api_key(
@@ -13,16 +13,16 @@ def provide_api_key(
         hlog(f"Using host_organization api key defined in credentials.conf: {api_key_name}")
         return credentials[api_key_name]
     if "deployments" not in credentials:
-        hlog(
-            "WARNING: Could not find key 'deployments' in credentials.conf, "
+        hwarn(
+            "Could not find key 'deployments' in credentials.conf, "
             f"therefore the API key {api_key_name} should be specified."
         )
         return None
     deployment_api_keys = credentials["deployments"]
     if model is None:
-        hlog(f"WARNING: Could not find key '{host_organization}' in credentials.conf and no model provided")
+        hwarn(f"Could not find key '{host_organization}' in credentials.conf and no model provided")
         return None
     if model not in deployment_api_keys:
-        hlog(f"WARNING: Could not find key '{model}' under key 'deployments' in credentials.conf")
+        hwarn(f"Could not find key '{model}' under key 'deployments' in credentials.conf")
         return None
     return deployment_api_keys[model]

helm/common/file_caches/local_file_cache.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 from typing import Callable
 from helm.common.general import ensure_directory_exists, generate_unique_id
-from .file_cache import FileCache
+from helm.common.file_caches.file_cache import FileCache
 from helm.common.optional_dependencies import handle_module_not_found_error

helm/common/file_caches/test_local_file_cache.py CHANGED Viewed

@@ -3,7 +3,7 @@ import shutil
 import tempfile
 import unittest
-from .local_file_cache import LocalFileCache
+from helm.common.file_caches.local_file_cache import LocalFileCache
 class TestLocalFileCache(unittest.TestCase):

helm/common/general.py CHANGED Viewed

@@ -42,6 +42,13 @@ def ensure_directory_exists(path: str):
     os.makedirs(path, exist_ok=True)
+def check_file_exists(path: str, msg: Optional[str] = None):
+    """Checks that `path` exists, raises FileNotFoundError if it doesn't."""
+    if not os.path.exists(path):
+        error_msg = msg if msg else f"Required file not found: {path}"
+        raise FileNotFoundError(error_msg)
 def parse_hocon(text: str):
     """Parse `text` (in HOCON format) into a dict-like object."""
     return pyhocon.ConfigFactory.parse_string(text)
@@ -156,7 +163,7 @@ def format_split(split: str) -> str:
 def asdict_without_nones(obj: Any) -> Dict[str, Any]:
-    if not is_dataclass(obj):
+    if not is_dataclass(obj) or isinstance(obj, type):
         raise ValueError(f"Expected dataclass, got '{obj}'")
     return asdict(obj, dict_factory=lambda x: {k: v for (k, v) in x if v is not None})
@@ -178,7 +185,7 @@ def binarize_dict(d: Dict[str, int]) -> Dict[str, int]:
 def serialize(obj: Any) -> List[str]:
     """Takes in a dataclass and outputs all of its fields and values in a list."""
-    if not is_dataclass(obj):
+    if not is_dataclass(obj) or isinstance(obj, type):
         raise ValueError(f"Expected dataclass, got '{obj}'")
     return [f"{key}: {json.dumps(value)}" for key, value in asdict(obj).items()]

helm/common/hierarchical_logger.py CHANGED Viewed

@@ -1,6 +1,8 @@
+import logging
 import sys
 import time
 from typing import Any, Callable, List, Optional
+from colorlog import ColoredFormatter
 class HierarchicalLogger(object):
@@ -20,6 +22,12 @@ class HierarchicalLogger(object):
         } [0s]
     """
+    # Far too much effort to unwind every call to hlog to go via logging,
+    # And is a terrible idea to inspect the stack every time hlog is called
+    # to figure out the caller,
+    # So just log everything under "helm".
+    logger = logging.getLogger("helm")
     def __init__(self) -> None:
         self.start_times: List[float] = []
@@ -27,17 +35,21 @@ class HierarchicalLogger(object):
         return "  " * len(self.start_times)
     def track_begin(self, x: Any) -> None:
-        print(self.indent() + str(x) + " {")
+        self.logger.info(self.indent() + str(x) + " {")
         sys.stdout.flush()
         self.start_times.append(time.time())
     def track_end(self) -> None:
         t = time.time() - self.start_times.pop()
-        print(self.indent() + "} [%s]" % (format_time(t)))
+        self.logger.info(self.indent() + "} [%s]" % (format_time(t)))
         sys.stdout.flush()
     def log(self, x: Any) -> None:
-        print(self.indent() + str(x))
+        self.logger.info(self.indent() + str(x))
+        sys.stdout.flush()
+    def warn(self, x: Any) -> None:
+        self.logger.warning(self.indent() + str(x))
         sys.stdout.flush()
@@ -61,6 +73,10 @@ def hlog(x: Any) -> None:
     singleton.log(x)
+def hwarn(x: Any) -> None:
+    singleton.warn(x)
 class htrack_block:
     def __init__(self, x: Any) -> None:
         self.x = x
@@ -104,3 +120,30 @@ class htrack:
                 return fn(*args, **kwargs)
         return wrapper
+def setup_default_logging():
+    """
+    Setup a default logger to STDOUT for HELM via Python logging
+    """
+    formatter = ColoredFormatter(
+        "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
+        datefmt="%Y-%m-%dT%H:%M:%S",
+        reset=True,
+        log_colors={
+            "DEBUG": "cyan",
+            "INFO": "green",
+            "WARNING": "yellow",
+            "ERROR": "red",
+            "CRITICAL": "red,bg_white",
+        },
+        secondary_log_colors={},
+        style="%",
+    )
+    logger = logging.getLogger("helm")
+    logger.setLevel(logging.INFO)
+    logger.propagate = False
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)

helm/common/images_utils.py CHANGED Viewed

@@ -10,7 +10,7 @@ from urllib.request import urlopen
 import numpy as np
-from .general import is_url
+from helm.common.general import is_url
 from helm.common.optional_dependencies import handle_module_not_found_error
 try:
@@ -25,7 +25,7 @@ def open_image(image_location: str) -> Image.Image:
     """
     image: Image.Image
     if is_url(image_location):
-        image = Image.open(requests.get(image_location, stream=True).raw)
+        image = Image.open(requests.get(image_location, stream=True).raw)  # type: ignore
     else:
         image = Image.open(image_location)
     return image.convert("RGB")

helm/common/local_context.py ADDED Viewed

@@ -0,0 +1,140 @@
+import dataclasses
+import os
+from typing import Optional
+from helm.common.context import Context
+from helm.common.cache import CacheConfig
+from helm.common.cache_backend_config import CacheBackendConfig, BlackHoleCacheBackendConfig
+from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
+from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
+from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
+from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
+from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
+from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
+from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationRequestResult,
+    DecodeRequest,
+    DecodeRequestResult,
+)
+from helm.common.request import Request, RequestResult
+from helm.clients.auto_client import AutoClient
+from helm.clients.moderation_api_client import ModerationAPIClient
+from helm.clients.image_generation.nudity_check_client import NudityCheckClient
+from helm.clients.gcs_client import GCSClient
+from helm.clients.clip_score_client import CLIPScoreClient
+from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
+from helm.proxy.example_queries import example_queries
+from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
+from helm.proxy.query import Query, QueryResult
+from helm.proxy.retry import retry_request
+from helm.tokenizers.auto_tokenizer import AutoTokenizer
+from helm.proxy.services.service import (
+    CACHE_DIR,
+    GeneralInfo,
+    VERSION,
+    expand_environments,
+    synthesize_request,
+)
+class LocalContext(Context):
+    """
+    Main class that supports various functionality for the server.
+    """
+    def __init__(
+        self,
+        base_path: str = "prod_env",
+        cache_backend_config: CacheBackendConfig = BlackHoleCacheBackendConfig(),
+    ):
+        ensure_directory_exists(base_path)
+        client_file_storage_path = os.path.join(base_path, CACHE_DIR)
+        ensure_directory_exists(client_file_storage_path)
+        credentials = get_credentials(base_path)
+        self.cache_backend_config = cache_backend_config
+        self.client = AutoClient(credentials, client_file_storage_path, cache_backend_config)
+        self.tokenizer = AutoTokenizer(credentials, cache_backend_config)
+        # Lazily instantiate the following clients
+        self.moderation_api_client: Optional[ModerationAPIClient] = None
+        self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
+        self.perspective_api_client: Optional[ToxicityClassifierClient] = None
+        self.nudity_check_client: Optional[NudityCheckClient] = None
+        self.clip_score_client: Optional[CLIPScoreClient] = None
+        self.gcs_client: Optional[GCSClient] = None
+    def get_general_info(self) -> GeneralInfo:
+        # Can't send release_dates in ModelMetadata bacause dates cannot be round-tripped to and from JSON easily.
+        # TODO(#2158): Either fix this or delete get_general_info.
+        all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
+        return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
+    def expand_query(self, query: Query) -> QueryResult:
+        """Turn the `query` into requests."""
+        prompt = query.prompt
+        settings = query.settings
+        environments = parse_hocon(query.environments)
+        requests = []
+        for environment in expand_environments(environments):
+            request = synthesize_request(prompt, settings, environment)
+            requests.append(request)
+        return QueryResult(requests=requests)
+    def make_request(self, request: Request) -> RequestResult:
+        """Actually make a request to an API."""
+        return self.client.make_request(request)
+    def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
+        return self.tokenizer.tokenize(request)
+    def decode(self, request: DecodeRequest) -> DecodeRequestResult:
+        return self.tokenizer.decode(request)
+    def upload(self, request: FileUploadRequest) -> FileUploadResult:
+        if not self.gcs_client:
+            self.gcs_client = self.client.get_gcs_client()
+        assert self.gcs_client
+        return self.gcs_client.upload(request)
+    def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
+        if not self.nudity_check_client:
+            self.nudity_check_client = self.client.get_nudity_check_client()
+        assert self.nudity_check_client
+        return self.nudity_check_client.check_nudity(request)
+    def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
+        if not self.clip_score_client:
+            self.clip_score_client = self.client.get_clip_score_client()
+        assert self.clip_score_client
+        return self.clip_score_client.compute_score(request)
+    def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
+        @retry_request
+        def get_toxicity_scores_with_retry(request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
+            if not self.toxicity_classifier_client:
+                self.toxicity_classifier_client = self.client.get_toxicity_classifier_client()
+            return self.toxicity_classifier_client.get_toxicity_scores(request)
+        return get_toxicity_scores_with_retry(request)
+    def get_moderation_results(self, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
+        @retry_request
+        def get_moderation_results_with_retry(request: ModerationAPIRequest) -> ModerationAPIRequestResult:
+            if not self.moderation_api_client:
+                self.moderation_api_client = self.client.get_moderation_api_client()
+            return self.moderation_api_client.get_moderation_results(request)
+        return get_moderation_results_with_retry(request)
+    def make_critique_request(self, request: CritiqueRequest) -> CritiqueRequestResult:
+        return self.client.get_critique_client().make_critique_request(request)
+    def get_cache_config(self, shard_name: str) -> CacheConfig:
+        return self.cache_backend_config.get_cache_config(shard_name)

helm/common/media_object.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import urllib
 from copy import deepcopy
 from dataclasses import dataclass, field, replace
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 IMAGE_TYPE = "image"
@@ -27,7 +27,7 @@ class MediaObject:
     location: Optional[str] = None
     """When the media object is a file, specify the location of the media object, which can be a local path or URL."""
-    def to_dict(self) -> dict:
+    def to_dict(self) -> Dict[str, Any]:
         """Converts the media object to a dictionary."""
         return {key: value for key, value in self.__dict__.items() if value is not None}

helm/common/multimodal_request_utils.py CHANGED Viewed

@@ -1,10 +1,36 @@
+import base64
 from typing import List, Optional
+import requests
+import urllib.parse
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.scenarios.scenario import Reference
 from helm.common.request import RequestResult
+def get_contents_as_bytes(path: str) -> bytes:
+    """Get the contents at the location as bytes.
+    The location can be a local path or a URL."""
+    # Fetch the audio file and convert it to a base64 encoded string
+    is_remote = urllib.parse.urlparse(path).scheme in ["http", "https"]
+    if is_remote:
+        response = requests.get(path)
+        response.raise_for_status()
+        return response.content
+    else:
+        with open(path, "rb") as f:
+            return f.read()
+def get_contents_as_base64(path: str) -> str:
+    """Get the contents at the location as a base64-encoded string.
+    The location can be a local path or a URL."""
+    return base64.b64encode(get_contents_as_bytes(path)).decode("utf-8")
 def gather_generated_image_locations(request_result: RequestResult) -> List[str]:
     """Gathers the locations (file paths or URLs) of the generated images."""
     image_locations: List[str] = []

helm/common/reeval_parameters.py ADDED Viewed

@@ -0,0 +1,12 @@
+from dataclasses import dataclass
+from typing import Optional
+@dataclass(frozen=True)
+class REEvalParameters:
+    """
+    Parameters for reeval evaluation.
+    """
+    model_ability: Optional[float] = None
+    """The inital ability of the model to perform the task. Used for reeval evaluation."""

helm/common/remote_context.py ADDED Viewed

@@ -0,0 +1,61 @@
+from helm.common.context import Context
+from helm.common.cache import CacheConfig
+from helm.common.authentication import Authentication
+from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
+from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
+from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
+from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
+from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
+from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationRequestResult,
+    DecodeRequestResult,
+    DecodeRequest,
+)
+from helm.common.request import Request, RequestResult
+from helm.proxy.query import Query, QueryResult
+from helm.proxy.services.remote_service import RemoteService
+from helm.proxy.services.service import GeneralInfo, Service
+class RemoteContext(Context):
+    def __init__(self, base_url: str, auth: Authentication):
+        self.service: Service = RemoteService(base_url)
+        self.auth = auth
+    def get_general_info(self) -> GeneralInfo:
+        return self.service.get_general_info()
+    def expand_query(self, query: Query) -> QueryResult:
+        return self.service.expand_query(query)
+    def make_request(self, request: Request) -> RequestResult:
+        return self.service.make_request(self.auth, request)
+    def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
+        return self.service.tokenize(self.auth, request)
+    def decode(self, request: DecodeRequest) -> DecodeRequestResult:
+        return self.service.decode(self.auth, request)
+    def upload(self, request: FileUploadRequest) -> FileUploadResult:
+        return self.service.upload(self.auth, request)
+    def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
+        return self.service.check_nudity(self.auth, request)
+    def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
+        return self.service.compute_clip_score(self.auth, request)
+    def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
+        return self.service.get_toxicity_scores(self.auth, request)
+    def get_moderation_results(self, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
+        return self.service.get_moderation_results(self.auth, request)
+    def make_critique_request(self, request: CritiqueRequest) -> CritiqueRequestResult:
+        return self.service.make_critique_request(self.auth, request)
+    def get_cache_config(self, shard_name: str) -> CacheConfig:
+        return self.service.get_cache_config(shard_name)

crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl