PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show

crfm_helm-0.5.6.dist-info/METADATA +427 -0
crfm_helm-0.5.6.dist-info/RECORD +941 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +13 -1
helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/common_adapter_specs.py +69 -4
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/annotation/aci_bench_annotator.py +95 -0
helm/benchmark/annotation/air_bench_annotator.py +21 -6
helm/benchmark/annotation/annotator.py +5 -0
helm/benchmark/annotation/annotator_factory.py +3 -20
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +107 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +99 -0
helm/benchmark/annotation/medalign_annotator.py +100 -0
helm/benchmark/annotation/medi_qa_annotator.py +98 -0
helm/benchmark/annotation/medication_qa_annotator.py +87 -63
helm/benchmark/annotation/mental_health_annotator.py +98 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +214 -6
helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation_executor.py +35 -15
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +2 -2
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +2 -2
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +16 -13
helm/benchmark/augmentations/translate_perturbation.py +2 -2
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +11 -12
helm/benchmark/huggingface_registration.py +2 -7
helm/benchmark/metrics/aci_bench_metrics.py +14 -0
helm/benchmark/metrics/basic_metrics.py +6 -6
helm/benchmark/metrics/bbq_metrics.py +2 -2
helm/benchmark/metrics/bias_metrics.py +12 -3
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
helm/benchmark/metrics/classification_metrics.py +76 -12
helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +9 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
helm/benchmark/metrics/copyright_metrics.py +4 -4
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +14 -0
helm/benchmark/metrics/disinformation_metrics.py +4 -4
helm/benchmark/metrics/dry_run_metrics.py +5 -5
helm/benchmark/metrics/efficiency_metrics.py +6 -6
helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
helm/benchmark/metrics/ifeval/__init__.py +0 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +55 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/language_modeling_metrics.py +4 -4
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/machine_translation_metrics.py +2 -2
helm/benchmark/metrics/med_dialog_metrics.py +14 -0
helm/benchmark/metrics/medalign_metrics.py +14 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
helm/benchmark/metrics/medec_metrics.py +101 -0
helm/benchmark/metrics/medi_qa_metrics.py +14 -0
helm/benchmark/metrics/medication_qa_metrics.py +10 -19
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +14 -0
helm/benchmark/metrics/metric.py +3 -3
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/numeracy_metrics.py +4 -4
helm/benchmark/metrics/omni_math_metrics.py +32 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
helm/benchmark/metrics/ranking_metrics.py +3 -3
helm/benchmark/metrics/reference_metric.py +3 -3
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +20 -9
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +140 -68
helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
helm/benchmark/metrics/test_metric.py +1 -1
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
helm/benchmark/metrics/toxicity_metrics.py +6 -6
helm/benchmark/metrics/unitxt_metrics.py +7 -5
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/metrics/wildbench_metrics.py +34 -0
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/model_metadata_registry.py +16 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +97 -67
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +86 -90
helm/benchmark/run_expander.py +90 -9
helm/benchmark/run_spec_factory.py +13 -0
helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/call_center_run_specs.py +49 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1 -69
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +142 -3
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/lite_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +141 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +37 -0
helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
helm/benchmark/run_specs/vlm_run_specs.py +103 -2
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
helm/benchmark/scenarios/air_bench_scenario.py +6 -1
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
helm/benchmark/scenarios/banking77_scenario.py +6 -1
helm/benchmark/scenarios/bbq_scenario.py +1 -1
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bold_scenario.py +1 -1
helm/benchmark/scenarios/boolq_scenario.py +1 -1
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
helm/benchmark/scenarios/clear_scenario.py +157 -0
helm/benchmark/scenarios/cleva_scenario.py +2 -2
helm/benchmark/scenarios/code_scenario.py +17 -4
helm/benchmark/scenarios/commonsense_scenario.py +1 -1
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
helm/benchmark/scenarios/disinformation_scenario.py +10 -1
helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
helm/benchmark/scenarios/gpqa_scenario.py +80 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +2 -2
helm/benchmark/scenarios/gsm_scenario.py +10 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
helm/benchmark/scenarios/headqa_scenario.py +136 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
helm/benchmark/scenarios/ice_scenario.py +8 -4
helm/benchmark/scenarios/ifeval_scenario.py +53 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +11 -2
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +1 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
helm/benchmark/scenarios/legal_support_scenario.py +11 -1
helm/benchmark/scenarios/legalbench_scenario.py +22 -3
helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
helm/benchmark/scenarios/lextreme_scenario.py +11 -1
helm/benchmark/scenarios/live_qa_scenario.py +1 -1
helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
helm/benchmark/scenarios/math_scenario.py +9 -1
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +10 -1
helm/benchmark/scenarios/medalign_scenario.py +94 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
helm/benchmark/scenarios/medbullets_scenario.py +145 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
helm/benchmark/scenarios/medec_scenario.py +125 -0
helm/benchmark/scenarios/medhallu_scenario.py +72 -0
helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +123 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
helm/benchmark/scenarios/mmlu_scenario.py +11 -1
helm/benchmark/scenarios/msmarco_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/numeracy_scenario.py +12 -2
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +53 -0
helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
helm/benchmark/scenarios/quac_scenario.py +10 -1
helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
helm/benchmark/scenarios/raft_scenario.py +17 -2
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
helm/benchmark/scenarios/scenario.py +9 -1
helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
helm/benchmark/scenarios/spider_scenario.py +91 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
helm/benchmark/scenarios/summarization_scenario.py +11 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_math_scenario.py +1 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
helm/benchmark/scenarios/unitxt_scenario.py +8 -2
helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
helm/benchmark/scenarios/vicuna_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/wikifact_scenario.py +11 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +83 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
helm/benchmark/scenarios/xstest_scenario.py +1 -1
helm/benchmark/server.py +13 -1
helm/benchmark/slurm_runner.py +1 -1
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +97 -60
helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_finance.yaml +14 -12
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_long_context.yaml +283 -0
helm/benchmark/static/schema_medhelm.yaml +1140 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +18 -1
helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_vhelm.yaml +129 -56
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
helm/benchmark/static_build/config.js +1 -1
helm/benchmark/static_build/index.html +6 -6
helm/benchmark/window_services/default_window_service.py +1 -1
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
helm/benchmark/window_services/ice_window_service.py +1 -1
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
helm/benchmark/window_services/local_window_service.py +2 -2
helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
helm/benchmark/window_services/test_bloom_window_service.py +3 -3
helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
helm/benchmark/window_services/test_gptj_window_service.py +8 -3
helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
helm/benchmark/window_services/test_openai_window_service.py +8 -3
helm/benchmark/window_services/test_opt_window_service.py +3 -3
helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
helm/benchmark/window_services/test_t511b_window_service.py +3 -3
helm/benchmark/window_services/test_ul2_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +4 -5
helm/benchmark/window_services/test_yalm_window_service.py +3 -3
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/benchmark/window_services/yalm_window_service.py +1 -1
helm/clients/ai21_client.py +3 -3
helm/clients/aleph_alpha_client.py +1 -1
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +120 -0
helm/clients/audio_language/llama_omni_client.py +198 -0
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
helm/clients/audio_language/qwen_audiolm_client.py +152 -0
helm/clients/audio_language/test.py +62 -0
helm/clients/auto_client.py +4 -2
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +203 -7
helm/clients/bedrock_utils.py +33 -0
helm/clients/client.py +7 -7
helm/clients/clip_scorers/clip_scorer.py +1 -1
helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
helm/clients/cohere_client.py +3 -3
helm/clients/google_client.py +1 -1
helm/clients/grok_client.py +36 -0
helm/clients/http_model_client.py +1 -1
helm/clients/huggingface_client.py +52 -21
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +267 -0
helm/clients/image_generation/adobe_vision_client.py +1 -1
helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
helm/clients/image_generation/cogview2_client.py +1 -1
helm/clients/image_generation/dalle2_client.py +1 -1
helm/clients/image_generation/dalle3_client.py +2 -2
helm/clients/image_generation/dalle_mini/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/data.py +1 -1
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
helm/clients/image_generation/dalle_mini_client.py +1 -1
helm/clients/image_generation/deep_floyd_client.py +1 -1
helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
helm/clients/image_generation/lexica_client.py +1 -1
helm/clients/image_generation/mindalle/models/__init__.py +6 -6
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
helm/clients/image_generation/mindalle_client.py +1 -1
helm/clients/image_generation/together_image_generation_client.py +1 -1
helm/clients/lit_gpt_client.py +2 -2
helm/clients/mistral_client.py +62 -18
helm/clients/nvidia_nim_client.py +0 -3
helm/clients/openai_client.py +308 -43
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +3 -9
helm/clients/reka_client.py +3 -3
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +93 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/clients/test_client.py +1 -1
helm/clients/test_together_client.py +6 -1
helm/clients/together_client.py +76 -9
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +45 -13
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/open_flamingo/__init__.py +2 -2
helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +188 -0
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +4 -6
helm/clients/writer_client.py +102 -0
helm/clients/yi_client.py +0 -3
helm/common/audio_utils.py +111 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/file_caches/local_file_cache.py +1 -1
helm/common/file_caches/test_local_file_cache.py +1 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/images_utils.py +2 -2
helm/common/local_context.py +140 -0
helm/common/media_object.py +2 -2
helm/common/multimodal_request_utils.py +26 -0
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +14 -2
helm/common/response_format.py +18 -0
helm/common/test_media_object.py +1 -1
helm/config/model_deployments.yaml +1792 -28
helm/config/model_metadata.yaml +1606 -51
helm/config/tokenizer_configs.yaml +521 -4
helm/proxy/cli.py +5 -3
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/example_queries.py +1 -1
helm/proxy/server.py +11 -4
helm/proxy/services/remote_service.py +1 -1
helm/proxy/services/server_service.py +22 -86
helm/proxy/services/test_remote_service.py +2 -2
helm/proxy/services/test_service.py +1 -1
helm/proxy/static/general.js +122 -0
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +57 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +456 -0
helm/proxy/static/info-icon.png +0 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +1 -1
helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
helm/tokenizers/caching_tokenizer.py +2 -30
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/http_model_tokenizer.py +1 -1
helm/tokenizers/huggingface_tokenizer.py +3 -3
helm/tokenizers/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/tokenizers/test_huggingface_tokenizer.py +1 -1
helm/tokenizers/test_yalm_tokenizer.py +1 -1
helm/tokenizers/tiktoken_tokenizer.py +1 -1
helm/tokenizers/tokenizer.py +3 -1
helm/tokenizers/yalm_tokenizer.py +3 -3
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
crfm_helm-0.5.4.dist-info/METADATA +0 -350
crfm_helm-0.5.4.dist-info/RECORD +0 -697
helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
helm/tokenizers/anthropic_tokenizer.py +0 -52
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0

helm/clients/huggingface_pipeline_client.py ADDED Viewed

@@ -0,0 +1,138 @@
+from threading import Lock
+from typing import Any, Dict, List, Optional, Union
+import transformers
+from helm.clients.client import CachingClient
+from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import htrack_block, hwarn
+from helm.common.request import GeneratedOutput, Request, RequestResult, wrap_request_time
+from helm.proxy.retry import NonRetriableException
+_pipelines: Dict[str, transformers.Pipeline] = {}
+_pipelines_lock: Lock = Lock()
+def _get_pipeline(
+    helm_model_name: str,
+    pipeline_kwargs: Dict[str, Any],
+) -> Any:
+    """
+    Checks if the desired HuggingFaceModel is cached. Creates the HuggingFaceModel if it's not cached.
+    Returns the HuggingFaceModel.
+    """
+    global _pipelines
+    global _pipelines_lock
+    with _pipelines_lock:
+        if helm_model_name not in _pipelines:
+            huggingface_model_name = pipeline_kwargs["model"]
+            with htrack_block(
+                f"Loading HuggingFace model {huggingface_model_name} (kwargs={pipeline_kwargs}) "
+                f"for HELM model {helm_model_name} with transformers.pipeline"
+            ):
+                _pipelines[helm_model_name] = transformers.pipeline(**pipeline_kwargs)
+    return _pipelines[helm_model_name]
+class HuggingFacePipelineClient(CachingClient):
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        model_name: str,
+        pretrained_model_name_or_path: Optional[str] = None,
+        apply_chat_template: Optional[bool] = None,
+        **kwargs,
+    ):
+        # Include `pretrained_model_name_or_path` parameter so that model deployments can use
+        # the `pretrained_model_name_or_path` arg to override `model_name`
+        super().__init__(cache_config=cache_config)
+        self._helm_model_name = model_name
+        self._pipeline_kwargs = {
+            "model": pretrained_model_name_or_path or self._helm_model_name,
+            "task": "text-generation",
+            **kwargs,
+        }
+        self._pipeline = _get_pipeline(self._helm_model_name, self._pipeline_kwargs)
+        if apply_chat_template is not None:
+            self._apply_chat_template = apply_chat_template
+        else:
+            # If the user did not explicitly configure whether the model is a chat model with `apply_chat_template` arg,
+            # auto-infer if the model is a chat model based on whether the tokenizer has a chat template.
+            # Note: Auto-inference is incorrect for some non-chat models that still have chat templates
+            # e.g. Qwen2, Qwen 2.5.
+            # For these models, the `apply_chat_template` arg should be explicitly set to false.
+            self._apply_chat_template = bool(self._pipeline.tokenizer.chat_template)
+            hwarn(
+                f"Automatically set `apply_chat_template` to {self._apply_chat_template} based on "
+                "whether the tokenizer has a chat template. "
+                "If this is incorrect, please explicitly set `apply_chat_template`."
+            )
+    def make_text_inputs(self, request: Request) -> Union[str, List[Dict[str, str]]]:
+        if request.prompt and request.messages:
+            raise NonRetriableException(f"More than one of `prompt` and `messages` was set in request: {request}")
+        # Chat model expects a list of messages as input
+        if self._apply_chat_template:
+            if request.messages:
+                return request.messages
+            else:
+                return [{"role": "user", "content": request.prompt}]
+        # Base non-chat model expects a string as input
+        else:
+            if request.messages:
+                raise NonRetriableException("Chat mesages not supported by non-chat model")
+            else:
+                return request.prompt
+    def make_request(self, request: Request) -> RequestResult:
+        """Make a request"""
+        if request.model != self._helm_model_name:
+            raise NonRetriableException(
+                f"This instance of HuggingFacePipelineClient has loaded model {self._helm_model_name} but the request was for model {request.model}"  # noqa: E501
+            )
+        completions: List[GeneratedOutput] = []
+        do_sample = request.temperature > 0.0
+        raw_request = {
+            "text_inputs": self.make_text_inputs(request),
+            "return_full_text": request.echo_prompt,
+            "temperature": request.temperature if do_sample else None,
+            "num_return_sequences": request.num_completions,
+            "max_new_tokens": request.max_tokens,
+            "top_p": request.top_p,
+            "top_k": request.top_k_per_token if do_sample else None,
+            "do_sample": do_sample,
+            "return_dict_in_generate": True,
+        }
+        if request.stop_sequences:
+            stop_sequence_ids = self._pipeline.tokenizer(
+                request.stop_sequences, return_token_type_ids=False, add_special_tokens=False
+            )
+            if len(stop_sequence_ids.input_ids) == 1 and len(stop_sequence_ids.input_ids[0]) == 1:
+                raw_request["eos_token_id"] = stop_sequence_ids.input_ids[0][0]
+            else:
+                raise NonRetriableException(
+                    "Multiple stop sequences and stop sequences of multiple tokens, are not yet supported by HuggingFacePipelineClient"  # noqa: E501
+                )
+        def do_it() -> Dict[str, Any]:
+            pipeline_outputs = self._pipeline(**raw_request)
+            return {"outputs": pipeline_outputs}
+        cache_key = CachingClient.make_cache_key(
+            {"pipeline_kwargs": self._pipeline_kwargs, **raw_request},
+            request,
+        )
+        response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+        for raw_output in response["outputs"]:
+            completions.append(GeneratedOutput(text=raw_output["generated_text"], logprob=0, tokens=[]))
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=response["request_time"],
+            request_datetime=response["request_datetime"],
+            completions=completions,
+            embedding=[],
+        )

helm/clients/ibm_client.py ADDED Viewed

@@ -0,0 +1,267 @@
+from abc import ABC
+from abc import abstractmethod
+from helm.common.hierarchical_logger import hlog
+from helm.common.cache import CacheConfig
+from helm.common.request import (
+    Request,
+    RequestResult,
+    Token,
+    wrap_request_time,
+    EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
+    GeneratedOutput,
+)
+from helm.clients.client import CachingClient
+from helm.common.optional_dependencies import handle_module_not_found_error
+from typing import TypeVar, Generic
+from typing import Any, Dict, List
+from threading import Semaphore, Lock
+import threading
+try:
+    from ibm_watsonx_ai import Credentials
+    from ibm_watsonx_ai.foundation_models import ModelInference
+    from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
+    from ibm_watsonx_ai.foundation_models.schema import (
+        TextChatParameters,
+        TextGenParameters,
+        ReturnOptionProperties,
+    )
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["ibm"])
+# Define the maximum number of parallel executions is limited by IBM API
+MAX_CONCURRENT_REQUESTS = 8
+__semaphores: Dict[str, Semaphore] = dict()
+__semaphores_lock = Lock()
+def _get_semaphore(model: str) -> Semaphore:
+    with __semaphores_lock:
+        if model not in __semaphores:
+            __semaphores[model] = threading.Semaphore(MAX_CONCURRENT_REQUESTS)
+    return __semaphores[model]
+T = TypeVar("T", TextGenParameters, TextChatParameters)
+class ModelInferenceHandler(ABC, Generic[T]):
+    @abstractmethod
+    def __init__(self, inference_engine: ModelInference):
+        """
+        :type inference_engine: object
+        """
+        self.inference_engine = inference_engine
+    @abstractmethod
+    def serve_request(self, prompt: str, params: T) -> Dict:
+        pass
+    @abstractmethod
+    def parse_response(self, response: dict) -> List[GeneratedOutput]:
+        pass
+    @abstractmethod
+    def create_params(self, request: Request) -> T:
+        pass
+class GenerateInferenceHandler(ModelInferenceHandler[TextGenParameters]):
+    def __init__(self, inference_engine: ModelInference):
+        self.inference_engine = inference_engine
+    def create_params(self, request: Request) -> TextGenParameters:
+        def set_temperature_requirements():
+            # Default temperature 0.05 required by ibm/granite-13b-instruct-v2
+            if self.inference_engine.model_id == "ibm/granite-13b-instruct-v2":
+                return 0.05
+            return 1e-7 if request.temperature == 0 else request.temperature
+        return TextGenParameters(
+            temperature=set_temperature_requirements(),
+            top_p=request.top_p,
+            max_new_tokens=request.max_tokens,
+            return_options=ReturnOptionProperties(
+                input_text=True,
+                generated_tokens=True,
+                input_tokens=False,
+                token_logprobs=True,
+                token_ranks=False,
+            ),
+            include_stop_sequence=False,
+            prompt_variables=None,
+        )
+    def serve_request(self, prompt: str, params: TextGenParameters) -> Dict:
+        semaphore = _get_semaphore(self.inference_engine.model_id)
+        with semaphore:
+            response = self.inference_engine.generate(
+                prompt=prompt,
+                params=params,
+            )
+        return response
+    def parse_response(self, response: dict) -> List[GeneratedOutput]:
+        completions = []
+        try:
+            for r in response["results"]:
+                sequence_logprob: float = 0
+                tokens: List[Token] = []
+                generated_text = r["generated_text"]
+                for token_and_logprob in r["generated_tokens"]:
+                    logprob = token_and_logprob.get("logprob", 0)
+                    text = token_and_logprob["text"]
+                    tokens.append(Token(text=text, logprob=logprob))
+                    sequence_logprob += logprob
+                completion = GeneratedOutput(text=generated_text, logprob=sequence_logprob, tokens=tokens)
+                completions.append(completion)
+        except Exception as e:
+            hlog(f"GenerateInferenceHandler failed with exception {e} during parse_response {response}")
+        return completions
+class ChatModelInferenceHandler(ModelInferenceHandler[TextChatParameters]):
+    def __init__(self, inference_engine: ModelInference):
+        self.inference_engine = inference_engine
+    def create_params(self, request: Request) -> TextChatParameters:
+        return TextChatParameters(
+            logprobs=True,
+            presence_penalty=0,
+            frequency_penalty=0,
+            temperature=request.temperature,
+            max_tokens=request.max_tokens,
+            top_p=request.top_p,
+        )
+    def parse_response(self, response: dict) -> List[GeneratedOutput]:
+        completions = []
+        try:
+            for raw_completion in response["choices"]:
+                sequence_logprob: float = 0
+                tokens: List[Token] = []
+                generated_text = raw_completion["message"]["content"]
+                for token_and_logprob in raw_completion["logprobs"]["content"]:
+                    logprob = token_and_logprob["logprob"]
+                    text = token_and_logprob["token"]
+                    tokens.append(Token(text=text, logprob=logprob))
+                    sequence_logprob += logprob
+                completion = GeneratedOutput(text=generated_text, logprob=sequence_logprob, tokens=tokens)
+                completions.append(completion)
+        except Exception as e:
+            hlog(f"ChatModelInferenceHandler failed with exception {e} during parse_response {response}")
+        return completions
+    def serve_request(self, prompt: str, params: TextChatParameters) -> Dict:
+        semaphore = _get_semaphore(self.inference_engine.model_id)
+        with semaphore:
+            response = self.inference_engine.chat(
+                messages=[{"role": "user", "content": prompt}],
+                params=params,
+            )
+        return response
+class IbmClient(CachingClient, ABC):
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        api_key: str,
+        region: str,
+        location: dict,
+        watsonx_model_name: str,
+        **kwargs,
+    ):
+        super().__init__(cache_config=cache_config)
+        self.project_id = None
+        self.url = None
+        self.watsonx_model_name = watsonx_model_name
+        self.api_key = api_key
+        self.region = region
+        self.kwargs = kwargs
+        for entry in location:
+            if entry["region"].lower() == self.region.lower():
+                self.project_id = entry["project_id"]
+                self.url = entry["url"]
+        assert self.project_id is not None, (
+            "Missed project_id for specified region configuration in credentials.conf, should be in list "
+            "of JSON objects with 'region', 'url', 'project_id' per region"
+        )
+        assert self.url is not None, (
+            "Missed url for specified region configuration in credentials.conf, should be in list "
+            "of JSON objects with 'region', 'url', 'project_id' per region"
+        )
+        self.inference_engine = ModelInference(
+            model_id=self.watsonx_model_name,
+            params={GenParams.MAX_NEW_TOKENS: 2000},
+            credentials=Credentials(api_key=api_key, url=self.url),
+            project_id=self.project_id,
+        )
+        hlog("Started IBM Client")
+    @abstractmethod
+    def make_request(self, request: Request) -> RequestResult:
+        pass
+    def do_call(self, inference_handler: ModelInferenceHandler, request: Request) -> RequestResult:
+        params = inference_handler.create_params(request=request)
+        def do_it() -> Dict[str, Any]:
+            return inference_handler.serve_request(prompt=request.prompt, params=params)
+        raw_request = {"prompt": request.prompt, "params": params.to_dict(), "model": request.model}
+        cache_key = CachingClient.make_cache_key(raw_request, request)
+        response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+        completions = inference_handler.parse_response(response)
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=response["request_time"],
+            request_datetime=response.get("request_datetime"),
+            completions=completions,
+            embedding=[],
+        )
+class IbmChatClient(IbmClient):
+    def make_request(self, request: Request) -> RequestResult:
+        # Embedding not supported for this model
+        if request.embedding:
+            return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
+        try:
+            return self.do_call(
+                inference_handler=ChatModelInferenceHandler(inference_engine=self.inference_engine), request=request
+            )
+        except Exception as e:
+            error: str = f"IBM Chat client Model error: {e}"
+            return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
+class IbmTextClient(IbmClient):
+    def make_request(self, request: Request) -> RequestResult:
+        # Embedding not supported for this model
+        if request.embedding:
+            return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
+        try:
+            return self.do_call(
+                inference_handler=GenerateInferenceHandler(inference_engine=self.inference_engine), request=request
+            )
+        except Exception as e:
+            error: str = f"IBM Text client Model error: {e}"
+            return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/image_generation/adobe_vision_client.py CHANGED Viewed

@@ -9,7 +9,7 @@ from helm.common.tokenization_request import (
     DecodeRequestResult,
 )
 from helm.clients.client import Client, CachingClient
-from .image_generation_client_utils import get_single_image_multimedia_object
+from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
 class AdobeVisionClient(Client):

helm/clients/image_generation/aleph_alpha_image_generation_client.py CHANGED Viewed

@@ -9,7 +9,7 @@ from helm.common.tokenization_request import (
     DecodeRequestResult,
 )
 from helm.clients.client import Client, CachingClient
-from .image_generation_client_utils import get_single_image_multimedia_object
+from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
 class AlephAlphaImageGenerationClient(Client):

helm/clients/image_generation/cogview2/sr_pipeline/__init__.py CHANGED Viewed

@@ -6,9 +6,9 @@
 @Contact :   dm18@mails.tsinghua.edu.cn
 """
-from .direct_sr import DirectSuperResolution
-from .iterative_sr import IterativeSuperResolution
-from .sr_group import SRGroup
+from helm.clients.image_generation.cogview2.sr_pipeline.direct_sr import DirectSuperResolution
+from helm.clients.image_generation.cogview2.sr_pipeline.iterative_sr import IterativeSuperResolution
+from helm.clients.image_generation.cogview2.sr_pipeline.sr_group import SRGroup
 DirectSuperResolution
 IterativeSuperResolution

helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py CHANGED Viewed

@@ -10,8 +10,11 @@
 import torch
 from icetk import icetk as tokenizer
-from .dsr_sampling import filling_sequence_dsr, IterativeEntfilterStrategy
-from .dsr_model import DsrModel
+from helm.clients.image_generation.cogview2.sr_pipeline.dsr_sampling import (
+    filling_sequence_dsr,
+    IterativeEntfilterStrategy,
+)
+from helm.clients.image_generation.cogview2.sr_pipeline.dsr_model import DsrModel
 from helm.common.optional_dependencies import handle_module_not_found_error

helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py CHANGED Viewed

@@ -8,8 +8,11 @@
 import torch
 from icetk import icetk as tokenizer
-from .itersr_sampling import filling_sequence_itersr, IterativeEntfilterStrategy
-from .itersr_model import ItersrModel
+from helm.clients.image_generation.cogview2.sr_pipeline.itersr_sampling import (
+    filling_sequence_itersr,
+    IterativeEntfilterStrategy,
+)
+from helm.clients.image_generation.cogview2.sr_pipeline.itersr_model import ItersrModel
 from helm.common.optional_dependencies import handle_module_not_found_error

helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py CHANGED Viewed

@@ -7,8 +7,8 @@
 """
 # here put the import lib
-from .direct_sr import DirectSuperResolution
-from .iterative_sr import IterativeSuperResolution
+from helm.clients.image_generation.cogview2.sr_pipeline.direct_sr import DirectSuperResolution
+from helm.clients.image_generation.cogview2.sr_pipeline.iterative_sr import IterativeSuperResolution
 from helm.common.optional_dependencies import handle_module_not_found_error

helm/clients/image_generation/cogview2_client.py CHANGED Viewed

@@ -20,7 +20,7 @@ from helm.common.tokenization_request import (
 )
 from helm.clients.client import Client, CachingClient
 from helm.clients.image_generation.cogview2.coglm_strategy import CoglmStrategy
-from .image_generation_client_utils import get_single_image_multimedia_object
+from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
 class CogView2Client(Client):

helm/clients/image_generation/dalle2_client.py CHANGED Viewed

@@ -15,7 +15,7 @@ from helm.common.tokenization_request import (
 )
 from helm.clients.moderation_api_client import ModerationAPIClient
 from helm.clients.client import Client, CachingClient
-from .image_generation_client_utils import get_single_image_multimedia_object
+from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
 try:
     import openai

helm/clients/image_generation/dalle3_client.py CHANGED Viewed

@@ -7,8 +7,8 @@ from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
 from helm.clients.moderation_api_client import ModerationAPIClient
 from helm.clients.client import CachingClient
-from .dalle2_client import DALLE2Client
-from .image_generation_client_utils import get_single_image_multimedia_object
+from helm.clients.image_generation.dalle2_client import DALLE2Client
+from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
 try:
     import openai

helm/clients/image_generation/dalle_mini/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 __version__ = "0.1.4"
-from .model import DalleBart, DalleBartProcessor
+from helm.clients.image_generation.dalle_mini.model import DalleBart, DalleBartProcessor

helm/clients/image_generation/dalle_mini/data.py CHANGED Viewed

@@ -6,7 +6,7 @@ from pathlib import Path
 import numpy as np
 from datasets import Dataset, load_dataset
-from .model.text import TextNormalizer
+from helm.clients.image_generation.dalle_mini.model.text import TextNormalizer
 from helm.common.optional_dependencies import handle_module_not_found_error
 try:

helm/clients/image_generation/dalle_mini/model/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from .configuration import DalleBartConfig
-from .modeling import DalleBart
-from .partitions import set_partitions
-from .processor import DalleBartProcessor
-from .tokenizer import DalleBartTokenizer
+from helm.clients.image_generation.dalle_mini.model.configuration import DalleBartConfig
+from helm.clients.image_generation.dalle_mini.model.modeling import DalleBart
+from helm.clients.image_generation.dalle_mini.model.partitions import set_partitions
+from helm.clients.image_generation.dalle_mini.model.processor import DalleBartProcessor
+from helm.clients.image_generation.dalle_mini.model.tokenizer import DalleBartTokenizer

helm/clients/image_generation/dalle_mini/model/configuration.py CHANGED Viewed

@@ -12,13 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DalleBart model configuration """
+"""DalleBart model configuration"""
 import warnings
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
-from .utils import PretrainedFromWandbMixin
+from helm.clients.image_generation.dalle_mini.model.utils import PretrainedFromWandbMixin
 logger = logging.get_logger(__name__)

helm/clients/image_generation/dalle_mini/model/modeling.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DalleBart model. """
+"""DalleBart model."""
 import math
 from functools import partial
@@ -35,8 +35,8 @@ from transformers.utils import ModelOutput, logging
 from transformers.generation.configuration_utils import GenerationConfig
 from helm.common.optional_dependencies import handle_module_not_found_error
-from .configuration import DalleBartConfig
-from .utils import PretrainedFromWandbMixin
+from helm.clients.image_generation.dalle_mini.model.configuration import DalleBartConfig
+from helm.clients.image_generation.dalle_mini.model.utils import PretrainedFromWandbMixin
 try:
     import flax

helm/clients/image_generation/dalle_mini/model/processor.py CHANGED Viewed

@@ -1,11 +1,11 @@
-""" DalleBart processor """
+"""DalleBart processor"""
 from typing import List
-from .configuration import DalleBartConfig
-from .text import TextNormalizer
-from .tokenizer import DalleBartTokenizer
-from .utils import PretrainedFromWandbMixin
+from helm.clients.image_generation.dalle_mini.model.configuration import DalleBartConfig
+from helm.clients.image_generation.dalle_mini.model.text import TextNormalizer
+from helm.clients.image_generation.dalle_mini.model.tokenizer import DalleBartTokenizer
+from helm.clients.image_generation.dalle_mini.model.utils import PretrainedFromWandbMixin
 from helm.common.optional_dependencies import handle_module_not_found_error

helm/clients/image_generation/dalle_mini/model/tokenizer.py CHANGED Viewed

@@ -1,8 +1,8 @@
-""" DalleBart tokenizer """
+"""DalleBart tokenizer"""
 from transformers import BartTokenizerFast
-from .utils import PretrainedFromWandbMixin
+from helm.clients.image_generation.dalle_mini.model.utils import PretrainedFromWandbMixin
 class DalleBartTokenizer(PretrainedFromWandbMixin, BartTokenizerFast):

helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from . import *
1	+ from helm.clients.image_generation.dalle_mini.vqgan_jax import *

helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py CHANGED Viewed

@@ -2,8 +2,8 @@ import re
 import torch
-from .modeling_flax_vqgan import VQModel
-from .configuration_vqgan import VQGANConfig
+from helm.clients.image_generation.dalle_mini.vqgan_jax.modeling_flax_vqgan import VQModel
+from helm.clients.image_generation.dalle_mini.vqgan_jax.configuration_vqgan import VQGANConfig
 from helm.common.optional_dependencies import handle_module_not_found_error
 try:

helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py CHANGED Viewed

@@ -6,7 +6,7 @@ import math
 from transformers.modeling_flax_utils import FlaxPreTrainedModel
-from .configuration_vqgan import VQGANConfig
+from helm.clients.image_generation.dalle_mini.vqgan_jax.configuration_vqgan import VQGANConfig
 from helm.common.optional_dependencies import handle_module_not_found_error
 try:

helm/clients/image_generation/dalle_mini_client.py CHANGED Viewed

@@ -15,7 +15,7 @@ from helm.common.tokenization_request import (
     TokenizationRequestResult,
 )
 from helm.clients.client import Client, CachingClient
-from .image_generation_client_utils import get_single_image_multimedia_object
+from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
 class DALLEMiniClient(Client):

helm/clients/image_generation/deep_floyd_client.py CHANGED Viewed

@@ -9,7 +9,7 @@ from helm.common.tokenization_request import (
     DecodeRequestResult,
 )
 from helm.clients.client import Client, CachingClient
-from .image_generation_client_utils import get_single_image_multimedia_object
+from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
 class DeepFloydClient(Client):

helm/clients/image_generation/huggingface_diffusers_client.py CHANGED Viewed

@@ -17,7 +17,7 @@ from helm.common.tokenization_request import (
     TokenizationRequestResult,
 )
 from helm.clients.client import Client, CachingClient
-from .image_generation_client_utils import get_single_image_multimedia_object
+from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
 _models_lock: Lock = Lock()

crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl