PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show

crfm_helm-0.5.6.dist-info/METADATA +427 -0
crfm_helm-0.5.6.dist-info/RECORD +941 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +13 -1
helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/common_adapter_specs.py +69 -4
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/annotation/aci_bench_annotator.py +95 -0
helm/benchmark/annotation/air_bench_annotator.py +21 -6
helm/benchmark/annotation/annotator.py +5 -0
helm/benchmark/annotation/annotator_factory.py +3 -20
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +107 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +99 -0
helm/benchmark/annotation/medalign_annotator.py +100 -0
helm/benchmark/annotation/medi_qa_annotator.py +98 -0
helm/benchmark/annotation/medication_qa_annotator.py +87 -63
helm/benchmark/annotation/mental_health_annotator.py +98 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +214 -6
helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation_executor.py +35 -15
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +2 -2
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +2 -2
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +16 -13
helm/benchmark/augmentations/translate_perturbation.py +2 -2
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +11 -12
helm/benchmark/huggingface_registration.py +2 -7
helm/benchmark/metrics/aci_bench_metrics.py +14 -0
helm/benchmark/metrics/basic_metrics.py +6 -6
helm/benchmark/metrics/bbq_metrics.py +2 -2
helm/benchmark/metrics/bias_metrics.py +12 -3
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
helm/benchmark/metrics/classification_metrics.py +76 -12
helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +9 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
helm/benchmark/metrics/copyright_metrics.py +4 -4
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +14 -0
helm/benchmark/metrics/disinformation_metrics.py +4 -4
helm/benchmark/metrics/dry_run_metrics.py +5 -5
helm/benchmark/metrics/efficiency_metrics.py +6 -6
helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
helm/benchmark/metrics/ifeval/__init__.py +0 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +55 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/language_modeling_metrics.py +4 -4
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/machine_translation_metrics.py +2 -2
helm/benchmark/metrics/med_dialog_metrics.py +14 -0
helm/benchmark/metrics/medalign_metrics.py +14 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
helm/benchmark/metrics/medec_metrics.py +101 -0
helm/benchmark/metrics/medi_qa_metrics.py +14 -0
helm/benchmark/metrics/medication_qa_metrics.py +10 -19
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +14 -0
helm/benchmark/metrics/metric.py +3 -3
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/numeracy_metrics.py +4 -4
helm/benchmark/metrics/omni_math_metrics.py +32 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
helm/benchmark/metrics/ranking_metrics.py +3 -3
helm/benchmark/metrics/reference_metric.py +3 -3
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +20 -9
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +140 -68
helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
helm/benchmark/metrics/test_metric.py +1 -1
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
helm/benchmark/metrics/toxicity_metrics.py +6 -6
helm/benchmark/metrics/unitxt_metrics.py +7 -5
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/metrics/wildbench_metrics.py +34 -0
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/model_metadata_registry.py +16 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +97 -67
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +86 -90
helm/benchmark/run_expander.py +90 -9
helm/benchmark/run_spec_factory.py +13 -0
helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/call_center_run_specs.py +49 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1 -69
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +142 -3
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/lite_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +141 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +37 -0
helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
helm/benchmark/run_specs/vlm_run_specs.py +103 -2
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
helm/benchmark/scenarios/air_bench_scenario.py +6 -1
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
helm/benchmark/scenarios/banking77_scenario.py +6 -1
helm/benchmark/scenarios/bbq_scenario.py +1 -1
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bold_scenario.py +1 -1
helm/benchmark/scenarios/boolq_scenario.py +1 -1
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
helm/benchmark/scenarios/clear_scenario.py +157 -0
helm/benchmark/scenarios/cleva_scenario.py +2 -2
helm/benchmark/scenarios/code_scenario.py +17 -4
helm/benchmark/scenarios/commonsense_scenario.py +1 -1
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
helm/benchmark/scenarios/disinformation_scenario.py +10 -1
helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
helm/benchmark/scenarios/gpqa_scenario.py +80 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +2 -2
helm/benchmark/scenarios/gsm_scenario.py +10 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
helm/benchmark/scenarios/headqa_scenario.py +136 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
helm/benchmark/scenarios/ice_scenario.py +8 -4
helm/benchmark/scenarios/ifeval_scenario.py +53 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +11 -2
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +1 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
helm/benchmark/scenarios/legal_support_scenario.py +11 -1
helm/benchmark/scenarios/legalbench_scenario.py +22 -3
helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
helm/benchmark/scenarios/lextreme_scenario.py +11 -1
helm/benchmark/scenarios/live_qa_scenario.py +1 -1
helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
helm/benchmark/scenarios/math_scenario.py +9 -1
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +10 -1
helm/benchmark/scenarios/medalign_scenario.py +94 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
helm/benchmark/scenarios/medbullets_scenario.py +145 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
helm/benchmark/scenarios/medec_scenario.py +125 -0
helm/benchmark/scenarios/medhallu_scenario.py +72 -0
helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +123 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
helm/benchmark/scenarios/mmlu_scenario.py +11 -1
helm/benchmark/scenarios/msmarco_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/numeracy_scenario.py +12 -2
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +53 -0
helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
helm/benchmark/scenarios/quac_scenario.py +10 -1
helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
helm/benchmark/scenarios/raft_scenario.py +17 -2
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
helm/benchmark/scenarios/scenario.py +9 -1
helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
helm/benchmark/scenarios/spider_scenario.py +91 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
helm/benchmark/scenarios/summarization_scenario.py +11 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_math_scenario.py +1 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
helm/benchmark/scenarios/unitxt_scenario.py +8 -2
helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
helm/benchmark/scenarios/vicuna_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/wikifact_scenario.py +11 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +83 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
helm/benchmark/scenarios/xstest_scenario.py +1 -1
helm/benchmark/server.py +13 -1
helm/benchmark/slurm_runner.py +1 -1
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +97 -60
helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_finance.yaml +14 -12
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_long_context.yaml +283 -0
helm/benchmark/static/schema_medhelm.yaml +1140 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +18 -1
helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_vhelm.yaml +129 -56
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
helm/benchmark/static_build/config.js +1 -1
helm/benchmark/static_build/index.html +6 -6
helm/benchmark/window_services/default_window_service.py +1 -1
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
helm/benchmark/window_services/ice_window_service.py +1 -1
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
helm/benchmark/window_services/local_window_service.py +2 -2
helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
helm/benchmark/window_services/test_bloom_window_service.py +3 -3
helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
helm/benchmark/window_services/test_gptj_window_service.py +8 -3
helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
helm/benchmark/window_services/test_openai_window_service.py +8 -3
helm/benchmark/window_services/test_opt_window_service.py +3 -3
helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
helm/benchmark/window_services/test_t511b_window_service.py +3 -3
helm/benchmark/window_services/test_ul2_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +4 -5
helm/benchmark/window_services/test_yalm_window_service.py +3 -3
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/benchmark/window_services/yalm_window_service.py +1 -1
helm/clients/ai21_client.py +3 -3
helm/clients/aleph_alpha_client.py +1 -1
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +120 -0
helm/clients/audio_language/llama_omni_client.py +198 -0
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
helm/clients/audio_language/qwen_audiolm_client.py +152 -0
helm/clients/audio_language/test.py +62 -0
helm/clients/auto_client.py +4 -2
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +203 -7
helm/clients/bedrock_utils.py +33 -0
helm/clients/client.py +7 -7
helm/clients/clip_scorers/clip_scorer.py +1 -1
helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
helm/clients/cohere_client.py +3 -3
helm/clients/google_client.py +1 -1
helm/clients/grok_client.py +36 -0
helm/clients/http_model_client.py +1 -1
helm/clients/huggingface_client.py +52 -21
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +267 -0
helm/clients/image_generation/adobe_vision_client.py +1 -1
helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
helm/clients/image_generation/cogview2_client.py +1 -1
helm/clients/image_generation/dalle2_client.py +1 -1
helm/clients/image_generation/dalle3_client.py +2 -2
helm/clients/image_generation/dalle_mini/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/data.py +1 -1
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
helm/clients/image_generation/dalle_mini_client.py +1 -1
helm/clients/image_generation/deep_floyd_client.py +1 -1
helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
helm/clients/image_generation/lexica_client.py +1 -1
helm/clients/image_generation/mindalle/models/__init__.py +6 -6
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
helm/clients/image_generation/mindalle_client.py +1 -1
helm/clients/image_generation/together_image_generation_client.py +1 -1
helm/clients/lit_gpt_client.py +2 -2
helm/clients/mistral_client.py +62 -18
helm/clients/nvidia_nim_client.py +0 -3
helm/clients/openai_client.py +308 -43
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +3 -9
helm/clients/reka_client.py +3 -3
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +93 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/clients/test_client.py +1 -1
helm/clients/test_together_client.py +6 -1
helm/clients/together_client.py +76 -9
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +45 -13
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/open_flamingo/__init__.py +2 -2
helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +188 -0
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +4 -6
helm/clients/writer_client.py +102 -0
helm/clients/yi_client.py +0 -3
helm/common/audio_utils.py +111 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/file_caches/local_file_cache.py +1 -1
helm/common/file_caches/test_local_file_cache.py +1 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/images_utils.py +2 -2
helm/common/local_context.py +140 -0
helm/common/media_object.py +2 -2
helm/common/multimodal_request_utils.py +26 -0
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +14 -2
helm/common/response_format.py +18 -0
helm/common/test_media_object.py +1 -1
helm/config/model_deployments.yaml +1792 -28
helm/config/model_metadata.yaml +1606 -51
helm/config/tokenizer_configs.yaml +521 -4
helm/proxy/cli.py +5 -3
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/example_queries.py +1 -1
helm/proxy/server.py +11 -4
helm/proxy/services/remote_service.py +1 -1
helm/proxy/services/server_service.py +22 -86
helm/proxy/services/test_remote_service.py +2 -2
helm/proxy/services/test_service.py +1 -1
helm/proxy/static/general.js +122 -0
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +57 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +456 -0
helm/proxy/static/info-icon.png +0 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +1 -1
helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
helm/tokenizers/caching_tokenizer.py +2 -30
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/http_model_tokenizer.py +1 -1
helm/tokenizers/huggingface_tokenizer.py +3 -3
helm/tokenizers/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/tokenizers/test_huggingface_tokenizer.py +1 -1
helm/tokenizers/test_yalm_tokenizer.py +1 -1
helm/tokenizers/tiktoken_tokenizer.py +1 -1
helm/tokenizers/tokenizer.py +3 -1
helm/tokenizers/yalm_tokenizer.py +3 -3
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
crfm_helm-0.5.4.dist-info/METADATA +0 -350
crfm_helm-0.5.4.dist-info/RECORD +0 -697
helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
helm/tokenizers/anthropic_tokenizer.py +0 -52
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0

helm/clients/palmyra_client.py CHANGED Viewed

@@ -5,14 +5,14 @@ from typing import Any, Dict, List
 from helm.clients.openai_client import OpenAIClient
 from helm.common.cache import CacheConfig
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
 from helm.common.tokenization_request import (
     TokenizationRequest,
     TokenizationRequestResult,
 )
 from helm.tokenizers.tokenizer import Tokenizer
-from .client import CachingClient, truncate_sequence
+from helm.clients.client import CachingClient, truncate_sequence
 _CONTENT_MODERATION_KEY = "fail.content.moderation.failed"
@@ -103,10 +103,7 @@ class PalmyraClient(CachingClient):
                 return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
             if _is_content_moderation_failure(response):
-                hlog(
-                    f"WARNING: Returning empty request for {request.model_deployment} "
-                    "due to content moderation filter"
-                )
+                hwarn(f"Returning empty request for {request.model_deployment} " "due to content moderation filter")
                 return RequestResult(
                     success=False,
                     cached=False,
@@ -163,6 +160,3 @@ class PalmyraChatClient(OpenAIClient):
             org_id=None,
             base_url="https://api.writer.com/v1/chat",
         )
-    def _is_chat_model_engine(self, model_engine: str) -> bool:
-        return True

helm/clients/reka_client.py CHANGED Viewed

@@ -6,10 +6,10 @@ from helm.proxy.retry import NonRetriableException
 from helm.common.cache import CacheConfig
 from helm.common.media_object import TEXT_TYPE
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.tokenizers.tokenizer import Tokenizer
-from .client import CachingClient, truncate_and_tokenize_response_text
+from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
 try:
     import reka
@@ -121,7 +121,7 @@ class RekaClient(CachingClient):
             if messages[-1]["role"] != "user":
                 raise ValueError("Last message must have role 'user'")
             if request.prompt != "":
-                hlog("WARNING: Since message is set, prompt will be ignored")
+                hwarn("Since message is set, prompt will be ignored")
             reka_chat_history = self._convert_messages_to_reka_chat_history(messages)
         else:
             current_chat_history: Dict[str, Any] = {

helm/clients/stanfordhealthcare_azure_openai_client.py ADDED Viewed

@@ -0,0 +1,58 @@
+from typing import Optional
+from helm.clients.azure_openai_client import AzureOpenAIClient
+from helm.common.cache import CacheConfig
+from helm.proxy.retry import NonRetriableException
+from helm.tokenizers.tokenizer import Tokenizer
+class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
+    """
+    Client for accessing OpenAI models hosted on Stanford Health Care's model API.
+    Configure by setting the following in prod_env/credentials.conf:
+    ```
+    stanfordhealthcareEndpoint: https://your-domain-name/
+    stanfordhealthcareApiKey: your-private-key
+    ```
+    """
+    CREDENTIAL_HEADER_NAME = "Ocp-Apim-Subscription-Key"
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        openai_model_name: str,
+        api_version: str,
+        api_key: Optional[str] = None,
+        endpoint: Optional[str] = None,
+        base_url: Optional[str] = None,
+    ):
+        if not api_key:
+            raise NonRetriableException("Must provide API key through credentials.conf")
+        if base_url:
+            base_url = base_url.format(endpoint=endpoint)
+            super().__init__(
+                tokenizer=tokenizer,
+                tokenizer_name=tokenizer_name,
+                cache_config=cache_config,
+                api_key="unused",
+                base_url=base_url,
+                azure_openai_deployment_name=openai_model_name,
+                api_version=api_version,
+                default_headers={StanfordHealthCareAzureOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
+            )
+        else:
+            super().__init__(
+                tokenizer=tokenizer,
+                tokenizer_name=tokenizer_name,
+                cache_config=cache_config,
+                api_key="unused",
+                endpoint=endpoint,
+                azure_openai_deployment_name=openai_model_name,
+                api_version=api_version,
+                default_headers={StanfordHealthCareAzureOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
+            )

helm/clients/stanfordhealthcare_claude_client.py ADDED Viewed

@@ -0,0 +1,31 @@
+from typing import Any, Dict, List
+from helm.common.request import (
+    Request,
+    GeneratedOutput,
+)
+from helm.clients.stanfordhealthcare_http_model_client import StanfordHealthCareHTTPModelClient
+class StanfordHealthCareClaudeClient(StanfordHealthCareHTTPModelClient):
+    """
+    Client for accessing Claude models hosted on Stanford Health Care's model API.
+    Configure by setting the following in prod_env/credentials.conf:
+    ```
+    stanfordhealthcareEndpoint: https://your-domain-name/
+    stanfordhealthcareApiKey: your-private-key
+    ```
+    """
+    def get_request(self, request: Request) -> Dict[str, Any]:
+        return {
+            "model_id": self.model,
+            "prompt_text": request.prompt,
+            "max_tokens": request.max_tokens,
+            "temperature": request.temperature,
+        }
+    def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
+        return [GeneratedOutput(text=item["text"], logprob=0, tokens=[]) for item in response["content"]]

helm/clients/stanfordhealthcare_google_client.py ADDED Viewed

@@ -0,0 +1,43 @@
+from typing import Any, Dict, List
+from helm.common.request import (
+    Request,
+    GeneratedOutput,
+)
+from helm.clients.stanfordhealthcare_http_model_client import StanfordHealthCareHTTPModelClient
+class StanfordHealthCareGoogleClient(StanfordHealthCareHTTPModelClient):
+    """
+    Client for accessing Google models hosted on Stanford Health Care's model API.
+    Configure by setting the following in prod_env/credentials.conf:
+    ```
+    stanfordhealthcareEndpoint: https://your-domain-name/
+    stanfordhealthcareApiKey: your-private-key
+    ```
+    """
+    def get_request(self, request: Request) -> Dict[str, Any]:
+        return {
+            "contents": {
+                "role": "user",
+                "parts": {"text": request.prompt},
+            },
+            "generation_config": {
+                "temperature": request.temperature,
+                "topP": request.top_p,
+                "topK": request.top_k_per_token,
+                "candidateCount": 1,
+                "maxOutputTokens": request.max_tokens,
+                "stopSequences": request.stop_sequences,
+            },
+        }
+    def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
+        completion = ""
+        for item in response["content"]:
+            if "content" in item["candidates"][0]:
+                completion += item["candidates"][0]["content"]["parts"][0]["text"]
+        return [GeneratedOutput(text=completion, logprob=0, tokens=[])]

helm/clients/stanfordhealthcare_http_model_client.py ADDED Viewed

@@ -0,0 +1,93 @@
+import requests
+from abc import ABC, abstractmethod
+from dataclasses import asdict
+from typing import Any, Dict, List, Optional
+from helm.common.cache import CacheConfig
+from helm.common.request import (
+    wrap_request_time,
+    Request,
+    RequestResult,
+    GeneratedOutput,
+    EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
+)
+from helm.clients.client import CachingClient
+class StanfordHealthCareHTTPModelClient(CachingClient, ABC):
+    """
+    Client for accessing Stanford Health Care models via HTTP requests.
+    Configure by setting the following in prod_env/credentials.conf:
+    ```
+    stanfordhealthcareEndpoint: https://your-domain-name/
+    stanfordhealthcareApiKey: your-private-key
+    ```
+    """
+    CREDENTIAL_HEADER_NAME = "Ocp-Apim-Subscription-Key"
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        deployment: str,
+        endpoint: str = "http://localhost:8080",
+        do_cache: bool = False,
+        timeout: int = 3000,
+        api_key: Optional[str] = None,
+        model: Optional[str] = None,
+    ):
+        super().__init__(cache_config=cache_config)
+        assert api_key, "API key must be provided"
+        self.endpoint = endpoint
+        self.timeout = timeout
+        self.do_cache = do_cache
+        self.deployment = deployment
+        self.model = model
+        self.default_headers = {StanfordHealthCareHTTPModelClient.CREDENTIAL_HEADER_NAME: api_key}
+    def make_request(self, request: Request) -> RequestResult:
+        cache_key = asdict(request)
+        if request.embedding:
+            return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
+        raw_request = self.get_request(request)
+        try:
+            def do_it() -> Dict[str, Any]:
+                url = f"{self.endpoint}/{self.deployment}"
+                response = requests.post(url, json=raw_request, headers=self.default_headers, timeout=self.timeout)
+                response.raise_for_status()
+                response_json = response.json()
+                if type(response_json) == list:
+                    response_json = {"content": response_json}
+                return response_json
+            if self.do_cache:
+                response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+            else:
+                response, cached = wrap_request_time(do_it)(), False
+            completions = self.parse_response(response)
+            return RequestResult(
+                success=True,
+                cached=cached,
+                error=None,
+                completions=completions,
+                embedding=[],
+                request_time=response["request_time"],
+            )
+        except requests.exceptions.RequestException as e:
+            return RequestResult(success=False, cached=False, error=f"Request error: {e}", completions=[], embedding=[])
+    @abstractmethod
+    def get_request(self, request: Request) -> Dict[str, Any]:
+        pass
+    @abstractmethod
+    def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
+        pass

helm/clients/stanfordhealthcare_openai_client.py ADDED Viewed

@@ -0,0 +1,62 @@
+from typing import Optional
+from helm.clients.openai_client import OpenAIClient
+from helm.common.cache import CacheConfig
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.proxy.retry import NonRetriableException
+from helm.tokenizers.tokenizer import Tokenizer
+try:
+    from openai import OpenAI
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["openai"])
+class StanfordHealthCareOpenAIClient(OpenAIClient):
+    """
+    Client for accessing OpenAI models hosted on Stanford Health Care's model API.
+    Configure by setting the following in prod_env/credentials.conf:
+    ```
+    stanfordhealthcareEndpoint: https://your-domain-name/
+    stanfordhealthcareApiKey: your-private-key
+    ```
+    """
+    CREDENTIAL_HEADER_NAME = "Ocp-Apim-Subscription-Key"
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        model_name: str,
+        api_key: Optional[str] = None,
+        endpoint: Optional[str] = None,
+        base_url: Optional[str] = None,
+        openai_model_name: Optional[str] = None,
+        output_processor: Optional[str] = None,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            tokenizer_name=tokenizer_name,
+            cache_config=cache_config,
+            api_key="unused",
+            openai_model_name=openai_model_name,
+            output_processor=output_processor,
+        )
+        if not endpoint:
+            raise NonRetriableException("Must provide endpoint through credentials.conf")
+        if not api_key:
+            raise NonRetriableException("Must provide API key through credentials.conf")
+        if not base_url:
+            raise NonRetriableException("Must provide base url through model_deployments")
+        # Guess the base URL part based on the model name
+        # Maybe make this configurable instead?
+        base_url = base_url.format(endpoint=endpoint)
+        self.client = OpenAI(
+            api_key="dummy",
+            base_url=base_url,
+            default_headers={StanfordHealthCareOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
+        )

helm/clients/stanfordhealthcare_shc_openai_client.py ADDED Viewed

@@ -0,0 +1,42 @@
+from typing import Dict, Optional
+from helm.clients.azure_openai_client import AzureOpenAIClient
+from helm.common.cache import CacheConfig
+from helm.proxy.retry import NonRetriableException
+from helm.tokenizers.tokenizer import Tokenizer
+class StanfordHealthCareSHCOpenAIClient(AzureOpenAIClient):
+    """
+    Client for accessing OpenAI models hosted on Stanford Health Care's model API.
+    Configure by setting the following in prod_env/credentials.conf:
+    ```
+    stanfordhealthcareEndpoint: https://your-domain-name/
+    stanfordhealthcareApiKey: your-private-key
+    ```
+    """
+    API_VERSION = "2024-08-01-preview"
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        api_key: Optional[str] = None,
+        endpoint: Optional[str] = None,
+        default_headers: Optional[Dict[str, str]] = None,
+    ):
+        if not api_key:
+            raise NonRetriableException("Must provide API key through credentials.conf")
+        super().__init__(
+            tokenizer=tokenizer,
+            tokenizer_name=tokenizer_name,
+            cache_config=cache_config,
+            api_key=api_key,
+            endpoint=endpoint,
+            api_version=StanfordHealthCareSHCOpenAIClient.API_VERSION,
+            default_headers=default_headers,
+        )

helm/clients/test_client.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
 from helm.tokenizers.auto_tokenizer import AutoTokenizer
-from .client import truncate_sequence, truncate_and_tokenize_response_text
+from helm.clients.client import truncate_sequence, truncate_and_tokenize_response_text
 from typing import List
 from helm.common.request import Request, GeneratedOutput, Token

helm/clients/test_together_client.py CHANGED Viewed

@@ -5,7 +5,12 @@ import tempfile
 from helm.common.cache import BlackHoleCacheConfig, SqliteCacheConfig
 from helm.common.request import Request
-from .together_client import TogetherClient, TogetherChatClient, TogetherCompletionClient, TogetherClientError
+from helm.clients.together_client import (
+    TogetherClient,
+    TogetherChatClient,
+    TogetherCompletionClient,
+    TogetherClientError,
+)
 class TestTogetherClient:

helm/clients/together_client.py CHANGED Viewed

@@ -1,15 +1,18 @@
 from copy import deepcopy
 from itertools import zip_longest
+import re
 import threading
-from typing import List, Dict, Any, Mapping, Optional, TypedDict, Union
+from typing import Callable, List, Dict, Any, Mapping, Optional, Tuple, TypedDict, Union
+from typing_extensions import NotRequired
 import requests
 from retrying import retry
 from helm.common.cache import CacheConfig
 from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
+from helm.common.object_spec import get_class_by_name
 from helm.common.optional_dependencies import handle_module_not_found_error
-from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
+from helm.common.request import Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
 from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
 try:
@@ -98,6 +101,19 @@ class JobNotFinishedError(TogetherClientError):
     pass
+def _parse_thinking(input: str) -> Tuple[str, str]:
+    """Return a tuple of thinking text and output text."""
+    match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), match.group(2))
+    match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), "")
+    return (input, "")
 class TogetherClient(CachingClient):
     """
     Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
@@ -313,15 +329,31 @@ class TogetherRawChatRequest(TypedDict):
     logprobs: int
     echo: bool
     n: int
+    response_format: NotRequired[Dict[str, Any]]
 class TogetherChatClient(CachingClient):
     """Client that uses the Python Together library for chat models."""
-    def __init__(self, cache_config: CacheConfig, api_key: Optional[str], together_model: Optional[str] = None):
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        api_key: Optional[str],
+        together_model: Optional[str] = None,
+        disable_logprobs: Optional[bool] = None,
+        output_processor: Optional[str] = None,
+        parse_thinking: Optional[bool] = None,
+    ):
         super().__init__(cache_config=cache_config)
         self._client = Together(api_key=api_key)
         self._together_model = together_model
+        self._disable_logprobs = bool(disable_logprobs)
+        # self.output_processor is actually a function, not a class
+        self._parse_thinking = bool(parse_thinking)
+        self.output_processor: Optional[Callable[[str], str]] = (
+            get_class_by_name(output_processor) if output_processor else None
+        )
     def convert_to_raw_chat_request(self, request: Request) -> TogetherRawChatRequest:
         request.validate()
@@ -353,7 +385,11 @@ class TogetherChatClient(CachingClient):
             model = self._together_model
         else:
             model = request.model
-        return {
+        if self._disable_logprobs:
+            logprobs = 0
+        else:
+            logprobs = min(request.top_k_per_token, 1)
+        raw_chat_request: TogetherRawChatRequest = {
             "messages": messages,
             "model": model,
             "max_tokens": request.max_tokens,
@@ -361,10 +397,16 @@ class TogetherChatClient(CachingClient):
             "temperature": request.temperature,
             "top_p": request.top_p,
             "top_k": request.top_k_per_token,
-            "logprobs": min(request.top_k_per_token, 1),
+            "logprobs": logprobs,
             "echo": request.echo_prompt,
             "n": request.num_completions,
         }
+        if request.response_format and request.response_format.json_schema:
+            raw_chat_request["response_format"] = {
+                "type": "json_object",
+                "schema": request.response_format.json_schema,
+            }
+        return raw_chat_request
     def make_request(self, request: Request) -> RequestResult:
         raw_request = self.convert_to_raw_chat_request(request)
@@ -398,8 +440,21 @@ class TogetherChatClient(CachingClient):
                     if token_text is None:
                         break
                     tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
+            logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
             assert choice.message.role == "assistant"
-            generated_outputs.append(GeneratedOutput(text=choice.message.content, logprob=0.0, tokens=tokens))
+            output_text = choice.message.content
+            if self.output_processor:
+                output_text = self.output_processor(output_text)
+            if self._parse_thinking:
+                thinking_text, output_text = _parse_thinking(output_text)
+                generated_outputs.append(
+                    GeneratedOutput(
+                        text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
+                    )
+                )
+            else:
+                generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
         return RequestResult(
             success=True,
             cached=cached,
@@ -426,16 +481,27 @@ class TogetherRawCompletionRequest(TypedDict):
 class TogetherCompletionClient(CachingClient):
     """Client that uses the Python Together library for text completion models."""
-    def __init__(self, cache_config: CacheConfig, api_key: Optional[str], together_model: Optional[str] = None):
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        api_key: Optional[str],
+        together_model: Optional[str] = None,
+        disable_logprobs: Optional[bool] = None,
+    ):
         super().__init__(cache_config=cache_config)
         self._client = Together(api_key=api_key)
         self._together_model = together_model
+        self._disable_logprobs = bool(disable_logprobs)
     def convert_to_raw_completion_request(self, request: Request) -> TogetherRawCompletionRequest:
         if self._together_model is not None:
             model = self._together_model
         else:
             model = request.model
+        if self._disable_logprobs:
+            logprobs = 0
+        else:
+            logprobs = min(request.top_k_per_token, 1)
         return {
             "prompt": request.prompt,
             "model": model,
@@ -444,7 +510,7 @@ class TogetherCompletionClient(CachingClient):
             "temperature": request.temperature,
             "top_p": request.top_p,
             "top_k": request.top_k_per_token,
-            "logprobs": min(request.top_k_per_token, 1),
+            "logprobs": logprobs,
             "echo": request.echo_prompt,
             "n": request.num_completions,
         }
@@ -481,8 +547,9 @@ class TogetherCompletionClient(CachingClient):
                     if token_text is None:
                         break
                     tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
+            logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
             assert choice.text
-            generated_outputs.append(GeneratedOutput(text=choice.text, logprob=0.0, tokens=tokens))
+            generated_outputs.append(GeneratedOutput(text=choice.text, logprob=logprob, tokens=tokens))
         return RequestResult(
             success=True,
             cached=cached,

helm/clients/upstage_client.py ADDED Viewed

@@ -0,0 +1,23 @@
+from helm.clients.openai_client import OpenAIClient
+from helm.common.cache import CacheConfig
+from helm.tokenizers.tokenizer import Tokenizer
+class UpstageChatClient(OpenAIClient):
+    """Sends request to a Upstage model using a OpenAI-compatible Chat API."""
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        api_key: str,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            tokenizer_name=tokenizer_name,
+            cache_config=cache_config,
+            api_key=api_key,
+            org_id=None,
+            base_url="https://api.upstage.ai/v1/solar",
+        )

crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl