PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show

crfm_helm-0.5.6.dist-info/METADATA +427 -0
crfm_helm-0.5.6.dist-info/RECORD +941 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +13 -1
helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/common_adapter_specs.py +69 -4
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/annotation/aci_bench_annotator.py +95 -0
helm/benchmark/annotation/air_bench_annotator.py +21 -6
helm/benchmark/annotation/annotator.py +5 -0
helm/benchmark/annotation/annotator_factory.py +3 -20
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +107 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +99 -0
helm/benchmark/annotation/medalign_annotator.py +100 -0
helm/benchmark/annotation/medi_qa_annotator.py +98 -0
helm/benchmark/annotation/medication_qa_annotator.py +87 -63
helm/benchmark/annotation/mental_health_annotator.py +98 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +214 -6
helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation_executor.py +35 -15
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +2 -2
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +2 -2
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +16 -13
helm/benchmark/augmentations/translate_perturbation.py +2 -2
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +11 -12
helm/benchmark/huggingface_registration.py +2 -7
helm/benchmark/metrics/aci_bench_metrics.py +14 -0
helm/benchmark/metrics/basic_metrics.py +6 -6
helm/benchmark/metrics/bbq_metrics.py +2 -2
helm/benchmark/metrics/bias_metrics.py +12 -3
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
helm/benchmark/metrics/classification_metrics.py +76 -12
helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +9 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
helm/benchmark/metrics/copyright_metrics.py +4 -4
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +14 -0
helm/benchmark/metrics/disinformation_metrics.py +4 -4
helm/benchmark/metrics/dry_run_metrics.py +5 -5
helm/benchmark/metrics/efficiency_metrics.py +6 -6
helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
helm/benchmark/metrics/ifeval/__init__.py +0 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +55 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/language_modeling_metrics.py +4 -4
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/machine_translation_metrics.py +2 -2
helm/benchmark/metrics/med_dialog_metrics.py +14 -0
helm/benchmark/metrics/medalign_metrics.py +14 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
helm/benchmark/metrics/medec_metrics.py +101 -0
helm/benchmark/metrics/medi_qa_metrics.py +14 -0
helm/benchmark/metrics/medication_qa_metrics.py +10 -19
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +14 -0
helm/benchmark/metrics/metric.py +3 -3
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/numeracy_metrics.py +4 -4
helm/benchmark/metrics/omni_math_metrics.py +32 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
helm/benchmark/metrics/ranking_metrics.py +3 -3
helm/benchmark/metrics/reference_metric.py +3 -3
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +20 -9
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +140 -68
helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
helm/benchmark/metrics/test_metric.py +1 -1
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
helm/benchmark/metrics/toxicity_metrics.py +6 -6
helm/benchmark/metrics/unitxt_metrics.py +7 -5
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/metrics/wildbench_metrics.py +34 -0
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/model_metadata_registry.py +16 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +97 -67
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +86 -90
helm/benchmark/run_expander.py +90 -9
helm/benchmark/run_spec_factory.py +13 -0
helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/call_center_run_specs.py +49 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1 -69
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +142 -3
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/lite_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +141 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +37 -0
helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
helm/benchmark/run_specs/vlm_run_specs.py +103 -2
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
helm/benchmark/scenarios/air_bench_scenario.py +6 -1
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
helm/benchmark/scenarios/banking77_scenario.py +6 -1
helm/benchmark/scenarios/bbq_scenario.py +1 -1
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bold_scenario.py +1 -1
helm/benchmark/scenarios/boolq_scenario.py +1 -1
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
helm/benchmark/scenarios/clear_scenario.py +157 -0
helm/benchmark/scenarios/cleva_scenario.py +2 -2
helm/benchmark/scenarios/code_scenario.py +17 -4
helm/benchmark/scenarios/commonsense_scenario.py +1 -1
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
helm/benchmark/scenarios/disinformation_scenario.py +10 -1
helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
helm/benchmark/scenarios/gpqa_scenario.py +80 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +2 -2
helm/benchmark/scenarios/gsm_scenario.py +10 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
helm/benchmark/scenarios/headqa_scenario.py +136 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
helm/benchmark/scenarios/ice_scenario.py +8 -4
helm/benchmark/scenarios/ifeval_scenario.py +53 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +11 -2
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +1 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
helm/benchmark/scenarios/legal_support_scenario.py +11 -1
helm/benchmark/scenarios/legalbench_scenario.py +22 -3
helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
helm/benchmark/scenarios/lextreme_scenario.py +11 -1
helm/benchmark/scenarios/live_qa_scenario.py +1 -1
helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
helm/benchmark/scenarios/math_scenario.py +9 -1
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +10 -1
helm/benchmark/scenarios/medalign_scenario.py +94 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
helm/benchmark/scenarios/medbullets_scenario.py +145 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
helm/benchmark/scenarios/medec_scenario.py +125 -0
helm/benchmark/scenarios/medhallu_scenario.py +72 -0
helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +123 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
helm/benchmark/scenarios/mmlu_scenario.py +11 -1
helm/benchmark/scenarios/msmarco_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/numeracy_scenario.py +12 -2
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +53 -0
helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
helm/benchmark/scenarios/quac_scenario.py +10 -1
helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
helm/benchmark/scenarios/raft_scenario.py +17 -2
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
helm/benchmark/scenarios/scenario.py +9 -1
helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
helm/benchmark/scenarios/spider_scenario.py +91 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
helm/benchmark/scenarios/summarization_scenario.py +11 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_math_scenario.py +1 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
helm/benchmark/scenarios/unitxt_scenario.py +8 -2
helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
helm/benchmark/scenarios/vicuna_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/wikifact_scenario.py +11 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +83 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
helm/benchmark/scenarios/xstest_scenario.py +1 -1
helm/benchmark/server.py +13 -1
helm/benchmark/slurm_runner.py +1 -1
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +97 -60
helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_finance.yaml +14 -12
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_long_context.yaml +283 -0
helm/benchmark/static/schema_medhelm.yaml +1140 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +18 -1
helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_vhelm.yaml +129 -56
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
helm/benchmark/static_build/config.js +1 -1
helm/benchmark/static_build/index.html +6 -6
helm/benchmark/window_services/default_window_service.py +1 -1
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
helm/benchmark/window_services/ice_window_service.py +1 -1
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
helm/benchmark/window_services/local_window_service.py +2 -2
helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
helm/benchmark/window_services/test_bloom_window_service.py +3 -3
helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
helm/benchmark/window_services/test_gptj_window_service.py +8 -3
helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
helm/benchmark/window_services/test_openai_window_service.py +8 -3
helm/benchmark/window_services/test_opt_window_service.py +3 -3
helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
helm/benchmark/window_services/test_t511b_window_service.py +3 -3
helm/benchmark/window_services/test_ul2_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +4 -5
helm/benchmark/window_services/test_yalm_window_service.py +3 -3
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/benchmark/window_services/yalm_window_service.py +1 -1
helm/clients/ai21_client.py +3 -3
helm/clients/aleph_alpha_client.py +1 -1
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +120 -0
helm/clients/audio_language/llama_omni_client.py +198 -0
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
helm/clients/audio_language/qwen_audiolm_client.py +152 -0
helm/clients/audio_language/test.py +62 -0
helm/clients/auto_client.py +4 -2
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +203 -7
helm/clients/bedrock_utils.py +33 -0
helm/clients/client.py +7 -7
helm/clients/clip_scorers/clip_scorer.py +1 -1
helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
helm/clients/cohere_client.py +3 -3
helm/clients/google_client.py +1 -1
helm/clients/grok_client.py +36 -0
helm/clients/http_model_client.py +1 -1
helm/clients/huggingface_client.py +52 -21
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +267 -0
helm/clients/image_generation/adobe_vision_client.py +1 -1
helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
helm/clients/image_generation/cogview2_client.py +1 -1
helm/clients/image_generation/dalle2_client.py +1 -1
helm/clients/image_generation/dalle3_client.py +2 -2
helm/clients/image_generation/dalle_mini/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/data.py +1 -1
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
helm/clients/image_generation/dalle_mini_client.py +1 -1
helm/clients/image_generation/deep_floyd_client.py +1 -1
helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
helm/clients/image_generation/lexica_client.py +1 -1
helm/clients/image_generation/mindalle/models/__init__.py +6 -6
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
helm/clients/image_generation/mindalle_client.py +1 -1
helm/clients/image_generation/together_image_generation_client.py +1 -1
helm/clients/lit_gpt_client.py +2 -2
helm/clients/mistral_client.py +62 -18
helm/clients/nvidia_nim_client.py +0 -3
helm/clients/openai_client.py +308 -43
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +3 -9
helm/clients/reka_client.py +3 -3
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +93 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/clients/test_client.py +1 -1
helm/clients/test_together_client.py +6 -1
helm/clients/together_client.py +76 -9
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +45 -13
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/open_flamingo/__init__.py +2 -2
helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +188 -0
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +4 -6
helm/clients/writer_client.py +102 -0
helm/clients/yi_client.py +0 -3
helm/common/audio_utils.py +111 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/file_caches/local_file_cache.py +1 -1
helm/common/file_caches/test_local_file_cache.py +1 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/images_utils.py +2 -2
helm/common/local_context.py +140 -0
helm/common/media_object.py +2 -2
helm/common/multimodal_request_utils.py +26 -0
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +14 -2
helm/common/response_format.py +18 -0
helm/common/test_media_object.py +1 -1
helm/config/model_deployments.yaml +1792 -28
helm/config/model_metadata.yaml +1606 -51
helm/config/tokenizer_configs.yaml +521 -4
helm/proxy/cli.py +5 -3
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/example_queries.py +1 -1
helm/proxy/server.py +11 -4
helm/proxy/services/remote_service.py +1 -1
helm/proxy/services/server_service.py +22 -86
helm/proxy/services/test_remote_service.py +2 -2
helm/proxy/services/test_service.py +1 -1
helm/proxy/static/general.js +122 -0
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +57 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +456 -0
helm/proxy/static/info-icon.png +0 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +1 -1
helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
helm/tokenizers/caching_tokenizer.py +2 -30
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/http_model_tokenizer.py +1 -1
helm/tokenizers/huggingface_tokenizer.py +3 -3
helm/tokenizers/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/tokenizers/test_huggingface_tokenizer.py +1 -1
helm/tokenizers/test_yalm_tokenizer.py +1 -1
helm/tokenizers/tiktoken_tokenizer.py +1 -1
helm/tokenizers/tokenizer.py +3 -1
helm/tokenizers/yalm_tokenizer.py +3 -3
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
crfm_helm-0.5.4.dist-info/METADATA +0 -350
crfm_helm-0.5.4.dist-info/RECORD +0 -697
helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
helm/tokenizers/anthropic_tokenizer.py +0 -52
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0

helm/benchmark/metrics/ruler_qa_metrics.py ADDED Viewed

@@ -0,0 +1,34 @@
+from typing import List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.scenarios.scenario import CORRECT_TAG
+class RulerQAMetric(Metric):
+    """Accuracy metric for Ruler QA Scenarios.
+    Adapted from: https://github.com/NVIDIA/RULER/blob/1c45e5c60273e0ae9e3099137bf0eec6f0395f84/scripts/eval/synthetic/constants.py#L25
+    """  # noqa: E501
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        response_text = request_state.result.completions[0].text
+        assert all(
+            len(reference.tags) == 1 and reference.tags[0] == CORRECT_TAG
+            for reference in request_state.instance.references
+        )
+        reference_texts = [reference.output.text for reference in request_state.instance.references]
+        score = max([1.0 if r.lower() in response_text.lower() else 0.0 for r in reference_texts])
+        return [Stat(MetricName("ruler_string_match_part")).add(score)]

helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} RENAMED Viewed

@@ -14,7 +14,7 @@ from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
-class BhasaMachineTranslationMetric(Metric):
+class SEAHELMMachineTranslationMetric(Metric):
     """Machine Translation Metrics
     This class computes the following standard machine translation metrics
@@ -74,8 +74,8 @@ class BhasaMachineTranslationMetric(Metric):
         return result
-class BhasaQAMetric(Metric):
-    """Bhasa QA Metrics
+class SEAHELMQAMetric(Metric):
+    """SEAHELM QA Metrics
     This class computes the following standard SQuAD v1.1 metrics

helm/benchmark/metrics/seahelm_metrics_specs.py ADDED Viewed

@@ -0,0 +1,10 @@
+from typing import Any, Dict, List
+from helm.benchmark.metrics.metric import MetricSpec
+def get_seahelm_machine_translation_metric_specs() -> List[MetricSpec]:
+    return [MetricSpec(class_name="helm.benchmark.metrics.seahelm_metrics.SEAHELMMachineTranslationMetric")]
+def get_seahelm_qa_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
+    return [MetricSpec(class_name="helm.benchmark.metrics.seahelm_metrics.SEAHELMQAMetric", args=args)]

helm/benchmark/metrics/spider_metrics.py ADDED Viewed

@@ -0,0 +1,7 @@
+from helm.benchmark.metrics.bird_sql_metrics import BirdSQLMetric
+class SpiderMetric(BirdSQLMetric):
+    """Score metrics for Spider. Based on Bird-SQL."""
+    ANNOTATOR_NAME = "spider"

helm/benchmark/metrics/starr_patient_instructions_metrics.py ADDED Viewed

@@ -0,0 +1,14 @@
+from helm.benchmark.annotation.starr_patient_instructions_annotator import ANNOTATOR_MODELS
+from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
+class StarrPatientInstructionsMetric(LLMJuryMetric):
+    """Score metrics for StarrPatientInstructions."""
+    def __init__(self):
+        super().__init__(
+            metric_name="starr_patient_instructions_accuracy",
+            scenario_name="starr_patient_instructions",
+            annotator_models=ANNOTATOR_MODELS,
+            default_score=1.0,
+        )

helm/benchmark/metrics/statistic.py CHANGED Viewed

@@ -2,7 +2,7 @@ from dataclasses import dataclass, replace
 import math
 from typing import Dict, Optional
-from .metric_name import MetricName
+from helm.benchmark.metrics.metric_name import MetricName
 @dataclass

helm/benchmark/metrics/summac/model_summac.py CHANGED Viewed

@@ -10,7 +10,7 @@ import numpy.typing as npt
 import torch
 import os
 import json
-from . import utils_misc
+from helm.benchmark.metrics.summac import utils_misc
 model_map = {
     "snli-base": {"model_card": "boychaboy/SNLI_roberta-base", "entailment_idx": 0, "contradiction_idx": 2},
@@ -169,10 +169,9 @@ class SummaCImager:
                 batch_tokens = self.tokenizer.batch_encode_plus(
                     list(zip(batch_prems, batch_hypos)),
                     padding=True,
-                    truncation=True,
+                    truncation="only_first",
                     max_length=self.max_input_length,
                     return_tensors="pt",
-                    truncation_strategy="only_first",
                 )
                 batch_tokens = {k: v.to(self.device) for k, v in batch_tokens.items()}
                 with torch.no_grad():

helm/benchmark/metrics/summarization_critique_metrics.py CHANGED Viewed

@@ -2,10 +2,10 @@ from typing import Dict, List
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from .metric import Metric
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
 from helm.common.critique_request import CritiqueTaskTemplate, CritiqueQuestionTemplate, CritiqueRequest, QuestionType

helm/benchmark/metrics/summarization_metrics.py CHANGED Viewed

@@ -16,12 +16,16 @@ from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
 from helm.common.hierarchical_logger import hlog
 from helm.common.general import ensure_file_downloaded
 from helm.common.optional_dependencies import handle_module_not_found_error
-from .metric import Metric, MetricResult
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .statistic import Stat
-from .summac.model_summac import SummaCZS
-from bert_score import BERTScorer
+from helm.benchmark.metrics.metric import Metric, MetricResult
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.summac.model_summac import SummaCZS
+try:
+    from bert_score import BERTScorer  # type: ignore
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["summarization"])
 QAFACTEVAL_URL: str = (
@@ -43,7 +47,14 @@ class SummarizationMetric(Metric):
     4. Faithfulness (SummaC)
     """
-    def __init__(self, task: str, device: str = "cpu"):
+    def __init__(
+        self,
+        task: str,
+        language: str = "en",
+        device: str = "cpu",
+        bertscore_model: str = "microsoft/deberta-large-mnli",
+        rescale_with_baseline: bool = True,
+    ):
         self.rouge_fns = {
             "rouge_1": get_rouge_function("rouge1"),
             "rouge_2": get_rouge_function("rouge2"),
@@ -56,7 +67,7 @@ class SummarizationMetric(Metric):
             spacy.cli.download("en_core_web_sm")
         try:
-            from summ_eval.data_stats_metric import DataStatsMetric
+            from summ_eval.data_stats_metric import DataStatsMetric  # type: ignore
         except ModuleNotFoundError as e:
             handle_module_not_found_error(e, ["summarization"])
@@ -71,7 +82,7 @@ class SummarizationMetric(Metric):
         else:
             self.compute_bertscore = True
             self.bert_scorer = BERTScorer(
-                model_type="microsoft/deberta-large-mnli", lang="en", rescale_with_baseline=True, device=device
+                model_type=bertscore_model, lang=language, rescale_with_baseline=rescale_with_baseline, device=device
             )
             # Need GPU for faithfulness metrics since they are model-based.
             self.compute_faithfulness = True

helm/benchmark/metrics/test_bias_metrics.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from dataclasses import dataclass
 from typing import Callable, List, Optional
-from .bias_metrics import BiasMetric
+from helm.benchmark.metrics.bias_metrics import BiasMetric
+from helm.benchmark.metrics.nltk_helper import install_nltk_resources
+install_nltk_resources()
 @dataclass(frozen=True)

helm/benchmark/metrics/test_classification_metrics.py CHANGED Viewed

@@ -38,12 +38,14 @@ def _request_state(prediction: str, options: List[_Option]):
     )
-def assert_stats_equal(actual_stats: List[Stat], expected_values: Dict[str, float]):
-    actual_values = {stat.name.name: stat.mean for stat in actual_stats}
-    assert actual_values == approx(expected_values)
+def get_stat_value(stats: List[Stat], stat_name: str):
+    for stat in stats:
+        if stat.name.name == stat_name:
+            return stat.mean
+    raise ValueError(f"No stat with name {stat_name}")
-def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
+def compute_stats(all_classes_counts: Dict[str, Dict[str, int]]):
     micro_counts: Dict[str, int] = defaultdict(int)
     for class_counts in all_classes_counts.values():
         for key, class_count in class_counts.items():
@@ -52,102 +54,172 @@ def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
     micro_recall = micro_counts["tp"] / (micro_counts["tp"] + micro_counts["fn"])
     micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)
-    class_f1: List[float] = []
-    for class_counts in all_classes_counts.values():
+    class_f1: Dict[str, float] = {}
+    for class_name, class_counts in all_classes_counts.items():
         class_precision = class_counts["tp"] / (class_counts["tp"] + class_counts["fp"])
         class_recall = class_counts["tp"] / (class_counts["tp"] + class_counts["fn"])
-        class_f1.append(2 * (class_precision * class_recall) / (class_precision + class_recall))
-    macro_f1 = sum(class_f1) / len(class_f1)
-    return {
-        "classification_micro_f1": micro_f1,
-        "classification_macro_f1": macro_f1,
+        class_f1[class_name] = 2 * (class_precision * class_recall) / (class_precision + class_recall)
+    macro_f1 = sum(class_f1.values()) / len(class_f1)
+    class_name_to_support = {
+        class_name: class_counts["tp"] + class_counts["fn"] for class_name, class_counts in all_classes_counts.items()
     }
+    weighted_f1 = sum(class_f1[class_name] * support for class_name, support in class_name_to_support.items()) / sum(
+        support for support in class_name_to_support.values()
+    )
+    stats = {
+        "macro_f1": macro_f1,
+        "micro_f1": micro_f1,
+        "weighted_f1": weighted_f1,
+    }
+    for class_name, class_f1_score in class_f1.items():
+        stats[f"{class_name}_f1"] = class_f1_score
+    return stats
-def test_evaluate_instances_binary_generation():
-    metric = ClassificationMetric(delimiter=None)
+def test_evaluate_instances_default_parameters():
     request_states = [
         _request_state("yes", [_Option("yes", True)]),
+        _request_state("yes ", [_Option("yes", True)]),
+        _request_state("yeS", [_Option("yes", True)]),
+        _request_state("yes", [_Option("no", True)]),
+        _request_state("no", [_Option("yes", True)]),
+        _request_state("no", [_Option("no", True)]),
+        _request_state("invalid", [_Option("no", True)]),
+    ]
+    expected_stats = compute_stats(
+        {
+            "yes": {"tp": 3, "fp": 1, "tn": 2, "fn": 1},
+            "no": {"tp": 1, "fp": 1, "tn": 3, "fn": 2},
+        }
+    )
+    actual_stats = ClassificationMetric().evaluate_instances(request_states, "")
+    actual_macro_f1 = get_stat_value(actual_stats, "classification_macro_f1")
+    assert actual_macro_f1 == approx(expected_stats["macro_f1"])
+    actual_micro_f1 = get_stat_value(actual_stats, "classification_micro_f1")
+    assert actual_micro_f1 == approx(expected_stats["micro_f1"])
+def test_evaluate_instances_yes_and_no():
+    labels = ["yes", "no"]
+    request_states = [
         _request_state("yes", [_Option("yes", True)]),
-        _request_state("yes", [_Option("yes", True)]),
+        _request_state("yes ", [_Option("yes", True)]),
+        _request_state("yeS", [_Option("yes", True)]),
         _request_state("yes", [_Option("no", True)]),
         _request_state("no", [_Option("yes", True)]),
         _request_state("no", [_Option("no", True)]),
         _request_state("invalid", [_Option("no", True)]),
     ]
-    assert_stats_equal(
-        metric.evaluate_instances(request_states, ""),
-        _expected_stats(
-            {
-                "yes": {"tp": 3, "fp": 1, "tn": 2, "fn": 1},
-                "no": {"tp": 1, "fp": 1, "tn": 3, "fn": 2},
-            }
-        ),
+    expected_stats = compute_stats(
+        {
+            "yes": {"tp": 3, "fp": 1, "tn": 2, "fn": 1},
+            "no": {"tp": 1, "fp": 1, "tn": 3, "fn": 2},
+        }
     )
+    actual_stats = ClassificationMetric(
+        scores=["f1"], averages=["macro", "micro", "weighted", None], labels=labels
+    ).evaluate_instances(request_states, "")
+    actual_macro_f1 = get_stat_value(actual_stats, "classification_macro_f1")
+    assert actual_macro_f1 == approx(expected_stats["macro_f1"])
+    actual_micro_f1 = get_stat_value(actual_stats, "classification_micro_f1")
+    assert actual_micro_f1 == approx(expected_stats["micro_f1"])
+    actual_weighted_f1 = get_stat_value(actual_stats, "classification_weighted_f1")
+    assert actual_weighted_f1 == approx(expected_stats["weighted_f1"])
+    actual_yes_f1 = get_stat_value(actual_stats, "classification_yes_f1")
+    assert actual_yes_f1 == approx(expected_stats["yes_f1"])
+    actual_no_f1 = get_stat_value(actual_stats, "classification_no_f1")
+    assert actual_no_f1 == approx(expected_stats["no_f1"])
 def test_evaluate_instances_multi_class():
-    # Note: no "a" because it would get filtered out by normalize_text()
-    metric = ClassificationMetric(delimiter=None)
+    labels = ["a", "b", "c"]
-    def _options(correct: str):
-        return [_Option(text, text == correct) for text in ["d", "b", "c"]]
+    def _gold_label(correct: str):
+        return [_Option(text, text == correct) for text in labels]
     request_states = [
-        _request_state("d", _options("d")),
-        _request_state("d", _options("d")),
-        _request_state("d", _options("d")),
-        _request_state("d", _options("b")),
-        _request_state("b", _options("b")),
-        _request_state("b", _options("b")),
-        _request_state("b", _options("c")),
-        _request_state("c", _options("d")),
-        _request_state("c", _options("c")),
-        _request_state("invalid", _options("c")),
+        _request_state("a", _gold_label("a")),
+        _request_state("a", _gold_label("a")),
+        _request_state("a", _gold_label("a")),
+        _request_state("a", _gold_label("b")),
+        _request_state("b", _gold_label("b")),
+        _request_state("b", _gold_label("b")),
+        _request_state("b", _gold_label("c")),
+        _request_state("c", _gold_label("a")),
+        _request_state("c", _gold_label("c")),
+        _request_state("invalid", _gold_label("c")),
     ]
-    assert_stats_equal(
-        metric.evaluate_instances(request_states, ""),
-        _expected_stats(
-            {
-                "d": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
-                "b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
-                "c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
-            }
-        ),
+    expected_stats = compute_stats(
+        {
+            "a": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
+            "b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
+            "c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
+        }
     )
+    actual_stats = ClassificationMetric(
+        scores=["f1"], averages=["macro", "micro", "weighted", None], labels=labels
+    ).evaluate_instances(request_states, "")
+    actual_macro_f1 = get_stat_value(actual_stats, "classification_macro_f1")
+    assert actual_macro_f1 == approx(expected_stats["macro_f1"])
+    actual_micro_f1 = get_stat_value(actual_stats, "classification_micro_f1")
+    assert actual_micro_f1 == approx(expected_stats["micro_f1"])
+    actual_weighted_f1 = get_stat_value(actual_stats, "classification_weighted_f1")
+    assert actual_weighted_f1 == approx(expected_stats["weighted_f1"])
+    actual_a_f1 = get_stat_value(actual_stats, "classification_a_f1")
+    assert actual_a_f1 == approx(expected_stats["a_f1"])
+    actual_b_f1 = get_stat_value(actual_stats, "classification_b_f1")
+    assert actual_b_f1 == approx(expected_stats["b_f1"])
+    actual_c_f1 = get_stat_value(actual_stats, "classification_c_f1")
+    assert actual_c_f1 == approx(expected_stats["c_f1"])
 def test_evaluate_instances_multilabel():
-    # Note: no "a" because it would get filtered out by normalize_text()
-    metric = ClassificationMetric(delimiter=",")
+    labels = ["a", "b", "c"]
-    def _options(correct: List[str]):
-        return [_Option(text, text in correct) for text in ["d", "b", "c"]]
+    def _gold_labels(correct: List[str]):
+        return [_Option(text, text in correct) for text in labels]
     request_states = [
-        _request_state("d,b", _options(["d", "b"])),
-        _request_state("d,b", _options(["d", "c"])),
-        _request_state("d", _options(["d"])),
-        _request_state("c", _options(["b"])),
-        _request_state("b", _options(["b", "c"])),
-        _request_state("d,b", _options(["c"])),
-        _request_state("d,c", _options(["d"])),
-        _request_state("d,b,c", _options(["d", "b", "c"])),
+        _request_state("a,b", _gold_labels(["a", "b"])),
+        _request_state("a,b", _gold_labels(["a", "c"])),
+        _request_state("a", _gold_labels(["a"])),
+        _request_state("c", _gold_labels(["b"])),
+        _request_state("b", _gold_labels(["b", "c"])),
+        _request_state("a,b", _gold_labels(["c"])),
+        _request_state("a,c", _gold_labels(["a"])),
+        _request_state("a,b,c", _gold_labels(["a", "b", "c"])),
         _request_state("", []),
         _request_state("n/a", []),
-        _request_state("invalid", _options(["c"])),
+        _request_state("invalid", _gold_labels(["c"])),
     ]
-    assert_stats_equal(
-        metric.evaluate_instances(request_states, ""),
-        _expected_stats(
-            {
-                "d": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
-                "b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
-                "c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
-            }
-        ),
+    expected_stats = compute_stats(
+        {
+            "a": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
+            "b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
+            "c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
+        }
     )
+    actual_stats = ClassificationMetric(
+        scores=["f1"], averages=["macro", "micro", "weighted", None], labels=labels, delimiter=","
+    ).evaluate_instances(request_states, "")
+    actual_macro_f1 = get_stat_value(actual_stats, "classification_macro_f1")
+    assert actual_macro_f1 == approx(expected_stats["macro_f1"])
+    actual_micro_f1 = get_stat_value(actual_stats, "classification_micro_f1")
+    assert actual_micro_f1 == approx(expected_stats["micro_f1"])
+    actual_weighted_f1 = get_stat_value(actual_stats, "classification_weighted_f1")
+    assert actual_weighted_f1 == approx(expected_stats["weighted_f1"])
+    actual_a_f1 = get_stat_value(actual_stats, "classification_a_f1")
+    assert actual_a_f1 == approx(expected_stats["a_f1"])
+    actual_b_f1 = get_stat_value(actual_stats, "classification_b_f1")
+    assert actual_b_f1 == approx(expected_stats["b_f1"])
+    actual_c_f1 = get_stat_value(actual_stats, "classification_c_f1")
+    assert actual_c_f1 == approx(expected_stats["c_f1"])

helm/benchmark/metrics/test_evaluate_reference_metrics.py CHANGED Viewed

@@ -1,4 +1,7 @@
+import pytest
 from helm.benchmark.metrics.evaluate_reference_metrics import (
+    bleu_1,
+    chinese_bleu_1,
     exact_match,
     exact_match_indicator,
     final_number_exact_match,
@@ -28,3 +31,15 @@ def test_final_number_exact_match():
     assert final_number_exact_match("34.2", "2") == 0
     assert final_number_exact_match("342", "342.") == 1
     assert final_number_exact_match("3,420", "3420") == 1
+def test_bleu_1():
+    assert bleu_1("the quick brown fox jumped over the lazy dog", "the fox jumped over the dog") == pytest.approx(
+        0.6065306597126334
+    )
+def test_chinese_bleu_1():
+    assert chinese_bleu_1(
+        "太祖武皇帝，沛國譙人也，姓曹，諱操，字孟德，漢相國參之後。", "太祖武皇帝，沛國譙人也，漢相國參之後。"
+    ) == pytest.approx(0.5907775139012316)

helm/benchmark/metrics/test_metric.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from helm.common.request import Token
-from .basic_metrics import get_num_bytes, convert_tokens_to_text
+from helm.benchmark.metrics.basic_metrics import get_num_bytes, convert_tokens_to_text
 def test_get_num_bytes():

helm/benchmark/metrics/test_statistic.py CHANGED Viewed

@@ -3,8 +3,8 @@ from typing import Dict
 import pytest
 import statistics
-from .metric_name import MetricName
-from .statistic import Stat, merge_stat
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.statistic import Stat, merge_stat
 def test_stat_add():

helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.request import Request
-from .token_cost_estimator import TokenCostEstimator
+from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
 class AI21TokenCostEstimator(TokenCostEstimator):

helm/benchmark/metrics/tokens/auto_token_cost_estimator.py CHANGED Viewed

@@ -2,12 +2,12 @@ from typing import Dict
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.request import Request
-from .ai21_token_cost_estimator import AI21TokenCostEstimator
-from .cohere_token_cost_estimator import CohereTokenCostEstimator
-from .free_token_cost_estimator import FreeTokenCostEstimator
-from .gooseai_token_cost_estimator import GooseAITokenCostEstimator
-from .openai_token_cost_estimator import OpenAITokenCostEstimator
-from .token_cost_estimator import TokenCostEstimator
+from helm.benchmark.metrics.tokens.ai21_token_cost_estimator import AI21TokenCostEstimator
+from helm.benchmark.metrics.tokens.cohere_token_cost_estimator import CohereTokenCostEstimator
+from helm.benchmark.metrics.tokens.free_token_cost_estimator import FreeTokenCostEstimator
+from helm.benchmark.metrics.tokens.gooseai_token_cost_estimator import GooseAITokenCostEstimator
+from helm.benchmark.metrics.tokens.openai_token_cost_estimator import OpenAITokenCostEstimator
+from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
 class AutoTokenCostEstimator(TokenCostEstimator):

helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.request import Request
-from .token_cost_estimator import TokenCostEstimator
+from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
 class CohereTokenCostEstimator(TokenCostEstimator):

helm/benchmark/metrics/tokens/free_token_cost_estimator.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.request import Request
-from .token_cost_estimator import TokenCostEstimator
+from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
 class FreeTokenCostEstimator(TokenCostEstimator):

helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py CHANGED Viewed

@@ -2,7 +2,7 @@ from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.window_services.window_service import WindowService
 from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 from helm.common.request import Request
-from .token_cost_estimator import TokenCostEstimator
+from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
 class GooseAITokenCostEstimator(TokenCostEstimator):

helm/benchmark/metrics/tokens/openai_token_cost_estimator.py CHANGED Viewed

@@ -2,7 +2,7 @@ from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.request import Request
 from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 from helm.benchmark.window_services.window_service import WindowService
-from .token_cost_estimator import TokenCostEstimator
+from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
 class OpenAITokenCostEstimator(TokenCostEstimator):

helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from unittest.mock import Mock
 from helm.common.request import Request
-from .ai21_token_cost_estimator import AI21TokenCostEstimator
+from helm.benchmark.metrics.tokens.ai21_token_cost_estimator import AI21TokenCostEstimator
 class TestAI21TokenCostEstimator:

helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py CHANGED Viewed

@@ -7,8 +7,8 @@ from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.authentication import Authentication
 from helm.common.request import Request
 from helm.common.tokenization_request import TokenizationRequestResult, TokenizationToken
-from helm.proxy.services.remote_service import RemoteService
-from .openai_token_cost_estimator import OpenAITokenCostEstimator
+from helm.common.remote_context import RemoteContext
+from helm.benchmark.metrics.tokens.openai_token_cost_estimator import OpenAITokenCostEstimator
 class TestOpenAITokenCostEstimator:
@@ -23,7 +23,7 @@ class TestOpenAITokenCostEstimator:
     def setup_method(self, method):
         self._token_cost_estimator = OpenAITokenCostEstimator()
-        self._mock_metric_service = MetricService(RemoteService("DUMMY_URL"), Authentication(api_key="test"))
+        self._mock_metric_service = MetricService(RemoteContext("DUMMY_URL", Authentication(api_key="test")))
         gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
         tokenization_request_result = TokenizationRequestResult(
             success=True,

helm/benchmark/metrics/toxicity_metrics.py CHANGED Viewed

@@ -2,14 +2,14 @@ from typing import List
 from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
 from helm.common.request import RequestResult
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, hwarn
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
-from .metric import Metric
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
 class ToxicityMetric(Metric):
@@ -62,7 +62,7 @@ class ToxicityMetric(Metric):
             )
         except PerspectiveAPIClientCredentialsError as e:
             self._perspective_api_unavailable = True
-            hlog(f"WARNING: Skipping ToxicityMetrics because Perspective API Client unavailable due to error: {e}")
+            hwarn(f"Skipping ToxicityMetrics because Perspective API Client unavailable due to error: {e}")
             hlog(
                 "To enable ToxicityMetrics, see: https://crfm-helm.readthedocs.io/en/latest/benchmark/#perspective-api"
             )

crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl