PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show

crfm_helm-0.5.6.dist-info/METADATA +427 -0
crfm_helm-0.5.6.dist-info/RECORD +941 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +13 -1
helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/common_adapter_specs.py +69 -4
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/annotation/aci_bench_annotator.py +95 -0
helm/benchmark/annotation/air_bench_annotator.py +21 -6
helm/benchmark/annotation/annotator.py +5 -0
helm/benchmark/annotation/annotator_factory.py +3 -20
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +107 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +99 -0
helm/benchmark/annotation/medalign_annotator.py +100 -0
helm/benchmark/annotation/medi_qa_annotator.py +98 -0
helm/benchmark/annotation/medication_qa_annotator.py +87 -63
helm/benchmark/annotation/mental_health_annotator.py +98 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +214 -6
helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation_executor.py +35 -15
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +2 -2
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +2 -2
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +16 -13
helm/benchmark/augmentations/translate_perturbation.py +2 -2
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +11 -12
helm/benchmark/huggingface_registration.py +2 -7
helm/benchmark/metrics/aci_bench_metrics.py +14 -0
helm/benchmark/metrics/basic_metrics.py +6 -6
helm/benchmark/metrics/bbq_metrics.py +2 -2
helm/benchmark/metrics/bias_metrics.py +12 -3
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
helm/benchmark/metrics/classification_metrics.py +76 -12
helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +9 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
helm/benchmark/metrics/copyright_metrics.py +4 -4
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +14 -0
helm/benchmark/metrics/disinformation_metrics.py +4 -4
helm/benchmark/metrics/dry_run_metrics.py +5 -5
helm/benchmark/metrics/efficiency_metrics.py +6 -6
helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
helm/benchmark/metrics/ifeval/__init__.py +0 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +55 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/language_modeling_metrics.py +4 -4
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/machine_translation_metrics.py +2 -2
helm/benchmark/metrics/med_dialog_metrics.py +14 -0
helm/benchmark/metrics/medalign_metrics.py +14 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
helm/benchmark/metrics/medec_metrics.py +101 -0
helm/benchmark/metrics/medi_qa_metrics.py +14 -0
helm/benchmark/metrics/medication_qa_metrics.py +10 -19
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +14 -0
helm/benchmark/metrics/metric.py +3 -3
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/numeracy_metrics.py +4 -4
helm/benchmark/metrics/omni_math_metrics.py +32 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
helm/benchmark/metrics/ranking_metrics.py +3 -3
helm/benchmark/metrics/reference_metric.py +3 -3
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +20 -9
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +140 -68
helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
helm/benchmark/metrics/test_metric.py +1 -1
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
helm/benchmark/metrics/toxicity_metrics.py +6 -6
helm/benchmark/metrics/unitxt_metrics.py +7 -5
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/metrics/wildbench_metrics.py +34 -0
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/model_metadata_registry.py +16 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +97 -67
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +86 -90
helm/benchmark/run_expander.py +90 -9
helm/benchmark/run_spec_factory.py +13 -0
helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/call_center_run_specs.py +49 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1 -69
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +142 -3
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/lite_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +141 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +37 -0
helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
helm/benchmark/run_specs/vlm_run_specs.py +103 -2
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
helm/benchmark/scenarios/air_bench_scenario.py +6 -1
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
helm/benchmark/scenarios/banking77_scenario.py +6 -1
helm/benchmark/scenarios/bbq_scenario.py +1 -1
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bold_scenario.py +1 -1
helm/benchmark/scenarios/boolq_scenario.py +1 -1
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
helm/benchmark/scenarios/clear_scenario.py +157 -0
helm/benchmark/scenarios/cleva_scenario.py +2 -2
helm/benchmark/scenarios/code_scenario.py +17 -4
helm/benchmark/scenarios/commonsense_scenario.py +1 -1
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
helm/benchmark/scenarios/disinformation_scenario.py +10 -1
helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
helm/benchmark/scenarios/gpqa_scenario.py +80 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +2 -2
helm/benchmark/scenarios/gsm_scenario.py +10 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
helm/benchmark/scenarios/headqa_scenario.py +136 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
helm/benchmark/scenarios/ice_scenario.py +8 -4
helm/benchmark/scenarios/ifeval_scenario.py +53 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +11 -2
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +1 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
helm/benchmark/scenarios/legal_support_scenario.py +11 -1
helm/benchmark/scenarios/legalbench_scenario.py +22 -3
helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
helm/benchmark/scenarios/lextreme_scenario.py +11 -1
helm/benchmark/scenarios/live_qa_scenario.py +1 -1
helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
helm/benchmark/scenarios/math_scenario.py +9 -1
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +10 -1
helm/benchmark/scenarios/medalign_scenario.py +94 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
helm/benchmark/scenarios/medbullets_scenario.py +145 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
helm/benchmark/scenarios/medec_scenario.py +125 -0
helm/benchmark/scenarios/medhallu_scenario.py +72 -0
helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +123 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
helm/benchmark/scenarios/mmlu_scenario.py +11 -1
helm/benchmark/scenarios/msmarco_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/numeracy_scenario.py +12 -2
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +53 -0
helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
helm/benchmark/scenarios/quac_scenario.py +10 -1
helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
helm/benchmark/scenarios/raft_scenario.py +17 -2
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
helm/benchmark/scenarios/scenario.py +9 -1
helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
helm/benchmark/scenarios/spider_scenario.py +91 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
helm/benchmark/scenarios/summarization_scenario.py +11 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_math_scenario.py +1 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
helm/benchmark/scenarios/unitxt_scenario.py +8 -2
helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
helm/benchmark/scenarios/vicuna_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/wikifact_scenario.py +11 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +83 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
helm/benchmark/scenarios/xstest_scenario.py +1 -1
helm/benchmark/server.py +13 -1
helm/benchmark/slurm_runner.py +1 -1
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +97 -60
helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_finance.yaml +14 -12
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_long_context.yaml +283 -0
helm/benchmark/static/schema_medhelm.yaml +1140 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +18 -1
helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_vhelm.yaml +129 -56
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
helm/benchmark/static_build/config.js +1 -1
helm/benchmark/static_build/index.html +6 -6
helm/benchmark/window_services/default_window_service.py +1 -1
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
helm/benchmark/window_services/ice_window_service.py +1 -1
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
helm/benchmark/window_services/local_window_service.py +2 -2
helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
helm/benchmark/window_services/test_bloom_window_service.py +3 -3
helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
helm/benchmark/window_services/test_gptj_window_service.py +8 -3
helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
helm/benchmark/window_services/test_openai_window_service.py +8 -3
helm/benchmark/window_services/test_opt_window_service.py +3 -3
helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
helm/benchmark/window_services/test_t511b_window_service.py +3 -3
helm/benchmark/window_services/test_ul2_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +4 -5
helm/benchmark/window_services/test_yalm_window_service.py +3 -3
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/benchmark/window_services/yalm_window_service.py +1 -1
helm/clients/ai21_client.py +3 -3
helm/clients/aleph_alpha_client.py +1 -1
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +120 -0
helm/clients/audio_language/llama_omni_client.py +198 -0
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
helm/clients/audio_language/qwen_audiolm_client.py +152 -0
helm/clients/audio_language/test.py +62 -0
helm/clients/auto_client.py +4 -2
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +203 -7
helm/clients/bedrock_utils.py +33 -0
helm/clients/client.py +7 -7
helm/clients/clip_scorers/clip_scorer.py +1 -1
helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
helm/clients/cohere_client.py +3 -3
helm/clients/google_client.py +1 -1
helm/clients/grok_client.py +36 -0
helm/clients/http_model_client.py +1 -1
helm/clients/huggingface_client.py +52 -21
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +267 -0
helm/clients/image_generation/adobe_vision_client.py +1 -1
helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
helm/clients/image_generation/cogview2_client.py +1 -1
helm/clients/image_generation/dalle2_client.py +1 -1
helm/clients/image_generation/dalle3_client.py +2 -2
helm/clients/image_generation/dalle_mini/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/data.py +1 -1
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
helm/clients/image_generation/dalle_mini_client.py +1 -1
helm/clients/image_generation/deep_floyd_client.py +1 -1
helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
helm/clients/image_generation/lexica_client.py +1 -1
helm/clients/image_generation/mindalle/models/__init__.py +6 -6
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
helm/clients/image_generation/mindalle_client.py +1 -1
helm/clients/image_generation/together_image_generation_client.py +1 -1
helm/clients/lit_gpt_client.py +2 -2
helm/clients/mistral_client.py +62 -18
helm/clients/nvidia_nim_client.py +0 -3
helm/clients/openai_client.py +308 -43
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +3 -9
helm/clients/reka_client.py +3 -3
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +93 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/clients/test_client.py +1 -1
helm/clients/test_together_client.py +6 -1
helm/clients/together_client.py +76 -9
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +45 -13
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/open_flamingo/__init__.py +2 -2
helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +188 -0
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +4 -6
helm/clients/writer_client.py +102 -0
helm/clients/yi_client.py +0 -3
helm/common/audio_utils.py +111 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/file_caches/local_file_cache.py +1 -1
helm/common/file_caches/test_local_file_cache.py +1 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/images_utils.py +2 -2
helm/common/local_context.py +140 -0
helm/common/media_object.py +2 -2
helm/common/multimodal_request_utils.py +26 -0
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +14 -2
helm/common/response_format.py +18 -0
helm/common/test_media_object.py +1 -1
helm/config/model_deployments.yaml +1792 -28
helm/config/model_metadata.yaml +1606 -51
helm/config/tokenizer_configs.yaml +521 -4
helm/proxy/cli.py +5 -3
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/example_queries.py +1 -1
helm/proxy/server.py +11 -4
helm/proxy/services/remote_service.py +1 -1
helm/proxy/services/server_service.py +22 -86
helm/proxy/services/test_remote_service.py +2 -2
helm/proxy/services/test_service.py +1 -1
helm/proxy/static/general.js +122 -0
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +57 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +456 -0
helm/proxy/static/info-icon.png +0 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +1 -1
helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
helm/tokenizers/caching_tokenizer.py +2 -30
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/http_model_tokenizer.py +1 -1
helm/tokenizers/huggingface_tokenizer.py +3 -3
helm/tokenizers/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/tokenizers/test_huggingface_tokenizer.py +1 -1
helm/tokenizers/test_yalm_tokenizer.py +1 -1
helm/tokenizers/tiktoken_tokenizer.py +1 -1
helm/tokenizers/tokenizer.py +3 -1
helm/tokenizers/yalm_tokenizer.py +3 -3
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
crfm_helm-0.5.4.dist-info/METADATA +0 -350
crfm_helm-0.5.4.dist-info/RECORD +0 -697
helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
helm/tokenizers/anthropic_tokenizer.py +0 -52
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0

helm/proxy/static/index.js ADDED Viewed

@@ -0,0 +1,456 @@
+/**
+ * This is a very quick and dirty frontend for just interacting with the models.
+ * Please refrain from adding additional functionality to this.
+ * TODO: Write this in React.
+ */
+$(function () {
+  const urlParams = decodeUrlParams(window.location.search);
+  let auth = null;
+  function censor(api_key) {
+    // Show only the first k letters
+    const k = 2;
+    if (api_key.length <= k) {
+      return api_key;
+    }
+    return api_key.substring(0, k) + "*".repeat(api_key.length - k);
+  }
+  // Logging in and out
+  function updateLogin() {
+    const $loginInfo = $("#loginInfo");
+    $loginInfo.empty();
+    let api_key = readCookie("api_key");
+    if (api_key) {
+      auth = { api_key };
+      $loginInfo.append(
+        $("<a>", { class: "nav-link", href: "#" })
+          .append("Logout of API key " + censor(api_key))
+          .click(() => {
+            eraseCookie("api_key");
+            updateLogin();
+          }),
+      );
+    } else {
+      auth = null;
+      $loginInfo.append(
+        $("<a>", { class: "nav-link", href: "#" })
+          .append("Login")
+          .click(() => {
+            api_key = prompt("Enter your API key:");
+            if (!api_key) {
+              return;
+            }
+            // Check the API key the user entered using endpoint /api/account
+            const args = { auth: JSON.stringify({ api_key }) };
+            $.getJSON("/api/account", args, (response) => {
+              console.log("/api/account", response);
+              if ("error" in response) {
+                alert("The API key you've entered is invalid. Try again.");
+              } else {
+                createCookie("api_key", api_key);
+                updateLogin();
+              }
+            });
+          }),
+      );
+    }
+  }
+  updateLogin();
+  ////////////////////////////////////////////////////////////
+  // Rendering functions
+  function renderExampleQueries(updateQuery) {
+    // Show links for each example query, so when you click on them, they populate the textboxes.
+    const $examplesBlock = $("<div>", { class: "examples-block" });
+    $examplesBlock.append($("<span>").append("Examples:"));
+    generalInfo.example_queries.forEach((query, i) => {
+      const href = "#";
+      const title =
+        "[Prompt]\n" +
+        query.prompt +
+        "\n[Settings]\n" +
+        query.settings +
+        "\n[Environments]\n" +
+        query.environments;
+      const $link = $("<a>", { href, title }).append(`[${i}]`);
+      $link.click(() => {
+        // Populate the query with the example
+        updateQuery(query);
+        urlParams.prompt = query.prompt;
+        urlParams.settings = query.settings;
+        urlParams.environments = query.environments;
+        updateBrowserLocation(urlParams);
+      });
+      $examplesBlock.append("&nbsp;");
+      $examplesBlock.append($link);
+    });
+    return $examplesBlock;
+  }
+  function renderQuery(handleQueryResult) {
+    // Render the textboxes for entering the query (which includes the prompt, settings, and environment)
+    const $queryBlock = $("<div>", { class: "block" });
+    const $prompt = $("<textarea>", {
+      cols: 90,
+      rows: 7,
+      placeholder: "Enter prompt",
+    }).val(urlParams.prompt);
+    const $settings = $("<textarea>", {
+      cols: 90,
+      rows: 5,
+      placeholder:
+        "Enter settings (e.g., model: openai/text-davinci-002 for Instruct GPT-3); click Help at the top to learn more",
+    }).val(urlParams.settings);
+    const $environments = $("<textarea>", {
+      cols: 90,
+      rows: 3,
+      placeholder:
+        "Enter environment variables (e.g., city: [Boston, New York]); click Help at the top to learn more",
+    }).val(urlParams.environments);
+    $queryBlock.data("prompt", $prompt);
+    $queryBlock.data("settings", $settings);
+    $queryBlock.data("environments", $environments);
+    function bindSubmit($text) {
+      $text.keypress((e) => {
+        if ((event.keyCode === 10 || event.keyCode === 13) && event.ctrlKey) {
+          submit();
+        }
+      });
+    }
+    bindSubmit($prompt);
+    bindSubmit($settings);
+    bindSubmit($environments);
+    function submit() {
+      if (!auth) {
+        alert("You must log in first.");
+        return;
+      }
+      const query = {
+        prompt: $prompt.val(),
+        settings: $settings.val(),
+        environments: $environments.val(),
+      };
+      urlParams.prompt = query.prompt;
+      urlParams.settings = query.settings;
+      urlParams.environments = query.environments;
+      updateBrowserLocation(urlParams);
+      $.getJSON("/api/query", query, handleQueryResult);
+    }
+    // Show examples of queries
+    const $exampleQueries = renderExampleQueries((query) => {
+      $queryBlock.data("prompt").val(query.prompt);
+      $queryBlock.data("settings").val(query.settings);
+      $queryBlock.data("environments").val(query.environments);
+      urlParams.prompt = query.prompt;
+      urlParams.settings = query.settings;
+      urlParams.environments = query.environments;
+      updateBrowserLocation();
+    });
+    const promptHelp =
+      "This is the text you feed into the language model to complete.\nExample:\n  Life is like";
+    const settingsHelp =
+      "Specifies what information we want from the language model (see [Help] for more details).\nExample:\n  temperature: ${temperature}\n  model: openai/davinci\n  max_tokens: 10\n  num_completions: 5";
+    const environmentsHelp =
+      "Specifies a list of values to try for each variable that appears in the prompt or settings.\nExample:\n  temperature: [0, 0.5, 1]";
+    const $promptLabel = $("<span>")
+      .append(helpIcon(promptHelp, "help.html#query"))
+      .append("Prompt");
+    const $settingsLabel = $("<span>")
+      .append(helpIcon(settingsHelp, "help.html#query"))
+      .append("Settings");
+    const $environmentsLabel = $("<span>")
+      .append(helpIcon(environmentsHelp, "help.html#query"))
+      .append("Environments");
+    $queryBlock.append($("<h4>").append("Query"));
+    $queryBlock.append($exampleQueries);
+    const $table = $("<table>", { class: "query-table" });
+    $table.append(
+      $("<tr>")
+        .append($("<td>").append($promptLabel))
+        .append($("<td>").append($prompt)),
+    );
+    $table.append(
+      $("<tr>")
+        .append($("<td>").append($settingsLabel))
+        .append($("<td>").append($settings)),
+    );
+    $table.append(
+      $("<tr>")
+        .append($("<td>").append($environmentsLabel))
+        .append($("<td>").append($environments)),
+    );
+    $queryBlock.append($table);
+    $queryBlock.append($("<button>").append("Submit").click(submit));
+    return $queryBlock;
+  }
+  function getChangingKeys(items) {
+    // Return the list of keys whose values vary across `items.`
+    if (items.length === 0) {
+      return [];
+    }
+    return Object.keys(items[0]).filter((key) => {
+      return !items.every(
+        (item) => JSON.stringify(item[key]) === JSON.stringify(items[0][key]),
+      );
+    });
+  }
+  function renderRequest(changingKeys, request) {
+    // Render the request metadata (e.g., temperature if it is changing)
+    const title = JSON.stringify(request);
+    // Always include model, never prompt (since that's shown right after).
+    const showKeys = ["model"].concat(
+      changingKeys.filter((key) => key !== "prompt" && key !== "model"),
+    );
+    const summary =
+      "[" + showKeys.map((key) => key + ":" + request[key]).join(", ") + "]";
+    return $("<div>", { title }).append(
+      summary + " " + multilineHtml(request.prompt),
+    );
+  }
+  function renderTime(time) {
+    return Math.round(time * 10) / 10 + "s";
+  }
+  function constructTokenGroups(tokens) {
+    // Note: sometimes multiple tokens correspond to one character, for example:
+    // ["bytes:\xe2\x80", "bytes:\x99"] => ’
+    // For these, we keep these in the buffer and collapse them, and concatenate the entries.
+    //
+    // get_num_bytes() and convert_tokens_to_text() in src/helm/benchmark/basic_metrics.py are adapted from this function.
+    const groups = [];
+    for (let i = 0; i < tokens.length;) {
+      // Aggregate consecutive tokens while they're "bytes:..."
+      const group = { tokens: [] };
+      if (tokens[i].text.startsWith("bytes:")) {
+        let bytestring = "";
+        while (i < tokens.length && tokens[i].text.startsWith("bytes:")) {
+          group.tokens.push(tokens[i]);
+          // Extract part after : (e.g., \xe2\x80)
+          bytestring += tokens[i].text.split(":")[1];
+          i++;
+        }
+        // Convert to encoded URI (e.g., %e2%80%99) and decode
+        group.text = decodeURIComponent(bytestring.replaceAll("\\x", "%"));
+      } else {
+        group.tokens.push(tokens[i]);
+        group.text = tokens[i].text;
+        i++;
+      }
+      groups.push(group);
+    }
+    return groups;
+  }
+  function renderTokens(tokens) {
+    // Render text as a sequence of tokens that you can interact with to see more information (e.g., logprobs)
+    const $result = $("<div>");
+    const groups = constructTokenGroups(tokens);
+    for (const group of groups) {
+      const $group = $("<span>", { class: "token" }).append(
+        multilineHtml(group.text),
+      );
+      $result.append($group);
+    }
+    return $result;
+  }
+  function renderRequestResult(requestResult) {
+    // Render the list of completions.
+    if (requestResult.error) {
+      return renderError(requestResult.error);
+    }
+    const $result = $("<div>");
+    requestResult.completions.forEach((completion) => {
+      const $contents = $("<span>", {
+        title: `logprob: ${completion.logprob}`,
+      }).append(renderTokens(completion.tokens));
+      const $metadata = $("<span>", { class: "metadata" });
+      $metadata.append(
+        $("<span>", { title: "Log probability" }).append(
+          round(completion.logprob, 2),
+        ),
+      );
+      if (completion.finish_reason) {
+        const title =
+          "Generation finished because of this reason: " +
+          JSON.stringify(completion.finish_reason);
+        $metadata
+          .append(" ")
+          .append(
+            $("<span>", { title }).append(completion.finish_reason.reason),
+          );
+      }
+      $result.append(
+        $("<div>", { class: "completion" }).append($metadata).append($contents),
+      );
+    });
+    $result.append($("<i>").append(renderTime(requestResult.request_time)));
+    return $result;
+  }
+  function renderAccount() {
+    // Render the account information (usage, quotas).
+    if (!auth) {
+      return null;
+    }
+    const $accountBlock = $("<div>", { class: "block" });
+    const args = { auth: JSON.stringify(auth) };
+    $.getJSON("/api/account", args, ([account]) => {
+      console.log("/api/account", account);
+      const items = [];
+      for (modelGroup in account.usages) {
+        for (granularity in account.usages[modelGroup]) {
+          const usage = account.usages[modelGroup][granularity];
+          // Only print out usage for model groups and granularities where there is a quota
+          if (usage.quota) {
+            const percent = Math.round((usage.used / usage.quota) * 100);
+            items.push(
+              `<b>${modelGroup}</b>: ${usage.period} (${usage.used} / ${usage.quota} = ${percent}%)`,
+            );
+          }
+        }
+      }
+      if (items.length === 0) {
+        items.push("no restrictions");
+      }
+      $accountBlock
+        .empty()
+        .append(
+          helpIcon(
+            "Specifies your usage/quota (321/10000) for each model group (e.g., gpt3) for the current period (e.g., 2022-1-2).",
+            "help.html#quotas",
+          ),
+        )
+        .append("Usage")
+        .append(": ")
+        .append(items.join(" | "));
+    });
+    return $accountBlock;
+  }
+  ////////////////////////////////////////////////////////////
+  // For index.html
+  function renderQueryInterface() {
+    // For index.html
+    const $accountBlock = $("<div>").append(renderAccount());
+    // Allow editing the query
+    const $queryBlock = renderQuery((queryResult) => {
+      // Create requests
+      console.log("/api/query", queryResult);
+      $requestsBlock.empty();
+      if (queryResult.error) {
+        $requestsBlock.append(renderError(queryResult.error));
+        return;
+      }
+      $requestsBlock.append(
+        $("<h4>").append(`Requests (${queryResult.requests.length})`),
+      );
+      if (queryResult.error) {
+        $requestsBlock.append(renderError(queryResult.error));
+        return;
+      }
+      const changingKeys = getChangingKeys(queryResult.requests);
+      queryResult.requests.forEach((request) => {
+        const $request = $("<div>", { class: "request" }).append(
+          renderRequest(changingKeys, request),
+        );
+        const $requestResult = $("<div>").append($("<i>").append("(waiting)"));
+        const args = {
+          auth: JSON.stringify(auth),
+          request: JSON.stringify(request),
+        };
+        $.getJSON("/api/request", args, (requestResult) => {
+          console.log("/api/request", request, requestResult);
+          $requestResult.empty().append(renderRequestResult(requestResult));
+          if (!requestResult.cached) {
+            $accountBlock.empty().append(renderAccount());
+          }
+        });
+        $request.append($requestResult);
+        $requestsBlock.append($request);
+      });
+    });
+    // Where the requests and responses come in
+    const $requestsBlock = $("<div>", { class: "block" });
+    const $group = $("<div>");
+    $group.append($accountBlock);
+    $group.append($queryBlock);
+    $group.append($requestsBlock);
+    return $group;
+  }
+  ////////////////////////////////////////////////////////////
+  // For help.html
+  function renderModelsTable() {
+    // Render the list of models
+    const $table = $("<table>", { class: "table" });
+    const $header = $("<tr>")
+      .append($("<td>").append("group"))
+      .append($("<td>").append("name"))
+      .append($("<td>").append("description"))
+      .append($("<td>").append("tags"));
+    $table.append($header);
+    generalInfo.all_models.forEach((model) => {
+      const $row = $("<tr>")
+        .append($("<td>").append($("<tt>").append(model.group)))
+        .append($("<td>").append($("<tt>").append(model.name)))
+        .append($("<td>").append(model.description))
+        .append($("<td>").append(model.tags.join(" ")));
+      $table.append($row);
+    });
+    return $table;
+  }
+  ////////////////////////////////////////////////////////////
+  // Main
+  let generalInfo;
+  $.getJSON("/api/general_info", (response) => {
+    generalInfo = response;
+    console.log("/api/general_info", generalInfo);
+    if (generalInfo.error) {
+      alert(generalInfo.error);
+      return;
+    }
+    // For index.html
+    const $main = $("#main");
+    if ($main.length > 0) {
+      $main.empty().append(renderQueryInterface());
+    }
+    // For help.html
+    const $helpModels = $("#help-models");
+    if ($helpModels.length > 0) {
+      $helpModels.empty().append(renderModelsTable());
+    }
+  });
+});

helm/proxy/static/info-icon.png ADDED Viewed

Binary file

helm/proxy/test_retry.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from retrying import RetryError
 from helm.common.request import RequestResult
-from .retry import retry_request, get_retry_decorator, retry_if_request_failed
+from helm.proxy.retry import retry_request, get_retry_decorator, retry_if_request_failed
 def test_retry_for_successful_request():

helm/proxy/token_counters/auto_token_counter.py CHANGED Viewed

@@ -4,7 +4,7 @@ from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_
 from helm.common.request import Request, GeneratedOutput
 from helm.tokenizers.auto_tokenizer import AutoTokenizer
 from helm.common.tokenization_request import TokenizationRequest, TokenizationRequestResult
-from .token_counter import TokenCounter
+from helm.proxy.token_counters.token_counter import TokenCounter
 class AutoTokenCounter(TokenCounter):

helm/tokenizers/aleph_alpha_tokenizer.py CHANGED Viewed

@@ -10,7 +10,7 @@ from helm.common.tokenization_request import (
     DecodeRequest,
     TokenizationToken,
 )
-from .caching_tokenizer import CachingTokenizer
+from helm.tokenizers.caching_tokenizer import CachingTokenizer
 try:
     from aleph_alpha_client import Client as AlephAlphaPythonClient

helm/tokenizers/caching_tokenizer.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from abc import abstractmethod
 from dataclasses import asdict
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List
 from helm.common.cache import Cache, CacheConfig
 from helm.common.request import wrap_request_time
@@ -11,7 +11,7 @@ from helm.common.tokenization_request import (
     DecodeRequestResult,
     TokenizationToken,
 )
-from .tokenizer import Tokenizer
+from helm.tokenizers.tokenizer import Tokenizer
 class CachingTokenizer(Tokenizer):
@@ -153,31 +153,3 @@ class CachingTokenizer(Tokenizer):
             )
         except Exception as error:
             raise ValueError(f"Failed to decode tokens with {self.__class__.__name__} tokenizer: {error}") from error
-def cleanup_str(token: str, tokenizer_name: Optional[str] = None) -> str:
-    """
-    Certain tokenizers introduce special characters to represent spaces, such as
-    "Ġ" or "▁". This function removes those characters.
-    """
-    if tokenizer_name in [
-        "TsinghuaKEG/ice",
-        "bigscience/T0pp",
-        "google/t5-11b",
-        "google/flan-t5-xxl",
-        "google/ul2",
-        "Yandex/yalm",
-        "ai21/j1",
-        "together",
-    ]:
-        return token.replace("▁", " ")
-    elif tokenizer_name is not None and tokenizer_name.startswith("huggingface"):
-        return token.replace("Ġ", " ")
-    return token
-def cleanup_tokens(tokens: List[str], tokenizer_name: Optional[str] = None) -> List[str]:
-    """
-    Applies `cleanup_str` to each token in `tokens`.
-    """
-    return [cleanup_str(token, tokenizer_name) for token in tokens]

helm/tokenizers/grok_tokenizer.py ADDED Viewed

@@ -0,0 +1,53 @@
+import dataclasses
+import os
+from typing import Any, Dict, List, Optional
+import requests
+from helm.common.cache import CacheConfig
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationToken,
+)
+from helm.tokenizers.caching_tokenizer import CachingTokenizer
+class GrokAPITokenizer(CachingTokenizer):
+    """Tokenizer that uses the xAI Grok Tokenize Text API
+    Doc: https://docs.x.ai/docs/api-reference#tokenize-text"""
+    def __init__(self, cache_config: CacheConfig, api_key: Optional[str] = None) -> None:
+        super().__init__(cache_config)
+        self.api_key = api_key or os.environ.get("XAI_API_KEY")
+    def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
+        cache_key = dataclasses.asdict(request)
+        # Delete encode because the Grok API simulateously gives string and integer tokens.
+        del cache_key["encode"]
+        return cache_key
+    def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
+        if not self.api_key:
+            raise Exception(
+                "No Grok API key found. "
+                "Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
+            )
+        text = request["text"]
+        model = request["tokenizer"].split("/")[-1]
+        response = requests.post(
+            url="https://api.x.ai/v1/tokenize-text",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={"text": text, "model": model},
+        )
+        response.raise_for_status()
+        return response.json()
+    def _tokenization_raw_response_to_tokens(
+        self, response: Dict[str, Any], request: TokenizationRequest
+    ) -> List[TokenizationToken]:
+        raw_token_field_name = "token_id" if request.encode else "string_token"
+        return [TokenizationToken(raw_token[raw_token_field_name]) for raw_token in response["token_ids"]]
+    def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
+        raise NotImplementedError("The xAI API does not support decoding.")

helm/tokenizers/http_model_tokenizer.py CHANGED Viewed

@@ -11,7 +11,7 @@ from helm.common.tokenization_request import (
     TokenizationRequestResult,
     TokenizationToken,
 )
-from .tokenizer import Tokenizer
+from helm.tokenizers.tokenizer import Tokenizer
 import requests

helm/tokenizers/huggingface_tokenizer.py CHANGED Viewed

@@ -7,8 +7,8 @@ from helm.common.concurrency import ThreadSafeWrapper
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 from helm.common.hierarchical_logger import htrack_block, hlog
-from .caching_tokenizer import CachingTokenizer
-from .tokenizer import cleanup_tokens
+from helm.tokenizers.caching_tokenizer import CachingTokenizer
+from helm.tokenizers.tokenizer import cleanup_tokens
 WrappedPreTrainedTokenizer = ThreadSafeWrapper[PreTrainedTokenizerBase]
@@ -119,7 +119,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
                     tokens = tokenizer.encode(
                         request["text"],
                         truncation=request["truncation"],
-                        max_length=request["max_length"],
+                        max_length=max(request["max_length"], 0),
                         add_special_tokens=False,
                     )
             else:

helm/tokenizers/lit_gpt_tokenizer.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 from helm.common.cache import CacheConfig
 from helm.common.optional_dependencies import OptionalDependencyNotInstalled
-from .caching_tokenizer import CachingTokenizer
+from helm.tokenizers.caching_tokenizer import CachingTokenizer
 try:
     from lit_gpt import Tokenizer as InternalTokenizer

helm/tokenizers/test_anthropic_tokenizer.py CHANGED Viewed

@@ -10,7 +10,7 @@ from helm.common.tokenization_request import (
     TokenizationRequest,
     TokenizationRequestResult,
 )
-from helm.tokenizers.anthropic_tokenizer import AnthropicTokenizer
+from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
 class TestAnthropicTokenizer:
@@ -21,7 +21,11 @@ class TestAnthropicTokenizer:
     def setup_method(self, method):
         cache_file = tempfile.NamedTemporaryFile(delete=False)
         self.cache_path: str = cache_file.name
-        self.tokenizer = AnthropicTokenizer(SqliteCacheConfig(self.cache_path))
+        self.tokenizer = HuggingFaceTokenizer(
+            SqliteCacheConfig(self.cache_path),
+            tokenizer_name="anthropic/claude",
+            pretrained_model_name_or_path="Xenova/claude-tokenizer",
+        )
     def teardown_method(self, method):
         os.remove(self.cache_path)

helm/tokenizers/test_grok_tokenizer.py ADDED Viewed

@@ -0,0 +1,33 @@
+import os
+import pytest
+from helm.common.cache import BlackHoleCacheConfig
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationToken,
+)
+from helm.tokenizers.grok_tokenizer import GrokAPITokenizer
+@pytest.mark.models
+def test_tokenize():
+    if not os.environ.get("XAI_API_KEY"):
+        pytest.skip("No xAI API key found; skipping test")
+    tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
+    request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦")
+    result = tokenizer.tokenize(request)
+    assert result.success
+    assert not result.cached
+    assert result.tokens == [TokenizationToken(token) for token in ["otter", "", "", ""]]
+@pytest.mark.models
+def test_encode():
+    if not os.environ.get("XAI_API_KEY"):
+        pytest.skip("No xAI API key found; skipping test")
+    tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
+    request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦", encode=True)
+    result = tokenizer.tokenize(request)
+    assert result.success
+    assert not result.cached
+    assert result.tokens == [TokenizationToken(token) for token in [142507, 11637, 294, 294]]

crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl