PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show

crfm_helm-0.5.6.dist-info/METADATA +427 -0
crfm_helm-0.5.6.dist-info/RECORD +941 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +13 -1
helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/common_adapter_specs.py +69 -4
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/annotation/aci_bench_annotator.py +95 -0
helm/benchmark/annotation/air_bench_annotator.py +21 -6
helm/benchmark/annotation/annotator.py +5 -0
helm/benchmark/annotation/annotator_factory.py +3 -20
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +107 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +99 -0
helm/benchmark/annotation/medalign_annotator.py +100 -0
helm/benchmark/annotation/medi_qa_annotator.py +98 -0
helm/benchmark/annotation/medication_qa_annotator.py +87 -63
helm/benchmark/annotation/mental_health_annotator.py +98 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +214 -6
helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation_executor.py +35 -15
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +2 -2
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +2 -2
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +16 -13
helm/benchmark/augmentations/translate_perturbation.py +2 -2
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +11 -12
helm/benchmark/huggingface_registration.py +2 -7
helm/benchmark/metrics/aci_bench_metrics.py +14 -0
helm/benchmark/metrics/basic_metrics.py +6 -6
helm/benchmark/metrics/bbq_metrics.py +2 -2
helm/benchmark/metrics/bias_metrics.py +12 -3
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
helm/benchmark/metrics/classification_metrics.py +76 -12
helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +9 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
helm/benchmark/metrics/copyright_metrics.py +4 -4
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +14 -0
helm/benchmark/metrics/disinformation_metrics.py +4 -4
helm/benchmark/metrics/dry_run_metrics.py +5 -5
helm/benchmark/metrics/efficiency_metrics.py +6 -6
helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
helm/benchmark/metrics/ifeval/__init__.py +0 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +55 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/language_modeling_metrics.py +4 -4
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/machine_translation_metrics.py +2 -2
helm/benchmark/metrics/med_dialog_metrics.py +14 -0
helm/benchmark/metrics/medalign_metrics.py +14 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
helm/benchmark/metrics/medec_metrics.py +101 -0
helm/benchmark/metrics/medi_qa_metrics.py +14 -0
helm/benchmark/metrics/medication_qa_metrics.py +10 -19
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +14 -0
helm/benchmark/metrics/metric.py +3 -3
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/numeracy_metrics.py +4 -4
helm/benchmark/metrics/omni_math_metrics.py +32 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
helm/benchmark/metrics/ranking_metrics.py +3 -3
helm/benchmark/metrics/reference_metric.py +3 -3
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +20 -9
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +140 -68
helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
helm/benchmark/metrics/test_metric.py +1 -1
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
helm/benchmark/metrics/toxicity_metrics.py +6 -6
helm/benchmark/metrics/unitxt_metrics.py +7 -5
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/metrics/wildbench_metrics.py +34 -0
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/model_metadata_registry.py +16 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +97 -67
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +86 -90
helm/benchmark/run_expander.py +90 -9
helm/benchmark/run_spec_factory.py +13 -0
helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/call_center_run_specs.py +49 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1 -69
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +142 -3
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/lite_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +141 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +37 -0
helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
helm/benchmark/run_specs/vlm_run_specs.py +103 -2
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
helm/benchmark/scenarios/air_bench_scenario.py +6 -1
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
helm/benchmark/scenarios/banking77_scenario.py +6 -1
helm/benchmark/scenarios/bbq_scenario.py +1 -1
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bold_scenario.py +1 -1
helm/benchmark/scenarios/boolq_scenario.py +1 -1
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
helm/benchmark/scenarios/clear_scenario.py +157 -0
helm/benchmark/scenarios/cleva_scenario.py +2 -2
helm/benchmark/scenarios/code_scenario.py +17 -4
helm/benchmark/scenarios/commonsense_scenario.py +1 -1
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
helm/benchmark/scenarios/disinformation_scenario.py +10 -1
helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
helm/benchmark/scenarios/gpqa_scenario.py +80 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +2 -2
helm/benchmark/scenarios/gsm_scenario.py +10 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
helm/benchmark/scenarios/headqa_scenario.py +136 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
helm/benchmark/scenarios/ice_scenario.py +8 -4
helm/benchmark/scenarios/ifeval_scenario.py +53 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +11 -2
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +1 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
helm/benchmark/scenarios/legal_support_scenario.py +11 -1
helm/benchmark/scenarios/legalbench_scenario.py +22 -3
helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
helm/benchmark/scenarios/lextreme_scenario.py +11 -1
helm/benchmark/scenarios/live_qa_scenario.py +1 -1
helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
helm/benchmark/scenarios/math_scenario.py +9 -1
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +10 -1
helm/benchmark/scenarios/medalign_scenario.py +94 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
helm/benchmark/scenarios/medbullets_scenario.py +145 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
helm/benchmark/scenarios/medec_scenario.py +125 -0
helm/benchmark/scenarios/medhallu_scenario.py +72 -0
helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +123 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
helm/benchmark/scenarios/mmlu_scenario.py +11 -1
helm/benchmark/scenarios/msmarco_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/numeracy_scenario.py +12 -2
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +53 -0
helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
helm/benchmark/scenarios/quac_scenario.py +10 -1
helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
helm/benchmark/scenarios/raft_scenario.py +17 -2
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
helm/benchmark/scenarios/scenario.py +9 -1
helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
helm/benchmark/scenarios/spider_scenario.py +91 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
helm/benchmark/scenarios/summarization_scenario.py +11 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_math_scenario.py +1 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
helm/benchmark/scenarios/unitxt_scenario.py +8 -2
helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
helm/benchmark/scenarios/vicuna_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/wikifact_scenario.py +11 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +83 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
helm/benchmark/scenarios/xstest_scenario.py +1 -1
helm/benchmark/server.py +13 -1
helm/benchmark/slurm_runner.py +1 -1
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +97 -60
helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_finance.yaml +14 -12
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_long_context.yaml +283 -0
helm/benchmark/static/schema_medhelm.yaml +1140 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +18 -1
helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_vhelm.yaml +129 -56
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
helm/benchmark/static_build/config.js +1 -1
helm/benchmark/static_build/index.html +6 -6
helm/benchmark/window_services/default_window_service.py +1 -1
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
helm/benchmark/window_services/ice_window_service.py +1 -1
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
helm/benchmark/window_services/local_window_service.py +2 -2
helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
helm/benchmark/window_services/test_bloom_window_service.py +3 -3
helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
helm/benchmark/window_services/test_gptj_window_service.py +8 -3
helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
helm/benchmark/window_services/test_openai_window_service.py +8 -3
helm/benchmark/window_services/test_opt_window_service.py +3 -3
helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
helm/benchmark/window_services/test_t511b_window_service.py +3 -3
helm/benchmark/window_services/test_ul2_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +4 -5
helm/benchmark/window_services/test_yalm_window_service.py +3 -3
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/benchmark/window_services/yalm_window_service.py +1 -1
helm/clients/ai21_client.py +3 -3
helm/clients/aleph_alpha_client.py +1 -1
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +120 -0
helm/clients/audio_language/llama_omni_client.py +198 -0
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
helm/clients/audio_language/qwen_audiolm_client.py +152 -0
helm/clients/audio_language/test.py +62 -0
helm/clients/auto_client.py +4 -2
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +203 -7
helm/clients/bedrock_utils.py +33 -0
helm/clients/client.py +7 -7
helm/clients/clip_scorers/clip_scorer.py +1 -1
helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
helm/clients/cohere_client.py +3 -3
helm/clients/google_client.py +1 -1
helm/clients/grok_client.py +36 -0
helm/clients/http_model_client.py +1 -1
helm/clients/huggingface_client.py +52 -21
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +267 -0
helm/clients/image_generation/adobe_vision_client.py +1 -1
helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
helm/clients/image_generation/cogview2_client.py +1 -1
helm/clients/image_generation/dalle2_client.py +1 -1
helm/clients/image_generation/dalle3_client.py +2 -2
helm/clients/image_generation/dalle_mini/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/data.py +1 -1
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
helm/clients/image_generation/dalle_mini_client.py +1 -1
helm/clients/image_generation/deep_floyd_client.py +1 -1
helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
helm/clients/image_generation/lexica_client.py +1 -1
helm/clients/image_generation/mindalle/models/__init__.py +6 -6
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
helm/clients/image_generation/mindalle_client.py +1 -1
helm/clients/image_generation/together_image_generation_client.py +1 -1
helm/clients/lit_gpt_client.py +2 -2
helm/clients/mistral_client.py +62 -18
helm/clients/nvidia_nim_client.py +0 -3
helm/clients/openai_client.py +308 -43
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +3 -9
helm/clients/reka_client.py +3 -3
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +93 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/clients/test_client.py +1 -1
helm/clients/test_together_client.py +6 -1
helm/clients/together_client.py +76 -9
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +45 -13
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/open_flamingo/__init__.py +2 -2
helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +188 -0
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +4 -6
helm/clients/writer_client.py +102 -0
helm/clients/yi_client.py +0 -3
helm/common/audio_utils.py +111 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/file_caches/local_file_cache.py +1 -1
helm/common/file_caches/test_local_file_cache.py +1 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/images_utils.py +2 -2
helm/common/local_context.py +140 -0
helm/common/media_object.py +2 -2
helm/common/multimodal_request_utils.py +26 -0
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +14 -2
helm/common/response_format.py +18 -0
helm/common/test_media_object.py +1 -1
helm/config/model_deployments.yaml +1792 -28
helm/config/model_metadata.yaml +1606 -51
helm/config/tokenizer_configs.yaml +521 -4
helm/proxy/cli.py +5 -3
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/example_queries.py +1 -1
helm/proxy/server.py +11 -4
helm/proxy/services/remote_service.py +1 -1
helm/proxy/services/server_service.py +22 -86
helm/proxy/services/test_remote_service.py +2 -2
helm/proxy/services/test_service.py +1 -1
helm/proxy/static/general.js +122 -0
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +57 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +456 -0
helm/proxy/static/info-icon.png +0 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +1 -1
helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
helm/tokenizers/caching_tokenizer.py +2 -30
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/http_model_tokenizer.py +1 -1
helm/tokenizers/huggingface_tokenizer.py +3 -3
helm/tokenizers/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/tokenizers/test_huggingface_tokenizer.py +1 -1
helm/tokenizers/test_yalm_tokenizer.py +1 -1
helm/tokenizers/tiktoken_tokenizer.py +1 -1
helm/tokenizers/tokenizer.py +3 -1
helm/tokenizers/yalm_tokenizer.py +3 -3
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
crfm_helm-0.5.4.dist-info/METADATA +0 -350
crfm_helm-0.5.4.dist-info/RECORD +0 -697
helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
helm/tokenizers/anthropic_tokenizer.py +0 -52
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0

helm/benchmark/static_build/config.js CHANGED Viewed

@@ -1,4 +1,4 @@
-window.RELEASE = "v1.0.0";
+window.RELEASE = "v1.9.0";
 window.BENCHMARK_OUTPUT_BASE_URL =
 	"https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/";
 window.PROJECT_ID = "lite";

helm/benchmark/static_build/index.html CHANGED Viewed

@@ -2,16 +2,16 @@
 <html lang="en">
   <head>
     <meta charset="UTF-8" />
-    <link rel="icon" type="image/svg+xml" href="./helm.svg" />
+    <link rel="icon" type="image/svg+xml" href="https://crfm.stanford.edu/helm/helm.svg" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <title>Holistic Evaluation of Language Models (HELM)</title>
     <meta name="description" content="The Holistic Evaluation of Language Models (HELM) serves as a living benchmark for transparency in language models. Providing broad coverage and recognizing incompleteness, multi-metric measurements, and standardization. All data and analysis are freely accessible on the website for exploration and study." />
     <script type="text/javascript" src="./config.js"></script>
-    <script type="module" crossorigin src="./assets/index-3ee38b3d.js"></script>
-    <link rel="modulepreload" crossorigin href="./assets/react-d4a0b69b.js">
-    <link rel="modulepreload" crossorigin href="./assets/recharts-6d337683.js">
-    <link rel="modulepreload" crossorigin href="./assets/tremor-54a99cc4.js">
-    <link rel="stylesheet" href="./assets/index-05c76bb1.css">
+    <script type="module" crossorigin src="./assets/index-94295e78.js"></script>
+    <link rel="modulepreload" crossorigin href="./assets/react-f82877fd.js">
+    <link rel="modulepreload" crossorigin href="./assets/recharts-4037aff0.js">
+    <link rel="modulepreload" crossorigin href="./assets/tremor-38a10867.js">
+    <link rel="stylesheet" href="./assets/index-b9779128.css">
   </head>
   <body class="block">
     <div id="root"></div>

helm/benchmark/window_services/default_window_service.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .local_window_service import LocalWindowService
+from helm.benchmark.window_services.local_window_service import LocalWindowService
 class DefaultWindowService(LocalWindowService):

helm/benchmark/window_services/encoder_decoder_window_service.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from abc import ABC
-from helm.common.hierarchical_logger import hlog
-from .local_window_service import LocalWindowService
+from helm.common.hierarchical_logger import hwarn
+from helm.benchmark.window_services.local_window_service import LocalWindowService
 class EncoderDecoderWindowService(LocalWindowService, ABC):
@@ -21,8 +21,8 @@ class EncoderDecoderWindowService(LocalWindowService, ABC):
         vs. the completions, we check the two values separately.
         """
         if expected_completion_token_length > self.max_output_length:
-            hlog(
-                f"WARNING: The expected completion token length ({expected_completion_token_length}) "
+            hwarn(
+                f"The expected completion token length ({expected_completion_token_length}) "
                 f"exceeds the max output length ({self.max_output_length})."
             )
         return self.get_num_tokens(text) <= self.max_request_length

helm/benchmark/window_services/ice_window_service.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .local_window_service import LocalWindowService
+from helm.benchmark.window_services.local_window_service import LocalWindowService
 class ICEWindowService(LocalWindowService):

helm/benchmark/window_services/image_generation/lexica_search_window_service.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .clip_window_service import CLIPWindowService
+from helm.benchmark.window_services.image_generation.clip_window_service import CLIPWindowService
 class LexicaSearchWindowService(CLIPWindowService):

helm/benchmark/window_services/image_generation/openai_dalle_window_service.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .clip_window_service import CLIPWindowService
+from helm.benchmark.window_services.image_generation.clip_window_service import CLIPWindowService
 class OpenAIDALLEWindowService(CLIPWindowService):

helm/benchmark/window_services/local_window_service.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from abc import ABC
 from typing import List, Optional, cast
-from .window_service import ConfigurableWindowService, EncodeResult
-from .tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service import ConfigurableWindowService, EncodeResult
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
 from helm.common.tokenization_request import (
     DecodeRequest,
     DecodeRequestResult,

helm/benchmark/window_services/test_anthropic_window_service.py CHANGED Viewed

@@ -3,9 +3,9 @@ import tempfile
 from typing import List
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
-from .test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
 class TestAnthropicWindowService:

helm/benchmark/window_services/test_bloom_window_service.py CHANGED Viewed

@@ -3,9 +3,9 @@ import tempfile
 from typing import List
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
-from .test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
 class TestBloomWindowService:

helm/benchmark/window_services/test_gpt2_window_service.py CHANGED Viewed

@@ -3,8 +3,13 @@ import tempfile
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .test_utils import get_tokenizer_service, TEST_PROMPT, GPT2_TEST_TOKENS, GPT2_TEST_TOKEN_IDS
-from .window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import (
+    get_tokenizer_service,
+    TEST_PROMPT,
+    GPT2_TEST_TOKENS,
+    GPT2_TEST_TOKEN_IDS,
+)
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 class TestGPT2WindowService:

helm/benchmark/window_services/test_gpt4_window_service.py CHANGED Viewed

@@ -2,9 +2,14 @@ import shutil
 import tempfile
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .test_utils import get_tokenizer_service, TEST_PROMPT, GPT4_TEST_TOKEN_IDS, GPT4_TEST_TOKENS
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import (
+    get_tokenizer_service,
+    TEST_PROMPT,
+    GPT4_TEST_TOKEN_IDS,
+    GPT4_TEST_TOKENS,
+)
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 class TestOpenAIWindowService:

helm/benchmark/window_services/test_gptj_window_service.py CHANGED Viewed

@@ -2,9 +2,14 @@ import shutil
 import tempfile
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
-from .test_utils import get_tokenizer_service, GPT2_TEST_TOKENS, GPT2_TEST_TOKEN_IDS, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import (
+    get_tokenizer_service,
+    GPT2_TEST_TOKENS,
+    GPT2_TEST_TOKEN_IDS,
+    TEST_PROMPT,
+)
 class TestGPTJWindowService:

helm/benchmark/window_services/test_gptneox_window_service.py CHANGED Viewed

@@ -3,9 +3,9 @@ import tempfile
 from typing import List
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
-from .test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
 class TestGPTNeoXWindowService:

helm/benchmark/window_services/test_openai_window_service.py CHANGED Viewed

@@ -2,9 +2,14 @@ import shutil
 import tempfile
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .test_utils import get_tokenizer_service, TEST_PROMPT, GPT2_TEST_TOKENS, GPT2_TEST_TOKEN_IDS
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import (
+    get_tokenizer_service,
+    TEST_PROMPT,
+    GPT2_TEST_TOKENS,
+    GPT2_TEST_TOKEN_IDS,
+)
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 class TestOpenAIWindowService:

helm/benchmark/window_services/test_opt_window_service.py CHANGED Viewed

@@ -2,9 +2,9 @@ import shutil
 import tempfile
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .test_utils import get_tokenizer_service, TEST_PROMPT
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 class TestOPTWindowService:

helm/benchmark/window_services/test_palmyra_window_service.py CHANGED Viewed

@@ -2,9 +2,9 @@ from tempfile import TemporaryDirectory
 from typing import List
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
-from .test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
 class TestPalmyraWindowService:

helm/benchmark/window_services/test_t0pp_window_service.py CHANGED Viewed

@@ -3,9 +3,9 @@ import tempfile
 from typing import List
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
-from .test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
 class TestT0ppWindowService:

helm/benchmark/window_services/test_t511b_window_service.py CHANGED Viewed

@@ -3,9 +3,9 @@ import tempfile
 from typing import List
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
-from .test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
 class TestT511bWindowService:

helm/benchmark/window_services/test_ul2_window_service.py CHANGED Viewed

@@ -3,9 +3,9 @@ import tempfile
 from typing import List
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
-from .test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
 class TestUL2WindowService:

helm/benchmark/window_services/test_utils.py CHANGED Viewed

@@ -1,10 +1,9 @@
 from typing import List
-from helm.common.authentication import Authentication
+from helm.common.local_context import LocalContext
 from helm.common.cache_backend_config import CacheBackendConfig
-from helm.proxy.services.server_service import ServerService
 from helm.benchmark.metrics.metric_service import MetricService
-from .tokenizer_service import TokenizerService
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
 TEST_PROMPT: str = (
@@ -229,5 +228,5 @@ GPT4_TEST_TOKENS: List[str] = [
 def get_tokenizer_service(local_path: str, cache_backend_config: CacheBackendConfig) -> TokenizerService:
-    service = ServerService(base_path=local_path, root_mode=True, cache_backend_config=cache_backend_config)
-    return MetricService(service, Authentication("test"))
+    context = LocalContext(base_path=local_path, cache_backend_config=cache_backend_config)
+    return MetricService(context)

helm/benchmark/window_services/test_yalm_window_service.py CHANGED Viewed

@@ -2,9 +2,9 @@ import shutil
 import tempfile
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .test_utils import get_tokenizer_service, TEST_PROMPT
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 class TestYaLMWindowService:

helm/benchmark/window_services/tokenizer_service.py CHANGED Viewed

@@ -1,26 +1,25 @@
-from helm.common.authentication import Authentication
+from helm.common.context import Context
 from helm.common.tokenization_request import (
     TokenizationRequest,
     TokenizationRequestResult,
     DecodeRequest,
     DecodeRequestResult,
 )
-from helm.proxy.services.service import Service
+# TODO: Rename this to TokenizerContext
 class TokenizerService:
     """
-    A wrapper around `Service` that makes only necessary server requests to tokenize.
+    A wrapper around `Context` that makes only necessary server requests to tokenize.
     """
-    def __init__(self, service: Service, auth: Authentication):
-        self._service: Service = service
-        self._auth: Authentication = auth
+    def __init__(self, context: Context):
+        self._context: Context = context
     def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
         """Tokenize via an API."""
-        return self._service.tokenize(self._auth, request)
+        return self._context.tokenize(request)
     def decode(self, request: DecodeRequest) -> DecodeRequestResult:
         """Decode via an API."""
-        return self._service.decode(self._auth, request)
+        return self._context.decode(request)

helm/benchmark/window_services/yalm_window_service.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .local_window_service import LocalWindowService
+from helm.benchmark.window_services.local_window_service import LocalWindowService
 class YaLMWindowService(LocalWindowService):

helm/clients/ai21_client.py CHANGED Viewed

@@ -11,8 +11,8 @@ from helm.common.request import (
     GeneratedOutput,
     Token,
 )
-from .client import CachingClient, truncate_sequence, cleanup_str
-from .ai21_utils import AI21RequestError, handle_failed_request
+from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
+from helm.clients.ai21_utils import AI21RequestError, handle_failed_request
 try:
     from ai21 import AI21Client as AISDKClient
@@ -186,7 +186,7 @@ class AI21ChatClient(CachingClient):
         completions: List[GeneratedOutput] = []
         for choice in response["choices"]:
-            completions.append(GeneratedOutput(text=choice["message"]["content"], logprob=0.0, tokens=[]))
+            completions.append(GeneratedOutput(text=choice["message"]["content"] or "", logprob=0.0, tokens=[]))
         return RequestResult(
             success=True,

helm/clients/aleph_alpha_client.py CHANGED Viewed

@@ -4,7 +4,7 @@ from helm.common.cache import CacheConfig
 from helm.common.media_object import TEXT_TYPE
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
-from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
+from helm.clients.client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
 try:
     from aleph_alpha_client import Client, CompletionRequest, CompletionResponse, Image, Prompt

helm/clients/anthropic_client.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import dataclasses
 from typing import Any, Dict, List, Optional, TypedDict, Union, cast
 import json
 import os
@@ -7,10 +8,11 @@ import time
 import urllib.parse
 from helm.common.cache import CacheConfig
-from helm.common.hierarchical_logger import htrack_block, hlog
+from helm.common.hierarchical_logger import htrack_block, hlog, hwarn
 from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import (
+    Thinking,
     wrap_request_time,
     EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
     Request,
@@ -30,8 +32,12 @@ from helm.clients.client import CachingClient, truncate_sequence, truncate_and_t
 try:
     from anthropic import Anthropic, BadRequestError
     from anthropic.types import MessageParam
+    from anthropic.types.message import Message
+    from anthropic.types.text_block import TextBlock
+    from anthropic.types.thinking_block import ThinkingBlock
     from anthropic.types.image_block_param import ImageBlockParam
     from anthropic.types.text_block_param import TextBlockParam
+    from anthropic.types.thinking_config_enabled_param import ThinkingConfigEnabledParam
     import websocket
 except ModuleNotFoundError as e:
     handle_module_not_found_error(e, ["anthropic"])
@@ -231,30 +237,41 @@ class AnthropicMessagesRequest(TypedDict, total=False):
     temperature: float
     top_k: int
     top_p: float
+    thinking: ThinkingConfigEnabledParam
 class AnthropicMessagesRequestError(NonRetriableException):
     pass
-class AnthropicMessagesResponseError(Exception):
+class AnthropicMessagesEmptyContentError(Exception):
     pass
 class AnthropicMessagesClient(CachingClient):
     # Source: https://docs.anthropic.com/claude/docs/models-overview
-    MAX_OUTPUT_TOKENS: int = 4096
+    MAX_OUTPUT_TOKENS: int = 64000
     MAX_IMAGE_SIZE_BYTES: int = 5242880  # 5MB
     def __init__(
-        self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, api_key: Optional[str] = None
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        thinking_budget_tokens: Optional[int] = None,
+        anthropic_model_name: Optional[str] = None,
+        api_key: Optional[str] = None,
+        stream: Optional[bool] = None,
     ):
         super().__init__(cache_config=cache_config)
         self.tokenizer = tokenizer
         self.tokenizer_name = tokenizer_name
         self.client = Anthropic(api_key=api_key)
         self.api_key: Optional[str] = api_key
+        self.anthropic_model_name: Optional[str] = anthropic_model_name
+        self.thinking_budget_tokens: Optional[int] = thinking_budget_tokens
+        self.stream: Optional[bool] = stream
     def make_request(self, request: Request) -> RequestResult:
         if request.max_tokens > AnthropicMessagesClient.MAX_OUTPUT_TOKENS:
@@ -293,8 +310,8 @@ class AnthropicMessagesClient(CachingClient):
                         image_width > AnthropicClient.MAX_IMAGE_DIMENSION
                         or image_height > AnthropicClient.MAX_IMAGE_DIMENSION
                     ):
-                        hlog(
-                            f"WARNING: Image {image_location} exceeds max allowed size: "
+                        hwarn(
+                            f"Image {image_location} exceeds max allowed size: "
                             f"{AnthropicClient.MAX_IMAGE_DIMENSION} pixels"
                         )
                         # Save the resized image to a temporary file
@@ -309,8 +326,8 @@ class AnthropicMessagesClient(CachingClient):
                             base64_image = encode_base64(temp_file.name, format="JPEG")
                     elif os.path.getsize(image_location) > AnthropicMessagesClient.MAX_IMAGE_SIZE_BYTES:
-                        hlog(
-                            f"WARNING: Image {image_location} exceeds max allowed size: "
+                        hwarn(
+                            f"Image {image_location} exceeds max allowed size: "
                             f"{AnthropicMessagesClient.MAX_IMAGE_SIZE_BYTES} bytes"
                         )
                         # Resize the image so it is smaller than the max allowed size
@@ -351,7 +368,7 @@ class AnthropicMessagesClient(CachingClient):
         raw_request: AnthropicMessagesRequest = {
             "messages": messages,
-            "model": request.model_engine,
+            "model": self.anthropic_model_name or request.model_engine,
             "stop_sequences": request.stop_sequences,
             "max_tokens": request.max_tokens,
             "temperature": request.temperature,
@@ -360,6 +377,15 @@ class AnthropicMessagesClient(CachingClient):
         }
         if system_message is not None:
             raw_request["system"] = cast(str, system_message["content"])
+        if self.thinking_budget_tokens:
+            raw_request["thinking"] = {
+                "type": "enabled",
+                "budget_tokens": self.thinking_budget_tokens,
+            }
+            # Avoid error:
+            # `top_k` must be unset when thinking is enabled. Please consult our documentation at https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#important-considerations-when-using-extended-thinking  # noqa: E501
+            del raw_request["top_k"]
         completions: List[GeneratedOutput] = []
         # `num_completions` is not supported, so instead make `num_completions` separate requests.
@@ -367,11 +393,15 @@ class AnthropicMessagesClient(CachingClient):
             def do_it() -> Dict[str, Any]:
                 try:
-                    result = self.client.messages.create(**raw_request).model_dump()
+                    if self.stream:
+                        with self.client.messages.stream(**raw_request) as message_stream:
+                            result = message_stream.get_final_message().model_dump()
+                    else:
+                        result = self.client.messages.create(**raw_request).model_dump()
                     if "content" not in result or not result["content"]:
-                        raise AnthropicMessagesResponseError(f"Anthropic response has empty content: {result}")
-                    elif "text" not in result["content"][0]:
-                        raise AnthropicMessagesResponseError(f"Anthropic response has non-text content: {result}")
+                        raise AnthropicMessagesEmptyContentError(f"Anthropic response has empty content: {result}")
+                    elif "text" not in result["content"][-1]:
+                        raise AnthropicMessagesEmptyContentError(f"Anthropic response has non-text content: {result}")
                     return result
                 except BadRequestError as e:
                     response = e.response.json()
@@ -387,9 +417,10 @@ class AnthropicMessagesClient(CachingClient):
                     },
                     request,
                 )
-                response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
-            except AnthropicMessagesResponseError:
-                hlog("WARNING: Response has empty content")
+                raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+            except AnthropicMessagesEmptyContentError:
+                hwarn("Anthropic response has empty content")
                 return RequestResult(
                     success=False,
                     cached=False,
@@ -399,32 +430,41 @@ class AnthropicMessagesClient(CachingClient):
                     error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
                 )
-            if _is_content_moderation_failure(response):
-                hlog(
-                    f"WARNING: Returning empty request for {request.model_deployment} "
-                    "due to content moderation filter"
-                )
+            if _is_content_moderation_failure(raw_response):
+                hwarn(f"Returning empty request for {request.model_deployment} " "due to content moderation filter")
                 return RequestResult(
                     success=False,
                     cached=cached,
-                    error=response["error"]["message"],
+                    error=raw_response["error"]["message"],
                     completions=[],
                     embedding=[],
                     error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
-                    request_time=response["request_time"],
-                    request_datetime=response["request_datetime"],
+                    request_time=raw_response["request_time"],
+                    request_datetime=raw_response["request_datetime"],
                 )
+            response_message: Message = Message.model_validate(raw_response)
+            response_text: Optional[str] = None
+            response_thinking: Optional[str] = None
+            for content in response_message.content:
+                if isinstance(content, TextBlock):
+                    response_text = content.text
+                elif isinstance(content, ThinkingBlock):
+                    response_thinking = content.thinking
+            if response_text is None:
+                raise Exception("Anthropic response did not contain text block")
             completion = truncate_and_tokenize_response_text(
-                response["content"][0]["text"], request, self.tokenizer, self.tokenizer_name, original_finish_reason=""
+                response_text, request, self.tokenizer, self.tokenizer_name, original_finish_reason=""
             )
+            if response_thinking is not None:
+                completion = dataclasses.replace(completion, thinking=Thinking(text=response_thinking))
             completions.append(completion)
         return RequestResult(
             success=True,
             cached=cached,
-            request_time=response["request_time"],
-            request_datetime=response["request_datetime"],
+            request_time=raw_response["request_time"],
+            request_datetime=raw_response["request_datetime"],
             completions=completions,
             embedding=[],
         )
@@ -617,8 +657,8 @@ class AnthropicLegacyClient(CachingClient):
                     if logprobs["tokens"] != tokens:
                         # This is a known limitation with the Anthropic API. For now keep track of the
                         # entries with the mismatch.
-                        hlog(
-                            f"WARNING: naive truncation for logprobs did not work."
+                        hwarn(
+                            f"naive truncation for logprobs did not work."
                             f"\nRequest:{raw_request}\nExpected: {tokens}\nActual: {logprobs['tokens']}"
                         )
                         check_logprobs = True

helm/clients/audio_language/__init__.py ADDED Viewed

File without changes

crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl