PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
helm/benchmark/adaptation/adapter_spec.py +10 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/bbq_metrics.py +12 -0
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/safety_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/run_display.py +13 -3
helm/benchmark/presentation/run_entry.py +2 -2
helm/benchmark/presentation/schema.py +10 -22
helm/benchmark/presentation/summarize.py +189 -14
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +15 -4
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +197 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +2 -55
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
helm/benchmark/runner.py +7 -0
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/banking77_scenario.py +21 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +480 -1
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +32 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
helm/benchmark/scenarios/financebench_scenario.py +21 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +21 -0
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +19 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +54 -20
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +20 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +21 -0
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +350 -2
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/spider_scenario.py +18 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +17 -18
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
helm/benchmark/static_build/index.html +5 -6
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/ai21_client.py +2 -0
helm/clients/aleph_alpha_client.py +2 -0
helm/clients/anthropic_client.py +7 -1
helm/clients/audio_language/diva_llama_client.py +2 -0
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/llama_omni_client.py +2 -1
helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/bedrock_client.py +63 -6
helm/clients/cohere_client.py +3 -0
helm/clients/dspy_client.py +135 -0
helm/clients/google_client.py +2 -0
helm/clients/http_model_client.py +2 -0
helm/clients/huggingface_client.py +4 -3
helm/clients/ibm_client.py +3 -1
helm/clients/image_generation/adobe_vision_client.py +2 -0
helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/cogview2_client.py +2 -1
helm/clients/image_generation/dalle2_client.py +2 -0
helm/clients/image_generation/dalle_mini_client.py +2 -1
helm/clients/image_generation/deep_floyd_client.py +2 -0
helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
helm/clients/image_generation/lexica_client.py +2 -0
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/image_generation/mindalle_client.py +2 -1
helm/clients/image_generation/together_image_generation_client.py +2 -0
helm/clients/megatron_client.py +2 -0
helm/clients/mistral_client.py +2 -0
helm/clients/moderation_api_client.py +2 -0
helm/clients/openai_client.py +38 -21
helm/clients/openai_responses_client.py +34 -8
helm/clients/openrouter_client.py +31 -0
helm/clients/palmyra_client.py +2 -1
helm/clients/reka_client.py +2 -1
helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
helm/clients/stanfordhealthcare_http_model_client.py +2 -0
helm/clients/test_huggingface_client.py +3 -3
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +52 -13
helm/clients/vertexai_client.py +23 -11
helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
helm/clients/vision_language/huggingface_vlm_client.py +2 -0
helm/clients/vision_language/idefics_client.py +2 -1
helm/clients/vision_language/open_flamingo_client.py +2 -1
helm/clients/vision_language/paligemma_client.py +2 -1
helm/clients/vision_language/palmyra_vision_client.py +2 -0
helm/clients/vision_language/qwen2_vlm_client.py +2 -1
helm/clients/vision_language/qwen_vlm_client.py +2 -1
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +5 -2
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +103 -34
helm/common/object_spec.py +23 -8
helm/common/optional_dependencies.py +1 -1
helm/common/test_general.py +4 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +1001 -187
helm/config/model_metadata.yaml +602 -18
helm/config/tokenizer_configs.yaml +202 -5
helm/proxy/cli.py +1 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/retry.py +5 -0
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/tokenizers/auto_tokenizer.py +2 -2
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
/helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
/helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
/helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
/helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
/helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
/helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
/helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0

helm/clients/huggingface_client.py CHANGED Viewed

@@ -8,7 +8,7 @@ from transformers.generation.stopping_criteria import (
 from typing import Any, Dict, List, Optional, TypedDict
 from helm.common.cache import CacheConfig
-from helm.common.hierarchical_logger import htrack_block, hlog, hwarn
+from helm.common.hierarchical_logger import hexception, htrack_block, hlog, hwarn
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import (
     wrap_request_time,
@@ -293,12 +293,12 @@ class HuggingFaceClient(CachingClient):
         if self._apply_chat_template:
             with self._wrapped_tokenizer as tokenizer:
                 if request.messages:
-                    prompt = tokenizer.apply_chat_template(request.messages, tokenize=False)
+                    prompt = tokenizer.apply_chat_template(request.messages, tokenize=False, add_generation_prompt=True)
                     assert isinstance(prompt, str)
                     return prompt
                 else:
                     prompt = tokenizer.apply_chat_template(
-                        [{"role": "user", "content": request.prompt}], tokenize=False
+                        [{"role": "user", "content": request.prompt}], tokenize=False, add_generation_prompt=True
                     )
                     assert isinstance(prompt, str)
                     return prompt
@@ -345,6 +345,7 @@ class HuggingFaceClient(CachingClient):
             response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
         except Exception as e:  # Do something if error is encountered.
             error: str = f"HuggingFace error: {e}"
+            hexception(e)
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
         completions = []

helm/clients/ibm_client.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from abc import ABC
 from abc import abstractmethod
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hexception, hlog
 from helm.common.cache import CacheConfig
 from helm.common.request import (
     Request,
@@ -249,6 +249,7 @@ class IbmChatClient(IbmClient):
             )
         except Exception as e:
+            hexception(e)
             error: str = f"IBM Chat client Model error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
@@ -263,5 +264,6 @@ class IbmTextClient(IbmClient):
                 inference_handler=GenerateInferenceHandler(inference_engine=self.inference_engine), request=request
             )
         except Exception as e:
+            hexception(e)
             error: str = f"IBM Text client Model error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/image_generation/adobe_vision_client.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List, Dict
 from helm.common.cache import Cache, CacheConfig
+from helm.common.hierarchical_logger import hexception
 from helm.common.request import Request, RequestResult, GeneratedOutput
 from helm.common.tokenization_request import (
     TokenizationRequest,
@@ -54,6 +55,7 @@ class AdobeVisionClient(Client):
             response, cached = self._cache.get(cache_key, fail)
         except RuntimeError as e:
+            hexception(e)
             error: str = f"Adobe Vision Client error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/image_generation/aleph_alpha_image_generation_client.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List, Dict
 from helm.common.cache import Cache, CacheConfig
+from helm.common.hierarchical_logger import hexception
 from helm.common.request import Request, RequestResult, GeneratedOutput
 from helm.common.tokenization_request import (
     TokenizationRequest,
@@ -74,6 +75,7 @@ class AlephAlphaImageGenerationClient(Client):
             response, cached = self._cache.get(cache_key, fail)
         except RuntimeError as e:
+            hexception(e)
             error: str = f"AlephAlphaVisionClient error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py CHANGED Viewed

@@ -184,7 +184,7 @@ def sparse_attention_2d_light(
     attention_dropout=None,
     log_attention_weights=None,
     add_scalar=0,
-    **kwargs
+    **kwargs,
 ):
     """
     q0, k0, v0: [batch_size, 1088, hidden_size]

helm/clients/image_generation/cogview2_client.py CHANGED Viewed

@@ -9,7 +9,7 @@ from torchvision.utils import save_image
 from helm.common.cache import CacheConfig, Cache
 from helm.common.file_caches.file_cache import FileCache
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hexception, hlog, htrack_block
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
 from helm.common.tokenization_request import (
@@ -167,6 +167,7 @@ class CogView2Client(Client):
             )
             results, cached = self._cache.get(cache_key, wrap_request_time(do_it))
         except RuntimeError as e:
+            hexception(e)
             error: str = f"CogView2Client error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/image_generation/dalle2_client.py CHANGED Viewed

@@ -4,6 +4,7 @@ import base64
 from helm.common.cache import CacheConfig, Cache
 from helm.common.general import hlog
 from helm.common.file_caches.file_cache import FileCache
+from helm.common.hierarchical_logger import hexception
 from helm.common.media_object import MultimediaObject
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
@@ -124,6 +125,7 @@ class DALLE2Client(Client):
             hlog(f"Failed safety check: {request.prompt}")
             return self.get_content_policy_violated_result(request)
         else:
+            hexception(error)
             return RequestResult(
                 success=False, cached=False, error=f"DALL-E error: {error}", completions=[], embedding=[]
             )

helm/clients/image_generation/dalle_mini_client.py CHANGED Viewed

@@ -5,7 +5,7 @@ from functools import partial
 from helm.common.cache import CacheConfig, Cache
 from helm.common.file_caches.file_cache import FileCache
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hexception, hlog, htrack_block
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
 from helm.common.tokenization_request import (
@@ -166,6 +166,7 @@ class DALLEMiniClient(Client):
             )
             results, cached = self._cache.get(cache_key, wrap_request_time(do_it))
         except RuntimeError as e:
+            hexception(e)
             error: str = f"DALLEMiniClient error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/image_generation/deep_floyd_client.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List, Dict
 from helm.common.cache import Cache, CacheConfig
+from helm.common.hierarchical_logger import hexception
 from helm.common.request import Request, RequestResult, GeneratedOutput
 from helm.common.tokenization_request import (
     TokenizationRequest,
@@ -54,6 +55,7 @@ class DeepFloydClient(Client):
             response, cached = self._cache.get(cache_key, fail)
         except RuntimeError as e:
+            hexception(e)
             error: str = f"DeepFloyd Client error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/image_generation/huggingface_diffusers_client.py CHANGED Viewed

@@ -7,7 +7,7 @@ import torch
 from helm.common.cache import CacheConfig, Cache
 from helm.common.file_caches.file_cache import FileCache
 from helm.common.gpu_utils import get_torch_device_name, is_cuda_available
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hexception, hlog, htrack_block
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
 from helm.common.tokenization_request import (
@@ -178,6 +178,7 @@ class HuggingFaceDiffusersClient(Client):
             )
             results, cached = self._cache.get(cache_key, wrap_request_time(do_it))
         except RuntimeError as ex:
+            hexception(ex)
             error: str = f"HuggingFaceDiffusersClient error: {ex}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/image_generation/lexica_client.py CHANGED Viewed

@@ -5,6 +5,7 @@ import urllib.parse
 from helm.common.cache import CacheConfig, Cache
 from helm.common.file_caches.file_cache import FileCache
+from helm.common.hierarchical_logger import hexception
 from helm.common.images_utils import encode_base64
 from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
 from helm.common.tokenization_request import (
@@ -62,6 +63,7 @@ class LexicaClient(Client):
             response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
         except RuntimeError as e:
+            hexception(e)
             error: str = f"LexicaClient error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/image_generation/mindalle/models/stage1/layers.py CHANGED Viewed

@@ -141,7 +141,7 @@ class Encoder(nn.Module):
         in_channels: int,
         resolution: int,
         z_channels: int,
-        double_z: Optional[bool] = None
+        double_z: Optional[bool] = None,
     ) -> None:
         super().__init__()
         self.ch = ch
@@ -232,7 +232,7 @@ class Decoder(nn.Module):
         in_channels: int,
         resolution: int,
         z_channels: int,
-        double_z: bool
+        double_z: bool,
     ) -> None:
         super().__init__()
         self.ch = ch

helm/clients/image_generation/mindalle_client.py CHANGED Viewed

@@ -5,7 +5,7 @@ import numpy as np
 from helm.common.cache import CacheConfig, Cache
 from helm.common.file_caches.file_cache import FileCache
 from helm.common.gpu_utils import get_torch_device_name
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hexception, hlog, htrack_block
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
 from helm.common.tokenization_request import (
@@ -91,6 +91,7 @@ class MinDALLEClient(Client):
             )
             results, cached = self._cache.get(cache_key, wrap_request_time(do_it))
         except RuntimeError as ex:
+            hexception(ex)
             error: str = f"MinDALLEClient error: {ex}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/image_generation/together_image_generation_client.py CHANGED Viewed

@@ -4,6 +4,7 @@ import requests
 from helm.common.cache import CacheConfig, Cache
 from helm.common.file_caches.file_cache import FileCache
+from helm.common.hierarchical_logger import hexception
 from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
 from helm.common.tokenization_request import (
     TokenizationRequest,
@@ -84,6 +85,7 @@ class TogetherImageGenerationClient(Client):
             response, cached = self._cache.get(cache_key, wrap_request_time(do_it))
         except RuntimeError as e:
+            hexception(e)
             error: str = f"TogetherVisionClient error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/megatron_client.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Any, Dict, List
 import traceback
 from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import hexception
 from helm.common.request import (
     wrap_request_time,
     EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
@@ -103,6 +104,7 @@ class MegatronClient(CachingClient):
         try:
             return self._make_request(request)
         except Exception as e:
+            hexception(e)
             return RequestResult(
                 success=False,
                 cached=False,

helm/clients/mistral_client.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import requests
 from typing import Any, Dict, List, Optional, TypedDict, Union
+from helm.common.hierarchical_logger import hexception
 from helm.proxy.retry import NonRetriableException
 from helm.common.cache import CacheConfig
 from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
@@ -156,6 +157,7 @@ class MistralAIClient(CachingClient):
                 response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
             except (requests.exceptions.RequestException, AssertionError) as e:
+                hexception(e)
                 error: str = f"MistralClient error: {e}"
                 return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/moderation_api_client.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Any, Dict
+from helm.common.hierarchical_logger import hexception
 from helm.common.request import wrap_request_time
 from helm.common.cache import Cache, CacheConfig
 from helm.common.moderations_api_request import (
@@ -64,6 +65,7 @@ class ModerationAPIClient:
             response, cached = self.cache.get(raw_request, wrap_request_time(do_it))
         except openai.OpenAIError as e:
+            hexception(e)
             error: str = f"Moderation API error: {e}"
             return ModerationAPIRequestResult(
                 success=False, cached=False, error=error, flagged=None, flagged_results=None, scores=None

helm/clients/openai_client.py CHANGED Viewed

@@ -10,7 +10,7 @@ from helm.common import multimodal_request_utils
 from helm.common.cache import CacheConfig
 from helm.common.media_object import TEXT_TYPE, MultimediaObject, MediaObject
 from helm.common.request import ErrorFlags, Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
-from helm.common.hierarchical_logger import hlog, hwarn
+from helm.common.hierarchical_logger import hlog, hwarn, hexception
 from helm.common.object_spec import get_class_by_name
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.tokenization_request import (
@@ -33,9 +33,12 @@ class OpenAIClientUtils:
     @classmethod
     def is_reasoning_model(cls, model_engine: str) -> bool:
         # All OpenAI  reasoning models start "o[somenumber]", so we regexp for that to future proof things
-        return bool(re.match(r"^o\d+", model_engine))
+        return bool(re.match(r"^o\d+", model_engine)) or bool(re.match(r"^gpt-5", model_engine))
     # Error OpenAI throws when the image in the prompt violates their content policy
+    HARMFUL_INFORMATION_ERROR: str = (
+        "Invalid prompt: we've limited access to this content for safety reasons. This type of information may be used to benefit or to harm people."  # noqa: E501
+    )
     INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
     INAPPROPRIATE_PROMPT_ERROR: str = "Invalid prompt: your prompt was flagged"
     INAPPROPRIATE_PROMPT_AZURE_ERROR: str = (
@@ -44,12 +47,10 @@ class OpenAIClientUtils:
     INAPPROPRIATE_PROMPT_MICROSOFT_ERROR: str = (
         "The response was filtered due to the prompt triggering Microsoft's content management policy."
     )
-    # OpenAI server error
-    OPENAI_SERVER_ERROR: str = (
-        "The server had an error processing your request. Sorry about that! You can retry your request, "
-        "or contact us through our help center at help.openai.com if you keep seeing this error."
-    )
+    # Grok content safety guidelines error message
+    # TODO: Refactor so that this is owned by the Grok client instead.
+    SAFETY_GUIDELINES_GROK_ERROR: str = "Content violates safety guidelines."
+    USAGE_GUIDELINES_GROK_ERROR: str = "Content violates usage guidelines."
     # Set the finish reason to this if the prompt violates OpenAI's content policy
     CONTENT_POLICY_VIOLATED_FINISH_REASON: str = (
@@ -74,21 +75,14 @@ class OpenAIClientUtils:
                 completions=[empty_completion] * request.num_completions,
                 embedding=[],
             )
-        elif cls.OPENAI_SERVER_ERROR in str(e):
-            # Handle these errors by returning an empty completion to unblock
-            hwarn(f"OpenAI server error for request: {str(request)}")
-            empty_completion = GeneratedOutput(
-                text="",
-                logprob=0,
-                tokens=[],
-                finish_reason={"reason": cls.OPENAI_SERVER_ERROR},
-            )
+        elif cls.HARMFUL_INFORMATION_ERROR in str(e):
             return RequestResult(
-                success=True,
+                success=False,
                 cached=False,
-                request_time=0,
-                completions=[empty_completion] * request.num_completions,
+                error="Prompt blocked by OpenAI's safety filter",
+                completions=[],
                 embedding=[],
+                error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
             )
         elif cls.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
             return RequestResult(
@@ -99,7 +93,26 @@ class OpenAIClientUtils:
                 embedding=[],
                 error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
             )
+        elif cls.SAFETY_GUIDELINES_GROK_ERROR in str(e):
+            return RequestResult(
+                success=False,
+                cached=False,
+                error="Grok API error: Content violates safety guidelines",
+                completions=[],
+                embedding=[],
+                error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
+            )
+        elif cls.USAGE_GUIDELINES_GROK_ERROR in str(e):
+            return RequestResult(
+                success=False,
+                cached=False,
+                error="Grok API error: Content violates usage guidelines",
+                completions=[],
+                embedding=[],
+                error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
+            )
+        hexception(e)
         error: str = f"OpenAI error: {e}"
         return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
@@ -118,11 +131,12 @@ class OpenAIClient(CachingClient):
         reasoning_effort: Optional[str] = None,
         openai_model_name: Optional[str] = None,
         output_processor: Optional[str] = None,
+        **kwargs,
     ):
         super().__init__(cache_config=cache_config)
         self.tokenizer = tokenizer
         self.tokenizer_name = tokenizer_name
-        self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url)
+        self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url, **kwargs)
         self.reasoning_effort = reasoning_effort
         self.openai_model_name = openai_model_name
         self.output_processor: Optional[Callable[[str], str]] = (
@@ -157,6 +171,7 @@ class OpenAIClient(CachingClient):
             cache_key = self._get_cache_key(raw_request, request)
             response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
         except openai.OpenAIError as e:
+            hexception(e)
             error: str = f"OpenAI error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
@@ -423,6 +438,7 @@ class OpenAIClient(CachingClient):
             cache_key = self._get_cache_key(raw_request, request)
             response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
         except openai.OpenAIError as e:
+            hexception(e)
             error: str = f"OpenAI error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
@@ -478,6 +494,7 @@ class OpenAIClient(CachingClient):
             cache_key = self._get_cache_key({"audio": audio_path, "model": model}, request)
             response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
         except openai.OpenAIError as e:
+            hexception(e)
             error: str = f"OpenAI error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/openai_responses_client.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Union
 from helm.clients.openai_client import OpenAIClientUtils
 from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import hwarn
 from helm.common.media_object import TEXT_TYPE
 from helm.common.request import (
     Thinking,
@@ -60,7 +61,28 @@ class OpenAIResponseClient(CachingClient):
     def _make_raw_request(self, request: Request) -> dict[str, Any]:
         input: Union[str, List[Dict[str, Any]]]
-        if request.multimodal_prompt is not None:
+        if (
+            (request.prompt and request.messages)
+            or (request.prompt and request.multimodal_prompt)
+            or (request.messages and request.multimodal_prompt)
+        ):
+            raise ValueError(
+                f"More than one of `prompt`, `messages` and `multimodal_prompt` was set in request: {request}"
+            )
+        if request.messages is not None:
+            # Checks that all messages have a role and some content
+            for message in request.messages:
+                if not message.get("role") or not message.get("content"):
+                    raise ValueError("All messages must have a role and content")
+            # Checks that the last role is "user"
+            if request.messages[-1]["role"] != "user":
+                raise ValueError("Last message must have role 'user'")
+            if request.prompt != "":
+                hwarn("Since message is set, prompt will be ignored")
+            input = request.messages
+        elif request.multimodal_prompt is not None:
             content = []
             request.validate()
             for media_object in request.multimodal_prompt.media_objects:
@@ -101,6 +123,8 @@ class OpenAIResponseClient(CachingClient):
         # Plus other changes
         model_engine: str = request.model_engine
         if OpenAIClientUtils.is_reasoning_model(model_engine):
+            if "reasoning" not in raw_request:
+                raw_request["reasoning"] = {}
             raw_request["reasoning"]["summary"] = "detailed"
             # Avoid error:
             # "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
@@ -145,13 +169,15 @@ class OpenAIResponseClient(CachingClient):
             if request.echo_prompt:
                 text_output += request.prompt
             for output in response["output"]:
-                output_type = output["type"]  # one of "message" or "reasoning" from API observation
-                is_reasoning_output = output_type == "reasoning"
-                if is_reasoning_output:
-                    reasoning_output += "\n".join([raw_output["text"] for raw_output in output["summary"]])
-                else:
-                    text_output += "\n".join([raw_output["text"] for raw_output in output["content"]])
+                output_type = output[
+                    "type"
+                ]  # one of "message" or "reasoning" from API observation, but can also include tool calls
+                if output_type == "reasoning":
+                    reasoning_output += "\n\n".join([raw_output["text"] for raw_output in output["summary"]])
+                elif output_type == "message":
+                    text_output += "\n\n".join([raw_output["text"] for raw_output in output["content"]])
+                # (Other output types are ignored)
             completion = truncate_and_tokenize_response_text(
                 text_output,

helm/clients/openrouter_client.py ADDED Viewed

@@ -0,0 +1,31 @@
+import os
+from typing import Optional
+from helm.clients.openai_client import OpenAIClient
+from helm.common.cache import CacheConfig
+from helm.tokenizers.tokenizer import Tokenizer
+class OpenRouterClient(OpenAIClient):
+    def __init__(
+        self,
+        tokenizer_name: str,
+        tokenizer: Tokenizer,
+        cache_config: CacheConfig,
+        api_key: Optional[str] = None,
+        model_name: Optional[str] = None,
+        output_processor: Optional[str] = None,
+    ):
+        self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
+        self.base_url = "https://openrouter.ai/api/v1/"
+        super().__init__(
+            tokenizer,
+            tokenizer_name,
+            cache_config=cache_config,
+            output_processor=output_processor,
+            base_url=self.base_url,
+            api_key=self.api_key,
+        )
+        self.model_name = model_name
+    def _get_model_for_request(self, request):
+        return self.model_name or request.model

helm/clients/palmyra_client.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any, Dict, List
 from helm.clients.openai_client import OpenAIClient
 from helm.common.cache import CacheConfig
-from helm.common.hierarchical_logger import hwarn
+from helm.common.hierarchical_logger import hexception, hwarn
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
 from helm.common.tokenization_request import (
     TokenizationRequest,
@@ -99,6 +99,7 @@ class PalmyraClient(CachingClient):
                 response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
             except (requests.exceptions.RequestException, AssertionError) as e:
+                hexception(e)
                 error: str = f"PalmyraClient error: {e}"
                 return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/reka_client.py CHANGED Viewed

@@ -6,7 +6,7 @@ from helm.proxy.retry import NonRetriableException
 from helm.common.cache import CacheConfig
 from helm.common.media_object import TEXT_TYPE
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
-from helm.common.hierarchical_logger import hwarn
+from helm.common.hierarchical_logger import hexception, hwarn
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.tokenizers.tokenizer import Tokenizer
 from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
@@ -167,6 +167,7 @@ class RekaClient(CachingClient):
                 response, cached = self.cache.get(raw_request, wrap_request_time(do_it))
             except (requests.exceptions.RequestException, AssertionError) as e:
+                hexception(e)
                 error: str = f"RekaClient error: {e}"
                 return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])

helm/clients/stanfordhealthcare_azure_openai_client.py CHANGED Viewed

@@ -39,7 +39,7 @@ class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
                 tokenizer=tokenizer,
                 tokenizer_name=tokenizer_name,
                 cache_config=cache_config,
-                api_key="unused",
+                api_key=api_key,
                 base_url=base_url,
                 azure_openai_deployment_name=openai_model_name,
                 api_version=api_version,
@@ -50,7 +50,7 @@ class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
                 tokenizer=tokenizer,
                 tokenizer_name=tokenizer_name,
                 cache_config=cache_config,
-                api_key="unused",
+                api_key=api_key,
                 endpoint=endpoint,
                 azure_openai_deployment_name=openai_model_name,
                 api_version=api_version,

helm/clients/stanfordhealthcare_http_model_client.py CHANGED Viewed

@@ -5,6 +5,7 @@ from dataclasses import asdict
 from typing import Any, Dict, List, Optional
 from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import hexception
 from helm.common.request import (
     wrap_request_time,
     Request,
@@ -82,6 +83,7 @@ class StanfordHealthCareHTTPModelClient(CachingClient, ABC):
                 request_time=response["request_time"],
             )
         except requests.exceptions.RequestException as e:
+            hexception(e)
             return RequestResult(success=False, cached=False, error=f"Request error: {e}", completions=[], embedding=[])
     @abstractmethod

helm/clients/test_huggingface_client.py CHANGED Viewed

@@ -9,7 +9,7 @@ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
 class TestHuggingFaceClient:
     def test_gpt2(self):
         tokenizer = HuggingFaceTokenizer(
-            BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
+            BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
         )
         client = HuggingFaceClient(
             cache_config=BlackHoleCacheConfig(),
@@ -36,7 +36,7 @@ class TestHuggingFaceClient:
     @pytest.mark.skip(reason="GPT-J 6B is 22 GB and extremely slow without a GPU.")
     def test_gptj_6b(self):
         tokenizer = HuggingFaceTokenizer(
-            BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
+            BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
         )
         client = HuggingFaceClient(
             cache_config=BlackHoleCacheConfig(),
@@ -57,7 +57,7 @@ class TestHuggingFaceClient:
     def test_logprob(self):
         tokenizer = HuggingFaceTokenizer(
-            BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
+            BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
         )
         client = HuggingFaceClient(
             cache_config=BlackHoleCacheConfig(),

crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl