crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +15 -4
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
- helm/benchmark/runner.py +7 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +54 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +17 -18
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +63 -6
- helm/clients/cohere_client.py +3 -0
- helm/clients/dspy_client.py +135 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +4 -3
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +38 -21
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -13
- helm/clients/vertexai_client.py +23 -11
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +5 -2
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +103 -34
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +1001 -187
- helm/config/model_metadata.yaml +602 -18
- helm/config/tokenizer_configs.yaml +202 -5
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/auto_tokenizer.py +2 -2
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -8,7 +8,7 @@ from transformers.generation.stopping_criteria import (
|
|
|
8
8
|
from typing import Any, Dict, List, Optional, TypedDict
|
|
9
9
|
|
|
10
10
|
from helm.common.cache import CacheConfig
|
|
11
|
-
from helm.common.hierarchical_logger import htrack_block, hlog, hwarn
|
|
11
|
+
from helm.common.hierarchical_logger import hexception, htrack_block, hlog, hwarn
|
|
12
12
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
13
13
|
from helm.common.request import (
|
|
14
14
|
wrap_request_time,
|
|
@@ -293,12 +293,12 @@ class HuggingFaceClient(CachingClient):
|
|
|
293
293
|
if self._apply_chat_template:
|
|
294
294
|
with self._wrapped_tokenizer as tokenizer:
|
|
295
295
|
if request.messages:
|
|
296
|
-
prompt = tokenizer.apply_chat_template(request.messages, tokenize=False)
|
|
296
|
+
prompt = tokenizer.apply_chat_template(request.messages, tokenize=False, add_generation_prompt=True)
|
|
297
297
|
assert isinstance(prompt, str)
|
|
298
298
|
return prompt
|
|
299
299
|
else:
|
|
300
300
|
prompt = tokenizer.apply_chat_template(
|
|
301
|
-
[{"role": "user", "content": request.prompt}], tokenize=False
|
|
301
|
+
[{"role": "user", "content": request.prompt}], tokenize=False, add_generation_prompt=True
|
|
302
302
|
)
|
|
303
303
|
assert isinstance(prompt, str)
|
|
304
304
|
return prompt
|
|
@@ -345,6 +345,7 @@ class HuggingFaceClient(CachingClient):
|
|
|
345
345
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
346
346
|
except Exception as e: # Do something if error is encountered.
|
|
347
347
|
error: str = f"HuggingFace error: {e}"
|
|
348
|
+
hexception(e)
|
|
348
349
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
349
350
|
|
|
350
351
|
completions = []
|
helm/clients/ibm_client.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
from abc import abstractmethod
|
|
3
3
|
|
|
4
|
-
from helm.common.hierarchical_logger import hlog
|
|
4
|
+
from helm.common.hierarchical_logger import hexception, hlog
|
|
5
5
|
from helm.common.cache import CacheConfig
|
|
6
6
|
from helm.common.request import (
|
|
7
7
|
Request,
|
|
@@ -249,6 +249,7 @@ class IbmChatClient(IbmClient):
|
|
|
249
249
|
)
|
|
250
250
|
|
|
251
251
|
except Exception as e:
|
|
252
|
+
hexception(e)
|
|
252
253
|
error: str = f"IBM Chat client Model error: {e}"
|
|
253
254
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
254
255
|
|
|
@@ -263,5 +264,6 @@ class IbmTextClient(IbmClient):
|
|
|
263
264
|
inference_handler=GenerateInferenceHandler(inference_engine=self.inference_engine), request=request
|
|
264
265
|
)
|
|
265
266
|
except Exception as e:
|
|
267
|
+
hexception(e)
|
|
266
268
|
error: str = f"IBM Text client Model error: {e}"
|
|
267
269
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List, Dict
|
|
2
2
|
|
|
3
3
|
from helm.common.cache import Cache, CacheConfig
|
|
4
|
+
from helm.common.hierarchical_logger import hexception
|
|
4
5
|
from helm.common.request import Request, RequestResult, GeneratedOutput
|
|
5
6
|
from helm.common.tokenization_request import (
|
|
6
7
|
TokenizationRequest,
|
|
@@ -54,6 +55,7 @@ class AdobeVisionClient(Client):
|
|
|
54
55
|
|
|
55
56
|
response, cached = self._cache.get(cache_key, fail)
|
|
56
57
|
except RuntimeError as e:
|
|
58
|
+
hexception(e)
|
|
57
59
|
error: str = f"Adobe Vision Client error: {e}"
|
|
58
60
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
59
61
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List, Dict
|
|
2
2
|
|
|
3
3
|
from helm.common.cache import Cache, CacheConfig
|
|
4
|
+
from helm.common.hierarchical_logger import hexception
|
|
4
5
|
from helm.common.request import Request, RequestResult, GeneratedOutput
|
|
5
6
|
from helm.common.tokenization_request import (
|
|
6
7
|
TokenizationRequest,
|
|
@@ -74,6 +75,7 @@ class AlephAlphaImageGenerationClient(Client):
|
|
|
74
75
|
|
|
75
76
|
response, cached = self._cache.get(cache_key, fail)
|
|
76
77
|
except RuntimeError as e:
|
|
78
|
+
hexception(e)
|
|
77
79
|
error: str = f"AlephAlphaVisionClient error: {e}"
|
|
78
80
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
79
81
|
|
|
@@ -9,7 +9,7 @@ from torchvision.utils import save_image
|
|
|
9
9
|
|
|
10
10
|
from helm.common.cache import CacheConfig, Cache
|
|
11
11
|
from helm.common.file_caches.file_cache import FileCache
|
|
12
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
12
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
13
13
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
14
14
|
from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
|
|
15
15
|
from helm.common.tokenization_request import (
|
|
@@ -167,6 +167,7 @@ class CogView2Client(Client):
|
|
|
167
167
|
)
|
|
168
168
|
results, cached = self._cache.get(cache_key, wrap_request_time(do_it))
|
|
169
169
|
except RuntimeError as e:
|
|
170
|
+
hexception(e)
|
|
170
171
|
error: str = f"CogView2Client error: {e}"
|
|
171
172
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
172
173
|
|
|
@@ -4,6 +4,7 @@ import base64
|
|
|
4
4
|
from helm.common.cache import CacheConfig, Cache
|
|
5
5
|
from helm.common.general import hlog
|
|
6
6
|
from helm.common.file_caches.file_cache import FileCache
|
|
7
|
+
from helm.common.hierarchical_logger import hexception
|
|
7
8
|
from helm.common.media_object import MultimediaObject
|
|
8
9
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
9
10
|
from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
|
|
@@ -124,6 +125,7 @@ class DALLE2Client(Client):
|
|
|
124
125
|
hlog(f"Failed safety check: {request.prompt}")
|
|
125
126
|
return self.get_content_policy_violated_result(request)
|
|
126
127
|
else:
|
|
128
|
+
hexception(error)
|
|
127
129
|
return RequestResult(
|
|
128
130
|
success=False, cached=False, error=f"DALL-E error: {error}", completions=[], embedding=[]
|
|
129
131
|
)
|
|
@@ -5,7 +5,7 @@ from functools import partial
|
|
|
5
5
|
|
|
6
6
|
from helm.common.cache import CacheConfig, Cache
|
|
7
7
|
from helm.common.file_caches.file_cache import FileCache
|
|
8
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
8
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
9
9
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
10
10
|
from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
|
|
11
11
|
from helm.common.tokenization_request import (
|
|
@@ -166,6 +166,7 @@ class DALLEMiniClient(Client):
|
|
|
166
166
|
)
|
|
167
167
|
results, cached = self._cache.get(cache_key, wrap_request_time(do_it))
|
|
168
168
|
except RuntimeError as e:
|
|
169
|
+
hexception(e)
|
|
169
170
|
error: str = f"DALLEMiniClient error: {e}"
|
|
170
171
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
171
172
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List, Dict
|
|
2
2
|
|
|
3
3
|
from helm.common.cache import Cache, CacheConfig
|
|
4
|
+
from helm.common.hierarchical_logger import hexception
|
|
4
5
|
from helm.common.request import Request, RequestResult, GeneratedOutput
|
|
5
6
|
from helm.common.tokenization_request import (
|
|
6
7
|
TokenizationRequest,
|
|
@@ -54,6 +55,7 @@ class DeepFloydClient(Client):
|
|
|
54
55
|
|
|
55
56
|
response, cached = self._cache.get(cache_key, fail)
|
|
56
57
|
except RuntimeError as e:
|
|
58
|
+
hexception(e)
|
|
57
59
|
error: str = f"DeepFloyd Client error: {e}"
|
|
58
60
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
59
61
|
|
|
@@ -7,7 +7,7 @@ import torch
|
|
|
7
7
|
from helm.common.cache import CacheConfig, Cache
|
|
8
8
|
from helm.common.file_caches.file_cache import FileCache
|
|
9
9
|
from helm.common.gpu_utils import get_torch_device_name, is_cuda_available
|
|
10
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
10
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
11
11
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
12
12
|
from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
|
|
13
13
|
from helm.common.tokenization_request import (
|
|
@@ -178,6 +178,7 @@ class HuggingFaceDiffusersClient(Client):
|
|
|
178
178
|
)
|
|
179
179
|
results, cached = self._cache.get(cache_key, wrap_request_time(do_it))
|
|
180
180
|
except RuntimeError as ex:
|
|
181
|
+
hexception(ex)
|
|
181
182
|
error: str = f"HuggingFaceDiffusersClient error: {ex}"
|
|
182
183
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
183
184
|
|
|
@@ -5,6 +5,7 @@ import urllib.parse
|
|
|
5
5
|
|
|
6
6
|
from helm.common.cache import CacheConfig, Cache
|
|
7
7
|
from helm.common.file_caches.file_cache import FileCache
|
|
8
|
+
from helm.common.hierarchical_logger import hexception
|
|
8
9
|
from helm.common.images_utils import encode_base64
|
|
9
10
|
from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
|
|
10
11
|
from helm.common.tokenization_request import (
|
|
@@ -62,6 +63,7 @@ class LexicaClient(Client):
|
|
|
62
63
|
|
|
63
64
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
64
65
|
except RuntimeError as e:
|
|
66
|
+
hexception(e)
|
|
65
67
|
error: str = f"LexicaClient error: {e}"
|
|
66
68
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
67
69
|
|
|
@@ -141,7 +141,7 @@ class Encoder(nn.Module):
|
|
|
141
141
|
in_channels: int,
|
|
142
142
|
resolution: int,
|
|
143
143
|
z_channels: int,
|
|
144
|
-
double_z: Optional[bool] = None
|
|
144
|
+
double_z: Optional[bool] = None,
|
|
145
145
|
) -> None:
|
|
146
146
|
super().__init__()
|
|
147
147
|
self.ch = ch
|
|
@@ -232,7 +232,7 @@ class Decoder(nn.Module):
|
|
|
232
232
|
in_channels: int,
|
|
233
233
|
resolution: int,
|
|
234
234
|
z_channels: int,
|
|
235
|
-
double_z: bool
|
|
235
|
+
double_z: bool,
|
|
236
236
|
) -> None:
|
|
237
237
|
super().__init__()
|
|
238
238
|
self.ch = ch
|
|
@@ -5,7 +5,7 @@ import numpy as np
|
|
|
5
5
|
from helm.common.cache import CacheConfig, Cache
|
|
6
6
|
from helm.common.file_caches.file_cache import FileCache
|
|
7
7
|
from helm.common.gpu_utils import get_torch_device_name
|
|
8
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
8
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
9
9
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
10
10
|
from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
|
|
11
11
|
from helm.common.tokenization_request import (
|
|
@@ -91,6 +91,7 @@ class MinDALLEClient(Client):
|
|
|
91
91
|
)
|
|
92
92
|
results, cached = self._cache.get(cache_key, wrap_request_time(do_it))
|
|
93
93
|
except RuntimeError as ex:
|
|
94
|
+
hexception(ex)
|
|
94
95
|
error: str = f"MinDALLEClient error: {ex}"
|
|
95
96
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
96
97
|
|
|
@@ -4,6 +4,7 @@ import requests
|
|
|
4
4
|
|
|
5
5
|
from helm.common.cache import CacheConfig, Cache
|
|
6
6
|
from helm.common.file_caches.file_cache import FileCache
|
|
7
|
+
from helm.common.hierarchical_logger import hexception
|
|
7
8
|
from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
|
|
8
9
|
from helm.common.tokenization_request import (
|
|
9
10
|
TokenizationRequest,
|
|
@@ -84,6 +85,7 @@ class TogetherImageGenerationClient(Client):
|
|
|
84
85
|
|
|
85
86
|
response, cached = self._cache.get(cache_key, wrap_request_time(do_it))
|
|
86
87
|
except RuntimeError as e:
|
|
88
|
+
hexception(e)
|
|
87
89
|
error: str = f"TogetherVisionClient error: {e}"
|
|
88
90
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
89
91
|
|
helm/clients/megatron_client.py
CHANGED
|
@@ -4,6 +4,7 @@ from typing import Any, Dict, List
|
|
|
4
4
|
import traceback
|
|
5
5
|
from helm.common.cache import CacheConfig
|
|
6
6
|
|
|
7
|
+
from helm.common.hierarchical_logger import hexception
|
|
7
8
|
from helm.common.request import (
|
|
8
9
|
wrap_request_time,
|
|
9
10
|
EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
|
|
@@ -103,6 +104,7 @@ class MegatronClient(CachingClient):
|
|
|
103
104
|
try:
|
|
104
105
|
return self._make_request(request)
|
|
105
106
|
except Exception as e:
|
|
107
|
+
hexception(e)
|
|
106
108
|
return RequestResult(
|
|
107
109
|
success=False,
|
|
108
110
|
cached=False,
|
helm/clients/mistral_client.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
from typing import Any, Dict, List, Optional, TypedDict, Union
|
|
3
3
|
|
|
4
|
+
from helm.common.hierarchical_logger import hexception
|
|
4
5
|
from helm.proxy.retry import NonRetriableException
|
|
5
6
|
from helm.common.cache import CacheConfig
|
|
6
7
|
from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
|
|
@@ -156,6 +157,7 @@ class MistralAIClient(CachingClient):
|
|
|
156
157
|
|
|
157
158
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
158
159
|
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
160
|
+
hexception(e)
|
|
159
161
|
error: str = f"MistralClient error: {e}"
|
|
160
162
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
161
163
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Any, Dict
|
|
2
2
|
|
|
3
|
+
from helm.common.hierarchical_logger import hexception
|
|
3
4
|
from helm.common.request import wrap_request_time
|
|
4
5
|
from helm.common.cache import Cache, CacheConfig
|
|
5
6
|
from helm.common.moderations_api_request import (
|
|
@@ -64,6 +65,7 @@ class ModerationAPIClient:
|
|
|
64
65
|
|
|
65
66
|
response, cached = self.cache.get(raw_request, wrap_request_time(do_it))
|
|
66
67
|
except openai.OpenAIError as e:
|
|
68
|
+
hexception(e)
|
|
67
69
|
error: str = f"Moderation API error: {e}"
|
|
68
70
|
return ModerationAPIRequestResult(
|
|
69
71
|
success=False, cached=False, error=error, flagged=None, flagged_results=None, scores=None
|
helm/clients/openai_client.py
CHANGED
|
@@ -10,7 +10,7 @@ from helm.common import multimodal_request_utils
|
|
|
10
10
|
from helm.common.cache import CacheConfig
|
|
11
11
|
from helm.common.media_object import TEXT_TYPE, MultimediaObject, MediaObject
|
|
12
12
|
from helm.common.request import ErrorFlags, Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
13
|
-
from helm.common.hierarchical_logger import hlog, hwarn
|
|
13
|
+
from helm.common.hierarchical_logger import hlog, hwarn, hexception
|
|
14
14
|
from helm.common.object_spec import get_class_by_name
|
|
15
15
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
16
16
|
from helm.common.tokenization_request import (
|
|
@@ -33,9 +33,12 @@ class OpenAIClientUtils:
|
|
|
33
33
|
@classmethod
|
|
34
34
|
def is_reasoning_model(cls, model_engine: str) -> bool:
|
|
35
35
|
# All OpenAI reasoning models start "o[somenumber]", so we regexp for that to future proof things
|
|
36
|
-
return bool(re.match(r"^o\d+", model_engine))
|
|
36
|
+
return bool(re.match(r"^o\d+", model_engine)) or bool(re.match(r"^gpt-5", model_engine))
|
|
37
37
|
|
|
38
38
|
# Error OpenAI throws when the image in the prompt violates their content policy
|
|
39
|
+
HARMFUL_INFORMATION_ERROR: str = (
|
|
40
|
+
"Invalid prompt: we've limited access to this content for safety reasons. This type of information may be used to benefit or to harm people." # noqa: E501
|
|
41
|
+
)
|
|
39
42
|
INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
|
|
40
43
|
INAPPROPRIATE_PROMPT_ERROR: str = "Invalid prompt: your prompt was flagged"
|
|
41
44
|
INAPPROPRIATE_PROMPT_AZURE_ERROR: str = (
|
|
@@ -44,12 +47,10 @@ class OpenAIClientUtils:
|
|
|
44
47
|
INAPPROPRIATE_PROMPT_MICROSOFT_ERROR: str = (
|
|
45
48
|
"The response was filtered due to the prompt triggering Microsoft's content management policy."
|
|
46
49
|
)
|
|
47
|
-
|
|
48
|
-
#
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
"or contact us through our help center at help.openai.com if you keep seeing this error."
|
|
52
|
-
)
|
|
50
|
+
# Grok content safety guidelines error message
|
|
51
|
+
# TODO: Refactor so that this is owned by the Grok client instead.
|
|
52
|
+
SAFETY_GUIDELINES_GROK_ERROR: str = "Content violates safety guidelines."
|
|
53
|
+
USAGE_GUIDELINES_GROK_ERROR: str = "Content violates usage guidelines."
|
|
53
54
|
|
|
54
55
|
# Set the finish reason to this if the prompt violates OpenAI's content policy
|
|
55
56
|
CONTENT_POLICY_VIOLATED_FINISH_REASON: str = (
|
|
@@ -74,21 +75,14 @@ class OpenAIClientUtils:
|
|
|
74
75
|
completions=[empty_completion] * request.num_completions,
|
|
75
76
|
embedding=[],
|
|
76
77
|
)
|
|
77
|
-
elif cls.
|
|
78
|
-
# Handle these errors by returning an empty completion to unblock
|
|
79
|
-
hwarn(f"OpenAI server error for request: {str(request)}")
|
|
80
|
-
empty_completion = GeneratedOutput(
|
|
81
|
-
text="",
|
|
82
|
-
logprob=0,
|
|
83
|
-
tokens=[],
|
|
84
|
-
finish_reason={"reason": cls.OPENAI_SERVER_ERROR},
|
|
85
|
-
)
|
|
78
|
+
elif cls.HARMFUL_INFORMATION_ERROR in str(e):
|
|
86
79
|
return RequestResult(
|
|
87
|
-
success=
|
|
80
|
+
success=False,
|
|
88
81
|
cached=False,
|
|
89
|
-
|
|
90
|
-
completions=[
|
|
82
|
+
error="Prompt blocked by OpenAI's safety filter",
|
|
83
|
+
completions=[],
|
|
91
84
|
embedding=[],
|
|
85
|
+
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
92
86
|
)
|
|
93
87
|
elif cls.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
|
|
94
88
|
return RequestResult(
|
|
@@ -99,7 +93,26 @@ class OpenAIClientUtils:
|
|
|
99
93
|
embedding=[],
|
|
100
94
|
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
101
95
|
)
|
|
96
|
+
elif cls.SAFETY_GUIDELINES_GROK_ERROR in str(e):
|
|
97
|
+
return RequestResult(
|
|
98
|
+
success=False,
|
|
99
|
+
cached=False,
|
|
100
|
+
error="Grok API error: Content violates safety guidelines",
|
|
101
|
+
completions=[],
|
|
102
|
+
embedding=[],
|
|
103
|
+
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
104
|
+
)
|
|
105
|
+
elif cls.USAGE_GUIDELINES_GROK_ERROR in str(e):
|
|
106
|
+
return RequestResult(
|
|
107
|
+
success=False,
|
|
108
|
+
cached=False,
|
|
109
|
+
error="Grok API error: Content violates usage guidelines",
|
|
110
|
+
completions=[],
|
|
111
|
+
embedding=[],
|
|
112
|
+
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
113
|
+
)
|
|
102
114
|
|
|
115
|
+
hexception(e)
|
|
103
116
|
error: str = f"OpenAI error: {e}"
|
|
104
117
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
105
118
|
|
|
@@ -118,11 +131,12 @@ class OpenAIClient(CachingClient):
|
|
|
118
131
|
reasoning_effort: Optional[str] = None,
|
|
119
132
|
openai_model_name: Optional[str] = None,
|
|
120
133
|
output_processor: Optional[str] = None,
|
|
134
|
+
**kwargs,
|
|
121
135
|
):
|
|
122
136
|
super().__init__(cache_config=cache_config)
|
|
123
137
|
self.tokenizer = tokenizer
|
|
124
138
|
self.tokenizer_name = tokenizer_name
|
|
125
|
-
self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url)
|
|
139
|
+
self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url, **kwargs)
|
|
126
140
|
self.reasoning_effort = reasoning_effort
|
|
127
141
|
self.openai_model_name = openai_model_name
|
|
128
142
|
self.output_processor: Optional[Callable[[str], str]] = (
|
|
@@ -157,6 +171,7 @@ class OpenAIClient(CachingClient):
|
|
|
157
171
|
cache_key = self._get_cache_key(raw_request, request)
|
|
158
172
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
159
173
|
except openai.OpenAIError as e:
|
|
174
|
+
hexception(e)
|
|
160
175
|
error: str = f"OpenAI error: {e}"
|
|
161
176
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
162
177
|
|
|
@@ -423,6 +438,7 @@ class OpenAIClient(CachingClient):
|
|
|
423
438
|
cache_key = self._get_cache_key(raw_request, request)
|
|
424
439
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
425
440
|
except openai.OpenAIError as e:
|
|
441
|
+
hexception(e)
|
|
426
442
|
error: str = f"OpenAI error: {e}"
|
|
427
443
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
428
444
|
|
|
@@ -478,6 +494,7 @@ class OpenAIClient(CachingClient):
|
|
|
478
494
|
cache_key = self._get_cache_key({"audio": audio_path, "model": model}, request)
|
|
479
495
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
480
496
|
except openai.OpenAIError as e:
|
|
497
|
+
hexception(e)
|
|
481
498
|
error: str = f"OpenAI error: {e}"
|
|
482
499
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
483
500
|
|
|
@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
5
5
|
|
|
6
6
|
from helm.clients.openai_client import OpenAIClientUtils
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.hierarchical_logger import hwarn
|
|
8
9
|
from helm.common.media_object import TEXT_TYPE
|
|
9
10
|
from helm.common.request import (
|
|
10
11
|
Thinking,
|
|
@@ -60,7 +61,28 @@ class OpenAIResponseClient(CachingClient):
|
|
|
60
61
|
|
|
61
62
|
def _make_raw_request(self, request: Request) -> dict[str, Any]:
|
|
62
63
|
input: Union[str, List[Dict[str, Any]]]
|
|
63
|
-
|
|
64
|
+
|
|
65
|
+
if (
|
|
66
|
+
(request.prompt and request.messages)
|
|
67
|
+
or (request.prompt and request.multimodal_prompt)
|
|
68
|
+
or (request.messages and request.multimodal_prompt)
|
|
69
|
+
):
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"More than one of `prompt`, `messages` and `multimodal_prompt` was set in request: {request}"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if request.messages is not None:
|
|
75
|
+
# Checks that all messages have a role and some content
|
|
76
|
+
for message in request.messages:
|
|
77
|
+
if not message.get("role") or not message.get("content"):
|
|
78
|
+
raise ValueError("All messages must have a role and content")
|
|
79
|
+
# Checks that the last role is "user"
|
|
80
|
+
if request.messages[-1]["role"] != "user":
|
|
81
|
+
raise ValueError("Last message must have role 'user'")
|
|
82
|
+
if request.prompt != "":
|
|
83
|
+
hwarn("Since message is set, prompt will be ignored")
|
|
84
|
+
input = request.messages
|
|
85
|
+
elif request.multimodal_prompt is not None:
|
|
64
86
|
content = []
|
|
65
87
|
request.validate()
|
|
66
88
|
for media_object in request.multimodal_prompt.media_objects:
|
|
@@ -101,6 +123,8 @@ class OpenAIResponseClient(CachingClient):
|
|
|
101
123
|
# Plus other changes
|
|
102
124
|
model_engine: str = request.model_engine
|
|
103
125
|
if OpenAIClientUtils.is_reasoning_model(model_engine):
|
|
126
|
+
if "reasoning" not in raw_request:
|
|
127
|
+
raw_request["reasoning"] = {}
|
|
104
128
|
raw_request["reasoning"]["summary"] = "detailed"
|
|
105
129
|
# Avoid error:
|
|
106
130
|
# "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
|
|
@@ -145,13 +169,15 @@ class OpenAIResponseClient(CachingClient):
|
|
|
145
169
|
if request.echo_prompt:
|
|
146
170
|
text_output += request.prompt
|
|
147
171
|
for output in response["output"]:
|
|
148
|
-
output_type = output[
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
172
|
+
output_type = output[
|
|
173
|
+
"type"
|
|
174
|
+
] # one of "message" or "reasoning" from API observation, but can also include tool calls
|
|
175
|
+
|
|
176
|
+
if output_type == "reasoning":
|
|
177
|
+
reasoning_output += "\n\n".join([raw_output["text"] for raw_output in output["summary"]])
|
|
178
|
+
elif output_type == "message":
|
|
179
|
+
text_output += "\n\n".join([raw_output["text"] for raw_output in output["content"]])
|
|
180
|
+
# (Other output types are ignored)
|
|
155
181
|
|
|
156
182
|
completion = truncate_and_tokenize_response_text(
|
|
157
183
|
text_output,
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from helm.clients.openai_client import OpenAIClient
|
|
4
|
+
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OpenRouterClient(OpenAIClient):
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
tokenizer_name: str,
|
|
12
|
+
tokenizer: Tokenizer,
|
|
13
|
+
cache_config: CacheConfig,
|
|
14
|
+
api_key: Optional[str] = None,
|
|
15
|
+
model_name: Optional[str] = None,
|
|
16
|
+
output_processor: Optional[str] = None,
|
|
17
|
+
):
|
|
18
|
+
self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
|
|
19
|
+
self.base_url = "https://openrouter.ai/api/v1/"
|
|
20
|
+
super().__init__(
|
|
21
|
+
tokenizer,
|
|
22
|
+
tokenizer_name,
|
|
23
|
+
cache_config=cache_config,
|
|
24
|
+
output_processor=output_processor,
|
|
25
|
+
base_url=self.base_url,
|
|
26
|
+
api_key=self.api_key,
|
|
27
|
+
)
|
|
28
|
+
self.model_name = model_name
|
|
29
|
+
|
|
30
|
+
def _get_model_for_request(self, request):
|
|
31
|
+
return self.model_name or request.model
|
helm/clients/palmyra_client.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Any, Dict, List
|
|
|
5
5
|
|
|
6
6
|
from helm.clients.openai_client import OpenAIClient
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
-
from helm.common.hierarchical_logger import hwarn
|
|
8
|
+
from helm.common.hierarchical_logger import hexception, hwarn
|
|
9
9
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
|
|
10
10
|
from helm.common.tokenization_request import (
|
|
11
11
|
TokenizationRequest,
|
|
@@ -99,6 +99,7 @@ class PalmyraClient(CachingClient):
|
|
|
99
99
|
|
|
100
100
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
101
101
|
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
102
|
+
hexception(e)
|
|
102
103
|
error: str = f"PalmyraClient error: {e}"
|
|
103
104
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
104
105
|
|
helm/clients/reka_client.py
CHANGED
|
@@ -6,7 +6,7 @@ from helm.proxy.retry import NonRetriableException
|
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
7
|
from helm.common.media_object import TEXT_TYPE
|
|
8
8
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
|
|
9
|
-
from helm.common.hierarchical_logger import hwarn
|
|
9
|
+
from helm.common.hierarchical_logger import hexception, hwarn
|
|
10
10
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
11
11
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
12
12
|
from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
|
|
@@ -167,6 +167,7 @@ class RekaClient(CachingClient):
|
|
|
167
167
|
|
|
168
168
|
response, cached = self.cache.get(raw_request, wrap_request_time(do_it))
|
|
169
169
|
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
170
|
+
hexception(e)
|
|
170
171
|
error: str = f"RekaClient error: {e}"
|
|
171
172
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
172
173
|
|
|
@@ -39,7 +39,7 @@ class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
|
|
|
39
39
|
tokenizer=tokenizer,
|
|
40
40
|
tokenizer_name=tokenizer_name,
|
|
41
41
|
cache_config=cache_config,
|
|
42
|
-
api_key=
|
|
42
|
+
api_key=api_key,
|
|
43
43
|
base_url=base_url,
|
|
44
44
|
azure_openai_deployment_name=openai_model_name,
|
|
45
45
|
api_version=api_version,
|
|
@@ -50,7 +50,7 @@ class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
|
|
|
50
50
|
tokenizer=tokenizer,
|
|
51
51
|
tokenizer_name=tokenizer_name,
|
|
52
52
|
cache_config=cache_config,
|
|
53
|
-
api_key=
|
|
53
|
+
api_key=api_key,
|
|
54
54
|
endpoint=endpoint,
|
|
55
55
|
azure_openai_deployment_name=openai_model_name,
|
|
56
56
|
api_version=api_version,
|
|
@@ -5,6 +5,7 @@ from dataclasses import asdict
|
|
|
5
5
|
from typing import Any, Dict, List, Optional
|
|
6
6
|
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.hierarchical_logger import hexception
|
|
8
9
|
from helm.common.request import (
|
|
9
10
|
wrap_request_time,
|
|
10
11
|
Request,
|
|
@@ -82,6 +83,7 @@ class StanfordHealthCareHTTPModelClient(CachingClient, ABC):
|
|
|
82
83
|
request_time=response["request_time"],
|
|
83
84
|
)
|
|
84
85
|
except requests.exceptions.RequestException as e:
|
|
86
|
+
hexception(e)
|
|
85
87
|
return RequestResult(success=False, cached=False, error=f"Request error: {e}", completions=[], embedding=[])
|
|
86
88
|
|
|
87
89
|
@abstractmethod
|
|
@@ -9,7 +9,7 @@ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
|
|
|
9
9
|
class TestHuggingFaceClient:
|
|
10
10
|
def test_gpt2(self):
|
|
11
11
|
tokenizer = HuggingFaceTokenizer(
|
|
12
|
-
BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
|
|
12
|
+
BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
|
|
13
13
|
)
|
|
14
14
|
client = HuggingFaceClient(
|
|
15
15
|
cache_config=BlackHoleCacheConfig(),
|
|
@@ -36,7 +36,7 @@ class TestHuggingFaceClient:
|
|
|
36
36
|
@pytest.mark.skip(reason="GPT-J 6B is 22 GB and extremely slow without a GPU.")
|
|
37
37
|
def test_gptj_6b(self):
|
|
38
38
|
tokenizer = HuggingFaceTokenizer(
|
|
39
|
-
BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
|
|
39
|
+
BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
|
|
40
40
|
)
|
|
41
41
|
client = HuggingFaceClient(
|
|
42
42
|
cache_config=BlackHoleCacheConfig(),
|
|
@@ -57,7 +57,7 @@ class TestHuggingFaceClient:
|
|
|
57
57
|
|
|
58
58
|
def test_logprob(self):
|
|
59
59
|
tokenizer = HuggingFaceTokenizer(
|
|
60
|
-
BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
|
|
60
|
+
BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
|
|
61
61
|
)
|
|
62
62
|
client = HuggingFaceClient(
|
|
63
63
|
cache_config=BlackHoleCacheConfig(),
|